1   /*
2    *  Copyright (c) 1998-2005, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: POSTagger.java,v 1.25 2005/04/26 16:27:25 niraj Exp $
12   */
13  
14  package gate.creole;
15  
16  import java.text.NumberFormat;
17  import java.util.*;
18  
19  import gate.*;
20  import gate.util.GateRuntimeException;
21  import gate.util.OffsetComparator;
22  /**
23   * This class is a wrapper for HepTag, Mark Hepple's POS tagger.
24   */
25  public class POSTagger extends AbstractLanguageAnalyser {
26  
27    public static final String
28      TAG_DOCUMENT_PARAMETER_NAME = "document";
29  
30    public static final String
31      TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32  
33    public static final String
34      TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35  
36    public static final String
37      TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38  
39    public static final String
40        TAG_ENCODING_PARAMETER_NAME = "encoding";
41  
42    
43    public static final String
44      BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = "baseTokenAnnotationType";
45  
46    public static final String
47    OUTPUT_ANNOTATION_TYPE_PARAMETER_NAME = "outputAnnotationType";
48    
49    public static final String
50    BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = "baseSentenceAnnotationType";
51  
52    public static final String
53      TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
54  
55    public POSTagger() {
56    }
57  
58    public Resource init()throws ResourceInstantiationException{
59      if(lexiconURL == null){
60        throw new ResourceInstantiationException(
61          "NoURL provided for the lexicon!");
62      }
63      if(rulesURL == null){
64        throw new ResourceInstantiationException(
65          "No URL provided for the rules!");
66      }
67      try{
68        tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL, encoding);
69      }catch(Exception e){
70        throw new ResourceInstantiationException(e);
71      }
72      return this;
73    }
74  
75  
76    public void execute() throws ExecutionException{
77      try{
78        //check the parameters
79        if(document == null) throw new GateRuntimeException(
80          "No document to process!");
81        if(inputASName != null && inputASName.equals("")) inputASName = null;
82        AnnotationSet inputAS = (inputASName == null) ?
83                                document.getAnnotations() :
84                                document.getAnnotations(inputASName);
85  
86        /* Addition by Niraj */
87                                
88        if(baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length()==0) {
89            throw new GateRuntimeException("No base Token Annotation Type provided!");
90        }
91  
92        if(outputASName != null && outputASName.equals("")) outputASName = null;
93        AnnotationSet outputAS = (outputASName == null) ?
94                                document.getAnnotations() :
95                                document.getAnnotations(outputASName);
96        
97        if(baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length()==0) {
98            throw new GateRuntimeException("No base Sentence Annotation Type provided!");
99        }
100       
101       if(outputAnnotationType == null || outputAnnotationType.trim().length()==0) {
102           throw new GateRuntimeException("No AnnotationType provided to store the new feature!");
103       }
104       
105       /* End of addition */
106       
107       AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
108       AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
109       if(sentencesAS != null && sentencesAS.size() > 0
110          && tokensAS != null && tokensAS.size() > 0){
111         long startTime = System.currentTimeMillis();
112         fireStatusChanged("POS tagging " + document.getName());
113         fireProgressChanged(0);
114         //prepare the input for HepTag
115         List sentenceForTagger = new ArrayList();
116         List sentencesForTagger = new ArrayList(1);
117         sentencesForTagger.add(sentenceForTagger);
118 
119         //define a comparator for annotations by start offset
120         Comparator offsetComparator = new OffsetComparator();
121 
122         //read all the tokens and all the sentences
123         List sentencesList = new ArrayList(sentencesAS);
124         Collections.sort(sentencesList, offsetComparator);
125         List tokensList = new ArrayList(tokensAS);
126         Collections.sort(tokensList, offsetComparator);
127 
128         Iterator sentencesIter = sentencesList.iterator();
129         ListIterator tokensIter = tokensList.listIterator();
130 
131         List tokensInCurrentSentence = new ArrayList();
132         Annotation currentToken = (Annotation)tokensIter.next();
133         int sentIndex = 0;
134         int sentCnt = sentencesAS.size();
135         while(sentencesIter.hasNext()){
136           Annotation currentSentence = (Annotation)sentencesIter.next();
137           tokensInCurrentSentence.clear();
138           sentenceForTagger.clear();
139           while(currentToken != null
140                 &&
141                 currentToken.getEndNode().getOffset().compareTo(
142                 currentSentence.getEndNode().getOffset()) <= 0){
143             tokensInCurrentSentence.add(currentToken);
144             sentenceForTagger.add(currentToken.getFeatures().
145                                   get(TOKEN_STRING_FEATURE_NAME));
146             currentToken = (Annotation)(tokensIter.hasNext() ?
147                                        tokensIter.next() : null);
148           }
149           //run the POS tagger
150           List taggerList = tagger.runTagger(sentencesForTagger);
151           if(taggerList != null && taggerList.size() > 0){
152             List taggerResults = (List) taggerList.get(0);
153             //add the results
154             //make sure no malfunction occurred
155             if(taggerResults.size() != tokensInCurrentSentence.size())
156               throw new GateRuntimeException(
157                   "POS Tagger malfunction: the output size (" +
158                   taggerResults.size() +
159                   ") is different from the input size (" +
160                   tokensInCurrentSentence.size() + ")!");
161             Iterator resIter = taggerResults.iterator();
162             Iterator tokIter = tokensInCurrentSentence.iterator();
163             while(resIter.hasNext()){
164                 /* Addition by Niraj */
165                 Annotation annot = (Annotation) tokIter.next();
166                 addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
167                 /* End */
168             }
169           }
170           fireProgressChanged(sentIndex++ * 100 / sentCnt);
171         }//while(sentencesIter.hasNext())
172 
173         if(currentToken != null){
174           //we have remaining tokens after the last sentence
175           tokensInCurrentSentence.clear();
176           sentenceForTagger.clear();
177           while(currentToken != null){
178             tokensInCurrentSentence.add(currentToken);
179             sentenceForTagger.add(currentToken.getFeatures().
180                                   get(TOKEN_STRING_FEATURE_NAME));
181             currentToken = (Annotation)(tokensIter.hasNext() ?
182                                         tokensIter.next() : null);
183           }
184           //run the POS tagger
185           List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
186           //add the results
187           //make sure no malfunction accured
188           if(taggerResults.size() != tokensInCurrentSentence.size())
189             throw new GateRuntimeException(
190                 "POS Tagger malfunction: the output size (" +
191                 taggerResults.size() +
192                 ") is different from the input size (" +
193                 tokensInCurrentSentence.size() + ")!");
194           Iterator resIter = taggerResults.iterator();
195           Iterator tokIter = tokensInCurrentSentence.iterator();
196           while(resIter.hasNext()){
197               /* Addition by Niraj */
198               Annotation annot = (Annotation) tokIter.next();
199               addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
200               /* End */
201           }
202         }//if(currentToken != null)
203         fireProcessFinished();
204         fireStatusChanged(
205           document.getName() + " tagged in " +
206           NumberFormat.getInstance().format(
207           (double)(System.currentTimeMillis() - startTime) / 1000) +
208           " seconds!");
209       }else{
210         throw new GateRuntimeException("No sentences or tokens to process!\n" +
211                                        "Please run a sentence splitter "+
212                                        "and tokeniser first!");
213       }
214 
215 //OLD version
216 /*
217       AnnotationSet as = inputAS.get(SENTENCE_ANNOTATION_TYPE);
218       if(as != null && as.size() > 0){
219         List sentences = new ArrayList(as);
220         Collections.sort(sentences, offsetComparator);
221         Iterator sentIter = sentences.iterator();
222         int sentIndex = 0;
223         int sentCnt = sentences.size();
224         long startTime= System.currentTimeMillis();
225         while(sentIter.hasNext()){
226 start = System.currentTimeMillis();
227           Annotation sentenceAnn = (Annotation)sentIter.next();
228           AnnotationSet rangeSet = inputAS.get(
229                                     sentenceAnn.getStartNode().getOffset(),
230                                     sentenceAnn.getEndNode().getOffset());
231           if(rangeSet == null) continue;
232           AnnotationSet tokensSet = rangeSet.get(TOKEN_ANNOTATION_TYPE);
233           if(tokensSet == null) continue;
234           List tokens = new ArrayList(tokensSet);
235           Collections.sort(tokens, offsetComparator);
236 
237 //          List tokens = (List)sentenceAnn.getFeatures().get("tokens");
238           List sentence = new ArrayList(tokens.size());
239           Iterator tokIter = tokens.iterator();
240           while(tokIter.hasNext()){
241             Annotation token = (Annotation)tokIter.next();
242             String text = (String)token.getFeatures().get(TOKEN_STRING_FEATURE_NAME);
243             sentence.add(text);
244           }//while(tokIter.hasNext())
245 
246           //run the POSTagger over this sentence
247           List sentences4tagger = new ArrayList(1);
248           sentences4tagger.add(sentence);
249 prepTime += System.currentTimeMillis() - start;
250 start = System.currentTimeMillis();
251           List taggerResults = tagger.runTagger(sentences4tagger);
252 posTime += System.currentTimeMillis() - start;
253 start = System.currentTimeMillis();
254           //add the results to the output annotation set
255           //we only get one sentence
256           List sentenceFromTagger = (List)taggerResults.get(0);
257           if(sentenceFromTagger.size() != sentence.size()){
258             String taggerResult = "";
259             for(int i = 0; i< sentenceFromTagger.size(); i++){
260               taggerResult += ((String[])sentenceFromTagger.get(i))[1] + ", ";
261             }
262             throw new GateRuntimeException(
263               "POS Tagger malfunction: the output size (" +
264               sentenceFromTagger.size() +
265               ") is different from the input size (" +
266               sentence.size() + ")!" +
267               "\n Input: " + sentence + "\nOutput: " + taggerResult);
268           }
269           for(int i = 0; i< sentence.size(); i++){
270             String category = ((String[])sentenceFromTagger.get(i))[1];
271             Annotation token = (Annotation)tokens.get(i);
272             token.getFeatures().
273               put(TOKEN_CATEGORY_FEATURE_NAME, category);
274           }//for(i = 0; i<= sentence.size(); i++)
275 postTime += System.currentTimeMillis() - start;
276           fireProgressChanged(sentIndex++ * 100 / sentCnt);
277         }//while(sentIter.hasNext())
278 Out.prln("POS preparation time:" + prepTime);
279 Out.prln("POS execution time:" + posTime);
280 Out.prln("POS after execution time:" + postTime);
281           fireProcessFinished();
282           long endTime = System.currentTimeMillis();
283           fireStatusChanged(document.getName() + " tagged in " +
284                           NumberFormat.getInstance().format(
285                           (double)(endTime - startTime) / 1000) + " seconds!");
286       }else{
287         throw new GateRuntimeException("No sentences to process!\n" +
288                                        "Please run a sentence splitter first!");
289       }//if(as != null && as.size() > 0)
290 */
291     }catch(Exception e){
292       throw new ExecutionException(e);
293     }
294   }
295 
296 
297   protected void addFeatures(Annotation annot, String featureName, String featureValue) throws GateRuntimeException {
298       String tempIASN = inputASName == null ? "" : inputASName;
299       String tempOASN = outputASName == null ? "" : outputASName;
300       if(outputAnnotationType.equals(baseTokenAnnotationType) && tempIASN.equals(tempOASN)) {
301           annot.getFeatures().put(featureName, featureValue);
302           return;
303       } else {
304           int start = annot.getStartNode().getOffset().intValue();
305           int end = annot.getEndNode().getOffset().intValue();
306           
307           // get the annotations of type outputAnnotationType
308           AnnotationSet outputAS = (outputASName == null) ?
309                   document.getAnnotations() :
310                   document.getAnnotations(outputASName);
311           AnnotationSet annotations = outputAS.get(outputAnnotationType);
312           if(annotations == null || annotations.size() == 0) {
313               // add new annotation
314               FeatureMap features = Factory.newFeatureMap();
315               features.put(featureName, featureValue);
316               try {
317                   outputAS.add(new Long(start), new Long(end), outputAnnotationType, features);
318               } catch(Exception e) {
319                   throw new GateRuntimeException("Invalid Offsets");
320               }
321           } else {
322               // search for the annotation if there is one with the same start and end offsets
323               ArrayList tempList = new ArrayList(annotations.get());
324               boolean found = false;
325               for(int i=0;i<tempList.size();i++) {
326                   Annotation annotation = (Annotation) tempList.get(i);
327                   if(annotation.getStartNode().getOffset().intValue() == start && annotation.getEndNode().getOffset().intValue() == end) {
328                       // this is the one
329                       annotation.getFeatures().put(featureName, featureValue);
330                       found = true;
331                       break;
332                   }
333               }
334               
335               if(!found) {
336                   // add new annotation
337                   FeatureMap features = Factory.newFeatureMap();
338                   features.put(featureName, featureValue);
339                   try {
340                       outputAS.add(new Long(start), new Long(end), outputAnnotationType, features);
341                   } catch(Exception e) {
342                       throw new GateRuntimeException("Invalid Offsets");
343                   }
344               }
345           }
346       }
347   }
348   
349   public void setLexiconURL(java.net.URL newLexiconURL) {
350     lexiconURL = newLexiconURL;
351   }
352   public java.net.URL getLexiconURL() {
353     return lexiconURL;
354   }
355   public void setRulesURL(java.net.URL newRulesURL) {
356     rulesURL = newRulesURL;
357   }
358   public void setEncoding(String encoding) {
359     this.encoding = encoding;
360   }
361 
362   public java.net.URL getRulesURL() {
363     return rulesURL;
364   }
365   public void setInputASName(String newInputASName) {
366     inputASName = newInputASName;
367   }
368   public String getInputASName() {
369     return inputASName;
370   }
371   public String getEncoding() {
372     return this.encoding;
373   }
374 
375   public String getBaseTokenAnnotationType() {
376       return this.baseTokenAnnotationType;
377   }
378   
379   public String getBaseSentenceAnnotationType() {
380       return this.baseSentenceAnnotationType;
381   }
382   
383   public String getOutputAnnotationType() {
384       return this.outputAnnotationType;
385   }
386   
387   public void setBaseTokenAnnotationType(String baseTokenAnnotationType) {
388       this.baseTokenAnnotationType = baseTokenAnnotationType;
389   }
390   
391   public void setBaseSentenceAnnotationType(String baseSentenceAnnotationtype) {
392       this.baseSentenceAnnotationType = baseSentenceAnnotationtype;
393   }
394   
395   public void setOutputAnnotationType(String outputAnnotationType) {
396       this.outputAnnotationType = outputAnnotationType;
397   }
398   
399   public String getOutputASName() {
400       return this.outputASName;
401   }
402   
403   public void setOutputASName(String outputASName) {
404       this.outputASName = outputASName;
405   }
406   
407   protected hepple.postag.POSTagger tagger;
408   private java.net.URL lexiconURL;
409   private java.net.URL rulesURL;
410   private String inputASName;
411   private String encoding;
412   /* Addition by Niraj */
413   private String baseTokenAnnotationType;
414   private String baseSentenceAnnotationType;
415   private String outputAnnotationType;
416   private String outputASName;
417   /* End of Addition */
418 }
419