1
13
14 package gate.creole;
15
16 import java.text.NumberFormat;
17 import java.util.*;
18
19 import gate.*;
20 import gate.util.GateRuntimeException;
21 import gate.util.OffsetComparator;
22
25 public class POSTagger extends AbstractLanguageAnalyser {
26
27 public static final String
28 TAG_DOCUMENT_PARAMETER_NAME = "document";
29
30 public static final String
31 TAG_INPUT_AS_PARAMETER_NAME = "inputASName";
32
33 public static final String
34 TAG_LEXICON_URL_PARAMETER_NAME = "lexiconURL";
35
36 public static final String
37 TAG_RULES_URL_PARAMETER_NAME = "rulesURL";
38
39 public static final String
40 TAG_ENCODING_PARAMETER_NAME = "encoding";
41
42
43 public static final String
44 BASE_TOKEN_ANNOTATION_TYPE_PARAMETER_NAME = "baseTokenAnnotationType";
45
46 public static final String
47 OUTPUT_ANNOTATION_TYPE_PARAMETER_NAME = "outputAnnotationType";
48
49 public static final String
50 BASE_SENTENCE_ANNOTATION_TYPE_PARAMETER_NAME = "baseSentenceAnnotationType";
51
52 public static final String
53 TAG_OUTPUT_AS_PARAMETER_NAME = "outputASName";
54
55 public POSTagger() {
56 }
57
58 public Resource init()throws ResourceInstantiationException{
59 if(lexiconURL == null){
60 throw new ResourceInstantiationException(
61 "NoURL provided for the lexicon!");
62 }
63 if(rulesURL == null){
64 throw new ResourceInstantiationException(
65 "No URL provided for the rules!");
66 }
67 try{
68 tagger = new hepple.postag.POSTagger(lexiconURL,rulesURL, encoding);
69 }catch(Exception e){
70 throw new ResourceInstantiationException(e);
71 }
72 return this;
73 }
74
75
76 public void execute() throws ExecutionException{
77 try{
78 if(document == null) throw new GateRuntimeException(
80 "No document to process!");
81 if(inputASName != null && inputASName.equals("")) inputASName = null;
82 AnnotationSet inputAS = (inputASName == null) ?
83 document.getAnnotations() :
84 document.getAnnotations(inputASName);
85
86
87
88 if(baseTokenAnnotationType == null || baseTokenAnnotationType.trim().length()==0) {
89 throw new GateRuntimeException("No base Token Annotation Type provided!");
90 }
91
92 if(outputASName != null && outputASName.equals("")) outputASName = null;
93 AnnotationSet outputAS = (outputASName == null) ?
94 document.getAnnotations() :
95 document.getAnnotations(outputASName);
96
97 if(baseSentenceAnnotationType == null || baseSentenceAnnotationType.trim().length()==0) {
98 throw new GateRuntimeException("No base Sentence Annotation Type provided!");
99 }
100
101 if(outputAnnotationType == null || outputAnnotationType.trim().length()==0) {
102 throw new GateRuntimeException("No AnnotationType provided to store the new feature!");
103 }
104
105
106
107 AnnotationSet sentencesAS = inputAS.get(baseSentenceAnnotationType);
108 AnnotationSet tokensAS = inputAS.get(baseTokenAnnotationType);
109 if(sentencesAS != null && sentencesAS.size() > 0
110 && tokensAS != null && tokensAS.size() > 0){
111 long startTime = System.currentTimeMillis();
112 fireStatusChanged("POS tagging " + document.getName());
113 fireProgressChanged(0);
114 List sentenceForTagger = new ArrayList();
116 List sentencesForTagger = new ArrayList(1);
117 sentencesForTagger.add(sentenceForTagger);
118
119 Comparator offsetComparator = new OffsetComparator();
121
122 List sentencesList = new ArrayList(sentencesAS);
124 Collections.sort(sentencesList, offsetComparator);
125 List tokensList = new ArrayList(tokensAS);
126 Collections.sort(tokensList, offsetComparator);
127
128 Iterator sentencesIter = sentencesList.iterator();
129 ListIterator tokensIter = tokensList.listIterator();
130
131 List tokensInCurrentSentence = new ArrayList();
132 Annotation currentToken = (Annotation)tokensIter.next();
133 int sentIndex = 0;
134 int sentCnt = sentencesAS.size();
135 while(sentencesIter.hasNext()){
136 Annotation currentSentence = (Annotation)sentencesIter.next();
137 tokensInCurrentSentence.clear();
138 sentenceForTagger.clear();
139 while(currentToken != null
140 &&
141 currentToken.getEndNode().getOffset().compareTo(
142 currentSentence.getEndNode().getOffset()) <= 0){
143 tokensInCurrentSentence.add(currentToken);
144 sentenceForTagger.add(currentToken.getFeatures().
145 get(TOKEN_STRING_FEATURE_NAME));
146 currentToken = (Annotation)(tokensIter.hasNext() ?
147 tokensIter.next() : null);
148 }
149 List taggerList = tagger.runTagger(sentencesForTagger);
151 if(taggerList != null && taggerList.size() > 0){
152 List taggerResults = (List) taggerList.get(0);
153 if(taggerResults.size() != tokensInCurrentSentence.size())
156 throw new GateRuntimeException(
157 "POS Tagger malfunction: the output size (" +
158 taggerResults.size() +
159 ") is different from the input size (" +
160 tokensInCurrentSentence.size() + ")!");
161 Iterator resIter = taggerResults.iterator();
162 Iterator tokIter = tokensInCurrentSentence.iterator();
163 while(resIter.hasNext()){
164
165 Annotation annot = (Annotation) tokIter.next();
166 addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
167
168 }
169 }
170 fireProgressChanged(sentIndex++ * 100 / sentCnt);
171 }
173 if(currentToken != null){
174 tokensInCurrentSentence.clear();
176 sentenceForTagger.clear();
177 while(currentToken != null){
178 tokensInCurrentSentence.add(currentToken);
179 sentenceForTagger.add(currentToken.getFeatures().
180 get(TOKEN_STRING_FEATURE_NAME));
181 currentToken = (Annotation)(tokensIter.hasNext() ?
182 tokensIter.next() : null);
183 }
184 List taggerResults = (List)tagger.runTagger(sentencesForTagger).get(0);
186 if(taggerResults.size() != tokensInCurrentSentence.size())
189 throw new GateRuntimeException(
190 "POS Tagger malfunction: the output size (" +
191 taggerResults.size() +
192 ") is different from the input size (" +
193 tokensInCurrentSentence.size() + ")!");
194 Iterator resIter = taggerResults.iterator();
195 Iterator tokIter = tokensInCurrentSentence.iterator();
196 while(resIter.hasNext()){
197
198 Annotation annot = (Annotation) tokIter.next();
199 addFeatures(annot, TOKEN_CATEGORY_FEATURE_NAME, ((String[])resIter.next())[1]);
200
201 }
202 } fireProcessFinished();
204 fireStatusChanged(
205 document.getName() + " tagged in " +
206 NumberFormat.getInstance().format(
207 (double)(System.currentTimeMillis() - startTime) / 1000) +
208 " seconds!");
209 }else{
210 throw new GateRuntimeException("No sentences or tokens to process!\n" +
211 "Please run a sentence splitter "+
212 "and tokeniser first!");
213 }
214
215
291 }catch(Exception e){
292 throw new ExecutionException(e);
293 }
294 }
295
296
297 protected void addFeatures(Annotation annot, String featureName, String featureValue) throws GateRuntimeException {
298 String tempIASN = inputASName == null ? "" : inputASName;
299 String tempOASN = outputASName == null ? "" : outputASName;
300 if(outputAnnotationType.equals(baseTokenAnnotationType) && tempIASN.equals(tempOASN)) {
301 annot.getFeatures().put(featureName, featureValue);
302 return;
303 } else {
304 int start = annot.getStartNode().getOffset().intValue();
305 int end = annot.getEndNode().getOffset().intValue();
306
307 AnnotationSet outputAS = (outputASName == null) ?
309 document.getAnnotations() :
310 document.getAnnotations(outputASName);
311 AnnotationSet annotations = outputAS.get(outputAnnotationType);
312 if(annotations == null || annotations.size() == 0) {
313 FeatureMap features = Factory.newFeatureMap();
315 features.put(featureName, featureValue);
316 try {
317 outputAS.add(new Long(start), new Long(end), outputAnnotationType, features);
318 } catch(Exception e) {
319 throw new GateRuntimeException("Invalid Offsets");
320 }
321 } else {
322 ArrayList tempList = new ArrayList(annotations.get());
324 boolean found = false;
325 for(int i=0;i<tempList.size();i++) {
326 Annotation annotation = (Annotation) tempList.get(i);
327 if(annotation.getStartNode().getOffset().intValue() == start && annotation.getEndNode().getOffset().intValue() == end) {
328 annotation.getFeatures().put(featureName, featureValue);
330 found = true;
331 break;
332 }
333 }
334
335 if(!found) {
336 FeatureMap features = Factory.newFeatureMap();
338 features.put(featureName, featureValue);
339 try {
340 outputAS.add(new Long(start), new Long(end), outputAnnotationType, features);
341 } catch(Exception e) {
342 throw new GateRuntimeException("Invalid Offsets");
343 }
344 }
345 }
346 }
347 }
348
349 public void setLexiconURL(java.net.URL newLexiconURL) {
350 lexiconURL = newLexiconURL;
351 }
352 public java.net.URL getLexiconURL() {
353 return lexiconURL;
354 }
355 public void setRulesURL(java.net.URL newRulesURL) {
356 rulesURL = newRulesURL;
357 }
358 public void setEncoding(String encoding) {
359 this.encoding = encoding;
360 }
361
362 public java.net.URL getRulesURL() {
363 return rulesURL;
364 }
365 public void setInputASName(String newInputASName) {
366 inputASName = newInputASName;
367 }
368 public String getInputASName() {
369 return inputASName;
370 }
371 public String getEncoding() {
372 return this.encoding;
373 }
374
375 public String getBaseTokenAnnotationType() {
376 return this.baseTokenAnnotationType;
377 }
378
379 public String getBaseSentenceAnnotationType() {
380 return this.baseSentenceAnnotationType;
381 }
382
383 public String getOutputAnnotationType() {
384 return this.outputAnnotationType;
385 }
386
387 public void setBaseTokenAnnotationType(String baseTokenAnnotationType) {
388 this.baseTokenAnnotationType = baseTokenAnnotationType;
389 }
390
391 public void setBaseSentenceAnnotationType(String baseSentenceAnnotationtype) {
392 this.baseSentenceAnnotationType = baseSentenceAnnotationtype;
393 }
394
395 public void setOutputAnnotationType(String outputAnnotationType) {
396 this.outputAnnotationType = outputAnnotationType;
397 }
398
399 public String getOutputASName() {
400 return this.outputASName;
401 }
402
403 public void setOutputASName(String outputASName) {
404 this.outputASName = outputASName;
405 }
406
407 protected hepple.postag.POSTagger tagger;
408 private java.net.URL lexiconURL;
409 private java.net.URL rulesURL;
410 private String inputASName;
411 private String encoding;
412
413 private String baseTokenAnnotationType;
414 private String baseSentenceAnnotationType;
415 private String outputAnnotationType;
416 private String outputASName;
417
418 }
419