1
16
17 package gate.creole.gazetteer;
18
19 import java.util.*;
20 import gate.util.*;
21 import gate.*;
22 import gate.creole.*;
23
24
41
42 public class FlexibleGazetteer
43 extends AbstractLanguageAnalyser
44 implements ProcessingResource {
45
46
49 public FlexibleGazetteer() {
50 changedNodes = new ArrayList();
51 }
52
53
56 public Resource init() throws ResourceInstantiationException {
57 if(gazetteerInst == null)
59 throw new ResourceInstantiationException("No Gazetteer Provided!");
60
61 return this;
62 }
63
64
68 public void execute() throws ExecutionException {
69 fireProgressChanged(0);
70 fireStatusChanged("Checking Document...");
71 if (document == null) {
72 throw new ExecutionException(
73 "No document to process!"
74 );
75 }
76
77 fireStatusChanged("Creating temporary Document...");
78 StringBuffer newdocString = new StringBuffer(document.getContent().toString());
79 Document tempDoc = null;
80 boolean chineseSplit = false;
81
82 if (inputFeatureNames == null || inputFeatureNames.size() == 0) {
83 inputFeatureNames = new ArrayList();
84 }
85
86 Iterator tokenIter = getTokenIterator(document, inputAnnotationSetName);
87 long totalDeductedSpaces = 0;
88 fireStatusChanged("Replacing contents with the feature value...");
89
90 outer:while (tokenIter != null && tokenIter.hasNext()) {
91 Annotation currentToken = (Annotation) tokenIter.next();
92
93 if (currentToken.getType().equals(ANNIEConstants.
96 SPACE_TOKEN_ANNOTATION_TYPE) &&
97 ( (String) (currentToken.getFeatures().get(ANNIEConstants.
98 TOKEN_KIND_FEATURE_NAME))).equals("ChineseSplit")) {
99
100 long startOffset = currentToken.getStartNode().getOffset().
102 longValue();
103
104 long newStartOffset = startOffset - totalDeductedSpaces;
107 long newEndOffset = newStartOffset + 1;
108 NodePosition newNode = new NodePosition(startOffset, startOffset,
109 newStartOffset, newEndOffset,
110 totalDeductedSpaces);
111 chineseSplit = true;
112
113 totalDeductedSpaces--;
115 changedNodes.add(newNode);
116 newdocString = newdocString.insert( (int) newStartOffset, ' ');
117 continue outer;
118 }
119
120 inner:for (int i = 0; i < inputFeatureNames.size(); i++) {
124 String[] keyVal = ( (String) (inputFeatureNames.get(i))).split("[.]");
125
126 if (keyVal.length == 2) {
127 if (currentToken.getType().equals(keyVal[0])) {
130 FeatureMap features = currentToken.getFeatures();
131 String newTokenValue = (String) (features.get(keyVal[1]));
132
133 if (newTokenValue == null) {
135 continue;
136
137 }
138 else {
139 long startOffset = currentToken.getStartNode().getOffset().
142 longValue();
143 long endOffset = currentToken.getEndNode().getOffset().
144 longValue();
145
146 String actualString = (String) (features.get(ANNIEConstants.
148 TOKEN_STRING_FEATURE_NAME));
149
150 if (actualString.equals(newTokenValue)) {
153 break inner;
155 }
156
157 long lengthDifference = actualString.length() -
160 newTokenValue.length();
161
162 long newStartOffset = startOffset - totalDeductedSpaces;
164 long newEndOffset = newStartOffset + newTokenValue.length();
165
166 NodePosition newNode = new NodePosition(startOffset,
168 endOffset,
169 newStartOffset, newEndOffset, totalDeductedSpaces);
170 changedNodes.add(newNode);
171 totalDeductedSpaces += lengthDifference;
174
175 newdocString = newdocString.replace( (int) newStartOffset,
178 (int) newStartOffset +
179 actualString.length(),
180 newTokenValue);
181 break inner;
182 }
183 }
184 }
185 }
186 }
187
188 fireStatusChanged("New Document to be processed with Gazetteer...");
189 try {
190 FeatureMap params = Factory.newFeatureMap();
191 params.put("stringContent", newdocString.toString());
192 FeatureMap features = Factory.newFeatureMap();
193 Gate.setHiddenAttribute(features, true);
194 tempDoc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
195 params, features);
196 }
197 catch (ResourceInstantiationException rie) {
198 throw new ExecutionException("Temporary document cannot be created");
199 }
200
201 FeatureMap params = Factory.newFeatureMap();
203 gazetteerInst.setDocument(tempDoc);
204 gazetteerInst.setAnnotationSetName(this.outputAnnotationSetName);
205
206 fireStatusChanged("Executing Gazetteer...");
207 gazetteerInst.execute();
208
209 fireStatusChanged("Transfering new tags to the original one...");
212 Iterator tokensIter = getTokenIterator(tempDoc, outputAnnotationSetName);
213 AnnotationSet original = (outputAnnotationSetName == null) ?
214 document.getAnnotations() :
215 document.getAnnotations(outputAnnotationSetName);
216 long totalSpaceAdded = 0;
217 long difference = 0;
218
219 int foundNode = -1;
220 while (tokensIter != null && tokensIter.hasNext()) {
221 Annotation currentToken = (Annotation) (tokensIter.next());
222 long startOffset = currentToken.getStartNode().getOffset().longValue();
223 long endOffset = currentToken.getEndNode().getOffset().longValue();
224
225 int i = foundNode + 1;
228 boolean found = false;
229 inner1:for (; i < changedNodes.size(); i++) {
230
231 NodePosition tempNode = (NodePosition) (changedNodes.get(i));
232
233 if (tempNode.getNewStartNode() > startOffset) {
237 i = i - 1;
242 break inner1;
243 }
244
245 if (tempNode.getNewStartNode() == startOffset) {
247
249 int k = i;
251 for (;
252 k >= 0 && k < changedNodes.size() &&
253 endOffset >
254 ( (NodePosition) changedNodes.get(k)).getNewStartNode(); k++)
255 ;
256 long spacesToAdd = 0;
257 if (k - 1 == i && k - 1 >= 0) {
258 spacesToAdd = (tempNode.getOldEndNode() - tempNode.getNewEndNode());
259 }
260 else if (k - 1 < 0) {
261 spacesToAdd = 0;
262 }
263 else {
264 spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
265 getOldEndNode() -
266 ( (NodePosition) changedNodes.get(k - 1)).
267 getNewEndNode();
268 }
269
270 FeatureMap newFeatureMap = currentToken.getFeatures();
273 try {
274
275 original.add(new Long(startOffset +
276 (tempNode.getOldStartNode() -
277 tempNode.getNewStartNode())),
278 new Long(endOffset + spacesToAdd),
279 ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
282 newFeatureMap);
283
284 }
285 catch (InvalidOffsetException ioe) {
286 throw new ExecutionException("Offset Error");
287 }
288 found = true;
289 foundNode = i;
290 break inner1;
291 }
292 }
293
294 if (!found) {
295 long totalStartSpaces = 0;
296 long totalEndSpaces = 0;
297
298 i = (changedNodes.size() == i) ? i - 1 : i;
301
302 int k = i;
304 for (;
305 k > 0 && k < changedNodes.size() &&
306 endOffset > ( (NodePosition) changedNodes.get(k)).getNewStartNode();
307 k++)
308 ;
309 long spacesToAdd = 0;
310 if (k - 1 == i && k - 1 >= 0) {
311 spacesToAdd = ( ( (NodePosition) changedNodes.get(i)).getOldEndNode() -
312 ( (NodePosition) changedNodes.get(i)).getNewEndNode());
313 }
314 else if (k - 1 < 0) {
315 spacesToAdd = 0;
316 }
317 else {
318 spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
319 getOldEndNode() -
320 ( (NodePosition) changedNodes.get(k - 1)).getNewEndNode();
321 }
322
323 if (i >= 0) {
324 totalStartSpaces = ( (NodePosition) changedNodes.get(i)).
328 getOldEndNode() -
329 ( (NodePosition) changedNodes.get(i)).
330 getNewEndNode();
331 totalEndSpaces = spacesToAdd;
335 foundNode = i;
336 }
337
338 FeatureMap newFeatureMap = currentToken.getFeatures();
340 try {
341 original.add(new Long(startOffset + totalStartSpaces),
342 new Long(endOffset + totalEndSpaces),
343 ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
344 newFeatureMap);
345 }
346 catch (InvalidOffsetException ioe) {
347 throw new ExecutionException("Offset Error");
348 }
349
350 }
351 }
352
353 Factory.deleteResource(tempDoc);
355 fireProcessFinished();
356 }
357
358
362 public void setDocument(gate.Document doc) {
363 this.document = doc;
364 }
365
366
370 public gate.Document getDocument() {
371 return this.document;
372 }
373
374
378 public void setOutputAnnotationSetName(String annName) {
379 this.outputAnnotationSetName = annName;
380 }
381
382
386 public String getOutputAnnotationSetName() {
387 return this.outputAnnotationSetName;
388 }
389
390
394 public void setInputAnnotationSetName(String annName) {
395 this.inputAnnotationSetName = annName;
396 }
397
398
402 public String getInputAnnotationSetName() {
403 return this.inputAnnotationSetName;
404 }
405
406
412 public void setInputFeatureNames(java.util.List inputs) {
413 this.inputFeatureNames = inputs;
414 }
415
416
421 public java.util.List getInputFeatureNames() {
422 return this.inputFeatureNames;
423 }
424
425 public Gazetteer getGazetteerInst() {
426 return this.gazetteerInst;
427 }
428
429 public void setGazetteerInst(gate.creole.gazetteer.Gazetteer gazetteerInst) {
430 this.gazetteerInst = gazetteerInst;
431 }
432
433
441 public Iterator getTokenIterator(gate.Document doc, String annotationSetName) {
442 AnnotationSet inputAs = (annotationSetName == null) ? doc.getAnnotations() :
443 doc.getAnnotations(annotationSetName);
444 AnnotationSet tempSet = inputAs.get();
445 if(tempSet == null)
446 return null;
447
448 List tokens = new ArrayList(inputAs.get());
449
450 if(tokens == null)
451 return null;
452
453 Comparator offsetComparator = new OffsetComparator();
454 Collections.sort(tokens, offsetComparator);
455 Iterator tokenIter = tokens.iterator();
456 return tokenIter;
457 }
458
459 private gate.Document document;
461 private java.lang.String outputAnnotationSetName;
462 private java.lang.String inputAnnotationSetName;
463
464 private Gazetteer gazetteerInst;
466 private java.util.List inputFeatureNames;
467
468 private ArrayList changedNodes;
470 }