gate.creole.gazetteer.FlexibleGazetteer (Java2HTML)

1   /*
2    * FlexibleGazetteer.java
3    *
4    * Copyright (c) 2004, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Niraj Aswani 02/2002
14   *
15   */
16  
17  package gate.creole.gazetteer;
18  
19  import java.util.*;
20  import gate.util.*;
21  import gate.*;
22  import gate.creole.*;
23  
24  /**
25   * <p>Title: Flexible Gazetteer </p>
26       * <p> The Flexible Gazetteer provides users with the flexibility to choose </p>
27   * <p> their own customized input and an external Gazetteer. For example, </p>
28   * <p> the user might want to replace words in the text with their base </p>
29   * <p> forms (which is an output of the Morphological Analyser) or to segment </p>
30   * <p> a Chinese text (using the Chinese Tokeniser) before running the </p>
31   * <p> Gazetteer on the Chinese text. </p>
32   *
33       * <p> The Flexible Gazetteer performs lookup over a document based on the  </p>
34   * <p> values of an arbitrary feature of an arbitrary annotation type, by </p>
35   * <p> using an externally provided gazetteer. It is important to use an </p>
36   * <p> external gazetteer as this allows the use of any type of gazetteer </p>
37   * <p> (e.g. an Ontological gazetteer). </p>
38   * @author niraj aswani
39   * @version 1.0
40   */
41  
42  public class FlexibleGazetteer
43      extends AbstractLanguageAnalyser
44      implements ProcessingResource {
45  
46    /**
47     * Constructor
48     */
49    public FlexibleGazetteer() {
50      changedNodes = new ArrayList();
51    }
52  
53    /** Does the actual loading and parsing of the lists. This method must be
54     * called before the gazetteer can be used
55     */
56    public Resource init() throws ResourceInstantiationException {
57      // check for parameters
58      if(gazetteerInst == null)
59        throw new ResourceInstantiationException("No Gazetteer Provided!");
60  
61      return this;
62    }
63  
64    /**
65     * This method runs the gazetteer. It assumes that all the needed parameters
66     * are set. If they are not, an exception will be fired.
67     */
68    public void execute() throws ExecutionException {
69      fireProgressChanged(0);
70      fireStatusChanged("Checking Document...");
71      if (document == null) {
72        throw new ExecutionException(
73            "No document to process!"
74            );
75      }
76  
77      fireStatusChanged("Creating temporary Document...");
78      StringBuffer newdocString = new StringBuffer(document.getContent().toString());
79      Document tempDoc = null;
80      boolean chineseSplit = false;
81  
82      if (inputFeatureNames == null || inputFeatureNames.size() == 0) {
83        inputFeatureNames = new ArrayList();
84      }
85  
86      Iterator tokenIter = getTokenIterator(document, inputAnnotationSetName);
87      long totalDeductedSpaces = 0;
88      fireStatusChanged("Replacing contents with the feature value...");
89  
90      outer:while (tokenIter != null && tokenIter.hasNext()) {
91        Annotation currentToken = (Annotation) tokenIter.next();
92  
93        // check if it is a chinesesplit
94        // if it is, replace no space character with a single space
95        if (currentToken.getType().equals(ANNIEConstants.
96                                          SPACE_TOKEN_ANNOTATION_TYPE) &&
97            ( (String) (currentToken.getFeatures().get(ANNIEConstants.
98            TOKEN_KIND_FEATURE_NAME))).equals("ChineseSplit")) {
99  
100         // for chinese split startnode and end node are same
101         long startOffset = currentToken.getStartNode().getOffset().
102                            longValue();
103 
104         // because we are adding a space in place of chinesesplit
105         // the endoffset will become newStartOffset + 1
106         long newStartOffset = startOffset - totalDeductedSpaces;
107         long newEndOffset = newStartOffset + 1;
108         NodePosition newNode = new NodePosition(startOffset, startOffset,
109                                                 newStartOffset, newEndOffset,
110                                                 totalDeductedSpaces);
111         chineseSplit = true;
112 
113         // here is the addition of space in the document
114         totalDeductedSpaces--;
115         changedNodes.add(newNode);
116         newdocString = newdocString.insert( (int) newStartOffset, ' ');
117         continue outer;
118       }
119 
120       // search in the provided inputFeaturesNames
121       // if the current token has a feature value that user
122       // wants to paste on and replace the original string of the token
123       inner:for (int i = 0; i < inputFeatureNames.size(); i++) {
124         String[] keyVal = ( (String) (inputFeatureNames.get(i))).split("[.]");
125 
126         if (keyVal.length == 2) {
127           // val is the feature name
128           // key is the annotationName
129           if (currentToken.getType().equals(keyVal[0])) {
130             FeatureMap features = currentToken.getFeatures();
131             String newTokenValue = (String) (features.get(keyVal[1]));
132 
133             // what if provided feature doesnot exist
134             if (newTokenValue == null) {
135               continue;
136 
137             }
138             else {
139               // feature value found so we need to replace it
140               // find the start and end offsets for this token
141               long startOffset = currentToken.getStartNode().getOffset().
142                                  longValue();
143               long endOffset = currentToken.getEndNode().getOffset().
144                                longValue();
145 
146               // what is the actual string
147               String actualString = (String) (features.get(ANNIEConstants.
148                   TOKEN_STRING_FEATURE_NAME));
149 
150               // if the feature value and the actual string both are same
151               // we don't need to replace it
152               if (actualString.equals(newTokenValue)) {
153                 // there is no need to change anything for this
154                 break inner;
155               }
156 
157               // let us find the difference between the lengths of the
158               // actual string and the newTokenValue
159               long lengthDifference = actualString.length() -
160                                       newTokenValue.length();
161 
162               // so lets find the new startOffset and endOffset
163               long newStartOffset = startOffset - totalDeductedSpaces;
164               long newEndOffset = newStartOffset + newTokenValue.length();
165 
166               // and make the entry for this
167               NodePosition newNode = new NodePosition(startOffset,
168                   endOffset,
169                   newStartOffset, newEndOffset, totalDeductedSpaces);
170               changedNodes.add(newNode);
171               // how many spaces have been added or removed till the current
172               // position of the token
173               totalDeductedSpaces += lengthDifference;
174 
175               // and finally replace the actual string in the document
176               // with the new document
177               newdocString = newdocString.replace( (int) newStartOffset,
178                                                   (int) newStartOffset +
179                                                   actualString.length(),
180                                                   newTokenValue);
181               break inner;
182             }
183           }
184         }
185       }
186     }
187 
188     fireStatusChanged("New Document to be processed with Gazetteer...");
189     try {
190       FeatureMap params = Factory.newFeatureMap();
191       params.put("stringContent", newdocString.toString());
192       FeatureMap features = Factory.newFeatureMap();
193       Gate.setHiddenAttribute(features, true);
194       tempDoc = (Document) Factory.createResource("gate.corpora.DocumentImpl",
195                                                   params, features);
196     }
197     catch (ResourceInstantiationException rie) {
198       throw new ExecutionException("Temporary document cannot be created");
199     }
200 
201     // lets create the gazetteer based on the provided gazetteer name
202     FeatureMap params = Factory.newFeatureMap();
203     gazetteerInst.setDocument(tempDoc);
204     gazetteerInst.setAnnotationSetName(this.outputAnnotationSetName);
205 
206     fireStatusChanged("Executing Gazetteer...");
207     gazetteerInst.execute();
208 
209     // now the tempDoc has been looked up, we need to shift the tokens from
210     // this temp document to the original document
211     fireStatusChanged("Transfering new tags to the original one...");
212     Iterator tokensIter = getTokenIterator(tempDoc, outputAnnotationSetName);
213     AnnotationSet original = (outputAnnotationSetName == null) ?
214                              document.getAnnotations() :
215                              document.getAnnotations(outputAnnotationSetName);
216     long totalSpaceAdded = 0;
217     long difference = 0;
218 
219     int foundNode = -1;
220     while (tokensIter != null && tokensIter.hasNext()) {
221       Annotation currentToken = (Annotation) (tokensIter.next());
222       long startOffset = currentToken.getStartNode().getOffset().longValue();
223       long endOffset = currentToken.getEndNode().getOffset().longValue();
224 
225       // search through the changedNodes and if it is found we will have to
226       // find the new offsets
227       int i = foundNode + 1;
228       boolean found = false;
229       inner1:for (; i < changedNodes.size(); i++) {
230 
231         NodePosition tempNode = (NodePosition) (changedNodes.get(i));
232 
233         // all the nodes are in the sorted order based on there offsets
234         // so if we reach beyond the position of the current text
235         // under consideration, simply terminate the loop
236         if (tempNode.getNewStartNode() > startOffset) {
237           // so we lets point to the node whose startOffset
238           // is less than the startOffset of the current node
239           // this will allow us to find out how many
240           // extra spaces were added or removed before the current token
241           i = i - 1;
242           break inner1;
243         }
244 
245         // how do we know if we want to change the offset
246         if (tempNode.getNewStartNode() == startOffset) {
247           // yes it is available
248 
249           // lets find the end node
250           int k = i;
251           for (;
252                k >= 0 && k < changedNodes.size() &&
253                endOffset >
254                ( (NodePosition) changedNodes.get(k)).getNewStartNode(); k++)
255             ;
256           long spacesToAdd = 0;
257           if (k - 1 == i && k - 1 >= 0) {
258             spacesToAdd = (tempNode.getOldEndNode() - tempNode.getNewEndNode());
259           }
260           else if (k - 1 < 0) {
261             spacesToAdd = 0;
262           }
263           else {
264             spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
265                           getOldEndNode() -
266                           ( (NodePosition) changedNodes.get(k - 1)).
267                           getNewEndNode();
268           }
269 
270           // and how many to be added before the endnode
271           // as any look up notation can be for the text with one or more tokens
272           FeatureMap newFeatureMap = currentToken.getFeatures();
273           try {
274 
275             original.add(new Long(startOffset +
276                                   (tempNode.getOldStartNode() -
277                                    tempNode.getNewStartNode())),
278                          new Long(endOffset + spacesToAdd),
279                          //new Long(endOffset + (tempNode.getOldEndNode()
280                          //          - tempNode.getNewEndNode())),
281                          ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
282                          newFeatureMap);
283 
284           }
285           catch (InvalidOffsetException ioe) {
286             throw new ExecutionException("Offset Error");
287           }
288           found = true;
289           foundNode = i;
290           break inner1;
291         }
292       }
293 
294       if (!found) {
295         long totalStartSpaces = 0;
296         long totalEndSpaces = 0;
297 
298         // check if we have reached at the end of the changedNodes
299         // if yes we need to find the last node
300         i = (changedNodes.size() == i) ? i - 1 : i;
301 
302         // lets find the end node
303         int k = i;
304         for (;
305              k > 0 && k < changedNodes.size() &&
306              endOffset > ( (NodePosition) changedNodes.get(k)).getNewStartNode();
307              k++)
308           ;
309         long spacesToAdd = 0;
310         if (k - 1 == i && k - 1 >= 0) {
311           spacesToAdd = ( ( (NodePosition) changedNodes.get(i)).getOldEndNode() -
312                          ( (NodePosition) changedNodes.get(i)).getNewEndNode());
313         }
314         else if (k - 1 < 0) {
315           spacesToAdd = 0;
316         }
317         else {
318           spacesToAdd = ( (NodePosition) changedNodes.get(k - 1)).
319                         getOldEndNode() -
320                         ( (NodePosition) changedNodes.get(k - 1)).getNewEndNode();
321         }
322 
323         if (i >= 0) {
324           //totalStartSpaces = ((NodePosition)
325           // changedNodes.get(i)).getOldStartNode()
326           // - ((NodePosition) changedNodes.get(i)).getNewStartNode();
327           totalStartSpaces = ( (NodePosition) changedNodes.get(i)).
328                              getOldEndNode() -
329                              ( (NodePosition) changedNodes.get(i)).
330                              getNewEndNode();
331           //totalEndSpaces = ((NodePosition)
332           // changedNodes.get(i)).getOldEndNode() -
333           // ((NodePosition) changedNodes.get(i)).getNewEndNode();
334           totalEndSpaces = spacesToAdd;
335           foundNode = i;
336         }
337 
338         // no it is not available
339         FeatureMap newFeatureMap = currentToken.getFeatures();
340         try {
341           original.add(new Long(startOffset + totalStartSpaces),
342                        new Long(endOffset + totalEndSpaces),
343                        ANNIEConstants.LOOKUP_ANNOTATION_TYPE,
344                        newFeatureMap);
345         }
346         catch (InvalidOffsetException ioe) {
347           throw new ExecutionException("Offset Error");
348         }
349 
350       }
351     }
352 
353     // now remove the newDoc
354     Factory.deleteResource(tempDoc);
355     fireProcessFinished();
356   }
357 
358   /**
359    * Sets the document to work on
360    * @param doc
361    */
362   public void setDocument(gate.Document doc) {
363     this.document = doc;
364   }
365 
366   /**
367    * Returns the document set up by user to work on
368    * @return a {@link Document}
369    */
370   public gate.Document getDocument() {
371     return this.document;
372   }
373 
374   /**
375    * sets the outputAnnotationSetName
376    * @param annName
377    */
378   public void setOutputAnnotationSetName(String annName) {
379     this.outputAnnotationSetName = annName;
380   }
381 
382   /**
383    * Returns the outputAnnotationSetName
384    * @return a {@link String} value.
385    */
386   public String getOutputAnnotationSetName() {
387     return this.outputAnnotationSetName;
388   }
389 
390   /**
391    * sets the inputAnnotationSetName
392    * @param annName
393    */
394   public void setInputAnnotationSetName(String annName) {
395     this.inputAnnotationSetName = annName;
396   }
397 
398   /**
399    * Returns the inputAnnotationSetName
400    * @return a {@link String} value.
401    */
402   public String getInputAnnotationSetName() {
403     return this.inputAnnotationSetName;
404   }
405 
406   /**
407    * Feature names for example: Token.string, Token.root etc... Values of these
408        * features should be used to replace the actual string of these features. This
409    * method allows a user to set the name of such features
410    * @param inputs
411    */
412   public void setInputFeatureNames(java.util.List inputs) {
413     this.inputFeatureNames = inputs;
414   }
415 
416   /**
417    * Returns the feature names that are provided by the user to use their values
418    * to replace their actual strings in the document
419    * @return a {@link List} value.
420    */
421   public java.util.List getInputFeatureNames() {
422     return this.inputFeatureNames;
423   }
424 
425   public Gazetteer getGazetteerInst() {
426     return this.gazetteerInst;
427   }
428 
429   public void setGazetteerInst(gate.creole.gazetteer.Gazetteer gazetteerInst) {
430     this.gazetteerInst = gazetteerInst;
431   }
432 
433   /**
434    * This method takes the document and the annotationSetName and then creates
435    * a interator for the annotations available in the document under the
436    * provided annotationSetName
437    * @param doc
438    * @param annotationSetName
439    * @return an {@link Iterator}
440    */
441   public Iterator getTokenIterator(gate.Document doc, String annotationSetName) {
442     AnnotationSet inputAs = (annotationSetName == null) ? doc.getAnnotations() :
443                             doc.getAnnotations(annotationSetName);
444     AnnotationSet tempSet = inputAs.get();
445     if(tempSet == null)
446       return null;
447 
448     List tokens = new ArrayList(inputAs.get());
449 
450     if(tokens == null)
451       return null;
452 
453     Comparator offsetComparator = new OffsetComparator();
454     Collections.sort(tokens, offsetComparator);
455     Iterator tokenIter = tokens.iterator();
456     return tokenIter;
457   }
458 
459   // Gazetteer Runtime parameters
460   private gate.Document document;
461   private java.lang.String outputAnnotationSetName;
462   private java.lang.String inputAnnotationSetName;
463 
464   // Flexible Gazetteer parameter
465   private Gazetteer gazetteerInst;
466   private java.util.List inputFeatureNames;
467 
468   // parameters required within the program
469   private ArrayList changedNodes;
470 }