1   /*
2    * DefaultGazeteer.java
3    *
4    * Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    * This file is part of GATE (see http://gate.ac.uk/), and is free
7    * software, licenced under the GNU Library General Public License,
8    * Version 2, June1991.
9    *
10   * A copy of this licence is included in the distribution in the file
11   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
12   *
13   * Valentin Tablan, 03/07/2000
14   * borislav popov 24/03/2002
15   *
16   * $Id: DefaultGazetteer.java,v 1.51 2006/03/22 11:21:38 valyt Exp $
17   */
18  package gate.creole.gazetteer;
19  
20  import java.util.*;
21  
22  import gate.*;
23  import gate.creole.*;
24  import gate.util.*;
25  
26  /** This component is responsible for doing lists lookup. The implementaion is
27   * based on finite state machines.
28   * The phrases to be recognised should be listed in a set of files, one for
29   * each type of occurences.
30   * The gazeteer is build with the information from a file that contains the set
31   * of lists (which are files as well) and the associated type for each list.
32   * The file defining the set of lists should have the following syntax:
33   * each list definition should be written on its own line and should contain:
34   * <ol>
35   * <li>the file name (required) </li>
36   * <li>the major type (required) </li>
37   * <li>the minor type (optional)</li>
38   * <li>the language(s) (optional) </li>
39   * </ol>
40   * The elements of each definition are separated by &quot;:&quot;.
41   * The following is an example of a valid definition: <br>
42   * <code>personmale.lst:person:male:english</code>
43   * Each list file named in the lists definition file is just a list containing
44   * one entry per line.
45   * When this gazetter will be run over some input text (a Gate document) it
46   * will generate annotations of type Lookup having the attributes specified in
47   * the definition file.
48   */
49  public class DefaultGazetteer extends AbstractGazetteer {
50  
51    /** Debug flag
52     */
53    private static final boolean DEBUG = false;
54  
55    public static final String
56      DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
57  
58    public static final String
59      DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
60  
61    public static final String
62      DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
63  
64    public static final String
65      DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
66  
67    public static final String
68      DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
69  
70  
71    /** a map of nodes vs gaz lists */
72    protected Map listsByNode;
73  
74    /** 
75     * Build a gazetter using the default lists from the gate resources
76     */
77    public DefaultGazetteer(){
78    }
79  
80    /** Does the actual loading and parsing of the lists. This method must be
81     * called before the gazetteer can be used
82     */
83    public Resource init()throws ResourceInstantiationException{
84      fsmStates = new HashSet();
85      initialState = new FSMState(this);
86      if(listsURL == null){
87        throw new ResourceInstantiationException (
88              "No URL provided for gazetteer creation!");
89      }
90      definition = new LinearDefinition();
91      definition.setURL(listsURL);
92      definition.load();
93      int linesCnt = definition.size();
94      listsByNode = definition.loadLists();
95      Iterator inodes = definition.iterator();
96  
97      int nodeIdx = 0;
98      LinearNode node;
99      while (inodes.hasNext()) {
100       node = (LinearNode) inodes.next();
101       fireStatusChanged("Reading " + node.toString());
102       fireProgressChanged(++nodeIdx * 100 / linesCnt);
103       readList(node,true);
104     } // while iline
105     fireProcessFinished();
106     return this;
107   }
108 
109 
110   /** Reads one lists (one file) of phrases
111    *
112    * @param node the node
113    * @param add if <b>true</b> will add the phrases found in the list to the ones
114    *     recognised by this gazetter, if <b>false</b> the phrases found in the
115    *     list will be removed from the list of phrases recognised by this
116    *     gazetteer.
117    */
118    protected void readList(LinearNode node, boolean add) 
119        throws ResourceInstantiationException{
120     String listName, majorType, minorType, languages;
121     if ( null == node ) {
122       throw new ResourceInstantiationException(" LinearNode node is null ");
123     }
124 
125     listName = node.getList();
126     majorType = node.getMajorType();
127     minorType = node.getMinorType();
128     languages = node.getLanguage();
129     GazetteerList gazList = (GazetteerList)listsByNode.get(node);
130     if (null == gazList) {
131       throw new ResourceInstantiationException("gazetteer list not found by node");
132     }
133 
134     Iterator iline = gazList.iterator();
135 
136     Lookup lookup = new Lookup(listName,majorType, minorType, languages);
137     lookup.list = node.getList();
138     if ( null != mappingDefinition){
139       MappingNode mnode = mappingDefinition.getNodeByList(lookup.list);
140       if (null!=mnode){
141         lookup.oClass = mnode.getClassID();
142         lookup.ontology = mnode.getOntologyID();
143       }
144     }//if mapping def
145 
146     String line;
147     while(iline.hasNext()){
148       line = iline.next().toString();
149       if(add)addLookup(line, lookup);
150       else removeLookup(line, lookup);
151     }
152   } // void readList(String listDesc)
153 
154   /** Adds one phrase to the list of phrases recognised by this gazetteer
155    *
156    * @param text the phrase to be added
157    * @param lookup the description of the annotation to be added when this
158    *     phrase is recognised
159    */
160 // >>> DAM, was
161 /*
162   public void addLookup(String text, Lookup lookup) {
163     Character currentChar;
164     FSMState currentState = initialState;
165     FSMState nextState;
166     Lookup oldLookup;
167     boolean isSpace;
168 
169     for(int i = 0; i< text.length(); i++) {
170       isSpace = Character.isWhitespace(text.charAt(i));
171       if(isSpace) currentChar = new Character(' ');
172       else currentChar = (caseSensitive.booleanValue()) ?
173                           new Character(text.charAt(i)) :
174                           new Character(Character.toUpperCase(text.charAt(i))) ;
175       nextState = currentState.next(currentChar);
176       if(nextState == null){
177         nextState = new FSMState(this);
178         currentState.put(currentChar, nextState);
179         if(isSpace) nextState.put(new Character(' '),nextState);
180       }
181       currentState = nextState;
182     } //for(int i = 0; i< text.length(); i++)
183 
184     currentState.addLookup(lookup);
185     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
186 
187   } // addLookup
188 */
189 // >>> DAM: TransArray optimization
190   public void addLookup(String text, Lookup lookup) {
191     char currentChar;
192     FSMState currentState = initialState;
193     FSMState nextState;
194     Lookup oldLookup;
195     boolean isSpace;
196 
197     for(int i = 0; i< text.length(); i++) {
198         currentChar = text.charAt(i);
199         isSpace = Character.isWhitespace(currentChar);
200         if(isSpace) currentChar = ' ';
201         else currentChar = (caseSensitive.booleanValue()) ?
202                           currentChar :
203                           Character.toUpperCase(currentChar) ;
204       nextState = currentState.next(currentChar);
205       if(nextState == null){
206         nextState = new FSMState(this);
207         currentState.put(currentChar, nextState);
208         if(isSpace) nextState.put(' ',nextState);
209       }
210       currentState = nextState;
211     } //for(int i = 0; i< text.length(); i++)
212 
213     currentState.addLookup(lookup);
214     //Out.println(text + "|" + lookup.majorType + "|" + lookup.minorType);
215 
216   } // addLookup
217 // >>> DAM, end
218 
219   /** Removes one phrase to the list of phrases recognised by this gazetteer
220    *
221    * @param text the phrase to be removed
222    * @param lookup the description of the annotation associated to this phrase
223    */
224 // >>> DAM, was
225 /*
226   public void removeLookup(String text, Lookup lookup) {
227     Character currentChar;
228     FSMState currentState = initialState;
229     FSMState nextState;
230     Lookup oldLookup;
231     boolean isSpace;
232 
233     for(int i = 0; i< text.length(); i++) {
234       isSpace = Character.isWhitespace(text.charAt(i));
235       if(isSpace) currentChar = new Character(' ');
236       else currentChar = new Character(text.charAt(i));
237       nextState = currentState.next(currentChar);
238       if(nextState == null) return;//nothing to remove
239       currentState = nextState;
240     } //for(int i = 0; i< text.length(); i++)
241     currentState.removeLookup(lookup);
242   } // removeLookup
243 */
244 // >>> DAM: TransArray optimization
245   public void removeLookup(String text, Lookup lookup) {
246     char currentChar;
247     FSMState currentState = initialState;
248     FSMState nextState;
249     Lookup oldLookup;
250 
251     for(int i = 0; i< text.length(); i++) {
252         currentChar = text.charAt(i);
253         if(Character.isWhitespace(currentChar)) currentChar = ' ';
254         nextState = currentState.next(currentChar);
255         if(nextState == null) return;//nothing to remove
256         currentState = nextState;
257     } //for(int i = 0; i< text.length(); i++)
258     currentState.removeLookup(lookup);
259   } // removeLookup
260 // >>> DAM, end
261 
262   /** Returns a string representation of the deterministic FSM graph using
263    * GML.
264    */
265   public String getFSMgml() {
266     String res = "graph[ \ndirected 1\n";
267     ///String nodes = "", edges = "";
268     StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
269                 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
270     Iterator fsmStatesIter = fsmStates.iterator();
271     while (fsmStatesIter.hasNext()){
272       FSMState currentState = (FSMState)fsmStatesIter.next();
273       int stateIndex = currentState.getIndex();
274       /*nodes += "node[ id " + stateIndex +
275                " label \"" + stateIndex;
276       */
277       nodes.append("node[ id ");
278       nodes.append(stateIndex);
279       nodes.append(" label \"");
280       nodes.append(stateIndex);
281 
282              if(currentState.isFinal()){
283               ///nodes += ",F\\n" + currentState.getLookupSet();
284               nodes.append(",F\\n");
285               nodes.append(currentState.getLookupSet());
286              }
287              ///nodes +=  "\"  ]\n";
288              nodes.append("\"  ]\n");
289       //edges += currentState.getEdgesGML();
290       edges.append(currentState.getEdgesGML());
291     }
292     res += nodes.toString() + edges.toString() + "]\n";
293     return res;
294   } // getFSMgml
295 
296 
297   /**
298    * Tests whether a character is internal to a word (i.e. if it's a letter or
299    * a combining mark (spacing or not)).
300    * @param ch the character to be tested
301    * @return a boolean value
302    */
303   public static boolean isWordInternal(char ch){
304     return Character.isLetter(ch) ||
305            Character.getType(ch) == Character.COMBINING_SPACING_MARK ||
306            Character.getType(ch) == Character.NON_SPACING_MARK;
307   }
308 
309   /**
310    * This method runs the gazetteer. It assumes that all the needed parameters
311    * are set. If they are not, an exception will be fired.
312    */
313   public void execute() throws ExecutionException{
314     interrupted = false;
315     AnnotationSet annotationSet;
316     //check the input
317     if(document == null) {
318       throw new ExecutionException(
319         "No document to process!"
320       );
321     }
322 
323     if(annotationSetName == null ||
324        annotationSetName.equals("")) annotationSet = document.getAnnotations();
325     else annotationSet = document.getAnnotations(annotationSetName);
326 
327     fireStatusChanged("Doing lookup in " +
328                            document.getName() + "...");
329     String content = document.getContent().toString();
330     int length = content.length();
331 // >>> DAM, was
332 /*
333     Character currentChar;
334 */
335 // >>> DAM: TransArray optimization
336     char currentChar;
337 // >>> DAM, end
338     FSMState currentState = initialState;
339     FSMState nextState;
340     FSMState lastMatchingState = null;
341     int matchedRegionEnd = 0;
342     int matchedRegionStart = 0;
343     int charIdx = 0;
344     int oldCharIdx = 0;
345     FeatureMap fm;
346     Lookup currentLookup;
347 
348 // >>> DAM, was
349 /*
350     while(charIdx < length) {
351       if(Character.isWhitespace(content.charAt(charIdx)))
352         currentChar = new Character(' ');
353       else currentChar = (caseSensitive.booleanValue()) ?
354                          new Character(content.charAt(charIdx)) :
355                          new Character(Character.toUpperCase(
356                                        content.charAt(charIdx)));
357 */
358 // >>> DAM: TransArray optimization
359     while(charIdx < length) {
360       currentChar = content.charAt(charIdx);
361       if(Character.isWhitespace(currentChar)) currentChar = ' ';
362       else currentChar = caseSensitive.booleanValue() ?
363                           currentChar :
364                           Character.toUpperCase(currentChar);
365 // >>> DAM, end
366       nextState = currentState.next(currentChar);
367       if(nextState == null) {
368         //the matching stopped
369 
370         //if we had a successful match then act on it;
371         if(lastMatchingState != null){
372           //let's add the new annotation(s)
373           Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
374 
375           while(lookupIter.hasNext()) {
376             currentLookup = (Lookup)lookupIter.next();
377             fm = Factory.newFeatureMap();
378             fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
379             if (null!= currentLookup.oClass && null!=currentLookup.ontology){
380               fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
381               fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
382             }
383             if(null != currentLookup.minorType) {
384               fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
385               if(null != currentLookup.languages)
386                 fm.put("language", currentLookup.languages);
387             }
388             try {
389               annotationSet.add(new Long(matchedRegionStart),
390                               new Long(matchedRegionEnd + 1),
391                               LOOKUP_ANNOTATION_TYPE,
392                               fm);
393             } catch(InvalidOffsetException ioe) {
394               throw new LuckyException(ioe.toString());
395             }
396           }//while(lookupIter.hasNext())
397           lastMatchingState = null;
398         }
399 
400         //reset the FSM
401         charIdx = matchedRegionStart + 1;
402         matchedRegionStart = charIdx;
403         currentState = initialState;
404 
405       } else{//go on with the matching
406         currentState = nextState;
407         //if we have a successful state then store it
408         if(currentState.isFinal() &&
409            (
410             (!wholeWordsOnly.booleanValue())
411              ||
412             ((matchedRegionStart == 0 ||
413              !isWordInternal(content.charAt(matchedRegionStart - 1)))
414              &&
415              (charIdx + 1 >= content.length()   ||
416              !isWordInternal(content.charAt(charIdx + 1)))
417             )
418            )
419           ){
420           matchedRegionEnd = charIdx;
421           lastMatchingState = currentState;
422         }
423         charIdx ++;
424         if(charIdx == content.length()){
425           //we can't go on, use the last matching state and restart matching
426           //from the next char
427           if(lastMatchingState != null){
428             //let's add the new annotation(s)
429             Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
430 
431             while(lookupIter.hasNext()) {
432               currentLookup = (Lookup)lookupIter.next();
433               fm = Factory.newFeatureMap();
434               fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
435               if (null!= currentLookup.oClass && null!=currentLookup.ontology){
436                 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
437                 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
438               }
439               if(null != currentLookup.minorType) {
440                 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
441                 if(null != currentLookup.languages)
442                   fm.put("language", currentLookup.languages);
443               }
444               try {
445                 annotationSet.add(new Long(matchedRegionStart),
446                                 new Long(matchedRegionEnd + 1),
447                                 LOOKUP_ANNOTATION_TYPE,
448                                 fm);
449               } catch(InvalidOffsetException ioe) {
450                 throw new LuckyException(ioe.toString());
451               }
452             }//while(lookupIter.hasNext())
453             lastMatchingState = null;
454           }
455 
456           //reset the FSM
457           charIdx = matchedRegionStart + 1;
458           matchedRegionStart = charIdx;
459           currentState = initialState;
460         }
461       }
462       if(charIdx - oldCharIdx > 256) {
463         fireProgressChanged((100 * charIdx )/ length );
464         oldCharIdx = charIdx;
465         if(isInterrupted()) throw new ExecutionInterruptedException(
466             "The execution of the " + getName() +
467             " gazetteer has been abruptly interrupted!");
468       }
469     } // while(charIdx < length)
470 
471     if(lastMatchingState != null) {
472       Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
473       while(lookupIter.hasNext()) {
474         currentLookup = (Lookup)lookupIter.next();
475         fm = Factory.newFeatureMap();
476         fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
477         if (null!= currentLookup.oClass && null!=currentLookup.ontology){
478           fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
479           fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
480         }
481 
482         if(null != currentLookup.minorType)
483           fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
484         try{
485           annotationSet.add(new Long(matchedRegionStart),
486                           new Long(matchedRegionEnd + 1),
487                           LOOKUP_ANNOTATION_TYPE,
488                           fm);
489         } catch(InvalidOffsetException ioe) {
490           throw new GateRuntimeException(ioe.toString());
491         }
492       }//while(lookupIter.hasNext())
493     }
494     fireProcessFinished();
495     fireStatusChanged("Lookup complete!");
496   } // execute
497 
498 
499   /** The initial state of the FSM that backs this gazetteer
500    */
501   protected FSMState initialState;
502 
503   /** A set containing all the states of the FSM backing the gazetteer
504    */
505   protected Set fsmStates;
506 
507   /**lookup <br>
508    * @param singleItem a single string to be looked up by the gazetteer
509    * @return set of the Lookups associated with the parameter*/
510   public Set lookup(String singleItem) {
511     char currentChar;
512     Set set = new HashSet();
513     FSMState currentState = initialState;
514     FSMState nextState;
515 
516     for(int i = 0; i< singleItem.length(); i++) {
517         currentChar = singleItem.charAt(i);
518         if(Character.isWhitespace(currentChar)) currentChar = ' ';
519         nextState = currentState.next(currentChar);
520         if(nextState == null) {
521           return set;
522         }
523         currentState = nextState;
524     } //for(int i = 0; i< text.length(); i++)
525     set = currentState.getLookupSet();
526     return set;
527   }
528 
529   public boolean remove(String singleItem) {
530     char currentChar;
531     FSMState currentState = initialState;
532     FSMState nextState;
533     Lookup oldLookup;
534 
535     for(int i = 0; i< singleItem.length(); i++) {
536         currentChar = singleItem.charAt(i);
537         if(Character.isWhitespace(currentChar)) currentChar = ' ';
538         nextState = currentState.next(currentChar);
539         if(nextState == null) {
540           return false;
541         }//nothing to remove
542         currentState = nextState;
543     } //for(int i = 0; i< text.length(); i++)
544     currentState.lookupSet = new HashSet();
545     return true;
546   }
547 
548   public boolean add(String singleItem, Lookup lookup) {
549     addLookup(singleItem,lookup);
550     return true;
551   }
552 
553 //>>> DAM: TransArray optimization, new CharMap implementation
554   public static interface Iter
555   {
556       public boolean hasNext();
557       public char next();
558   } // iter class
559 
560   /**
561    * class implementing the map using binary serach by char as key
562    * to retrive the coresponding object.
563    */
564   public static class CharMap
565   {
566       char[] itemsKeys = null;
567       Object[] itemsObjs = null;
568 
569       /**
570        * resize the containers by one leavaing empty elemant at position 'index'
571        */
572       void resize(int index)
573       {
574           int newsz = itemsKeys.length + 1;
575           char[] tempKeys = new char[newsz];
576           Object[] tempObjs = new Object[newsz];
577           int i;
578           for (i= 0; i < index; i++)
579           {
580               tempKeys[i] = itemsKeys[i];
581               tempObjs[i] = itemsObjs[i];
582           }
583           for (i= index+1; i < newsz; i++)
584           {
585               tempKeys[i] = itemsKeys[i-1];
586               tempObjs[i] = itemsObjs[i-1];
587           }
588 
589           itemsKeys = tempKeys;
590           itemsObjs = tempObjs;
591       } // resize
592 
593   /**
594    * get the object from the map using the char key
595    */
596       Object get(char key)
597       {
598           if (itemsKeys == null) return null;
599           int index = Arrays.binarySearch(itemsKeys, key);
600           if (index<0)
601               return null;
602           return itemsObjs[index];
603       }
604   /**
605    * put the object into the char map using the chat as the key
606    */
607       Object put(char key, Object value)
608       {
609           if (itemsKeys == null)
610           {
611               itemsKeys = new char[1];
612               itemsKeys[0] = key;
613               itemsObjs = new Object[1];
614               itemsObjs[0] = value;
615               return value;
616           }// if first time
617           int index = Arrays.binarySearch(itemsKeys, key);
618           if (index<0)
619           {
620               index = ~index;
621               resize(index);
622               itemsKeys[index] = key;
623               itemsObjs[index] = value;
624           }
625           return itemsObjs[index];
626       } // put
627   /**
628    * the keys itereator
629    * /
630       public Iter iter()
631       {
632           return new Iter()
633           {
634               int counter = 0;
635               public boolean hasNext() {return counter < itemsKeys.length;}
636               public char next() { return itemsKeys[counter];}
637           };
638       } // iter()
639    */
640 
641   }// class CharMap
642   
643 } // DefaultGazetteer