gate.creole.orthomatcher.OrthoMatcher (Java2HTML)

1   /*
2    *  OrthoMatcher.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/August/2001
12   *
13   *  $Id: OrthoMatcher.java,v 1.50 2005/02/17 10:43:50 julien Exp $
14   */
15  
16  
17  package gate.creole.orthomatcher;
18  
19  import java.io.*;
20  import java.net.URL;
21  import java.util.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.util.*;
26  
27  public class OrthoMatcher extends AbstractLanguageAnalyser
28                            implements ANNIEConstants{
29  
30    public static final String
31      OM_DOCUMENT_PARAMETER_NAME = "document";
32  
33    public static final String
34      OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
35  
36    public static final String
37      OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
38  
39    public static final String
40      OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
41  
42    public static final String
43      OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
44  
45    public static final String
46      OM_PERSON_TYPE_PARAMETER_NAME = "personType";
47  
48    public static final String
49      OM_EXT_LISTS_PARAMETER_NAME = "extLists";
50  
51    protected static final String CDGLISTNAME = "cdg";
52    protected static final String ALIASLISTNAME = "alias";
53    protected static final String ARTLISTNAME = "def_art";
54    protected static final String PREPLISTNAME = "prepos";
55    protected static final String CONNECTORLISTNAME = "connector";
56    protected static final String SPURLISTNAME = "spur_match";
57  
58    protected static final String PUNCTUATION_VALUE = "punctuation";
59    protected static final String THE_VALUE = "The";
60  
61  
62    /**the name of the annotation set*/
63    protected String annotationSetName;
64  
65    /** the types of the annotation */
66    protected List annotationTypes = new ArrayList(10);
67  
68    /** the organization type*/
69    protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
70  
71    /** the person type*/
72    protected String personType = PERSON_ANNOTATION_TYPE;
73  
74    protected String unknownType = "Unknown";
75  
76    /** internal or external list */
77    protected boolean extLists = true;
78  
79    /** matching unknowns or not*/
80    protected boolean matchingUnknowns = true;
81  
82    /** This is an internal variable to indicate whether
83     *  we matched using a rule that requires that
84     *  the newly matched annotation matches all the others
85     *  This is needed, because organizations can share
86     *  first/last tokens like News and be different
87     */
88    private   boolean allMatchingNeeded = false;
89  
90    //** Orthomatching is not case-sensitive by default*/
91    protected boolean caseSensitive = false;
92  
93    protected FeatureMap queryFM = Factory.newFeatureMap();
94  
95  //  protected ExecutionException executionException;
96  
97    // name lookup tables (used for namematch)
98    //gave them bigger default size, coz rehash is expensive
99    protected HashMap alias = new HashMap(100);
100   protected HashSet cdg = new HashSet(50);
101   protected HashMap spur_match = new HashMap(100);
102   protected HashMap def_art = new HashMap(20);
103   protected HashMap connector = new HashMap(20);
104   protected HashMap prepos = new HashMap(30);
105 
106 
107   protected AnnotationSet nameAllAnnots = null;
108   protected HashMap processedAnnots = new HashMap(150);
109   protected HashMap annots2Remove = new HashMap(75);
110   protected List matchesDocFeature = new ArrayList();
111   //maps annotation ids to array lists of tokens
112   protected HashMap tokensMap = new HashMap(150);
113 
114   protected Annotation shortAnnot, longAnnot;
115 
116   protected ArrayList tokensLongAnnot, tokensShortAnnot;
117 
118   /** a feature map to be used when retrieving annotations
119    *  declared here so can be reused for efficiency
120    *  clear() before each use
121    */
122   protected FeatureMap tempMap = Factory.newFeatureMap();
123 
124   /** the size of the buffer */
125   private final static int BUFF_SIZE = 65000;
126 
127   /**
128    * URL to the file containing the definition for this orthomatcher
129    */
130   private java.net.URL definitionFileURL;
131 
132   /** The encoding used for the definition file and associated lists.*/
133   private String encoding;
134 
135   /** @link dependency */
136   /*#OrthoMatcher lnkOrthoMatcher;*/
137 
138   public OrthoMatcher () {
139     annotationTypes.add(organizationType);
140     annotationTypes.add(personType);
141     annotationTypes.add("Location");
142     annotationTypes.add("Date");
143   }
144 
145   /** Initialise this resource, and return it. */
146   public Resource init() throws ResourceInstantiationException {
147     //initialise the list of annotations which we will match
148     if(definitionFileURL == null){
149       throw new ResourceInstantiationException(
150                 "No URL provided for the definition file!");
151     }
152 
153     //at this point we have the definition file
154     try{
155       BufferedReader reader = new BufferedReader(
156                       new InputStreamReader(definitionFileURL.openStream(),
157                                             encoding));
158       String lineRead = null;
159       while ((lineRead = reader.readLine()) != null){
160         int index = lineRead.indexOf(":");
161         if (index != -1){
162           String nameFile = lineRead.substring(0,index);
163           String nameList = lineRead.substring(index+1,lineRead.length());
164           createAnnotList(nameFile,nameList);
165         }// if
166       }//while
167       reader.close();
168     }catch(IOException ioe){
169       throw new ResourceInstantiationException(ioe);
170     }
171 
172     return this;
173   } // init()
174 
175   /**  Run the resource. It doesn't make sense not to override
176     *  this in subclasses so the default implementation signals an
177     *  exception.
178     */
179   public void execute() throws ExecutionException{
180 
181     //check the input
182     if(document == null) {
183       throw new ExecutionException(
184         "No document for namematch!"
185       );
186     }
187 
188     // get the annotations from document
189     if ((annotationSetName == null)|| (annotationSetName.equals("")))
190       nameAllAnnots = document.getAnnotations();
191     else
192       nameAllAnnots = document.getAnnotations(annotationSetName);
193 
194     //if none found, print warning and exit
195     if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
196       Out.prln("OrthoMatcher Warning: No annotations found for processing");
197       return;
198     }
199 
200     //check if we've been run on this document before
201     //and clean the doc if needed
202     docCleanup();
203     Map matchesMap = (Map)document.getFeatures().
204                      get(DOCUMENT_COREF_FEATURE_NAME);
205 
206     // creates the cdg list from the document
207     //no need to create otherwise, coz already done in init()
208     if (!extLists)
209       buildTables(nameAllAnnots);
210 
211     //first match all name annotations
212     matchNameAnnotations();
213 
214     //then match the unknown ones to all name ones
215     if (matchingUnknowns)
216       matchUnknown();
217 
218     // set the matches of the document
219 //    determineMatchesDocument();
220     if (! matchesDocFeature.isEmpty()) {
221       if(matchesMap == null){
222         matchesMap = new HashMap();
223       }
224       matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
225       //we need to put it even if it was already present in order to triger
226       //the update events
227       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
228 
229       //cannot do clear() as this has already been put on the document
230       //so I need a new one for the next run of matcher
231       matchesDocFeature = new ArrayList();
232     }
233 
234 //    Out.prln("Processed strings" + processedAnnots.values());
235     //clean-up the internal data structures for next run
236     nameAllAnnots = null;
237     processedAnnots.clear();
238     annots2Remove.clear();
239     tokensMap.clear();
240     matchesDocFeature = new ArrayList();
241     longAnnot = null;
242     shortAnnot = null;
243     tokensLongAnnot = null;
244     tokensShortAnnot = null;
245 
246   } // run()
247 
248   protected void matchNameAnnotations() throws ExecutionException{
249     // go through all the annotation types
250     Iterator iterAnnotationTypes = annotationTypes.iterator();
251     while (iterAnnotationTypes.hasNext()) {
252       String annotationType = (String)iterAnnotationTypes.next();
253 
254       AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
255 
256       // continue if no such annotations exist
257       if ((nameAnnots == null) || nameAnnots.isEmpty())
258         continue;
259 
260       Iterator iterNames = nameAnnots.iterator();
261       while (iterNames.hasNext()) {
262         Annotation nameAnnot = (Annotation) iterNames.next();
263         Integer id = nameAnnot.getId();
264 
265         // get string and value
266         String annotString = null;
267         try {
268             annotString = document.getContent().getContent(
269             nameAnnot.getStartNode().getOffset(),
270             nameAnnot.getEndNode().getOffset()
271             ).toString();
272           // now do the reg. exp. substitutions
273           annotString = annotString.replaceAll("\\s+", " ");
274 
275         } catch (InvalidOffsetException ioe) {
276             throw new ExecutionException
277                                    ("Invalid offset of the annotation");
278         }
279         //convert to lower case if we are not doing a case sensitive match
280         if (!caseSensitive)
281           annotString = annotString.toLowerCase();
282 
283         //get the tokens
284         List tokens = new ArrayList((Set)
285                         nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
286                           nameAnnot.getStartNode().getOffset(),
287                           nameAnnot.getEndNode().getOffset()
288                         ));
289         //if no tokens to match, do nothing
290         if (tokens.isEmpty())
291           continue;
292         Collections.sort(tokens, new gate.util.OffsetComparator());
293         //check if these actually do not end after the name
294         //needed coz new tokeniser conflates
295         //strings with dashes. So British Gas-style is two tokens
296         //instead of three. So cannot match properly British Gas
297 //        tokens = checkTokens(tokens);
298         tokensMap.put(nameAnnot.getId(), tokens);
299 
300 //        Out.prln("Matching annot " + nameAnnot + ": string " + annotString);
301 
302         //first check whether we have not matched such a string already
303         //if so, just consider it matched, don't bother calling the rules
304         if (processedAnnots.containsValue(annotString)) {
305 //          Out.prln("Contained string found " + annotString);
306           updateMatches(nameAnnot, annotString);
307           processedAnnots.put(nameAnnot.getId(), annotString);
308           continue;
309         } else if (processedAnnots.isEmpty()) {
310           processedAnnots.put(nameAnnot.getId(), annotString);
311           continue;
312         }
313 
314         //if a person, then remove their title before matching
315         if (nameAnnot.getType().equals(personType))
316           annotString = containTitle(annotString, nameAnnot);
317         else if (nameAnnot.getType().equals(organizationType))
318           annotString = stripCDG(annotString, nameAnnot);
319 
320         if(null == annotString || "".equals(annotString))
321           continue;
322 
323         //otherwise try matching with previous annotations
324         matchWithPrevious(nameAnnot, annotString);
325 
326 //        Out.prln("Putting in previous " + nameAnnot + ": string " + annotString);
327         //finally add the current annotations to the processed map
328         processedAnnots.put(nameAnnot.getId(), annotString);
329       }//while through name annotations
330 
331     }//while through annotation types
332 
333   }
334 
335   protected void matchUnknown() throws ExecutionException {
336     //get all Unknown annotations
337     AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
338 
339     if ((unknownAnnots == null) || unknownAnnots.isEmpty())
340       return;
341 
342     Iterator iter = unknownAnnots.iterator();
343     //loop through the unknown annots
344     while (iter.hasNext()) {
345       Annotation unknown = (Annotation) iter.next();
346 
347       // get string and value
348       String unknownString = null;
349       try {
350           unknownString = document.getContent().getContent(
351             unknown.getStartNode().getOffset(),
352             unknown.getEndNode().getOffset()
353             ).toString();
354         // now do the reg. exp. substitutions
355           unknownString = unknownString.replaceAll("\\s+", " ");
356       } catch (InvalidOffsetException ioe) {
357           throw new ExecutionException
358                                  ("Invalid offset of the annotation");
359       }
360       //convert to lower case if we are not doing a case sensitive match
361       if (!caseSensitive)
362         unknownString = unknownString.toLowerCase();
363 
364       //get the tokens
365       List tokens = new ArrayList((Set)
366                       nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
367                         unknown.getStartNode().getOffset(),
368                         unknown.getEndNode().getOffset()
369                       ));
370       if (tokens.isEmpty())
371         continue;
372       Collections.sort(tokens, new gate.util.OffsetComparator());
373       tokensMap.put(unknown.getId(), tokens);
374 
375 
376       //first check whether we have not matched such a string already
377       //if so, just consider it matched, don't bother calling the rules
378       if (processedAnnots.containsValue(unknownString)) {
379         Annotation matchedAnnot = updateMatches(unknown, unknownString);
380 //        Out.prln("Matched " + unknown + "with string " + unknownString);
381 //        Out.prln("That's same as " + matchedAnnot);
382         if (matchedAnnot.getType().equals(unknownType)) {
383           annots2Remove.put(unknown.getId(),
384                             annots2Remove.get(matchedAnnot.getId()));
385         }
386         else
387           annots2Remove.put(unknown.getId(), matchedAnnot.getType());
388         processedAnnots.put(unknown.getId(), unknownString);
389         unknown.getFeatures().put("NMRule", unknownType);
390         continue;
391       }
392 
393       //check if we should do sub-string matching in case it's hyphenated
394       //for example US-led
395       if (tokens.size() == 1
396           && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
397         if (matchHyphenatedUnknowns(unknown, unknownString, iter))
398           continue;
399       }//if
400 
401       matchWithPrevious(unknown, unknownString);
402 
403     } //while though unknowns
404 
405     if (! annots2Remove.isEmpty()) {
406       Iterator unknownIter = annots2Remove.keySet().iterator();
407       while (unknownIter.hasNext()) {
408         Integer unknId = (Integer) unknownIter.next();
409         Annotation unknown = nameAllAnnots.get(unknId);
410         Integer newID = nameAllAnnots.add(
411           unknown.getStartNode(),
412           unknown.getEndNode(),
413           (String) annots2Remove.get(unknId),
414           unknown.getFeatures()
415         );
416         nameAllAnnots.remove(unknown);
417 
418         //change the id in the matches list
419         List mList = (List)unknown.getFeatures().
420                      get(ANNOTATION_COREF_FEATURE_NAME);
421         mList.remove(unknId);
422         mList.add(newID);
423       }//while
424     }//if
425   }
426 
427   private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
428                                        Iterator iter){
429     boolean matched = false;
430 
431     //only take the substring before the hyphen
432     int stringEnd = unknownString.indexOf("-");
433     unknownString = unknownString.substring(0, stringEnd);
434     //check if we've already matched this string
435     //because only exact match of the substring are considered
436     if (processedAnnots.containsValue(unknownString)) {
437       matched = true;
438       Annotation matchedAnnot = updateMatches(unknown, unknownString);
439       //only do the matching if not a person, because we do not match
440       //those on sub-strings
441       iter.remove();
442       String newType;
443       if (matchedAnnot.getType().equals(unknownType))
444         newType = (String)annots2Remove.get(matchedAnnot.getId());
445       else
446         newType = matchedAnnot.getType();
447 
448       Integer newID = new Integer(-1);
449       try {
450         newID = nameAllAnnots.add(
451           unknown.getStartNode().getOffset(),
452           new Long(unknown.getStartNode().getOffset().longValue()
453                   + stringEnd),
454           newType,
455           unknown.getFeatures()
456         );
457       } catch (InvalidOffsetException ex) {
458         throw new GateRuntimeException(ex.getMessage());
459       }
460       nameAllAnnots.remove(unknown);
461 
462       //change the id in the matches list
463       List mList = (List)unknown.getFeatures().
464                    get(ANNOTATION_COREF_FEATURE_NAME);
465       mList.remove(unknown.getId());
466       mList.add(newID);
467 
468     }
469     return matched;
470   }
471 
472   protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
473     boolean matchedUnknown = false;
474 
475     Iterator prevIter = processedAnnots.keySet().iterator();
476     while (prevIter.hasNext()) {
477       Integer prevId = (Integer) prevIter.next();
478       Annotation prevAnnot = nameAllAnnots.get(prevId);
479 
480       //check if the two are from the same type or the new one is unknown
481       if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
482           && ! nameAnnot.getType().equals(unknownType))
483          )
484         continue;
485       //do not compare two unknown annotations either
486       //they are only matched to those of known types
487       if (  nameAnnot.getType().equals(unknownType)
488             && prevAnnot.getType().equals(unknownType))
489       continue;
490 
491       //check if we have already matched this annotation to the new one
492       if (matchedAlready(nameAnnot, prevAnnot) )
493         continue;
494 
495       //now changed to a rule, here we just match by gender
496       if (prevAnnot.getType().equals(personType)) {
497         String prevGender =
498           (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
499         String nameGender =
500           (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
501         if (   prevGender != null
502             && nameGender != null
503             && ( (nameGender.equalsIgnoreCase("female")
504                   &&
505                   prevGender.equalsIgnoreCase("male")
506                   )
507                ||
508                   (prevGender.equalsIgnoreCase("female")
509                    && nameGender.equalsIgnoreCase("male")
510                   )
511                 )
512             ) //if condition
513           continue; //we don't have a match if the two genders are different
514 
515       }//if
516 
517       //if the two annotations match
518       if (matchAnnotations(nameAnnot, annotString,  prevAnnot)) {
519 //        Out.prln("Matched " + shortName + "and " + longName);
520         updateMatches(nameAnnot, prevAnnot);
521         //if unknown annotation, we need to change to the new type
522         if (nameAnnot.getType().equals(unknownType)) {
523           matchedUnknown = true;
524           if (prevAnnot.getType().equals(unknownType))
525             annots2Remove.put(nameAnnot.getId(),
526                               annots2Remove.get(prevAnnot.getId()));
527           else
528             annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
529          //also put an attribute to indicate that
530           nameAnnot.getFeatures().put("NMRule", unknownType);
531         }//if unknown
532         break; //no need to match further
533       }//if annotations matched
534 
535     }//while through previous annotations
536 
537     if (matchedUnknown)
538       processedAnnots.put(nameAnnot.getId(), annotString);
539 
540 
541   }//matchWithPrevious
542 
543   protected boolean matchAnnotations(Annotation newAnnot, String annotString,
544                                      Annotation prevAnnot) {
545     //do not match two annotations that overlap
546     if (newAnnot.overlaps(prevAnnot))
547       return false;
548 
549     // find which annotation string of the two is longer
550     //  this is useful for some of the matching rules
551     String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
552 
553     String longName = prevAnnotString;
554     String shortName = annotString;
555     longAnnot = prevAnnot;
556     shortAnnot = newAnnot;
557 
558     if (shortName.length()>longName.length()) {
559       String temp = longName;
560       longName = shortName;
561       shortName = temp;
562       Annotation tempAnn = longAnnot;
563       longAnnot = shortAnnot;
564       shortAnnot = tempAnn;
565     }//if
566 
567     tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
568     tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
569 
570     List matchesList = (List) prevAnnot.getFeatures().
571                               get(ANNOTATION_COREF_FEATURE_NAME);
572     if (matchesList == null || matchesList.isEmpty())
573       return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
574 
575     //if these two match, then let's see if all the other matching one will too
576     //that's needed, because sometimes names can share a token (e.g., first or
577     //last but not be the same
578     if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
579       /**
580        * Check whether we need to ensure that there is a match with the rest
581        * of the matching annotations, because the rule requires that
582        * transtivity is not assummed.
583        */
584       if (allMatchingNeeded) {
585         allMatchingNeeded = false;
586 
587         List toMatchList = new ArrayList(matchesList);
588   //      if (newAnnot.getType().equals(unknownType))
589   //        Out.prln("Matching new " + annotString + " with annots " + toMatchList);
590         toMatchList.remove(prevAnnot.getId());
591 
592         return matchOtherAnnots(toMatchList, newAnnot, annotString);
593       } else
594         return true;
595     }
596     return false;
597   }
598 
599   /** This method checkes whether the new annotation matches
600    *  all annotations given in the toMatchList (it contains ids)
601    *  The idea is that the new annotation needs to match all those,
602    *  because assuming transitivity does not always work, when
603    *  two different entities share a common token: e.g., BT Cellnet
604    *  and BT and British Telecom.
605   */
606   protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
607                                       String annotString) {
608 
609     //if the list is empty, then we're matching all right :-)
610     if (toMatchList.isEmpty())
611       return true;
612 
613     boolean matchedAll = true;
614     int i = 0;
615 
616     while (matchedAll && i < toMatchList.size()) {
617       Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
618 
619       // find which annotation string of the two is longer
620       //  this is useful for some of the matching rules
621       String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
622       if (prevAnnotString == null)
623         try {
624           prevAnnotString = document.getContent().getContent(
625             prevAnnot.getStartNode().getOffset(),
626             prevAnnot.getEndNode().getOffset()
627             ).toString();
628         } catch (InvalidOffsetException ioe) {
629           return false;
630         }//try
631 
632 
633       String longName = prevAnnotString;
634       String shortName = annotString;
635       longAnnot = prevAnnot;
636       shortAnnot = newAnnot;
637 
638       if (shortName.length()>=longName.length()) {
639         String temp = longName;
640         longName = shortName;
641         shortName = temp;
642         Annotation tempAnn = longAnnot;
643         longAnnot = shortAnnot;
644         shortAnnot = tempAnn;
645       }//if
646 
647       tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
648       tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
649 
650       matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
651 //      if (newAnnot.getType().equals(unknownType))
652 //        Out.prln("Loop: " + shortName + " and " + longName + ": result: " + matchedAll);
653 
654       i++;
655     }//while
656     return matchedAll;
657   }
658 
659 
660   protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
661     //the two annotations are already matched if the matches list of the first
662     //contains the id of the second
663     List matchesList = (List) annot1.getFeatures().
664                        get(ANNOTATION_COREF_FEATURE_NAME);
665     if ((matchesList == null) || matchesList.isEmpty())
666       return false;
667     else if (matchesList.contains(annot2.getId()))
668       return true;
669     return false;
670   }
671 
672   protected Annotation updateMatches(Annotation newAnnot, String annotString) {
673     Annotation matchedAnnot = null;
674     Integer id;
675 
676     //first find a processed annotation with the same string
677     Iterator iter = processedAnnots.keySet().iterator();
678     while (iter.hasNext()) {
679       id = (Integer) iter.next();
680       String oldString = (String) processedAnnots.get(id);
681       if (annotString.equals(oldString)) {
682         matchedAnnot = nameAllAnnots.get(id);
683         break;
684       }//if
685     }//while
686 
687     if (matchedAnnot == null) return null;
688     //if the two matching annotations are of different type which is not
689     //unknown, do not match them
690     if (! matchedAnnot.getType().equals(newAnnot.getType())
691         && !newAnnot.getType().equals(unknownType) )
692       return matchedAnnot;
693 
694     List matchesList = (List) matchedAnnot.getFeatures().
695                        get(ANNOTATION_COREF_FEATURE_NAME);
696     if ((matchesList == null) || matchesList.isEmpty()) {
697       //no previous matches, so need to add
698       if (matchesList == null) {
699         matchesList = new ArrayList();
700         matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
701                                        matchesList);
702         matchesDocFeature.add(matchesList);
703       }//if
704       matchesList.add(matchedAnnot.getId());
705       matchesList.add(newAnnot.getId());
706     } else {
707       //just add the new annotation
708       matchesList.add(newAnnot.getId());
709     }//if
710     //add the matches list to the new annotation
711     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
712     return matchedAnnot;
713   }
714 
715   protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
716 
717     List matchesList = (List) prevAnnot.getFeatures().
718                               get(ANNOTATION_COREF_FEATURE_NAME);
719     if ((matchesList == null) || matchesList.isEmpty()) {
720       //no previous matches, so need to add
721       if (matchesList == null) {
722         matchesList = new ArrayList();
723         prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
724         matchesDocFeature.add(matchesList);
725       }//if
726       matchesList.add(prevAnnot.getId());
727       matchesList.add(newAnnot.getId());
728     } else {
729       //just add the new annotation
730       matchesList.add(newAnnot.getId());
731     }//if
732     //add the matches list to the new annotation
733     newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
734     //propagate the gender if two persons are matched
735     if (prevAnnot.getType().equals(personType)) {
736       String prevGender =
737         (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
738       String newGender =
739         (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
740       boolean unknownPrevGender = isUnknownGender(prevGender);
741       boolean unknownNewGender = isUnknownGender(newGender);
742       if (unknownPrevGender && !unknownNewGender)
743         prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender);
744       else if (unknownNewGender && !unknownPrevGender)
745         newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender);
746     }//if
747   }
748 
749 
750   protected void docCleanup() {
751     Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
752     if (matchesValue != null && (matchesValue instanceof Map))
753       ((Map)matchesValue).remove(nameAllAnnots.getName());
754     else if (matchesValue != null) {
755       document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
756     }
757 
758     //get all annotations that have a matches feature
759     HashSet fNames = new HashSet();
760     fNames.add(ANNOTATION_COREF_FEATURE_NAME);
761     AnnotationSet annots =
762                   nameAllAnnots.get(null, fNames);
763 
764 //    Out.prln("Annots to cleanup" + annots);
765 
766     if (annots == null || annots.isEmpty())
767       return;
768 
769     Iterator iter = annots.iterator();
770     while (iter.hasNext()) {
771       while (iter.hasNext())
772         ((Annotation) iter.next()).getFeatures().
773                                    remove(ANNOTATION_COREF_FEATURE_NAME);
774     } //while
775   }//cleanup
776 
777   /** return a person name without title */
778   protected String containTitle (String annotString, Annotation annot)
779                       throws ExecutionException {
780     // get the offsets
781     Long startAnnot = annot.getStartNode().getOffset();
782     Long endAnnot = annot.getEndNode().getOffset();
783 
784     // determine "Lookup" annotation set
785     queryFM.clear();
786     queryFM.put("majorType", "title");
787     AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot);
788     if (as1 == null || as1.isEmpty())
789       return annotString;
790     AnnotationSet as =
791       as1.get("Lookup", queryFM);
792     if (as !=null && ! as.isEmpty()) {
793       List titles = new ArrayList((Set)as);
794       Collections.sort(titles, new gate.util.OffsetComparator());
795 
796       Iterator iter = titles.iterator();
797       while (iter.hasNext()) {
798         Annotation titleAnn = (Annotation)(iter.next());
799 
800         //we've not found a title at the start offset,
801         //there's no point in looking further
802         //coz titles come first
803         if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
804           return annotString;
805 
806         try {
807           // the title from the current annotation
808           String annotTitle =
809             document.getContent().getContent(
810               titleAnn.getStartNode().getOffset(),
811               titleAnn.getEndNode().getOffset()
812             ).toString();
813 
814           // eliminate the title from annotation string and return the result
815           if (annotTitle.length()<annotString.length()) {
816             //remove from the array of tokens, so then we can compare properly
817             //the remaining tokens
818 //            Out.prln("Removing title from: " + annot + " with string " + annotString);
819 //            Out.prln("Tokens are" + tokensMap.get(annot.getId()));
820 //            Out.prln("Title is" + annotTitle);
821             ((ArrayList) tokensMap.get(annot.getId())).remove(0);
822             return annotString.substring(
823                                  annotTitle.length()+1,annotString.length());
824           }
825         } catch (InvalidOffsetException ioe) {
826             throw new ExecutionException
827                                ("Invalid offset of the annotation");
828         }//try
829       }// while
830     }//if
831     return annotString;
832 
833   }
834 
835   /** return an organization  without a designator and starting The*/
836   protected String stripCDG (String annotString, Annotation annot){
837 
838     ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
839 
840     //strip starting The first
841     if ( ((String) ((Annotation) tokens.get(0)
842           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
843           .equalsIgnoreCase(THE_VALUE))
844       tokens.remove(0);
845 
846     //no need to check for cdg if there is only 1 token or less
847     if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
848           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
849       tokens.remove(tokens.size()-1);
850 
851     StringBuffer newString = new StringBuffer(50);
852     for (int i = 0; i < tokens.size(); i++){
853       newString.append((String) ((Annotation) tokens.get(i)
854           ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
855       if (i != tokens.size()-1)
856         newString.append(" ");
857     }
858 //    Out.prln("Strip CDG returned: " + newString + "for string " + annotString);
859 
860     if (caseSensitive)
861       return newString.toString();
862 
863     return newString.toString().toLowerCase();
864   }
865 
866 /*
867   public void check() throws ExecutionException {
868     if (executionException != null) {
869       ExecutionException e = executionException;
870       executionException = null;
871       throw e;
872     }
873   } // check()
874 */
875 
876   /** if ( == false) then reads the names of files in order
877     *  to create the lookup tables
878     */
879 //  protected void createLists() throws IOException {
880 //
881 //    InputStream inputStream = Files.getGateResourceAsStream(
882 //                                              "creole/namematcher/listsNM.def");
883 //    InputStreamReader inputStreamReader = new InputStreamReader (
884 //                                                    inputStream);
885 //    BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
886 //
887 //    String lineRead = null;
888 //    while ((lineRead = bufferedReader.readLine()) != null){
889 //      int index = lineRead.indexOf(":");
890 //      if (index != -1){
891 //        String nameFile = lineRead.substring(0,index);
892 //        String nameList = lineRead.substring(index+1,lineRead.length());
893 //        createAnnotList(nameFile,nameList);
894 //      }// if
895 //    }//while
896 //    bufferedReader.close();
897 //    inputStreamReader.close();
898 //    inputStream.close();
899 //  }// createLists()
900 
901   /** creates the lookup tables */
902   protected void createAnnotList(String nameFile,String nameList)
903                                                           throws IOException{
904 
905 //    InputStream inputStream = Files.getGateResourceAsStream(
906 //                                              "creole/namematcher/"+nameFile);
907 //    InputStreamReader inputStreamReader = new InputStreamReader (
908 //                                                    inputStream);
909 //    BufferedReader bufferedReader = new BufferedReader(inputStreamReader);
910 
911     //create the relative URL
912     URL fileURL = new URL(definitionFileURL, nameFile);
913     BufferedReader bufferedReader =
914       new BufferedReader(new InputStreamReader(fileURL.openStream(),
915                          encoding));
916 
917     String lineRead = null;
918     while ((lineRead = bufferedReader.readLine()) != null){
919       if (nameList.compareTo(CDGLISTNAME)==0){
920         if (caseSensitive)
921           cdg.add(lineRead);
922         else
923           cdg.add(lineRead.toLowerCase());
924       }// if
925       else {
926         int index = lineRead.indexOf("£");
927         if (index != -1){
928           String  expr = lineRead.substring(0,index);
929           //if not case-sensitive, we need to downcase all strings
930           if (!caseSensitive)
931             expr = expr.toLowerCase();
932           String code = lineRead.substring(index+1,lineRead.length());
933           if (nameList.equals(ALIASLISTNAME))
934                             alias.put(expr, code);
935           else
936           if (nameList.equals(ARTLISTNAME))
937                             def_art.put(expr, code);
938           else
939           if (nameList.equals(PREPLISTNAME))
940                             prepos.put(expr, code);
941           else
942           if (nameList.equals(CONNECTORLISTNAME))
943                             connector.put(expr, code);
944           else
945           if (nameList.equals(SPURLISTNAME))
946                             spur_match.put(expr, code);
947 
948         }//if
949       }// else
950 
951     }//while
952   }//createAnnotList
953 
954 
955   /** apply_rules_namematch: apply rules similarly to lasie1.5's namematch */
956   private boolean apply_rules_namematch(String annotationType, String shortName,
957                                         String longName) {
958     // first apply rule for spurius matches i.e. rule0
959     if (matchRule0(longName, shortName))
960       return false;
961     if (
962          (// rules for all annotations
963           //no longer use rule1, coz I do the check for same string via the
964           //hash table
965             matchRule2(longName, shortName)
966          ||
967             matchRule3(longName, shortName)
968          ) // rules for all annotations
969          ||
970          (// rules for organisation annotations
971              ( annotationType.equals(organizationType)
972                //ACE addition
973                || annotationType.equals("Facility"))
974              &&
975              (    matchRule4(longName, shortName)
976                ||
977                   matchRule5(longName, shortName)
978                ||
979                   matchRule6(longName, shortName)
980                ||
981                   matchRule7(longName, shortName)
982                ||
983 //                  matchRule8(longName, shortName)
984 //               ||
985                   matchRule9(longName, shortName)
986                ||
987                   matchRule10(longName, shortName)
988                ||
989                   matchRule11(longName, shortName)
990                ||
991                   matchRule12(longName, shortName)
992                ||
993                   matchRule13(shortName, longName)
994               )
995            )// rules for organisation annotations
996          ||
997          (// rules for person annotations
998              (    annotationType.equals(personType))
999                &&
1000             (    matchRule4(longName, shortName)
1001               ||
1002                  matchRule5(longName, shortName)
1003               ||
1004                  matchRule14(longName, shortName)
1005               || //kalina: added this, so it matches names when contain more
1006                  //than one first and one last name
1007                  matchRule15(longName, shortName)
1008              )
1009          )// rules for person annotations
1010         ) //if
1011      return true;
1012    return false;
1013  }//apply_rules
1014
1015
1016  /** set the extLists flag */
1017  public void setExtLists(Boolean newExtLists) {
1018    extLists = newExtLists.booleanValue();
1019  }//setextLists
1020
1021  /** set the caseSensitive flag */
1022  public void setCaseSensitive(Boolean newCase) {
1023    caseSensitive = newCase.booleanValue();
1024  }//setextLists
1025
1026  /** set the annotation set name*/
1027  public void setAnnotationSetName(String newAnnotationSetName) {
1028    annotationSetName = newAnnotationSetName;
1029  }//setAnnotationSetName
1030
1031  /** set the types of the annotations*/
1032  public void setAnnotationTypes(List newType) {
1033    annotationTypes = newType;
1034  }//setAnnotationTypes
1035
1036  /** set whether to process the Unknown annotations*/
1037  public void setProcessUnknown(Boolean processOrNot) {
1038    this.matchingUnknowns = processOrNot.booleanValue();
1039  }//setAnnotationTypes
1040
1041  public void setOrganizationType(String newOrganizationType) {
1042    organizationType = newOrganizationType;
1043  }//setOrganizationType
1044
1045  public void setPersonType(String newPersonType) {
1046    personType = newPersonType;
1047  }//setPersonType
1048
1049  /**get the name of the annotation set*/
1050  public String getAnnotationSetName() {
1051    return annotationSetName;
1052  }//getAnnotationSetName
1053
1054  /** get the types of the annotation*/
1055  public List getAnnotationTypes() {
1056    return annotationTypes;
1057  }//getAnnotationTypes
1058
1059  public String getOrganizationType() {
1060    return organizationType;
1061  }
1062
1063  public String getPersonType() {
1064    return personType;
1065  }
1066
1067  public Boolean getExtLists() {
1068    return new Boolean(extLists);
1069  }
1070
1071  /** Are we running in a case-sensitive mode?*/
1072  public Boolean getCaseSensitive() {
1073    return new Boolean(caseSensitive);
1074  }
1075
1076  /** Return whether or not we're processing the Unknown annots*/
1077  public Boolean getProcessUnknown() {
1078    return new Boolean(matchingUnknowns);
1079  }
1080
1081/*
1082  public List getMatchesDocument() {
1083    return matchesDocument;
1084  }
1085*/
1086
1087  protected boolean isUnknownGender(String gender) {
1088    if (gender == null)
1089      return true;
1090    if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
1091      return false;
1092    return true;
1093
1094  } //isUnknownGender
1095
1096  /** RULE #0: If the two names are listed in table of
1097    * spurius matches then they do NOT match
1098    * Condition(s): -
1099    * Applied to: all name annotations
1100    */
1101  public boolean matchRule0(String s1,
1102           String s2) {
1103    if (spur_match.containsKey(s1)
1104        && spur_match.containsKey(s2) )
1105      return
1106        spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
1107
1108    return false;
1109  }//matchRule0
1110
1111  /** RULE #1: If the two names are identical then they are the same
1112    * no longer used, because I do the check for same string via the
1113    * hash table of previous annotations
1114    * Condition(s): depend on case
1115    * Applied to: all name annotations
1116    */
1117  public boolean matchRule1(String s1,
1118           String s2,
1119           boolean matchCase) {
1120//    Out.prln("Rule1: Matching " + s1 + "and " + s2);
1121
1122    boolean matched = false;
1123    if (!matchCase)
1124        matched = s1.equalsIgnoreCase(s2);
1125    else matched =  s1.equals(s2) ;
1126//kalina: do not remove, nice for debug
1127//    if (matched && (s2.startsWith("Kenneth") || s1.startsWith("Kenneth")))
1128//        Out.prln("Rule1: Matched " + s1 + "and " + s2);
1129    return matched;
1130  }//matchRule1
1131
1132
1133  /**
1134    * RULE #2: if the two names are listed as equivalent in the
1135    * lookup table (alias) then they match
1136    * Condition(s): -
1137    * Applied to: all name annotations
1138    */
1139  public boolean matchRule2(String s1,
1140           String s2) {
1141
1142    if (alias.containsKey(s1) && alias.containsKey(s2))
1143      return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1144
1145    return false;
1146  }//matchRule2
1147
1148  /**
1149    * RULE #3: adding a possessive at the end
1150    * of one name causes a match
1151    * e.g. "Standard and Poor" == "Standard and Poor's"
1152    * and also "Standard and Poor" == "Standard's"
1153    * Condition(s): case-insensitive match
1154    * Applied to: all name annotations
1155    */
1156  public boolean matchRule3(String s1, //long string
1157                             String s2) { //short string
1158
1159    if (s2.endsWith("'s") || s2.endsWith("'")
1160        ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1161
1162
1163      String s2_poss = null;
1164
1165      if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1166      else s2_poss = s2.concat("'");
1167
1168      if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1169
1170      // now check the second case i.e. "Standard and Poor" == "Standard's"
1171      String token = (String)
1172        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1173
1174      if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1175      else s2_poss = token.concat("'");
1176
1177      if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1178
1179    } // if (s2.endsWith("'s")
1180    return false;
1181  }//matchRule3
1182
1183  /**
1184    * RULE #4: Do all tokens other than the punctuation marks
1185    * , and . match?
1186    * e.g. "Smith, Jones" == "Smith Jones"
1187    * Condition(s): case-insensitive match
1188    * Applied to: organisation and person annotations
1189    */
1190  public boolean matchRule4(String s1,
1191           String s2) {
1192
1193    boolean allTokensMatch = true;
1194
1195    Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1196    Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1197    while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1198      Annotation token = (Annotation) tokensLongAnnotIter.next();
1199      if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE))
1200        continue;
1201//      Out.prln("Matching" + tokensLongAnnot + " with " + tokensShortAnnot);
1202      if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1203             ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) {
1204        allTokensMatch = false;
1205        break;
1206      } // if (!tokensLongAnnot.nextToken()
1207    } // while
1208//    if (allTokensMatch)
1209//      Out.prln("rule4 fired. result is: " + allTokensMatch);
1210    return allTokensMatch;
1211  }//matchRule4
1212
1213  /**
1214    * RULE #5: if the 1st token of one name
1215    * matches the second name
1216    * e.g. "Pepsi Cola" == "Pepsi"
1217    * Condition(s): case-insensitive match
1218    * Applied to: all name annotations
1219    */
1220  public boolean matchRule5(String s1,
1221           String s2) {
1222
1223    //do not match numbers by this rule
1224    if (tokensLongAnnot.size()> 1 &&
1225        ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1226      return false;
1227
1228//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick")) {
1229//      Out.prln("Rule 5: " + s1 + "and " + s2);
1230//    }
1231
1232    //require that when matching person names, the shorter one to be of length 1
1233    //for the rule to apply. In other words, avoid matching Peter Smith and
1234    //Peter Kline, because they share a Peter token.
1235    if ( (shortAnnot.getType().equals(personType)
1236         || longAnnot.getType().equals(personType)
1237         )
1238       &&
1239         tokensShortAnnot.size()>1
1240       )
1241       return false;
1242
1243    if (tokensLongAnnot.size()<=1)
1244      return false;
1245    boolean result = matchRule1((String)
1246                      ((Annotation) tokensLongAnnot.get(0)
1247                        ).getFeatures().get(TOKEN_STRING_FEATURE_NAME),
1248                      s2,
1249                      caseSensitive);
1250
1251//    if (s1.startsWith("Patrick") || s2.startsWith("Patrick"))
1252//      Out.prln("rule 5 result: " + result);
1253    return result;
1254
1255  }//matchRule5
1256
1257  /**
1258    * RULE #6: if one name is the acronym of the other
1259    * e.g. "Imperial Chemical Industries" == "ICI"
1260    * Applied to: organisation annotations only
1261    */
1262  public boolean matchRule6(String s1,
1263           String s2) {
1264
1265    int i = 0;
1266
1267    //check and if the shorted string has a space in it, then it's not
1268    //an acronym
1269    if (s2.indexOf(" ") > 0)
1270      return false;
1271
1272    //Out.prln("Acronym: Matching " + s1 + "and " + s2);
1273    StringBuffer acronym_s1 = new StringBuffer("");
1274    StringBuffer acronymDot_s1 = new StringBuffer("");
1275
1276    for ( ;i < tokensLongAnnot.size(); i++ ) {
1277      String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1278                         ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1);
1279      acronym_s1.append(toAppend);
1280      acronymDot_s1.append(toAppend);
1281      acronymDot_s1.append(".");
1282    }
1283
1284    //Out.prln("Acronym dot: To Match " + acronymDot_s1 + "and " + s2);
1285    //Out.prln("Result: " + matchRule1(acronymDot_s1.toString(),s2,caseSensitive));
1286
1287    if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1288        matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1289      return true;
1290
1291    return false;
1292  }//matchRule6
1293
1294  /**
1295    * RULE #7: if one of the tokens in one of the
1296    * names is in the list of separators eg. "&"
1297    * then check if the token before the separator
1298    * matches the other name
1299    * e.g. "R.H. Macy & Co." == "Macy"
1300    * Condition(s): case-sensitive match
1301    * Applied to: organisation annotations only
1302    */
1303  public boolean matchRule7(String s1,
1304           String s2) {
1305
1306    //don't try it unless the second string is just one token
1307    if (tokensShortAnnot.size() != 1)
1308      return false;
1309
1310    String previous_token = null;
1311
1312    for (int i = 0;  i < tokensLongAnnot.size(); i++ ) {
1313      if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1314          ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) {
1315        previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1316                                    ).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1317
1318        break;
1319      }
1320    }
1321
1322    //now match previous_token with other name
1323    if (previous_token != null) {
1324//      if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1325//        Out.prln("Rule7");
1326      return matchRule1(previous_token,s2,caseSensitive);
1327
1328    }
1329    return false;
1330  }//matchRule7
1331
1332  /**
1333   * This rule is now obsolete, as The and the trailing CDG
1334   * are stripped before matching.
1335   * DO NOT CALL!!!
1336   *
1337    * RULE #8: if the names match, ignoring The and
1338    * and trailing company designator (which have already been stripped)
1339    * e.g. "The Magic Tricks Co." == "Magic Tricks"
1340    * Condition(s): case-sensitive match
1341    * Applied to: organisation annotations only
1342    */
1343  public boolean matchRule8(String s1,
1344           String s2) {
1345    Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1346/*
1347    if (s1.startsWith("The ")) s1 = s1.substring(4);
1348    if (s2.startsWith("The ")) s2 = s2.substring(4);
1349
1350    // check that cdg is not empty
1351    if (!cdg.isEmpty()) {
1352      String stringToTokenize1 = s1;
1353      StringTokenizer tokensLongAnnot = new StringTokenizer(stringToTokenize1," ");
1354
1355      String stringToTokenize2 = s2;
1356      StringTokenizer tokensShortAnnot = new StringTokenizer(stringToTokenize2," ");
1357      String token = null;
1358      String cdg1 = null;
1359      String cdg2 = null;
1360
1361      s1 = "";
1362      s2 = "";
1363
1364      //check last token of s1
1365      while (tokensLongAnnot.hasMoreTokens()) {
1366        token = tokensLongAnnot.nextToken();
1367        if (!tokensLongAnnot.hasMoreTokens()
1368            && cdg.contains(token)) cdg1=token;
1369        else s1 = s1+token;
1370      }
1371
1372      // do the same for s2
1373      while (tokensShortAnnot.hasMoreTokens()) {
1374        token = tokensShortAnnot.nextToken();
1375        if (!tokensShortAnnot.hasMoreTokens()
1376          && cdg.contains(token)) cdg2=token;
1377        else s2 = s2+token;
1378      }
1379
1380      // if the company designators are different
1381      // then they are NOT the same organisations
1382      if ((cdg1!=null && cdg2!=null)
1383    && !cdg1.equalsIgnoreCase(cdg2)) return false;
1384    }
1385    if (!s1.equals("") && !s2.equals("")) return matchRule1(s1,s2,caseSensitive);
1386*/
1387    return false;
1388
1389  }//matchRule8
1390
1391  /**
1392    * RULE #9: does one of the names match the token
1393    * just before a trailing company designator
1394    * in the other name?
1395    * The company designator has already been chopped off,
1396    * so the token before it, is in fact the last token
1397    * e.g. "R.H. Macy Co." == "Macy"
1398    * Applied to: organisation annotations only
1399    */
1400  public boolean matchRule9(String s1,
1401           String s2) {
1402
1403//    if (s1.equalsIgnoreCase("news") || s2.equalsIgnoreCase("news"))
1404//      Out.prln("Rule 9 " + s1 + " and " + s2);
1405    String s1_short = (String)
1406                      ((Annotation) tokensLongAnnot.get(
1407                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1408//    Out.prln("Converted to " + s1_short);
1409    if (tokensLongAnnot.size()>1) {
1410      boolean matched = matchRule1(s1_short, s2, caseSensitive);
1411      //we need to make sure all names match, instead of assuming transitivity,
1412      //to avoid matching BBC News with News then News with ITV News, which
1413      //by transitivity leads to BBC News matching ITV News which is not what
1414      //we want
1415      if (matched)
1416        allMatchingNeeded = true;
1417      return matched;
1418    } //if
1419
1420    return false;
1421  }//matchRule9
1422
1423  /**
1424    * RULE #10: is one name the reverse of the other
1425    * reversing around prepositions only?
1426    * e.g. "Department of Defence" == "Defence Department"
1427    * Condition(s): case-sensitive match
1428    * Applied to: organisation annotations only
1429    */
1430  public boolean matchRule10(String s1,
1431            String s2) {
1432
1433    String token = null;
1434    String previous_token = null;
1435    String next_token = null;
1436    boolean invoke_rule=false;
1437
1438    if (tokensLongAnnot.size() >= 3
1439        && tokensShortAnnot.size() >= 2) {
1440
1441      // first get the tokens before and after the preposition
1442      int i = 0;
1443      for (; i< tokensLongAnnot.size(); i++) {
1444        token = (String)
1445                  ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1446        if (prepos.containsKey(token)) {
1447          invoke_rule=true;
1448          break;
1449        }//if
1450        previous_token = token;
1451      }//while
1452
1453      if (! invoke_rule)
1454        return false;
1455
1456      if (i < tokensLongAnnot.size()
1457          && previous_token != null)
1458        next_token= (String)
1459                    ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1460      else return false;
1461
1462      String s21 = (String)
1463                    ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1464      String s22 = (String)
1465                    ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1466      // then compare (in reverse) with the first two tokens of s2
1467      if (matchRule1(next_token,(String) s21,caseSensitive)
1468          && matchRule1(previous_token, s22,caseSensitive))
1469        return true ;
1470    }//if (tokensLongAnnot.countTokens() >= 3
1471    return false;
1472  }//matchRule10
1473
1474  /**
1475    * RULE #11: does one name consist of contractions
1476    * of the first two tokens of the other name?
1477    * e.g. "Communications Satellite" == "ComSat"
1478    * and "Pan American" == "Pan Am"
1479    * Condition(s): case-sensitive match
1480    * Applied to: organisation annotations only
1481    */
1482  public boolean matchRule11(String s1,
1483            String s2) {
1484
1485
1486    // first do the easy case e.g. "Pan American" == "Pan Am"
1487
1488    String token11 = null;
1489    String token12 = null;
1490    String token21 = null;
1491    String token22 = null;
1492
1493    if (tokensLongAnnot.size() < 2)
1494      return false;
1495
1496    // 1st get the first two tokens of s1
1497    token11 = (String)
1498                ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1499    token12 = (String)
1500                ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1501
1502    // now check for the first case i.e. "Pan American" == "Pan Am"
1503    if (tokensShortAnnot.size() == 2)  {
1504
1505      token21 = (String)
1506                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1507      token22 = (String)
1508                  ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1509
1510      if (token11.startsWith(token21)
1511          && token12.startsWith(token22))
1512        return true;
1513
1514    } // if (tokensShortAnnot.countTokens() == 2)
1515
1516    // now the second case e.g.  "Communications Satellite" == "ComSat"
1517    else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1518
1519      // split the token into possible contractions
1520      // ignore case for matching
1521      for (int i=2;i<s2.length();i++) {
1522        token21=s2.substring(0,i+1);
1523        token22=s2.substring(i+1);
1524
1525        if (token11.startsWith(token21)
1526            && token12.startsWith(token22))
1527          return true;
1528      }// for
1529    } // else if
1530
1531    return false;
1532  }//matchRule11
1533
1534  /**
1535    * RULE #12: do the first and last tokens of one name
1536    * match the first and last tokens of the other?
1537    * Condition(s): case-sensitive match
1538    * Applied to: organisation annotations only
1539    */
1540  public boolean matchRule12(String s1,
1541            String s2) {
1542
1543    // first do the easy case e.g. "Pan American" == "Pan Am"
1544
1545    if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1546//     Out.prln("Rule 12");
1547
1548      // get first and last tokens of s1 & s2
1549      String s1_first = (String)
1550                     ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1551      String s2_first = (String)
1552                     ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1553
1554      if (!matchRule1(s1_first,s2_first,caseSensitive))
1555        return false;
1556
1557      String s1_last = (String)
1558         ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1559      String s2_last = (String)
1560         ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1561
1562      return matchRule1(s1_last,s2_last,caseSensitive);
1563    } // if (tokensLongAnnot.countTokens()>1
1564    return false;
1565  }//matchRule12
1566
1567  /**
1568    * RULE #13: do multi-word names match except for
1569    * one token e.g.
1570    * "Second Force Recon Company" == "Force Recon Company"
1571    * Note that this rule has NOT been used in LaSIE's 1.5
1572    * namematcher
1573    * Restrictions: - remove cdg first
1574    *               - shortest name should be 2 words or more
1575    *               - if N is the number of tokens of the longest
1576    *                 name, then N-1 tokens should be matched
1577    * Condition(s): case-sensitive match
1578    * Applied to: organisation or person annotations only
1579    */
1580  public boolean matchRule13(String s1,
1581            String s2) {
1582
1583
1584    String token1 = null;
1585    String token2 = null;
1586
1587    int matched_tokens = 0, mismatches = 0;;
1588
1589    // if names < 2 words then rule is invalid
1590    if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1591
1592//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1593//      Out.prln("Rule 13: Matching tokens" + tokensLongAnnot);
1594//      Out.prln("with tokens " + tokensShortAnnot);
1595//    }
1596
1597    // now do the matching
1598    for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1599
1600//      Out.prln("i = " + i);
1601//      Out.prln("j = " + j);
1602      if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1603           ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) {
1604        matched_tokens++;
1605        j++;
1606      } else
1607        mismatches++;
1608    } // for
1609
1610    if (matched_tokens >= tokensLongAnnot.size()-1)
1611      return true;
1612
1613    return false;
1614  }//matchRule13
1615
1616  /**
1617    * RULE #14: if the last token of one name
1618    * matches the second name
1619    * e.g. "Hamish Cunningham" == "Cunningham"
1620    * Condition(s): case-insensitive match
1621    * Applied to: all person annotations
1622    */
1623  public boolean matchRule14(String s1,
1624           String s2) {
1625
1626//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin"))
1627//      Out.prln("Rule 14 " + s1 + " and " + s2);
1628    String s1_short = (String)
1629                      ((Annotation) tokensLongAnnot.get(
1630                          tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1631//    Out.prln("Converted to " + s1_short);
1632    if (tokensLongAnnot.size()>1)
1633      return matchRule1(s1_short,
1634                      s2,
1635                      caseSensitive);
1636
1637    return false;
1638
1639  }//matchRule14
1640
1641  /**
1642    * RULE #15: does one token from a Person name appear as the other token
1643    * Note that this rule has NOT been used in LaSIE's 1.5
1644    * namematcher; added for ACE by Di's request
1645    */
1646  public boolean matchRule15(String s1,
1647            String s2) {
1648
1649    int matched_tokens = 0;
1650
1651    // if names < 2 words then rule is invalid
1652
1653//    if (s1.equalsIgnoreCase("chin") || s2.equalsIgnoreCase("chin")) {
1654//      Out.prln("Rule 15:" );
1655//      Out.prln("with tokens " + tokensShortAnnot);
1656//    }
1657
1658    // now do the matching
1659    Annotation token1, token2;
1660    for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1661      token1 = (Annotation) tokensShortAnnot.get(i);
1662      //first check if not punctuation, because we need to skip it
1663      if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1664        continue;
1665
1666      for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) {
1667//      Out.prln("i = " + i);
1668        token2 = (Annotation) tokensLongAnnot.get(j);
1669        if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1670          continue;
1671        if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1672             token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
1673          matched_tokens++;
1674      }//for
1675    } // for
1676
1677    //19 February 2002: kalina
1678    //was originally > 0 (i.e., any match is good)
1679    //ensure that we've matched all the tokens in the short annotation
1680    //the reason for that is, because otherwise we match
1681    //Patrick Viera and Patrick Somebody - not good!
1682    if (matched_tokens == tokensShortAnnot.size())
1683      return true;
1684
1685    return false;
1686  }//matchRule15
1687
1688
1689  /** Tables for namematch info
1690    * (used by the namematch rules)
1691    */
1692  private void buildTables(AnnotationSet nameAllAnnots) {
1693
1694    //reset the tables first
1695    cdg.clear();
1696
1697    if (! extLists) {
1698    // i.e. get cdg from Lookup annotations
1699      // get all Lookup annotations
1700      tempMap.clear();
1701      tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
1702      //now get all lookup annotations which are cdg
1703      AnnotationSet nameAnnots =
1704        nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
1705
1706      if ((nameAnnots ==null) || nameAnnots.isEmpty())
1707        return;
1708
1709      Iterator iter = nameAnnots.iterator();
1710      while (iter.hasNext()) {
1711         Annotation annot = (Annotation)iter.next();
1712         // get the actual string
1713         Long offsetStartAnnot = annot.getStartNode().getOffset();
1714         Long offsetEndAnnot = annot.getEndNode().getOffset();
1715         try {
1716           gate.Document doc = nameAllAnnots.getDocument();
1717           String annotString =
1718                            doc.getContent().getContent(
1719                            offsetStartAnnot,offsetEndAnnot
1720                            ).toString();
1721                cdg.add(annotString);
1722         } catch (InvalidOffsetException ioe) {
1723             ioe.printStackTrace(Err.getPrintWriter());
1724         }
1725      }// while
1726    }//if
1727  }//buildTables
1728
1729  
1730  public void setDefinitionFileURL(java.net.URL definitionFileURL) {
1731    this.definitionFileURL = definitionFileURL;
1732  }
1733
1734  public java.net.URL getDefinitionFileURL() {
1735    return definitionFileURL;
1736  }
1737  public void setEncoding(String encoding) {
1738    this.encoding = encoding;
1739  }
1740  public String getEncoding() {
1741    return encoding;
1742  }
1743
1744
1745  private static class Class1 {
1746  }
1747} // public class OrthoMatcher
1748
1749