1
15
16
17 package gate.creole.orthomatcher;
18
19 import java.io.*;
20 import java.net.URL;
21 import java.util.*;
22
23 import gate.*;
24 import gate.creole.*;
25 import gate.util.*;
26
27 public class OrthoMatcher extends AbstractLanguageAnalyser
28 implements ANNIEConstants{
29
30 public static final String
31 OM_DOCUMENT_PARAMETER_NAME = "document";
32
33 public static final String
34 OM_ANN_SET_PARAMETER_NAME = "annotationSetName";
35
36 public static final String
37 OM_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
38
39 public static final String
40 OM_ANN_TYPES_PARAMETER_NAME = "annotationTypes";
41
42 public static final String
43 OM_ORG_TYPE_PARAMETER_NAME = "organizationType";
44
45 public static final String
46 OM_PERSON_TYPE_PARAMETER_NAME = "personType";
47
48 public static final String
49 OM_EXT_LISTS_PARAMETER_NAME = "extLists";
50
51 protected static final String CDGLISTNAME = "cdg";
52 protected static final String ALIASLISTNAME = "alias";
53 protected static final String ARTLISTNAME = "def_art";
54 protected static final String PREPLISTNAME = "prepos";
55 protected static final String CONNECTORLISTNAME = "connector";
56 protected static final String SPURLISTNAME = "spur_match";
57
58 protected static final String PUNCTUATION_VALUE = "punctuation";
59 protected static final String THE_VALUE = "The";
60
61
62
63 protected String annotationSetName;
64
65
66 protected List annotationTypes = new ArrayList(10);
67
68
69 protected String organizationType = ORGANIZATION_ANNOTATION_TYPE;
70
71
72 protected String personType = PERSON_ANNOTATION_TYPE;
73
74 protected String unknownType = "Unknown";
75
76
77 protected boolean extLists = true;
78
79
80 protected boolean matchingUnknowns = true;
81
82
88 private boolean allMatchingNeeded = false;
89
90 protected boolean caseSensitive = false;
92
93 protected FeatureMap queryFM = Factory.newFeatureMap();
94
95
97 protected HashMap alias = new HashMap(100);
100 protected HashSet cdg = new HashSet(50);
101 protected HashMap spur_match = new HashMap(100);
102 protected HashMap def_art = new HashMap(20);
103 protected HashMap connector = new HashMap(20);
104 protected HashMap prepos = new HashMap(30);
105
106
107 protected AnnotationSet nameAllAnnots = null;
108 protected HashMap processedAnnots = new HashMap(150);
109 protected HashMap annots2Remove = new HashMap(75);
110 protected List matchesDocFeature = new ArrayList();
111 protected HashMap tokensMap = new HashMap(150);
113
114 protected Annotation shortAnnot, longAnnot;
115
116 protected ArrayList tokensLongAnnot, tokensShortAnnot;
117
118
122 protected FeatureMap tempMap = Factory.newFeatureMap();
123
124
125 private final static int BUFF_SIZE = 65000;
126
127
130 private java.net.URL definitionFileURL;
131
132
133 private String encoding;
134
135
136
137
138 public OrthoMatcher () {
139 annotationTypes.add(organizationType);
140 annotationTypes.add(personType);
141 annotationTypes.add("Location");
142 annotationTypes.add("Date");
143 }
144
145
146 public Resource init() throws ResourceInstantiationException {
147 if(definitionFileURL == null){
149 throw new ResourceInstantiationException(
150 "No URL provided for the definition file!");
151 }
152
153 try{
155 BufferedReader reader = new BufferedReader(
156 new InputStreamReader(definitionFileURL.openStream(),
157 encoding));
158 String lineRead = null;
159 while ((lineRead = reader.readLine()) != null){
160 int index = lineRead.indexOf(":");
161 if (index != -1){
162 String nameFile = lineRead.substring(0,index);
163 String nameList = lineRead.substring(index+1,lineRead.length());
164 createAnnotList(nameFile,nameList);
165 } } reader.close();
168 }catch(IOException ioe){
169 throw new ResourceInstantiationException(ioe);
170 }
171
172 return this;
173 }
175
179 public void execute() throws ExecutionException{
180
181 if(document == null) {
183 throw new ExecutionException(
184 "No document for namematch!"
185 );
186 }
187
188 if ((annotationSetName == null)|| (annotationSetName.equals("")))
190 nameAllAnnots = document.getAnnotations();
191 else
192 nameAllAnnots = document.getAnnotations(annotationSetName);
193
194 if ((nameAllAnnots == null) || nameAllAnnots.isEmpty()) {
196 Out.prln("OrthoMatcher Warning: No annotations found for processing");
197 return;
198 }
199
200 docCleanup();
203 Map matchesMap = (Map)document.getFeatures().
204 get(DOCUMENT_COREF_FEATURE_NAME);
205
206 if (!extLists)
209 buildTables(nameAllAnnots);
210
211 matchNameAnnotations();
213
214 if (matchingUnknowns)
216 matchUnknown();
217
218 if (! matchesDocFeature.isEmpty()) {
221 if(matchesMap == null){
222 matchesMap = new HashMap();
223 }
224 matchesMap.put(nameAllAnnots.getName(), matchesDocFeature);
225 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, matchesMap);
228
229 matchesDocFeature = new ArrayList();
232 }
233
234 nameAllAnnots = null;
237 processedAnnots.clear();
238 annots2Remove.clear();
239 tokensMap.clear();
240 matchesDocFeature = new ArrayList();
241 longAnnot = null;
242 shortAnnot = null;
243 tokensLongAnnot = null;
244 tokensShortAnnot = null;
245
246 }
248 protected void matchNameAnnotations() throws ExecutionException{
249 Iterator iterAnnotationTypes = annotationTypes.iterator();
251 while (iterAnnotationTypes.hasNext()) {
252 String annotationType = (String)iterAnnotationTypes.next();
253
254 AnnotationSet nameAnnots = nameAllAnnots.get(annotationType);
255
256 if ((nameAnnots == null) || nameAnnots.isEmpty())
258 continue;
259
260 Iterator iterNames = nameAnnots.iterator();
261 while (iterNames.hasNext()) {
262 Annotation nameAnnot = (Annotation) iterNames.next();
263 Integer id = nameAnnot.getId();
264
265 String annotString = null;
267 try {
268 annotString = document.getContent().getContent(
269 nameAnnot.getStartNode().getOffset(),
270 nameAnnot.getEndNode().getOffset()
271 ).toString();
272 annotString = annotString.replaceAll("\\s+", " ");
274
275 } catch (InvalidOffsetException ioe) {
276 throw new ExecutionException
277 ("Invalid offset of the annotation");
278 }
279 if (!caseSensitive)
281 annotString = annotString.toLowerCase();
282
283 List tokens = new ArrayList((Set)
285 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
286 nameAnnot.getStartNode().getOffset(),
287 nameAnnot.getEndNode().getOffset()
288 ));
289 if (tokens.isEmpty())
291 continue;
292 Collections.sort(tokens, new gate.util.OffsetComparator());
293 tokensMap.put(nameAnnot.getId(), tokens);
299
300
302 if (processedAnnots.containsValue(annotString)) {
305 updateMatches(nameAnnot, annotString);
307 processedAnnots.put(nameAnnot.getId(), annotString);
308 continue;
309 } else if (processedAnnots.isEmpty()) {
310 processedAnnots.put(nameAnnot.getId(), annotString);
311 continue;
312 }
313
314 if (nameAnnot.getType().equals(personType))
316 annotString = containTitle(annotString, nameAnnot);
317 else if (nameAnnot.getType().equals(organizationType))
318 annotString = stripCDG(annotString, nameAnnot);
319
320 if(null == annotString || "".equals(annotString))
321 continue;
322
323 matchWithPrevious(nameAnnot, annotString);
325
326 processedAnnots.put(nameAnnot.getId(), annotString);
329 }
331 }
333 }
334
335 protected void matchUnknown() throws ExecutionException {
336 AnnotationSet unknownAnnots = nameAllAnnots.get(unknownType);
338
339 if ((unknownAnnots == null) || unknownAnnots.isEmpty())
340 return;
341
342 Iterator iter = unknownAnnots.iterator();
343 while (iter.hasNext()) {
345 Annotation unknown = (Annotation) iter.next();
346
347 String unknownString = null;
349 try {
350 unknownString = document.getContent().getContent(
351 unknown.getStartNode().getOffset(),
352 unknown.getEndNode().getOffset()
353 ).toString();
354 unknownString = unknownString.replaceAll("\\s+", " ");
356 } catch (InvalidOffsetException ioe) {
357 throw new ExecutionException
358 ("Invalid offset of the annotation");
359 }
360 if (!caseSensitive)
362 unknownString = unknownString.toLowerCase();
363
364 List tokens = new ArrayList((Set)
366 nameAllAnnots.get(TOKEN_ANNOTATION_TYPE,
367 unknown.getStartNode().getOffset(),
368 unknown.getEndNode().getOffset()
369 ));
370 if (tokens.isEmpty())
371 continue;
372 Collections.sort(tokens, new gate.util.OffsetComparator());
373 tokensMap.put(unknown.getId(), tokens);
374
375
376 if (processedAnnots.containsValue(unknownString)) {
379 Annotation matchedAnnot = updateMatches(unknown, unknownString);
380 if (matchedAnnot.getType().equals(unknownType)) {
383 annots2Remove.put(unknown.getId(),
384 annots2Remove.get(matchedAnnot.getId()));
385 }
386 else
387 annots2Remove.put(unknown.getId(), matchedAnnot.getType());
388 processedAnnots.put(unknown.getId(), unknownString);
389 unknown.getFeatures().put("NMRule", unknownType);
390 continue;
391 }
392
393 if (tokens.size() == 1
396 && "hyphen".equals(unknown.getFeatures().get(TOKEN_KIND_FEATURE_NAME))) {
397 if (matchHyphenatedUnknowns(unknown, unknownString, iter))
398 continue;
399 }
401 matchWithPrevious(unknown, unknownString);
402
403 }
405 if (! annots2Remove.isEmpty()) {
406 Iterator unknownIter = annots2Remove.keySet().iterator();
407 while (unknownIter.hasNext()) {
408 Integer unknId = (Integer) unknownIter.next();
409 Annotation unknown = nameAllAnnots.get(unknId);
410 Integer newID = nameAllAnnots.add(
411 unknown.getStartNode(),
412 unknown.getEndNode(),
413 (String) annots2Remove.get(unknId),
414 unknown.getFeatures()
415 );
416 nameAllAnnots.remove(unknown);
417
418 List mList = (List)unknown.getFeatures().
420 get(ANNOTATION_COREF_FEATURE_NAME);
421 mList.remove(unknId);
422 mList.add(newID);
423 } } }
426
427 private boolean matchHyphenatedUnknowns(Annotation unknown, String unknownString,
428 Iterator iter){
429 boolean matched = false;
430
431 int stringEnd = unknownString.indexOf("-");
433 unknownString = unknownString.substring(0, stringEnd);
434 if (processedAnnots.containsValue(unknownString)) {
437 matched = true;
438 Annotation matchedAnnot = updateMatches(unknown, unknownString);
439 iter.remove();
442 String newType;
443 if (matchedAnnot.getType().equals(unknownType))
444 newType = (String)annots2Remove.get(matchedAnnot.getId());
445 else
446 newType = matchedAnnot.getType();
447
448 Integer newID = new Integer(-1);
449 try {
450 newID = nameAllAnnots.add(
451 unknown.getStartNode().getOffset(),
452 new Long(unknown.getStartNode().getOffset().longValue()
453 + stringEnd),
454 newType,
455 unknown.getFeatures()
456 );
457 } catch (InvalidOffsetException ex) {
458 throw new GateRuntimeException(ex.getMessage());
459 }
460 nameAllAnnots.remove(unknown);
461
462 List mList = (List)unknown.getFeatures().
464 get(ANNOTATION_COREF_FEATURE_NAME);
465 mList.remove(unknown.getId());
466 mList.add(newID);
467
468 }
469 return matched;
470 }
471
472 protected void matchWithPrevious(Annotation nameAnnot, String annotString) {
473 boolean matchedUnknown = false;
474
475 Iterator prevIter = processedAnnots.keySet().iterator();
476 while (prevIter.hasNext()) {
477 Integer prevId = (Integer) prevIter.next();
478 Annotation prevAnnot = nameAllAnnots.get(prevId);
479
480 if (prevAnnot == null || (! prevAnnot.getType().equals(nameAnnot.getType())
482 && ! nameAnnot.getType().equals(unknownType))
483 )
484 continue;
485 if ( nameAnnot.getType().equals(unknownType)
488 && prevAnnot.getType().equals(unknownType))
489 continue;
490
491 if (matchedAlready(nameAnnot, prevAnnot) )
493 continue;
494
495 if (prevAnnot.getType().equals(personType)) {
497 String prevGender =
498 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
499 String nameGender =
500 (String) nameAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
501 if ( prevGender != null
502 && nameGender != null
503 && ( (nameGender.equalsIgnoreCase("female")
504 &&
505 prevGender.equalsIgnoreCase("male")
506 )
507 ||
508 (prevGender.equalsIgnoreCase("female")
509 && nameGender.equalsIgnoreCase("male")
510 )
511 )
512 ) continue;
515 }
517 if (matchAnnotations(nameAnnot, annotString, prevAnnot)) {
519 updateMatches(nameAnnot, prevAnnot);
521 if (nameAnnot.getType().equals(unknownType)) {
523 matchedUnknown = true;
524 if (prevAnnot.getType().equals(unknownType))
525 annots2Remove.put(nameAnnot.getId(),
526 annots2Remove.get(prevAnnot.getId()));
527 else
528 annots2Remove.put(nameAnnot.getId(), prevAnnot.getType());
529 nameAnnot.getFeatures().put("NMRule", unknownType);
531 } break; }
535 }
537 if (matchedUnknown)
538 processedAnnots.put(nameAnnot.getId(), annotString);
539
540
541 }
543 protected boolean matchAnnotations(Annotation newAnnot, String annotString,
544 Annotation prevAnnot) {
545 if (newAnnot.overlaps(prevAnnot))
547 return false;
548
549 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
552
553 String longName = prevAnnotString;
554 String shortName = annotString;
555 longAnnot = prevAnnot;
556 shortAnnot = newAnnot;
557
558 if (shortName.length()>longName.length()) {
559 String temp = longName;
560 longName = shortName;
561 shortName = temp;
562 Annotation tempAnn = longAnnot;
563 longAnnot = shortAnnot;
564 shortAnnot = tempAnn;
565 }
567 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
568 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
569
570 List matchesList = (List) prevAnnot.getFeatures().
571 get(ANNOTATION_COREF_FEATURE_NAME);
572 if (matchesList == null || matchesList.isEmpty())
573 return apply_rules_namematch(prevAnnot.getType(), shortName,longName);
574
575 if (apply_rules_namematch(prevAnnot.getType(), shortName,longName)) {
579
584 if (allMatchingNeeded) {
585 allMatchingNeeded = false;
586
587 List toMatchList = new ArrayList(matchesList);
588 toMatchList.remove(prevAnnot.getId());
591
592 return matchOtherAnnots(toMatchList, newAnnot, annotString);
593 } else
594 return true;
595 }
596 return false;
597 }
598
599
606 protected boolean matchOtherAnnots( List toMatchList, Annotation newAnnot,
607 String annotString) {
608
609 if (toMatchList.isEmpty())
611 return true;
612
613 boolean matchedAll = true;
614 int i = 0;
615
616 while (matchedAll && i < toMatchList.size()) {
617 Annotation prevAnnot = nameAllAnnots.get((Integer) toMatchList.get(i));
618
619 String prevAnnotString = (String) processedAnnots.get(prevAnnot.getId());
622 if (prevAnnotString == null)
623 try {
624 prevAnnotString = document.getContent().getContent(
625 prevAnnot.getStartNode().getOffset(),
626 prevAnnot.getEndNode().getOffset()
627 ).toString();
628 } catch (InvalidOffsetException ioe) {
629 return false;
630 }
632
633 String longName = prevAnnotString;
634 String shortName = annotString;
635 longAnnot = prevAnnot;
636 shortAnnot = newAnnot;
637
638 if (shortName.length()>=longName.length()) {
639 String temp = longName;
640 longName = shortName;
641 shortName = temp;
642 Annotation tempAnn = longAnnot;
643 longAnnot = shortAnnot;
644 shortAnnot = tempAnn;
645 }
647 tokensLongAnnot = (ArrayList) tokensMap.get(longAnnot.getId());
648 tokensShortAnnot = (ArrayList) tokensMap.get(shortAnnot.getId());
649
650 matchedAll = apply_rules_namematch(prevAnnot.getType(), shortName,longName);
651
654 i++;
655 } return matchedAll;
657 }
658
659
660 protected boolean matchedAlready(Annotation annot1, Annotation annot2) {
661 List matchesList = (List) annot1.getFeatures().
664 get(ANNOTATION_COREF_FEATURE_NAME);
665 if ((matchesList == null) || matchesList.isEmpty())
666 return false;
667 else if (matchesList.contains(annot2.getId()))
668 return true;
669 return false;
670 }
671
672 protected Annotation updateMatches(Annotation newAnnot, String annotString) {
673 Annotation matchedAnnot = null;
674 Integer id;
675
676 Iterator iter = processedAnnots.keySet().iterator();
678 while (iter.hasNext()) {
679 id = (Integer) iter.next();
680 String oldString = (String) processedAnnots.get(id);
681 if (annotString.equals(oldString)) {
682 matchedAnnot = nameAllAnnots.get(id);
683 break;
684 } }
687 if (matchedAnnot == null) return null;
688 if (! matchedAnnot.getType().equals(newAnnot.getType())
691 && !newAnnot.getType().equals(unknownType) )
692 return matchedAnnot;
693
694 List matchesList = (List) matchedAnnot.getFeatures().
695 get(ANNOTATION_COREF_FEATURE_NAME);
696 if ((matchesList == null) || matchesList.isEmpty()) {
697 if (matchesList == null) {
699 matchesList = new ArrayList();
700 matchedAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME,
701 matchesList);
702 matchesDocFeature.add(matchesList);
703 } matchesList.add(matchedAnnot.getId());
705 matchesList.add(newAnnot.getId());
706 } else {
707 matchesList.add(newAnnot.getId());
709 } newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
712 return matchedAnnot;
713 }
714
715 protected void updateMatches(Annotation newAnnot, Annotation prevAnnot) {
716
717 List matchesList = (List) prevAnnot.getFeatures().
718 get(ANNOTATION_COREF_FEATURE_NAME);
719 if ((matchesList == null) || matchesList.isEmpty()) {
720 if (matchesList == null) {
722 matchesList = new ArrayList();
723 prevAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
724 matchesDocFeature.add(matchesList);
725 } matchesList.add(prevAnnot.getId());
727 matchesList.add(newAnnot.getId());
728 } else {
729 matchesList.add(newAnnot.getId());
731 } newAnnot.getFeatures().put(ANNOTATION_COREF_FEATURE_NAME, matchesList);
734 if (prevAnnot.getType().equals(personType)) {
736 String prevGender =
737 (String) prevAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
738 String newGender =
739 (String) newAnnot.getFeatures().get(PERSON_GENDER_FEATURE_NAME);
740 boolean unknownPrevGender = isUnknownGender(prevGender);
741 boolean unknownNewGender = isUnknownGender(newGender);
742 if (unknownPrevGender && !unknownNewGender)
743 prevAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, newGender);
744 else if (unknownNewGender && !unknownPrevGender)
745 newAnnot.getFeatures().put(PERSON_GENDER_FEATURE_NAME, prevGender);
746 } }
748
749
750 protected void docCleanup() {
751 Object matchesValue = document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME);
752 if (matchesValue != null && (matchesValue instanceof Map))
753 ((Map)matchesValue).remove(nameAllAnnots.getName());
754 else if (matchesValue != null) {
755 document.getFeatures().put(DOCUMENT_COREF_FEATURE_NAME, new HashMap());
756 }
757
758 HashSet fNames = new HashSet();
760 fNames.add(ANNOTATION_COREF_FEATURE_NAME);
761 AnnotationSet annots =
762 nameAllAnnots.get(null, fNames);
763
764
766 if (annots == null || annots.isEmpty())
767 return;
768
769 Iterator iter = annots.iterator();
770 while (iter.hasNext()) {
771 while (iter.hasNext())
772 ((Annotation) iter.next()).getFeatures().
773 remove(ANNOTATION_COREF_FEATURE_NAME);
774 } }
777
778 protected String containTitle (String annotString, Annotation annot)
779 throws ExecutionException {
780 Long startAnnot = annot.getStartNode().getOffset();
782 Long endAnnot = annot.getEndNode().getOffset();
783
784 queryFM.clear();
786 queryFM.put("majorType", "title");
787 AnnotationSet as1 = nameAllAnnots.get(startAnnot,endAnnot);
788 if (as1 == null || as1.isEmpty())
789 return annotString;
790 AnnotationSet as =
791 as1.get("Lookup", queryFM);
792 if (as !=null && ! as.isEmpty()) {
793 List titles = new ArrayList((Set)as);
794 Collections.sort(titles, new gate.util.OffsetComparator());
795
796 Iterator iter = titles.iterator();
797 while (iter.hasNext()) {
798 Annotation titleAnn = (Annotation)(iter.next());
799
800 if (titleAnn.getStartNode().getOffset().compareTo(startAnnot) != 0)
804 return annotString;
805
806 try {
807 String annotTitle =
809 document.getContent().getContent(
810 titleAnn.getStartNode().getOffset(),
811 titleAnn.getEndNode().getOffset()
812 ).toString();
813
814 if (annotTitle.length()<annotString.length()) {
816 ((ArrayList) tokensMap.get(annot.getId())).remove(0);
822 return annotString.substring(
823 annotTitle.length()+1,annotString.length());
824 }
825 } catch (InvalidOffsetException ioe) {
826 throw new ExecutionException
827 ("Invalid offset of the annotation");
828 } } } return annotString;
832
833 }
834
835
836 protected String stripCDG (String annotString, Annotation annot){
837
838 ArrayList tokens = (ArrayList) tokensMap.get(annot.getId());
839
840 if ( ((String) ((Annotation) tokens.get(0)
842 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME))
843 .equalsIgnoreCase(THE_VALUE))
844 tokens.remove(0);
845
846 if (tokens.size()>1 && cdg.contains(((Annotation) tokens.get(tokens.size()-1)
848 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
849 tokens.remove(tokens.size()-1);
850
851 StringBuffer newString = new StringBuffer(50);
852 for (int i = 0; i < tokens.size(); i++){
853 newString.append((String) ((Annotation) tokens.get(i)
854 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) );
855 if (i != tokens.size()-1)
856 newString.append(" ");
857 }
858
860 if (caseSensitive)
861 return newString.toString();
862
863 return newString.toString().toLowerCase();
864 }
865
866
875
876
879
901
902 protected void createAnnotList(String nameFile,String nameList)
903 throws IOException{
904
905
911 URL fileURL = new URL(definitionFileURL, nameFile);
913 BufferedReader bufferedReader =
914 new BufferedReader(new InputStreamReader(fileURL.openStream(),
915 encoding));
916
917 String lineRead = null;
918 while ((lineRead = bufferedReader.readLine()) != null){
919 if (nameList.compareTo(CDGLISTNAME)==0){
920 if (caseSensitive)
921 cdg.add(lineRead);
922 else
923 cdg.add(lineRead.toLowerCase());
924 } else {
926 int index = lineRead.indexOf("£");
927 if (index != -1){
928 String expr = lineRead.substring(0,index);
929 if (!caseSensitive)
931 expr = expr.toLowerCase();
932 String code = lineRead.substring(index+1,lineRead.length());
933 if (nameList.equals(ALIASLISTNAME))
934 alias.put(expr, code);
935 else
936 if (nameList.equals(ARTLISTNAME))
937 def_art.put(expr, code);
938 else
939 if (nameList.equals(PREPLISTNAME))
940 prepos.put(expr, code);
941 else
942 if (nameList.equals(CONNECTORLISTNAME))
943 connector.put(expr, code);
944 else
945 if (nameList.equals(SPURLISTNAME))
946 spur_match.put(expr, code);
947
948 } }
951 } }
954
955
956 private boolean apply_rules_namematch(String annotationType, String shortName,
957 String longName) {
958 if (matchRule0(longName, shortName))
960 return false;
961 if (
962 ( matchRule2(longName, shortName)
966 ||
967 matchRule3(longName, shortName)
968 ) ||
970 ( ( annotationType.equals(organizationType)
972 || annotationType.equals("Facility"))
974 &&
975 ( matchRule4(longName, shortName)
976 ||
977 matchRule5(longName, shortName)
978 ||
979 matchRule6(longName, shortName)
980 ||
981 matchRule7(longName, shortName)
982 ||
983 matchRule9(longName, shortName)
986 ||
987 matchRule10(longName, shortName)
988 ||
989 matchRule11(longName, shortName)
990 ||
991 matchRule12(longName, shortName)
992 ||
993 matchRule13(shortName, longName)
994 )
995 ) ||
997 ( ( annotationType.equals(personType))
999 &&
1000 ( matchRule4(longName, shortName)
1001 ||
1002 matchRule5(longName, shortName)
1003 ||
1004 matchRule14(longName, shortName)
1005 || matchRule15(longName, shortName)
1008 )
1009 ) ) return true;
1012 return false;
1013 }
1015
1016
1017 public void setExtLists(Boolean newExtLists) {
1018 extLists = newExtLists.booleanValue();
1019 }
1021
1022 public void setCaseSensitive(Boolean newCase) {
1023 caseSensitive = newCase.booleanValue();
1024 }
1026
1027 public void setAnnotationSetName(String newAnnotationSetName) {
1028 annotationSetName = newAnnotationSetName;
1029 }
1031
1032 public void setAnnotationTypes(List newType) {
1033 annotationTypes = newType;
1034 }
1036
1037 public void setProcessUnknown(Boolean processOrNot) {
1038 this.matchingUnknowns = processOrNot.booleanValue();
1039 }
1041 public void setOrganizationType(String newOrganizationType) {
1042 organizationType = newOrganizationType;
1043 }
1045 public void setPersonType(String newPersonType) {
1046 personType = newPersonType;
1047 }
1049
1050 public String getAnnotationSetName() {
1051 return annotationSetName;
1052 }
1054
1055 public List getAnnotationTypes() {
1056 return annotationTypes;
1057 }
1059 public String getOrganizationType() {
1060 return organizationType;
1061 }
1062
1063 public String getPersonType() {
1064 return personType;
1065 }
1066
1067 public Boolean getExtLists() {
1068 return new Boolean(extLists);
1069 }
1070
1071
1072 public Boolean getCaseSensitive() {
1073 return new Boolean(caseSensitive);
1074 }
1075
1076
1077 public Boolean getProcessUnknown() {
1078 return new Boolean(matchingUnknowns);
1079 }
1080
1081
1086
1087 protected boolean isUnknownGender(String gender) {
1088 if (gender == null)
1089 return true;
1090 if (gender.equalsIgnoreCase("male") || gender.equalsIgnoreCase("female"))
1091 return false;
1092 return true;
1093
1094 }
1096
1101 public boolean matchRule0(String s1,
1102 String s2) {
1103 if (spur_match.containsKey(s1)
1104 && spur_match.containsKey(s2) )
1105 return
1106 spur_match.get(s1).toString().equals(spur_match.get(s2).toString());
1107
1108 return false;
1109 }
1111
1117 public boolean matchRule1(String s1,
1118 String s2,
1119 boolean matchCase) {
1120
1122 boolean matched = false;
1123 if (!matchCase)
1124 matched = s1.equalsIgnoreCase(s2);
1125 else matched = s1.equals(s2) ;
1126 return matched;
1130 }
1132
1133
1139 public boolean matchRule2(String s1,
1140 String s2) {
1141
1142 if (alias.containsKey(s1) && alias.containsKey(s2))
1143 return (alias.get(s1).toString().equals(alias.get(s2).toString()));
1144
1145 return false;
1146 }
1148
1156 public boolean matchRule3(String s1, String s2) {
1159 if (s2.endsWith("'s") || s2.endsWith("'")
1160 ||(s1.endsWith("'s")|| s1.endsWith("'"))) {
1161
1162
1163 String s2_poss = null;
1164
1165 if (!s2.endsWith("'s")) s2_poss = s2.concat("'s");
1166 else s2_poss = s2.concat("'");
1167
1168 if (s2_poss != null && matchRule1(s1, s2_poss,caseSensitive)) return true;
1169
1170 String token = (String)
1172 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1173
1174 if (!token.endsWith("'s")) s2_poss = token.concat("'s");
1175 else s2_poss = token.concat("'");
1176
1177 if (s2_poss != null && matchRule1(s2_poss,s2,caseSensitive)) return true;
1178
1179 } return false;
1181 }
1183
1190 public boolean matchRule4(String s1,
1191 String s2) {
1192
1193 boolean allTokensMatch = true;
1194
1195 Iterator tokensLongAnnotIter = tokensLongAnnot.iterator();
1196 Iterator tokensShortAnnotIter = tokensShortAnnot.iterator();
1197 while (tokensLongAnnotIter.hasNext() && tokensShortAnnotIter.hasNext()) {
1198 Annotation token = (Annotation) tokensLongAnnotIter.next();
1199 if (((String)token.getFeatures().get(TOKEN_KIND_FEATURE_NAME)).equals(PUNCTUATION_VALUE))
1200 continue;
1201 if (! token.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1203 ((Annotation) tokensShortAnnotIter.next()).getFeatures().get(TOKEN_STRING_FEATURE_NAME))) {
1204 allTokensMatch = false;
1205 break;
1206 } } return allTokensMatch;
1211 }
1213
1220 public boolean matchRule5(String s1,
1221 String s2) {
1222
1223 if (tokensLongAnnot.size()> 1 &&
1225 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get("kind").equals("number"))
1226 return false;
1227
1228
1232 if ( (shortAnnot.getType().equals(personType)
1236 || longAnnot.getType().equals(personType)
1237 )
1238 &&
1239 tokensShortAnnot.size()>1
1240 )
1241 return false;
1242
1243 if (tokensLongAnnot.size()<=1)
1244 return false;
1245 boolean result = matchRule1((String)
1246 ((Annotation) tokensLongAnnot.get(0)
1247 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME),
1248 s2,
1249 caseSensitive);
1250
1251 return result;
1254
1255 }
1257
1262 public boolean matchRule6(String s1,
1263 String s2) {
1264
1265 int i = 0;
1266
1267 if (s2.indexOf(" ") > 0)
1270 return false;
1271
1272 StringBuffer acronym_s1 = new StringBuffer("");
1274 StringBuffer acronymDot_s1 = new StringBuffer("");
1275
1276 for ( ;i < tokensLongAnnot.size(); i++ ) {
1277 String toAppend = ( (String) ((Annotation) tokensLongAnnot.get(i)
1278 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME)).substring(0,1);
1279 acronym_s1.append(toAppend);
1280 acronymDot_s1.append(toAppend);
1281 acronymDot_s1.append(".");
1282 }
1283
1284
1287 if (matchRule1(acronym_s1.toString(),s2,caseSensitive) ||
1288 matchRule1(acronymDot_s1.toString(),s2,caseSensitive) )
1289 return true;
1290
1291 return false;
1292 }
1294
1303 public boolean matchRule7(String s1,
1304 String s2) {
1305
1306 if (tokensShortAnnot.size() != 1)
1308 return false;
1309
1310 String previous_token = null;
1311
1312 for (int i = 0; i < tokensLongAnnot.size(); i++ ) {
1313 if (connector.containsKey( ((Annotation) tokensLongAnnot.get(i)
1314 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME) )) {
1315 previous_token = (String) ((Annotation) tokensLongAnnot.get(i-1)
1316 ).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1317
1318 break;
1319 }
1320 }
1321
1322 if (previous_token != null) {
1324 return matchRule1(previous_token,s2,caseSensitive);
1327
1328 }
1329 return false;
1330 }
1332
1343 public boolean matchRule8(String s1,
1344 String s2) {
1345 Out.prln("OrthoMatcher warning: This rule has been discontinued!");
1346
1387 return false;
1388
1389 }
1391
1400 public boolean matchRule9(String s1,
1401 String s2) {
1402
1403 String s1_short = (String)
1406 ((Annotation) tokensLongAnnot.get(
1407 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1408 if (tokensLongAnnot.size()>1) {
1410 boolean matched = matchRule1(s1_short, s2, caseSensitive);
1411 if (matched)
1416 allMatchingNeeded = true;
1417 return matched;
1418 }
1420 return false;
1421 }
1423
1430 public boolean matchRule10(String s1,
1431 String s2) {
1432
1433 String token = null;
1434 String previous_token = null;
1435 String next_token = null;
1436 boolean invoke_rule=false;
1437
1438 if (tokensLongAnnot.size() >= 3
1439 && tokensShortAnnot.size() >= 2) {
1440
1441 int i = 0;
1443 for (; i< tokensLongAnnot.size(); i++) {
1444 token = (String)
1445 ((Annotation) tokensLongAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1446 if (prepos.containsKey(token)) {
1447 invoke_rule=true;
1448 break;
1449 } previous_token = token;
1451 }
1453 if (! invoke_rule)
1454 return false;
1455
1456 if (i < tokensLongAnnot.size()
1457 && previous_token != null)
1458 next_token= (String)
1459 ((Annotation) tokensLongAnnot.get(i++)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1460 else return false;
1461
1462 String s21 = (String)
1463 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1464 String s22 = (String)
1465 ((Annotation) tokensShortAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1466 if (matchRule1(next_token,(String) s21,caseSensitive)
1468 && matchRule1(previous_token, s22,caseSensitive))
1469 return true ;
1470 } return false;
1472 }
1474
1482 public boolean matchRule11(String s1,
1483 String s2) {
1484
1485
1486
1488 String token11 = null;
1489 String token12 = null;
1490 String token21 = null;
1491 String token22 = null;
1492
1493 if (tokensLongAnnot.size() < 2)
1494 return false;
1495
1496 token11 = (String)
1498 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1499 token12 = (String)
1500 ((Annotation) tokensLongAnnot.get(1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1501
1502 if (tokensShortAnnot.size() == 2) {
1504
1505 token21 = (String)
1506 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1507 token22 = (String)
1508 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1509
1510 if (token11.startsWith(token21)
1511 && token12.startsWith(token22))
1512 return true;
1513
1514 }
1516 else if (tokensShortAnnot.size()==1 && s2.length()>=3) {
1518
1519 for (int i=2;i<s2.length();i++) {
1522 token21=s2.substring(0,i+1);
1523 token22=s2.substring(i+1);
1524
1525 if (token11.startsWith(token21)
1526 && token12.startsWith(token22))
1527 return true;
1528 } }
1531 return false;
1532 }
1534
1540 public boolean matchRule12(String s1,
1541 String s2) {
1542
1543
1545 if (tokensLongAnnot.size()>1 && tokensShortAnnot.size()>1) {
1546
1548 String s1_first = (String)
1550 ((Annotation) tokensLongAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1551 String s2_first = (String)
1552 ((Annotation) tokensShortAnnot.get(0)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1553
1554 if (!matchRule1(s1_first,s2_first,caseSensitive))
1555 return false;
1556
1557 String s1_last = (String)
1558 ((Annotation) tokensLongAnnot.get(tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1559 String s2_last = (String)
1560 ((Annotation) tokensShortAnnot.get(tokensShortAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1561
1562 return matchRule1(s1_last,s2_last,caseSensitive);
1563 } return false;
1565 }
1567
1580 public boolean matchRule13(String s1,
1581 String s2) {
1582
1583
1584 String token1 = null;
1585 String token2 = null;
1586
1587 int matched_tokens = 0, mismatches = 0;;
1588
1589 if (tokensLongAnnot.size() < 3 || tokensShortAnnot.size() < 2) return false;
1591
1592
1597 for (int i=0,j= 0; i < tokensShortAnnot.size() && mismatches < 2; i++) {
1599
1600 if ( ((Annotation) tokensLongAnnot.get(j)).getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1603 ((Annotation) tokensShortAnnot.get(i)).getFeatures().get(TOKEN_STRING_FEATURE_NAME)) ) {
1604 matched_tokens++;
1605 j++;
1606 } else
1607 mismatches++;
1608 }
1610 if (matched_tokens >= tokensLongAnnot.size()-1)
1611 return true;
1612
1613 return false;
1614 }
1616
1623 public boolean matchRule14(String s1,
1624 String s2) {
1625
1626 String s1_short = (String)
1629 ((Annotation) tokensLongAnnot.get(
1630 tokensLongAnnot.size()-1)).getFeatures().get(TOKEN_STRING_FEATURE_NAME);
1631 if (tokensLongAnnot.size()>1)
1633 return matchRule1(s1_short,
1634 s2,
1635 caseSensitive);
1636
1637 return false;
1638
1639 }
1641
1646 public boolean matchRule15(String s1,
1647 String s2) {
1648
1649 int matched_tokens = 0;
1650
1651
1653
1658 Annotation token1, token2;
1660 for (int i=0; i < tokensShortAnnot.size() && matched_tokens == 0; i++) {
1661 token1 = (Annotation) tokensShortAnnot.get(i);
1662 if (token1.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1664 continue;
1665
1666 for (int j=0; j<tokensLongAnnot.size() && matched_tokens ==0; j++) {
1667 token2 = (Annotation) tokensLongAnnot.get(j);
1669 if (token2.getFeatures().get(TOKEN_KIND_FEATURE_NAME).equals(PUNCTUATION_VALUE))
1670 continue;
1671 if ( token1.getFeatures().get(TOKEN_STRING_FEATURE_NAME).equals(
1672 token2.getFeatures().get(TOKEN_STRING_FEATURE_NAME)) )
1673 matched_tokens++;
1674 } }
1677 if (matched_tokens == tokensShortAnnot.size())
1683 return true;
1684
1685 return false;
1686 }
1688
1689
1692 private void buildTables(AnnotationSet nameAllAnnots) {
1693
1694 cdg.clear();
1696
1697 if (! extLists) {
1698 tempMap.clear();
1701 tempMap.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, "cdg");
1702 AnnotationSet nameAnnots =
1704 nameAllAnnots.get(LOOKUP_ANNOTATION_TYPE, tempMap);
1705
1706 if ((nameAnnots ==null) || nameAnnots.isEmpty())
1707 return;
1708
1709 Iterator iter = nameAnnots.iterator();
1710 while (iter.hasNext()) {
1711 Annotation annot = (Annotation)iter.next();
1712 Long offsetStartAnnot = annot.getStartNode().getOffset();
1714 Long offsetEndAnnot = annot.getEndNode().getOffset();
1715 try {
1716 gate.Document doc = nameAllAnnots.getDocument();
1717 String annotString =
1718 doc.getContent().getContent(
1719 offsetStartAnnot,offsetEndAnnot
1720 ).toString();
1721 cdg.add(annotString);
1722 } catch (InvalidOffsetException ioe) {
1723 ioe.printStackTrace(Err.getPrintWriter());
1724 }
1725 } } }
1729
1730 public void setDefinitionFileURL(java.net.URL definitionFileURL) {
1731 this.definitionFileURL = definitionFileURL;
1732 }
1733
1734 public java.net.URL getDefinitionFileURL() {
1735 return definitionFileURL;
1736 }
1737 public void setEncoding(String encoding) {
1738 this.encoding = encoding;
1739 }
1740 public String getEncoding() {
1741 return encoding;
1742 }
1743
1744
1745 private static class Class1 {
1746 }
1747}
1749