GATE
Version 3.1-2270

gate.creole.orthomatcher
Class OrthoMatcher

java.lang.Object
  extended by gate.util.AbstractFeatureBearer
      extended by gate.creole.AbstractResource
          extended by gate.creole.AbstractProcessingResource
              extended by gate.creole.AbstractLanguageAnalyser
                  extended by gate.creole.orthomatcher.OrthoMatcher
All Implemented Interfaces:
ANNIEConstants, Executable, LanguageAnalyser, ProcessingResource, Resource, FeatureBearer, NameBearer, Serializable

public class OrthoMatcher
extends AbstractLanguageAnalyser
implements ANNIEConstants

See Also:
Serialized Form

Nested Class Summary
 
Nested classes/interfaces inherited from class gate.creole.AbstractProcessingResource
AbstractProcessingResource.InternalStatusListener, AbstractProcessingResource.IntervalProgressListener
 
Field Summary
protected  HashMap alias
           
protected static String ALIASLISTNAME
           
protected  String annotationSetName
          the name of the annotation set
protected  List annotationTypes
          the types of the annotation
protected  HashMap annots2Remove
           
protected static String ARTLISTNAME
           
protected  boolean caseSensitive
           
protected  HashSet cdg
           
protected static String CDGLISTNAME
           
protected  HashMap connector
           
protected static String CONNECTORLISTNAME
           
protected  HashMap def_art
           
protected  boolean extLists
          internal or external list
protected  Annotation longAnnot
           
protected  List matchesDocFeature
           
protected  boolean matchingUnknowns
          matching unknowns or not
protected  AnnotationSet nameAllAnnots
           
static String OM_ANN_SET_PARAMETER_NAME
           
static String OM_ANN_TYPES_PARAMETER_NAME
           
static String OM_CASE_SENSITIVE_PARAMETER_NAME
           
static String OM_DOCUMENT_PARAMETER_NAME
           
static String OM_EXT_LISTS_PARAMETER_NAME
           
static String OM_ORG_TYPE_PARAMETER_NAME
           
static String OM_PERSON_TYPE_PARAMETER_NAME
           
protected  String organizationType
          the organization type
protected  String personType
          the person type
protected static String PREPLISTNAME
           
protected  HashMap prepos
           
protected  HashMap processedAnnots
           
protected static String PUNCTUATION_VALUE
           
protected  FeatureMap queryFM
           
protected  Annotation shortAnnot
           
protected  HashMap spur_match
           
protected static String SPURLISTNAME
           
protected  FeatureMap tempMap
          a feature map to be used when retrieving annotations declared here so can be reused for efficiency clear() before each use
protected static String THE_VALUE
           
protected  ArrayList tokensLongAnnot
           
protected  HashMap tokensMap
           
protected  ArrayList tokensShortAnnot
           
protected  String unknownType
           
 
Fields inherited from class gate.creole.AbstractLanguageAnalyser
corpus, document
 
Fields inherited from class gate.creole.AbstractProcessingResource
interrupted
 
Fields inherited from class gate.creole.AbstractResource
name
 
Fields inherited from class gate.util.AbstractFeatureBearer
features
 
Fields inherited from interface gate.creole.ANNIEConstants
ANNOTATION_COREF_FEATURE_NAME, DATE_ANNOTATION_TYPE, DATE_POSTED_ANNOTATION_TYPE, DOCUMENT_COREF_FEATURE_NAME, JOB_ID_ANNOTATION_TYPE, LOCATION_ANNOTATION_TYPE, LOOKUP_ANNOTATION_TYPE, LOOKUP_CLASS_FEATURE_NAME, LOOKUP_MAJOR_TYPE_FEATURE_NAME, LOOKUP_MINOR_TYPE_FEATURE_NAME, LOOKUP_ONTOLOGY_FEATURE_NAME, MONEY_ANNOTATION_TYPE, ORGANIZATION_ANNOTATION_TYPE, PERSON_ANNOTATION_TYPE, PERSON_GENDER_FEATURE_NAME, PR_NAMES, SENTENCE_ANNOTATION_TYPE, SPACE_TOKEN_ANNOTATION_TYPE, TOKEN_ANNOTATION_TYPE, TOKEN_CATEGORY_FEATURE_NAME, TOKEN_KIND_FEATURE_NAME, TOKEN_LENGTH_FEATURE_NAME, TOKEN_ORTH_FEATURE_NAME, TOKEN_STRING_FEATURE_NAME
 
Constructor Summary
OrthoMatcher()
           
 
Method Summary
protected  String containTitle(String annotString, Annotation annot)
          return a person name without title
protected  void createAnnotList(String nameFile, String nameList)
          creates the lookup tables
protected  void docCleanup()
           
 void execute()
          Run the resource.
 String getAnnotationSetName()
          get the name of the annotation set
 List getAnnotationTypes()
          get the types of the annotation
 Boolean getCaseSensitive()
          Are we running in a case-sensitive mode?
 URL getDefinitionFileURL()
           
 String getEncoding()
           
 Boolean getExtLists()
           
 String getOrganizationType()
           
 String getPersonType()
           
 Boolean getProcessUnknown()
          Return whether or not we're processing the Unknown annots
 Resource init()
          Initialise this resource, and return it.
protected  boolean isUnknownGender(String gender)
           
protected  boolean matchAnnotations(Annotation newAnnot, String annotString, Annotation prevAnnot)
           
protected  boolean matchedAlready(Annotation annot1, Annotation annot2)
           
protected  void matchNameAnnotations()
           
protected  boolean matchOtherAnnots(List toMatchList, Annotation newAnnot, String annotString)
          This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom.
 boolean matchRule0(String s1, String s2)
          RULE #0: If the two names are listed in table of spurius matches then they do NOT match Condition(s): - Applied to: all name annotations
 boolean matchRule1(String s1, String s2, boolean matchCase)
          RULE #1: If the two names are identical then they are the same no longer used, because I do the check for same string via the hash table of previous annotations Condition(s): depend on case Applied to: all name annotations
 boolean matchRule10(String s1, String s2)
          RULE #10: is one name the reverse of the other reversing around prepositions only?
 boolean matchRule11(String s1, String s2)
          RULE #11: does one name consist of contractions of the first two tokens of the other name?
 boolean matchRule12(String s1, String s2)
          RULE #12: do the first and last tokens of one name match the first and last tokens of the other?
 boolean matchRule13(String s1, String s2)
          RULE #13: do multi-word names match except for one token e.g.
 boolean matchRule14(String s1, String s2)
          RULE #14: if the last token of one name matches the second name e.g.
 boolean matchRule15(String s1, String s2)
          RULE #15: does one token from a Person name appear as the other token Note that this rule has NOT been used in LaSIE's 1.5 namematcher; added for ACE by Di's request
 boolean matchRule2(String s1, String s2)
          RULE #2: if the two names are listed as equivalent in the lookup table (alias) then they match Condition(s): - Applied to: all name annotations
 boolean matchRule3(String s1, String s2)
          RULE #3: adding a possessive at the end of one name causes a match e.g.
 boolean matchRule4(String s1, String s2)
          RULE #4: Do all tokens other than the punctuation marks , and . match?
 boolean matchRule5(String s1, String s2)
          RULE #5: if the 1st token of one name matches the second name e.g.
 boolean matchRule6(String s1, String s2)
          RULE #6: if one name is the acronym of the other e.g.
 boolean matchRule7(String s1, String s2)
          RULE #7: if one of the tokens in one of the names is in the list of separators eg. "&" then check if the token before the separator matches the other name e.g.
 boolean matchRule8(String s1, String s2)
          This rule is now obsolete, as The and the trailing CDG are stripped before matching.
 boolean matchRule9(String s1, String s2)
          RULE #9: does one of the names match the token just before a trailing company designator in the other name?
protected  void matchUnknown()
           
protected  void matchWithPrevious(Annotation nameAnnot, String annotString)
           
 void setAnnotationSetName(String newAnnotationSetName)
          set the annotation set name
 void setAnnotationTypes(List newType)
          set the types of the annotations
 void setCaseSensitive(Boolean newCase)
          set the caseSensitive flag
 void setDefinitionFileURL(URL definitionFileURL)
           
 void setEncoding(String encoding)
           
 void setExtLists(Boolean newExtLists)
          set the extLists flag
 void setOrganizationType(String newOrganizationType)
           
 void setPersonType(String newPersonType)
           
 void setProcessUnknown(Boolean processOrNot)
          set whether to process the Unknown annotations
protected  String stripCDG(String annotString, Annotation annot)
          return an organization without a designator and starting The
protected  void updateMatches(Annotation newAnnot, Annotation prevAnnot)
           
protected  Annotation updateMatches(Annotation newAnnot, String annotString)
           
 
Methods inherited from class gate.creole.AbstractLanguageAnalyser
getCorpus, getDocument, setCorpus, setDocument
 
Methods inherited from class gate.creole.AbstractProcessingResource
addProgressListener, addStatusListener, cleanup, fireProcessFinished, fireProgressChanged, fireStatusChanged, interrupt, isInterrupted, reInit, removeProgressListener, removeStatusListener
 
Methods inherited from class gate.creole.AbstractResource
checkParameterValues, getBeanInfo, getName, getParameterValue, getParameterValue, removeResourceListeners, setName, setParameterValue, setParameterValue, setParameterValues, setParameterValues, setResourceListeners
 
Methods inherited from class gate.util.AbstractFeatureBearer
getFeatures, setFeatures
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 
Methods inherited from interface gate.ProcessingResource
reInit
 
Methods inherited from interface gate.Resource
cleanup, getParameterValue, setParameterValue, setParameterValues
 
Methods inherited from interface gate.util.FeatureBearer
getFeatures, setFeatures
 
Methods inherited from interface gate.util.NameBearer
getName, setName
 
Methods inherited from interface gate.Executable
interrupt, isInterrupted
 

Field Detail

OM_DOCUMENT_PARAMETER_NAME

public static final String OM_DOCUMENT_PARAMETER_NAME
See Also:
Constant Field Values

OM_ANN_SET_PARAMETER_NAME

public static final String OM_ANN_SET_PARAMETER_NAME
See Also:
Constant Field Values

OM_CASE_SENSITIVE_PARAMETER_NAME

public static final String OM_CASE_SENSITIVE_PARAMETER_NAME
See Also:
Constant Field Values

OM_ANN_TYPES_PARAMETER_NAME

public static final String OM_ANN_TYPES_PARAMETER_NAME
See Also:
Constant Field Values

OM_ORG_TYPE_PARAMETER_NAME

public static final String OM_ORG_TYPE_PARAMETER_NAME
See Also:
Constant Field Values

OM_PERSON_TYPE_PARAMETER_NAME

public static final String OM_PERSON_TYPE_PARAMETER_NAME
See Also:
Constant Field Values

OM_EXT_LISTS_PARAMETER_NAME

public static final String OM_EXT_LISTS_PARAMETER_NAME
See Also:
Constant Field Values

CDGLISTNAME

protected static final String CDGLISTNAME
See Also:
Constant Field Values

ALIASLISTNAME

protected static final String ALIASLISTNAME
See Also:
Constant Field Values

ARTLISTNAME

protected static final String ARTLISTNAME
See Also:
Constant Field Values

PREPLISTNAME

protected static final String PREPLISTNAME
See Also:
Constant Field Values

CONNECTORLISTNAME

protected static final String CONNECTORLISTNAME
See Also:
Constant Field Values

SPURLISTNAME

protected static final String SPURLISTNAME
See Also:
Constant Field Values

PUNCTUATION_VALUE

protected static final String PUNCTUATION_VALUE
See Also:
Constant Field Values

THE_VALUE

protected static final String THE_VALUE
See Also:
Constant Field Values

annotationSetName

protected String annotationSetName
the name of the annotation set


annotationTypes

protected List annotationTypes
the types of the annotation


organizationType

protected String organizationType
the organization type


personType

protected String personType
the person type


unknownType

protected String unknownType

extLists

protected boolean extLists
internal or external list


matchingUnknowns

protected boolean matchingUnknowns
matching unknowns or not


caseSensitive

protected boolean caseSensitive

queryFM

protected FeatureMap queryFM

alias

protected HashMap alias

cdg

protected HashSet cdg

spur_match

protected HashMap spur_match

def_art

protected HashMap def_art

connector

protected HashMap connector

prepos

protected HashMap prepos

nameAllAnnots

protected AnnotationSet nameAllAnnots

processedAnnots

protected HashMap processedAnnots

annots2Remove

protected HashMap annots2Remove

matchesDocFeature

protected List matchesDocFeature

tokensMap

protected HashMap tokensMap

shortAnnot

protected Annotation shortAnnot

longAnnot

protected Annotation longAnnot

tokensLongAnnot

protected ArrayList tokensLongAnnot

tokensShortAnnot

protected ArrayList tokensShortAnnot

tempMap

protected FeatureMap tempMap
a feature map to be used when retrieving annotations declared here so can be reused for efficiency clear() before each use

Constructor Detail

OrthoMatcher

public OrthoMatcher()
Method Detail

init

public Resource init()
              throws ResourceInstantiationException
Initialise this resource, and return it.

Specified by:
init in interface Resource
Overrides:
init in class AbstractProcessingResource
Throws:
ResourceInstantiationException

execute

public void execute()
             throws ExecutionException
Run the resource. It doesn't make sense not to override this in subclasses so the default implementation signals an exception.

Specified by:
execute in interface Executable
Overrides:
execute in class AbstractProcessingResource
Throws:
ExecutionException

matchNameAnnotations

protected void matchNameAnnotations()
                             throws ExecutionException
Throws:
ExecutionException

matchUnknown

protected void matchUnknown()
                     throws ExecutionException
Throws:
ExecutionException

matchWithPrevious

protected void matchWithPrevious(Annotation nameAnnot,
                                 String annotString)

matchAnnotations

protected boolean matchAnnotations(Annotation newAnnot,
                                   String annotString,
                                   Annotation prevAnnot)

matchOtherAnnots

protected boolean matchOtherAnnots(List toMatchList,
                                   Annotation newAnnot,
                                   String annotString)
This method checkes whether the new annotation matches all annotations given in the toMatchList (it contains ids) The idea is that the new annotation needs to match all those, because assuming transitivity does not always work, when two different entities share a common token: e.g., BT Cellnet and BT and British Telecom.


matchedAlready

protected boolean matchedAlready(Annotation annot1,
                                 Annotation annot2)

updateMatches

protected Annotation updateMatches(Annotation newAnnot,
                                   String annotString)

updateMatches

protected void updateMatches(Annotation newAnnot,
                             Annotation prevAnnot)

docCleanup

protected void docCleanup()

containTitle

protected String containTitle(String annotString,
                              Annotation annot)
                       throws ExecutionException
return a person name without title

Throws:
ExecutionException

stripCDG

protected String stripCDG(String annotString,
                          Annotation annot)
return an organization without a designator and starting The


createAnnotList

protected void createAnnotList(String nameFile,
                               String nameList)
                        throws IOException
creates the lookup tables

Throws:
IOException

setExtLists

public void setExtLists(Boolean newExtLists)
set the extLists flag


setCaseSensitive

public void setCaseSensitive(Boolean newCase)
set the caseSensitive flag


setAnnotationSetName

public void setAnnotationSetName(String newAnnotationSetName)
set the annotation set name


setAnnotationTypes

public void setAnnotationTypes(List newType)
set the types of the annotations


setProcessUnknown

public void setProcessUnknown(Boolean processOrNot)
set whether to process the Unknown annotations


setOrganizationType

public void setOrganizationType(String newOrganizationType)

setPersonType

public void setPersonType(String newPersonType)

getAnnotationSetName

public String getAnnotationSetName()
get the name of the annotation set


getAnnotationTypes

public List getAnnotationTypes()
get the types of the annotation


getOrganizationType

public String getOrganizationType()

getPersonType

public String getPersonType()

getExtLists

public Boolean getExtLists()

getCaseSensitive

public Boolean getCaseSensitive()
Are we running in a case-sensitive mode?


getProcessUnknown

public Boolean getProcessUnknown()
Return whether or not we're processing the Unknown annots


isUnknownGender

protected boolean isUnknownGender(String gender)

matchRule0

public boolean matchRule0(String s1,
                          String s2)
RULE #0: If the two names are listed in table of spurius matches then they do NOT match Condition(s): - Applied to: all name annotations


matchRule1

public boolean matchRule1(String s1,
                          String s2,
                          boolean matchCase)
RULE #1: If the two names are identical then they are the same no longer used, because I do the check for same string via the hash table of previous annotations Condition(s): depend on case Applied to: all name annotations


matchRule2

public boolean matchRule2(String s1,
                          String s2)
RULE #2: if the two names are listed as equivalent in the lookup table (alias) then they match Condition(s): - Applied to: all name annotations


matchRule3

public boolean matchRule3(String s1,
                          String s2)
RULE #3: adding a possessive at the end of one name causes a match e.g. "Standard and Poor" == "Standard and Poor's" and also "Standard and Poor" == "Standard's" Condition(s): case-insensitive match Applied to: all name annotations


matchRule4

public boolean matchRule4(String s1,
                          String s2)
RULE #4: Do all tokens other than the punctuation marks , and . match? e.g. "Smith, Jones" == "Smith Jones" Condition(s): case-insensitive match Applied to: organisation and person annotations


matchRule5

public boolean matchRule5(String s1,
                          String s2)
RULE #5: if the 1st token of one name matches the second name e.g. "Pepsi Cola" == "Pepsi" Condition(s): case-insensitive match Applied to: all name annotations


matchRule6

public boolean matchRule6(String s1,
                          String s2)
RULE #6: if one name is the acronym of the other e.g. "Imperial Chemical Industries" == "ICI" Applied to: organisation annotations only


matchRule7

public boolean matchRule7(String s1,
                          String s2)
RULE #7: if one of the tokens in one of the names is in the list of separators eg. "&" then check if the token before the separator matches the other name e.g. "R.H. Macy & Co." == "Macy" Condition(s): case-sensitive match Applied to: organisation annotations only


matchRule8

public boolean matchRule8(String s1,
                          String s2)
This rule is now obsolete, as The and the trailing CDG are stripped before matching. DO NOT CALL!!! RULE #8: if the names match, ignoring The and and trailing company designator (which have already been stripped) e.g. "The Magic Tricks Co." == "Magic Tricks" Condition(s): case-sensitive match Applied to: organisation annotations only


matchRule9

public boolean matchRule9(String s1,
                          String s2)
RULE #9: does one of the names match the token just before a trailing company designator in the other name? The company designator has already been chopped off, so the token before it, is in fact the last token e.g. "R.H. Macy Co." == "Macy" Applied to: organisation annotations only


matchRule10

public boolean matchRule10(String s1,
                           String s2)
RULE #10: is one name the reverse of the other reversing around prepositions only? e.g. "Department of Defence" == "Defence Department" Condition(s): case-sensitive match Applied to: organisation annotations only


matchRule11

public boolean matchRule11(String s1,
                           String s2)
RULE #11: does one name consist of contractions of the first two tokens of the other name? e.g. "Communications Satellite" == "ComSat" and "Pan American" == "Pan Am" Condition(s): case-sensitive match Applied to: organisation annotations only


matchRule12

public boolean matchRule12(String s1,
                           String s2)
RULE #12: do the first and last tokens of one name match the first and last tokens of the other? Condition(s): case-sensitive match Applied to: organisation annotations only


matchRule13

public boolean matchRule13(String s1,
                           String s2)
RULE #13: do multi-word names match except for one token e.g. "Second Force Recon Company" == "Force Recon Company" Note that this rule has NOT been used in LaSIE's 1.5 namematcher Restrictions: - remove cdg first - shortest name should be 2 words or more - if N is the number of tokens of the longest name, then N-1 tokens should be matched Condition(s): case-sensitive match Applied to: organisation or person annotations only


matchRule14

public boolean matchRule14(String s1,
                           String s2)
RULE #14: if the last token of one name matches the second name e.g. "Hamish Cunningham" == "Cunningham" Condition(s): case-insensitive match Applied to: all person annotations


matchRule15

public boolean matchRule15(String s1,
                           String s2)
RULE #15: does one token from a Person name appear as the other token Note that this rule has NOT been used in LaSIE's 1.5 namematcher; added for ACE by Di's request


setDefinitionFileURL

public void setDefinitionFileURL(URL definitionFileURL)

getDefinitionFileURL

public URL getDefinitionFileURL()

setEncoding

public void setEncoding(String encoding)

getEncoding

public String getEncoding()

GATE
Version 3.1-2270