1   package gate.creole;
2   
3   import java.io.*;
4   import java.util.*;
5   
6   import gate.Annotation;
7   import gate.AnnotationSet;
8   import gate.creole.gazetteer.*;
9   import gate.util.*;
10  
11  public class GazetteerListsCollector extends AbstractLanguageAnalyser {
12    private static String PERSON_ANNOT_NAME = "PER";
13  
14    public void execute() throws gate.creole.ExecutionException {
15      //reinitialise the stats
16      statsPerType = new HashMap();
17  
18      //check the input
19      if(document == null) {
20        throw new ExecutionException(
21          "No document to process!"
22        );
23      }
24  
25      if (gazetteer == null) {
26        throw new ExecutionException(
27          "No gazetteer set!"
28        );
29      }
30  
31      //if no annotation types given, then exit
32      if ((this.annotationTypes == null) || annotationTypes.isEmpty()) {
33        Out.prln("Gazetteer Lists Collector Warning: No annotation types given for processing");
34        return;
35      }
36  
37      // get the annotations from document
38      if ((markupSetName == null)|| (markupSetName.equals("")))
39        allAnnots = document.getAnnotations();
40      else
41        allAnnots = document.getAnnotations(markupSetName);
42  
43      //if none found, print warning and exit
44      if ((allAnnots == null) || allAnnots.isEmpty()) {
45        Out.prln("Gazetteer Lists Collector Warning: No annotations found for processing");
46        return;
47      }
48  
49      //collect the stats for each annotation type
50      for (int i = 0; i < annotationTypes.size(); i++) {
51        AnnotationSet annots = allAnnots.get((String) annotationTypes.get(i));
52        if (annots == null || annots.isEmpty())
53          continue;
54        statsPerType.put(annotationTypes.get(i), new HashMap());
55        collectLists(annots, (String) annotationTypes.get(i));
56      }
57  
58      //print out the stats in log files
59      printStats();
60  
61      //save the updated gazetteer lists now
62      Map theLists = gazetteer.getLinearDefinition().getListsByNode();
63      Iterator iter1 = theLists.keySet().iterator();
64      while (iter1.hasNext()) {
65        GazetteerList theList = (GazetteerList) theLists.get(iter1.next());
66        try {
67          if (theList.isModified())
68            theList.store();
69        } catch (ResourceInstantiationException ex) {
70          throw new GateRuntimeException(ex.getMessage());
71        }
72      }
73  
74    }
75  
76    public void setMarkupASName(String newMarkupASName) {
77      markupSetName = newMarkupASName;
78    }
79  
80    public String  getMarkupASName() {
81      return markupSetName;
82    }
83  
84    /** get the types of the annotation*/
85    public List getAnnotationTypes() {
86      return annotationTypes;
87    }//getAnnotationTypes
88  
89    /** set the types of the annotations*/
90    public void setAnnotationTypes(List newType) {
91      annotationTypes = newType;
92    }//setAnnotationTypes
93  
94    public Gazetteer getGazetteer() {
95      return gazetteer;
96    }
97  
98    public void setGazetteer(Gazetteer theGaz) {
99      gazetteer = theGaz;
100   }
101 
102   public void setTheLanguage(String language) {
103     theLanguage = language;
104   }
105 
106   public String  getTheLanguage() {
107     return theLanguage;
108   }
109 
110   protected void collectLists(AnnotationSet annots, String annotType) {
111     Iterator iter = annots.iterator();
112     String listName = "";
113     GazetteerList theList = null;
114     Iterator theListsIter =
115       gazetteer.getLinearDefinition().getListsByNode().values().iterator();
116     while (theListsIter.hasNext() && listName.equals("")) {
117       theList = (GazetteerList) theListsIter.next();
118       if (theList.getURL().toExternalForm().endsWith(annotType + ".lst"))
119         listName = theList.getURL().toExternalForm();
120     }
121     while (iter.hasNext()) {
122       Annotation annot = (Annotation) iter.next();
123       String text = "";
124       List strings = new ArrayList();
125       try {
126         text = document.getContent().getContent(
127           annot.getStartNode().getOffset(),
128           annot.getEndNode().getOffset()
129         ).toString();
130         //tokenise the text and save for the future if we need it
131         StringTokenizer tok = new StringTokenizer(text, "\n\r.|();-?!\t", false);
132         while (tok.hasMoreTokens())
133           strings.add(tok.nextToken());
134         //then replace the line breaks with spaces for the gazetteer
135         text = text.replace('\r', ' ');
136         text = text.replace('\n', ' ');
137         text = text.replace('\t', ' ');
138 
139       } catch (InvalidOffsetException ex) {
140         throw new GateRuntimeException(ex.getMessage());
141       }
142 
143       //collect stats for the string
144       if (((HashMap) statsPerType.get(annotType)).containsKey(text))
145         ((HashMap) statsPerType.get(annotType)).put(text,
146             new Integer(((Integer)
147               ((HashMap) statsPerType.get(annotType)).get(text)).intValue()+1));
148       else
149         ((HashMap) statsPerType.get(annotType)).put(text, new Integer(1));
150 
151       //also collect stats for the individual tokens in the name to identify the most
152       //frequent tokens across names
153       if (strings.size() > 1) {
154         for (int i=0; i < strings.size(); i++) {
155           String theString = (String) strings.get(i);
156           //collect stats for the string
157           if ( ( (HashMap) statsPerType.get(annotType)).containsKey(theString))
158             ( (HashMap) statsPerType.get(annotType)).put(theString,
159                 new Integer( ( (Integer)
160                               ( (HashMap) statsPerType.get(annotType)).get(
161                 theString)).intValue() + 1));
162           else
163             ( (HashMap) statsPerType.get(annotType)).put(theString,
164                 new Integer(1));
165         }
166       }
167 
168       //first we check whether the text is already in the gazetteer
169       Set lookupResult = gazetteer.lookup(text);
170       if (lookupResult != null && lookupResult.size() > 0)
171         continue;
172       //if not, then we add it
173       gazetteer.add(text,
174         new Lookup(listName, annotType, "inferred", theLanguage));
175 //      theList.add(text + document.getSourceUrl().toString());
176       theList.add(text);
177 
178 
179       //for persons we want also to add their individual names to the list
180       if (annotType.equals(PERSON_ANNOT_NAME) && strings.size() > 1) {
181         for (int i=0; i < strings.size(); i++) {
182           String theString = (String) strings.get(i);
183           Set lookupResult1 = gazetteer.lookup(theString);
184           if (lookupResult1 != null && lookupResult1.size() > 0)
185             continue;
186           if (theString.length() < 3)
187             continue;
188           gazetteer.add(theString,
189             new Lookup(listName, annotType, "inferred", theLanguage));
190           theList.add(theString);
191         }
192       }
193     }
194   }
195 
196   protected void printStats() {
197     try {
198       for (int i=0; i < annotationTypes.size(); i++) {
199         if (! statsPerType.containsKey(annotationTypes.get(i)))
200           continue;
201         BufferedWriter writer = new BufferedWriter(
202           new OutputStreamWriter(new FileOutputStream(
203            annotationTypes.get(i) + ".stats.lst"),
204           "UTF-8"));
205         HashMap stats = (HashMap) statsPerType.get(annotationTypes.get(i));
206         Iterator stringsIter = stats.keySet().iterator();
207         while (stringsIter.hasNext()) {
208           String string = (String) stringsIter.next();
209           writer.write(string);
210           writer.write("$");
211           writer.write( ((Integer)stats.get(string)).toString());
212           writer.newLine();
213         }
214         writer.close();
215       }
216   } catch(IOException ioe){
217       throw new RuntimeException(ioe.getMessage());
218   }//try
219 
220   }
221 
222   /**
223    * The idea is to have this method check if an item
224    * is already present in the gazetteer under this type,
225    * and if so, not to add it. It is not implemented for now.
226    */
227   protected boolean alreadyPresentInGazetteer(String token) {
228     return false;
229   }
230 
231   private String markupSetName = "";
232   private AnnotationSet allAnnots;
233   private List annotationTypes;
234   private Gazetteer gazetteer;
235   private String theLanguage = "";
236   private HashMap statsPerType = new HashMap();
237 }