1 package gate.creole;
2
3 import java.io.*;
4 import java.util.*;
5
6 import gate.Annotation;
7 import gate.AnnotationSet;
8 import gate.creole.gazetteer.*;
9 import gate.util.*;
10
11 public class GazetteerListsCollector extends AbstractLanguageAnalyser {
12 private static String PERSON_ANNOT_NAME = "PER";
13
14 public void execute() throws gate.creole.ExecutionException {
15 statsPerType = new HashMap();
17
18 if(document == null) {
20 throw new ExecutionException(
21 "No document to process!"
22 );
23 }
24
25 if (gazetteer == null) {
26 throw new ExecutionException(
27 "No gazetteer set!"
28 );
29 }
30
31 if ((this.annotationTypes == null) || annotationTypes.isEmpty()) {
33 Out.prln("Gazetteer Lists Collector Warning: No annotation types given for processing");
34 return;
35 }
36
37 if ((markupSetName == null)|| (markupSetName.equals("")))
39 allAnnots = document.getAnnotations();
40 else
41 allAnnots = document.getAnnotations(markupSetName);
42
43 if ((allAnnots == null) || allAnnots.isEmpty()) {
45 Out.prln("Gazetteer Lists Collector Warning: No annotations found for processing");
46 return;
47 }
48
49 for (int i = 0; i < annotationTypes.size(); i++) {
51 AnnotationSet annots = allAnnots.get((String) annotationTypes.get(i));
52 if (annots == null || annots.isEmpty())
53 continue;
54 statsPerType.put(annotationTypes.get(i), new HashMap());
55 collectLists(annots, (String) annotationTypes.get(i));
56 }
57
58 printStats();
60
61 Map theLists = gazetteer.getLinearDefinition().getListsByNode();
63 Iterator iter1 = theLists.keySet().iterator();
64 while (iter1.hasNext()) {
65 GazetteerList theList = (GazetteerList) theLists.get(iter1.next());
66 try {
67 if (theList.isModified())
68 theList.store();
69 } catch (ResourceInstantiationException ex) {
70 throw new GateRuntimeException(ex.getMessage());
71 }
72 }
73
74 }
75
76 public void setMarkupASName(String newMarkupASName) {
77 markupSetName = newMarkupASName;
78 }
79
80 public String getMarkupASName() {
81 return markupSetName;
82 }
83
84
85 public List getAnnotationTypes() {
86 return annotationTypes;
87 }
89
90 public void setAnnotationTypes(List newType) {
91 annotationTypes = newType;
92 }
94 public Gazetteer getGazetteer() {
95 return gazetteer;
96 }
97
98 public void setGazetteer(Gazetteer theGaz) {
99 gazetteer = theGaz;
100 }
101
102 public void setTheLanguage(String language) {
103 theLanguage = language;
104 }
105
106 public String getTheLanguage() {
107 return theLanguage;
108 }
109
110 protected void collectLists(AnnotationSet annots, String annotType) {
111 Iterator iter = annots.iterator();
112 String listName = "";
113 GazetteerList theList = null;
114 Iterator theListsIter =
115 gazetteer.getLinearDefinition().getListsByNode().values().iterator();
116 while (theListsIter.hasNext() && listName.equals("")) {
117 theList = (GazetteerList) theListsIter.next();
118 if (theList.getURL().toExternalForm().endsWith(annotType + ".lst"))
119 listName = theList.getURL().toExternalForm();
120 }
121 while (iter.hasNext()) {
122 Annotation annot = (Annotation) iter.next();
123 String text = "";
124 List strings = new ArrayList();
125 try {
126 text = document.getContent().getContent(
127 annot.getStartNode().getOffset(),
128 annot.getEndNode().getOffset()
129 ).toString();
130 StringTokenizer tok = new StringTokenizer(text, "\n\r.|();-?!\t", false);
132 while (tok.hasMoreTokens())
133 strings.add(tok.nextToken());
134 text = text.replace('\r', ' ');
136 text = text.replace('\n', ' ');
137 text = text.replace('\t', ' ');
138
139 } catch (InvalidOffsetException ex) {
140 throw new GateRuntimeException(ex.getMessage());
141 }
142
143 if (((HashMap) statsPerType.get(annotType)).containsKey(text))
145 ((HashMap) statsPerType.get(annotType)).put(text,
146 new Integer(((Integer)
147 ((HashMap) statsPerType.get(annotType)).get(text)).intValue()+1));
148 else
149 ((HashMap) statsPerType.get(annotType)).put(text, new Integer(1));
150
151 if (strings.size() > 1) {
154 for (int i=0; i < strings.size(); i++) {
155 String theString = (String) strings.get(i);
156 if ( ( (HashMap) statsPerType.get(annotType)).containsKey(theString))
158 ( (HashMap) statsPerType.get(annotType)).put(theString,
159 new Integer( ( (Integer)
160 ( (HashMap) statsPerType.get(annotType)).get(
161 theString)).intValue() + 1));
162 else
163 ( (HashMap) statsPerType.get(annotType)).put(theString,
164 new Integer(1));
165 }
166 }
167
168 Set lookupResult = gazetteer.lookup(text);
170 if (lookupResult != null && lookupResult.size() > 0)
171 continue;
172 gazetteer.add(text,
174 new Lookup(listName, annotType, "inferred", theLanguage));
175 theList.add(text);
177
178
179 if (annotType.equals(PERSON_ANNOT_NAME) && strings.size() > 1) {
181 for (int i=0; i < strings.size(); i++) {
182 String theString = (String) strings.get(i);
183 Set lookupResult1 = gazetteer.lookup(theString);
184 if (lookupResult1 != null && lookupResult1.size() > 0)
185 continue;
186 if (theString.length() < 3)
187 continue;
188 gazetteer.add(theString,
189 new Lookup(listName, annotType, "inferred", theLanguage));
190 theList.add(theString);
191 }
192 }
193 }
194 }
195
196 protected void printStats() {
197 try {
198 for (int i=0; i < annotationTypes.size(); i++) {
199 if (! statsPerType.containsKey(annotationTypes.get(i)))
200 continue;
201 BufferedWriter writer = new BufferedWriter(
202 new OutputStreamWriter(new FileOutputStream(
203 annotationTypes.get(i) + ".stats.lst"),
204 "UTF-8"));
205 HashMap stats = (HashMap) statsPerType.get(annotationTypes.get(i));
206 Iterator stringsIter = stats.keySet().iterator();
207 while (stringsIter.hasNext()) {
208 String string = (String) stringsIter.next();
209 writer.write(string);
210 writer.write("$");
211 writer.write( ((Integer)stats.get(string)).toString());
212 writer.newLine();
213 }
214 writer.close();
215 }
216 } catch(IOException ioe){
217 throw new RuntimeException(ioe.getMessage());
218 }
220 }
221
222
227 protected boolean alreadyPresentInGazetteer(String token) {
228 return false;
229 }
230
231 private String markupSetName = "";
232 private AnnotationSet allAnnots;
233 private List annotationTypes;
234 private Gazetteer gazetteer;
235 private String theLanguage = "";
236 private HashMap statsPerType = new HashMap();
237 }