1
15
16 package gate.creole;
17
18 import java.net.URL;
19 import java.util.*;
20
21 import junit.framework.*;
22
23 import gate.*;
24 import gate.corpora.TestDocument;
25 import gate.creole.gazetteer.DefaultGazetteer;
26 import gate.creole.orthomatcher.OrthoMatcher;
27 import gate.creole.splitter.SentenceSplitter;
28 import gate.creole.tokeniser.DefaultTokeniser;
29 import gate.util.AnnotationDiffer;
30
31
32 public class TestPR extends TestCase
33 {
34
35 private static final boolean DEBUG = false;
36
37 protected static Document doc1;
38 protected static Document doc2;
39 protected static Document doc3;
40
41 protected static List annotationTypes = new ArrayList(10);
42
43 static{
44 annotationTypes.add(ANNIEConstants.SENTENCE_ANNOTATION_TYPE);
45 annotationTypes.add(ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE);
46 annotationTypes.add(ANNIEConstants.LOCATION_ANNOTATION_TYPE);
47 annotationTypes.add(ANNIEConstants.PERSON_ANNOTATION_TYPE);
48 annotationTypes.add(ANNIEConstants.DATE_ANNOTATION_TYPE);
49 annotationTypes.add(ANNIEConstants.MONEY_ANNOTATION_TYPE);
50 annotationTypes.add(ANNIEConstants.LOOKUP_ANNOTATION_TYPE);
51 annotationTypes.add(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
52 try{
53 if (doc1 == null)
55 doc1 = Factory.newDocument(
56 new URL(TestDocument.getTestServerName() +
57 "tests/ft-bt-03-aug-2001.html"),
58 "ISO-8859-1"
59 );
60
61 if (doc2 == null)
62 doc2 = Factory.newDocument(
63 new URL(TestDocument.getTestServerName() +
64 "tests/gu-Am-Brit-4-aug-2001.html"),
65 "ISO-8859-1"
66 );
67
68 if (doc3 == null)
69 doc3 = Factory.newDocument(
70 new URL(TestDocument.getTestServerName() +
71 "tests/in-outlook-09-aug-2001.html"),
72 "ISO-8859-1"
73 );
74 }catch(Exception e){
75 e.printStackTrace();
76 }
77 }
78
79
80 public TestPR(String name) { super(name); }
81
82
83 public void setUp() throws Exception {
84 }
86
88 public void tearDown() throws Exception {
89 }
91 public void testTokenizer() throws Exception {
92 FeatureMap params = Factory.newFeatureMap();
93 DefaultTokeniser tokeniser = (DefaultTokeniser) Factory.createResource(
94 "gate.creole.tokeniser.DefaultTokeniser", params);
95
96
97 tokeniser.setDocument(doc1);
99 tokeniser.execute();
100 assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
101 doc1.getAnnotations().size() +
102 " Token annotations, instead of the expected 1284.",
103 doc1.getAnnotations().size()== 1284);
104
105
106 tokeniser.setDocument(doc2);
108 tokeniser.execute();
109 assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
110 doc2.getAnnotations().size() +
111 " Token annotations, instead of the expected 2138.",
112 doc2.getAnnotations().size()== 2138);
113
114
115 tokeniser.setDocument(doc3);
117 tokeniser.execute();
118 assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
119 doc3.getAnnotations().size() +
120 " Token annotations, instead of the expected 2806.",
121 doc3.getAnnotations().size()== 2806);
122
123 Factory.deleteResource(tokeniser);
124 }
126 public void testGazetteer() throws Exception {
127 FeatureMap params = Factory.newFeatureMap();
128 DefaultGazetteer gaz = (DefaultGazetteer) Factory.createResource(
129 "gate.creole.gazetteer.DefaultGazetteer", params);
130
131 gaz.setDocument(doc1);
133 gaz.execute();
134 assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
135 doc1.getAnnotations().get(ANNIEConstants.LOOKUP_ANNOTATION_TYPE).size() +
136 " Lookup annotations, instead of the expected 63.",
137 doc1.getAnnotations().get(ANNIEConstants.LOOKUP_ANNOTATION_TYPE).size()== 63);
138
139 gaz.setDocument(doc2);
141 gaz.execute();
142 assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
143 doc2.getAnnotations().get(ANNIEConstants.LOOKUP_ANNOTATION_TYPE).size() +
144 " Lookup annotations, instead of the expected 109.",
145 doc2.getAnnotations().get(ANNIEConstants.LOOKUP_ANNOTATION_TYPE).size()== 109);
146
147 gaz.setDocument(doc3);
149 gaz.execute();
150 assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
151 doc3.getAnnotations().get(ANNIEConstants.LOOKUP_ANNOTATION_TYPE).size() +
152 " Lookup annotations, instead of the expected 136.",
153 doc3.getAnnotations().get(ANNIEConstants.LOOKUP_ANNOTATION_TYPE).size()== 136);
154 Factory.deleteResource(gaz);
155 }
157 public void testSplitter() throws Exception {
158 FeatureMap params = Factory.newFeatureMap();
159 SentenceSplitter splitter = (SentenceSplitter) Factory.createResource(
160 "gate.creole.splitter.SentenceSplitter", params);
161
162 splitter.setDocument(doc1);
164 splitter.execute();
165 assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
166 doc1.getAnnotations().get(ANNIEConstants.SENTENCE_ANNOTATION_TYPE).size() +
167 " Sentence annotations, instead of the expected 22.",
168 doc1.getAnnotations().get(ANNIEConstants.SENTENCE_ANNOTATION_TYPE).size()== 22);
169
170 assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
171 doc1.getAnnotations().get("Split").size() +
172 " Split annotations, instead of the expected 37.",
173 doc1.getAnnotations().get("Split").size()== 37);
174
175
176 splitter.setDocument(doc2);
178 splitter.execute();
179 assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
180 doc2.getAnnotations().get(ANNIEConstants.SENTENCE_ANNOTATION_TYPE).size() +
181 " Sentence annotations, instead of the expected 52.",
182 doc2.getAnnotations().get(ANNIEConstants.SENTENCE_ANNOTATION_TYPE).size()== 52);
183
184 assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+
185 doc2.getAnnotations().get("Split").size() +
186 " Split annotations, instead of the expected 72.",
187 doc2.getAnnotations().get("Split").size()== 72);
188
189 splitter.setDocument(doc3);
191 splitter.execute();
192
193 assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
194 doc3.getAnnotations().get(ANNIEConstants.SENTENCE_ANNOTATION_TYPE).size() +
195 " Sentence annotations, instead of the expected 66.",
196 doc3.getAnnotations().get(ANNIEConstants.SENTENCE_ANNOTATION_TYPE).size()== 66);
197
198 assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+
199 doc3.getAnnotations().get("Split").size() +
200 " Split annotations, instead of the expected 84.",
201 doc3.getAnnotations().get("Split").size()== 84);
202 Factory.deleteResource(splitter);
203 }
205 public void testTagger() throws Exception {
206 FeatureMap params = Factory.newFeatureMap();
207 POSTagger tagger = (POSTagger) Factory.createResource(
208 "gate.creole.POSTagger", params);
209
210
211 tagger.setDocument(doc1);
213 tagger.execute();
214
215 HashSet fType = new HashSet();
216 fType.add(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
217 AnnotationSet annots =
218 doc1.getAnnotations().get(ANNIEConstants.TOKEN_ANNOTATION_TYPE, fType);
219
220 assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+ annots.size() +
221 " Token annotations with category feature, instead of the expected 675.",
222 annots.size() == 675);
223
224 tagger.setDocument(doc2);
226 tagger.execute();
227 annots = doc2.getAnnotations().get(ANNIEConstants.TOKEN_ANNOTATION_TYPE, fType);
228 assertTrue("Found in "+ doc2.getSourceUrl().getFile()+ " "+annots.size() +
229 " Token annotations with category feature, instead of the expected 1131.",
230 annots.size() == 1131);
231
232 tagger.setDocument(doc3);
234 tagger.execute();
235 annots = doc3.getAnnotations().get(ANNIEConstants.TOKEN_ANNOTATION_TYPE, fType);
236 assertTrue("Found in "+ doc3.getSourceUrl().getFile()+ " "+ annots.size() +
237 " Token annotations with category feature, instead of the expected 1446.",
238 annots.size() == 1446);
239 Factory.deleteResource(tagger);
240 }
242 public void testTransducer() throws Exception {
243 FeatureMap params = Factory.newFeatureMap();
244 ANNIETransducer transducer = (ANNIETransducer) Factory.createResource(
245 "gate.creole.ANNIETransducer", params);
246
247 transducer.setDocument(doc1);
249 transducer.execute();
250 assertTrue("Found in "+ doc1.getSourceUrl().getFile()+ " "+
251 doc1.getAnnotations().get(ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE).size() +
252 " Organization annotations, instead of the expected 17",
253 doc1.getAnnotations().get(ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE).size()== 17);
254 assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
255 doc1.getAnnotations().get(ANNIEConstants.LOCATION_ANNOTATION_TYPE).size() +
256 " Location annotations, instead of the expected 3",
257 doc1.getAnnotations().get(ANNIEConstants.LOCATION_ANNOTATION_TYPE).size()== 3);
258 assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
259 doc1.getAnnotations().get(ANNIEConstants.PERSON_ANNOTATION_TYPE).size() +
260 " Person annotations, instead of the expected 3",
261 doc1.getAnnotations().get(ANNIEConstants.PERSON_ANNOTATION_TYPE).size()== 3);
262 assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
263 doc1.getAnnotations().get(ANNIEConstants.DATE_ANNOTATION_TYPE).size() +
264 " Date annotations, instead of the expected 6",
265 doc1.getAnnotations().get(ANNIEConstants.DATE_ANNOTATION_TYPE).size()== 6);
266 assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+
267 doc1.getAnnotations().get(ANNIEConstants.MONEY_ANNOTATION_TYPE).size() +
268 " Money annotations, instead of the expected 1",
269 doc1.getAnnotations().get(ANNIEConstants.MONEY_ANNOTATION_TYPE).size()== 1);
270
271 transducer.setDocument(doc2);
273 transducer.execute();
274 assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
275 doc2.getAnnotations().get(ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE).size() +
276 " Organization annotations, instead of the expected 18",
277 doc2.getAnnotations().get(ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE).size()== 18);
278 assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
279 doc2.getAnnotations().get(ANNIEConstants.LOCATION_ANNOTATION_TYPE).size() +
280 " Location annotations, instead of the expected 9",
281 doc2.getAnnotations().get(ANNIEConstants.LOCATION_ANNOTATION_TYPE).size()== 9);
282 assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
283 doc2.getAnnotations().get(ANNIEConstants.PERSON_ANNOTATION_TYPE).size() +
284 " Person annotations, instead of the expected 1",
285 doc2.getAnnotations().get(ANNIEConstants.PERSON_ANNOTATION_TYPE).size()== 1);
286 assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
287 doc2.getAnnotations().get(ANNIEConstants.DATE_ANNOTATION_TYPE).size() +
288 " Date annotations, instead of the expected 6",
289 doc2.getAnnotations().get(ANNIEConstants.DATE_ANNOTATION_TYPE).size()== 6);
290 assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+
291 doc2.getAnnotations().get(ANNIEConstants.MONEY_ANNOTATION_TYPE).size() +
292 " Money annotations, instead of the expected 3",
293 doc2.getAnnotations().get(ANNIEConstants.MONEY_ANNOTATION_TYPE).size()== 3);
294
295 transducer.setDocument(doc3);
297 transducer.execute();
298 assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
299 doc3.getAnnotations().get(ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE).size() +
300 " Organization annotations, instead of the expected 9",
301 doc3.getAnnotations().get(ANNIEConstants.ORGANIZATION_ANNOTATION_TYPE).size()== 9);
302 assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
303 doc3.getAnnotations().get(ANNIEConstants.LOCATION_ANNOTATION_TYPE).size() +
304 " Location annotations, instead of the expected 12",
305 doc3.getAnnotations().get(ANNIEConstants.LOCATION_ANNOTATION_TYPE).size()== 12);
306 assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
307 doc3.getAnnotations().get(ANNIEConstants.PERSON_ANNOTATION_TYPE).size() +
308 " Person annotations, instead of the expected 8",
309 doc3.getAnnotations().get(ANNIEConstants.PERSON_ANNOTATION_TYPE).size()== 8);
310 assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
311 doc3.getAnnotations().get(ANNIEConstants.DATE_ANNOTATION_TYPE).size() +
312 " Date annotations, instead of the expected 7",
313 doc3.getAnnotations().get(ANNIEConstants.DATE_ANNOTATION_TYPE).size()== 7);
314 assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+
315 doc3.getAnnotations().get(ANNIEConstants.MONEY_ANNOTATION_TYPE).size() +
316 " Money annotations, instead of the expected 4",
317 doc3.getAnnotations().get(ANNIEConstants.MONEY_ANNOTATION_TYPE).size()== 4);
318
319 Factory.deleteResource(transducer);
320 }
322 public void testOrthomatcher() throws Exception {
323 FeatureMap params = Factory.newFeatureMap();
324
325 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
326 "gate.creole.orthomatcher.OrthoMatcher", params);
327
328
329 orthomatcher.setDocument(doc1);
331 orthomatcher.execute();
332
333 HashSet fType = new HashSet();
334 fType.add(ANNIEConstants.ANNOTATION_COREF_FEATURE_NAME);
335 AnnotationSet annots =
336 doc1.getAnnotations().get(null,fType);
337
338 assertTrue("Found in "+doc1.getSourceUrl().getFile()+ " "+ annots.size() +
339 " annotations with matches feature, instead of the expected 30.",
340 annots.size() == 30);
341
342 orthomatcher.setDocument(doc2);
344 orthomatcher.execute();
345 annots = doc2.getAnnotations().get(null,fType);
346 assertTrue("Found in "+doc2.getSourceUrl().getFile()+ " "+ annots.size() +
347 " annotations with matches feature, instead of the expected 35.",
348 annots.size() == 33);
349
350 orthomatcher.setDocument(doc3);
352 orthomatcher.execute();
353
354 annots = doc3.getAnnotations().get(null,fType);
355 assertTrue("Found in "+doc3.getSourceUrl().getFile()+ " "+ annots.size() +
356 " annotations with matches feature, instead of the expected 24.",
357 annots.size() == 24);
358 Factory.deleteResource(orthomatcher);
359 }
361
362 public void testAllPR() throws Exception {
363
364 String urlBaseName = Gate.locateGateFiles();
367
372 if (urlBaseName.endsWith("/bin/gate.jar!/")) {
373 StringBuffer buff = new StringBuffer(
374 urlBaseName.substring(
375 0,
376 urlBaseName.lastIndexOf("bin/gate.jar!/"))
377 );
378 buff.append("classes/");
379 buff.delete(0, "jar:file:".length());
380 buff.insert(0, "file://");
381 urlBaseName = buff.toString();
382 }
383
384 URL urlBase = new URL(urlBaseName + "gate/resources/gate.ac.uk/");
385
386 URL storageDir = null;
387 storageDir = new URL(urlBase, "tests/ft");
388
389 DataStore ds = Factory.openDataStore
391 ("gate.persist.SerialDataStore",
392 storageDir.toExternalForm());
393
394 String lrId = (String)ds.getLrIds
396 ("gate.corpora.DocumentImpl").get(0);
397
398
399 FeatureMap features = Factory.newFeatureMap();
401 features.put(DataStore.DATASTORE_FEATURE_NAME, ds);
402 features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
403 Document document = (Document) Factory.createResource(
404 "gate.corpora.DocumentImpl",
405 features);
406 compareAnnots(document, doc1);
407
408 storageDir = null;
410 storageDir = new URL(urlBase, "tests/gu");
411
412 ds = Factory.openDataStore("gate.persist.SerialDataStore",
414 storageDir.toExternalForm());
415 lrId = (String)ds.getLrIds("gate.corpora.DocumentImpl").get(0);
417 features = Factory.newFeatureMap();
419 features.put(DataStore.DATASTORE_FEATURE_NAME, ds);
420 features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
421 document = (Document) Factory.createResource(
422 "gate.corpora.DocumentImpl",
423 features);
424 compareAnnots(document,doc2);
425
426 storageDir = null;
428 storageDir = new URL(urlBase, "tests/in");
429
430 ds = Factory.openDataStore("gate.persist.SerialDataStore",
432 storageDir.toExternalForm());
433 lrId = (String)ds.getLrIds("gate.corpora.DocumentImpl").get(0);
435 features = Factory.newFeatureMap();
437 features.put(DataStore.DATASTORE_FEATURE_NAME, ds);
438 features.put(DataStore.LR_ID_FEATURE_NAME, lrId);
439 document = (Document) Factory.createResource(
440 "gate.corpora.DocumentImpl",
441 features);
442 compareAnnots(document,doc3);
443 }
445 public void compareAnnots(Document keyDocument, Document responseDocument)
504 throws Exception{
505 Iterator iteratorTypes = annotationTypes.iterator();
507 while (iteratorTypes.hasNext()){
508 String annotType = (String)iteratorTypes.next();
510
511 AnnotationDiffer annotDiffer = new AnnotationDiffer();
513 Set significantFeatures = new HashSet(Arrays.asList(
514 new String[]{"NMRule", "kind", "orgType", "rule",
515 "rule1", "rule2", "locType", "gender",
516 "majorType", "minorType", "category",
517 "length", "orth", "string", "subkind",
518 "symbolkind"}));
519 annotDiffer.setSignificantFeaturesSet(significantFeatures);
520 annotDiffer.calculateDiff(keyDocument.getAnnotations().get(annotType),
521 responseDocument.getAnnotations().get(annotType));
522 if(DEBUG) annotDiffer.printMissmatches();
523
524 assertTrue(annotType+ " precision strict in "+
525 responseDocument.getSourceUrl().getFile()+
526 " is "+ annotDiffer.getPrecisionStrict()+ " instead of 1.0 ",
527 annotDiffer.getPrecisionStrict()== 1.0);
528
529 assertTrue(annotType+" recall strict in "
530 +responseDocument.getSourceUrl().getFile()+
531 " is " + annotDiffer.getRecallStrict()+ " instead of 1.0 ",
532 annotDiffer.getRecallStrict()== 1.0);
533
534 assertTrue(annotType+" f-measure strict in "
535 +responseDocument.getSourceUrl().getFile()+
536 " is "+ annotDiffer.getFMeasureStrict(0.5)+ " instead of 1.0 ",
537 annotDiffer.getFMeasureStrict(0.5)== 1.0);
538 } }
541
542
543 public static Test suite() {
544 return new TestSuite(TestPR.class);
545 }
547 public static void main(String[] args) {
548 try{
549
550 Gate.init();
551 TestPR testPR = new TestPR("");
552 testPR.setUp();
553 testPR.testTokenizer();
554 testPR.testGazetteer();
555 testPR.testSplitter();
556 testPR.testTagger();
557 testPR.testTransducer();
558 testPR.testOrthomatcher();
559 testPR.testAllPR();
560 testPR.tearDown();
561 } catch(Exception e) {
562 e.printStackTrace();
563 }
564 } }