1
15
16 package gate.xml;
17
18 import java.io.File;
19 import java.io.FileOutputStream;
20 import java.io.OutputStreamWriter;
21 import java.io.Writer;
22 import java.net.URL;
23 import java.util.*;
24 import java.text.NumberFormat;
25
26 import junit.framework.*;
27
28 import gate.*;
29 import gate.corpora.DocumentImpl;
30 import gate.creole.SerialAnalyserController;
31 import gate.util.Files;
32 import gate.util.Err;
33 import gate.creole.ANNIEConstants;
34
35
37
38
41 public class TestXml extends TestCase
42 {
43
44 private static final boolean DEBUG = false;
45
46
47 private static String workingEncoding="UTF-8";
48
49
50 public TestXml(String name) { super(name); }
51
52
53 public void setUp() {
54 }
56 public void testGateDocumentToAndFromXmlWithDifferentKindOfFormats()
57 throws Exception{
58 List urlList = new LinkedList();
59 List urlDescription = new LinkedList();
60 URL url = null;
61
62 url = Gate.getUrl("tests/xml/xces.xml");
63 assertTrue("Coudn't create a URL object for tests/xml/xces.xml ", url != null);
64 urlList.add(url);
65 urlDescription.add(" an XML document ");
66
67 url = Gate.getUrl("tests/xml/Sentence.xml");
68 assertTrue("Coudn't create a URL object for tests/xml/Sentence.xml",
69 url != null);
70 urlList.add(url);
71 urlDescription.add(" an XML document ");
72
73 url = Gate.getUrl("tests/html/test1.htm");
74 assertTrue("Coudn't create a URL object for tests/html/test.htm",url != null);
75 urlList.add(url);
76 urlDescription.add(" an HTML document ");
77
78 url = Gate.getUrl("tests/rtf/Sample.rtf");
79 assertTrue("Coudn't create a URL object for defg ",url != null);
80 urlList.add(url);
81 urlDescription.add(" a RTF document ");
82
83
84 url = Gate.getUrl("tests/email/test2.eml");
85 assertTrue("Coudn't create a URL object for defg ",url != null);
86 urlList.add(url);
87 urlDescription.add(" an EMAIL document ");
88
89 Iterator iter = urlList.iterator();
90 Iterator descrIter = urlDescription.iterator();
91 while(iter.hasNext()){
92 runCompleteTestWithAFormat((URL) iter.next(),(String)descrIter.next());
93 }
95
96 }
98 private void runCompleteTestWithAFormat(URL url, String urlDescription)
99 throws Exception{
100 gate.Document keyDocument = null;
102
103 FeatureMap params = Factory.newFeatureMap();
104 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, url);
105 params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME, "false");
106 keyDocument = (Document)Factory.createResource("gate.corpora.DocumentImpl",
107 params);
108
109 assertTrue("Coudn't create a GATE document instance for " +
110 url.toString() +
111 " Can't continue." , keyDocument != null);
112
113 gate.DocumentFormat keyDocFormat = null;
114 keyDocFormat = gate.DocumentFormat.getDocumentFormat(
115 keyDocument, keyDocument.getSourceUrl()
116 );
117
118 assertTrue("Fail to recognize " +
119 url.toString() +
120 " as being " + urlDescription + " !", keyDocFormat != null);
121
122 keyDocFormat.unpackMarkup(keyDocument);
124 gate.corpora.TestDocument.verifyNodeIdConsistency(keyDocument);
126
127 verifyAnnotationIDGenerator(keyDocument);
130
131 long keyDocumentSize = keyDocument.getContent().size().longValue();
133 int keyDocumentAnnotationSetSize = keyDocument.getAnnotations().size();
134
135
136 File xmlFile = null;
139 xmlFile = Files.writeTempFile(keyDocument.toXml(), workingEncoding );
140 assertTrue("The temp GATE XML file is null. Can't continue.",xmlFile != null);
141
142 gate.Document gateDoc = null;
144 gateDoc = gate.Factory.newDocument(xmlFile.toURL(), workingEncoding);
145
146 assertTrue("Coudn't create a GATE document instance for " +
147 xmlFile.toURL().toString() +
148 " Can't continue." , gateDoc != null);
149
150 gate.DocumentFormat gateDocFormat = null;
151 gateDocFormat =
152 DocumentFormat.getDocumentFormat(gateDoc,gateDoc.getSourceUrl());
153
154 assertTrue("Fail to recognize " +
155 xmlFile.toURL().toString() +
156 " as being a GATE XML document !", gateDocFormat != null);
157
158 gateDocFormat.unpackMarkup(gateDoc);
159 gate.corpora.TestDocument.verifyNodeIdConsistency(gateDoc);
161
162 long gateDocSize = keyDocument.getContent().size().longValue();
164 int gateDocAnnotationSetSize = keyDocument.getAnnotations().size();
165
166 assertTrue("Exporting as GATE XML resulted in document content size lost." +
167 " Something went wrong.", keyDocumentSize == gateDocSize);
168
169 assertTrue("Exporting as GATE XML resulted in annotation lost." +
170 " No. of annotations missing = " +
171 Math.abs(keyDocumentAnnotationSetSize - gateDocAnnotationSetSize),
172 keyDocumentAnnotationSetSize == gateDocAnnotationSetSize);
173
174 verifyAnnotationIDGenerator(gateDoc);
177
178 xmlFile.delete();
180 }
182
183 public void testUnpackMarkup() throws Exception{
184 Map markupElementsMap = null;
186 gate.Document doc = null;
187
193 Map anElement2StringMap = null;
195 anElement2StringMap = new HashMap();
196 anElement2StringMap.put("S","\n");
198 anElement2StringMap.put("s","\n");
199
200 doc = gate.Factory.newDocument(Gate.getUrl("tests/xml/xces.xml"), workingEncoding);
201
202 AnnotationSet annotSet = doc.getAnnotations(
203 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
204 assertEquals("For "+doc.getSourceUrl()+" the number of annotations"+
205 " should be:758",758,annotSet.size());
206
207 gate.corpora.TestDocument.verifyNodeIdConsistency(doc);
208
209 verifyAnnotationIDGenerator(doc);
212
213 }
215
222 public void testAnnotationConsistencyForSaveAsXml()throws Exception{
223 String testDoc = gate.util.Files.getGateResourceAsString("gate.ac.uk/tests/xml/gateTestSaveAsXML.xml");
226 Document origDoc = gate.Factory.newDocument(testDoc);
227
228 verifyAnnotationIDGenerator(origDoc);
231
232 SerialAnalyserController annie = loadANNIEWithDefaults();
234 assertTrue("ANNIE not loaded!", annie != null);
235 Corpus c = Factory.newCorpus("test");
236 c.add(origDoc);
237 annie.setCorpus(c);
238 annie.execute();
239
240 File xmlFile = Files.writeTempFile(origDoc.toXml(),workingEncoding);
244 System.out.println("Saved to temp file :" + xmlFile.toURL());
245
246 Document reloadedDoc = gate.Factory.newDocument(xmlFile.toURL(), workingEncoding);
247 verifyAnnotationIDGenerator(reloadedDoc);
250
251 Map origAnnotMap = buildID2AnnotMap(origDoc);
253 Map reloadedAnnMap = buildID2AnnotMap(reloadedDoc);
254
255 verifyIDConsistency(origAnnotMap, reloadedAnnMap);
257
258 Map origMatchesMap = buildMatchesMap(origDoc);
261 for(Iterator it = origMatchesMap.keySet().iterator(); it.hasNext();){
265 Integer id = (Integer)it.next();
266 Annotation origAnnot = (Annotation) origAnnotMap.get(id);
267 assertTrue("Couldn't find an original annot with ID=" + id, origAnnot != null);
268 Annotation reloadedAnnot = (Annotation) reloadedAnnMap.get(id);
269 assertTrue("Couldn't find a reloaded annot with ID=" + id, reloadedAnnot != null);
270 compareAnnot(origAnnot,reloadedAnnot);
271 List matchesList = (List) origMatchesMap.get(id);
273 for (Iterator itList = matchesList.iterator(); itList.hasNext();){
274 Integer matchId = (Integer) itList.next();
275 Annotation origA = (Annotation) origAnnotMap.get(matchId);
276 assertTrue("Couldn't find an original annot with ID=" + matchId, origA != null);
277 Annotation reloadedA = (Annotation) reloadedAnnMap.get(matchId);
278 assertTrue("Couldn't find a reloaded annot with ID=" + matchId, reloadedA != null);
279 compareAnnot(origA, reloadedA);
280 } } xmlFile.delete();
284 }
286
293 private Map buildMatchesMap(Document doc){
294 Map matchesMap = new HashMap();
295 AnnotationSet annotSet = doc.getAnnotations();
297
298 helperBuildMatchesMap(annotSet, matchesMap);
299 if (doc.getNamedAnnotationSets() != null){
301 for ( Iterator namedAnnotSetsIter = doc.getNamedAnnotationSets().values().iterator();
302 namedAnnotSetsIter.hasNext(); ){
303 helperBuildMatchesMap((gate.AnnotationSet) namedAnnotSetsIter.next(), matchesMap);
304 } } return matchesMap;
307 }
309
315 private void helperBuildMatchesMap(AnnotationSet sourceAnnotSet, Map aMap ){
316
317 for (Iterator it = sourceAnnotSet.iterator(); it.hasNext();){
318 Annotation a = (Annotation) it.next();
319 FeatureMap aFeatMap = a.getFeatures();
320 if (aFeatMap == null) continue;
322 List matchesVal = (List) aFeatMap.get("matches");
324 if (matchesVal == null) continue;
325 Integer id = a.getId();
326 aMap.put(id,matchesVal);
327 }
329 }
331
338 protected void verifyAnnotationIDGenerator(gate.Document aDoc){
339 Map id2AnnotationMap = buildID2AnnotMap(aDoc);
342
343 if (id2AnnotationMap == null || id2AnnotationMap.isEmpty()){
344 return;
346 }
347
348 Set keysSet = id2AnnotationMap.keySet();
350 TreeSet sortedSet = new TreeSet(keysSet);
351 Integer maxAnnotId = (Integer) sortedSet.last();
353 Integer generatorId = ((DocumentImpl)aDoc).getNextAnnotationId();
355
356
358 assertTrue("Annotation ID generator["+generatorId+"] on document [" + aDoc.getSourceUrl() +
359 "] was equal or less than the MAX Annotation ID["+maxAnnotId+"] on the document."+
360 " This may lead to Annotation ID conflicts.", generatorId.intValue() > maxAnnotId.intValue());
361
362
363 }
365
371 private void verifyIDConsistency(Map origAnnotMap, Map reloadedAnnMap) {
372 assertEquals("Found a different number of annot in both documents.",
373 origAnnotMap.keySet().size(), reloadedAnnMap.keySet().size());
374
375
384 for (Iterator it = origAnnotMap.keySet().iterator(); it.hasNext();){
385 Integer id = (Integer) it.next();
386 Annotation origAnn = (Annotation) origAnnotMap.get(id);
387 Annotation reloadedAnnot = (Annotation) reloadedAnnMap.get(id);
388
389 assertTrue("Annotation with ID="+ id +" was not found in the reloaded document.", reloadedAnnot != null);
390 compareAnnot(origAnn, reloadedAnnot);
391
392 } }
395
400 private void compareAnnot(Annotation origAnn, Annotation reloadedAnnot) {
401 assertTrue("Found original and reloaded annot without the same ID!",
402 origAnn.getId().equals(reloadedAnnot.getId()));
403 assertTrue("Found original and reloaded annot without the same TYPE!\n"+
404 "Original was ["+origAnn.getType()+"] and reloaded was ["+reloadedAnnot.getType()+"].",
405 origAnn.getType().equals(reloadedAnnot.getType()));
406 assertTrue("Found original and reloaded annot without the same START offset!",
407 origAnn.getStartNode().getOffset().equals(reloadedAnnot.getStartNode().getOffset()));
408 assertTrue("Found original and reloaded annot without the same END offset!",
409 origAnn.getEndNode().getOffset().equals(reloadedAnnot.getEndNode().getOffset()));
410 }
412
413 private Map addAnnotSet2Map(AnnotationSet annotSet, Map id2AnnMap){
414 for (Iterator it = annotSet.iterator(); it.hasNext();){
415 Annotation a = (Annotation) it.next();
416 Integer id = a.getId();
417
418 assertTrue("Found two annotations(one with type = " + a.getType() +
419 ")with the same ID=" + id, !id2AnnMap.keySet().contains(id));
420
421 id2AnnMap.put(id, a);
422 } return id2AnnMap;
424 }
425
426
432 private Map buildID2AnnotMap(Document aDoc){
433 Map id2AnnMap = new HashMap();
434 AnnotationSet annotSet = aDoc.getAnnotations();
436 addAnnotSet2Map(annotSet, id2AnnMap);
437 if (aDoc.getNamedAnnotationSets() != null){
439 for ( Iterator namedAnnotSetsIter = aDoc.getNamedAnnotationSets().values().iterator();
440 namedAnnotSetsIter.hasNext(); ){
441
442 addAnnotSet2Map((gate.AnnotationSet) namedAnnotSetsIter.next(), id2AnnMap);
443 } } return id2AnnMap;
446 }
448
452 private SerialAnalyserController loadANNIEWithDefaults(){
453 FeatureMap params = Factory.newFeatureMap();
454 SerialAnalyserController sac = null;
455 try{
456 sac = (SerialAnalyserController)
458 Factory.createResource("gate.creole.SerialAnalyserController",
459 Factory.newFeatureMap(),
460 Factory.newFeatureMap(),
461 "ANNIE_" + Gate.genSym());
462 for(int i = 0; i < ANNIEConstants.PR_NAMES.length; i++){
464 ProcessingResource pr = (ProcessingResource)
465 Factory.createResource(ANNIEConstants.PR_NAMES[i], params);
466 sac.add(pr);
468 }
470 }catch(gate.creole.ResourceInstantiationException ex){
471 ex.printStackTrace(Err.getPrintWriter());
472 }
473 return sac;
474 }
476
477
478 public static Test suite() {
479 return new TestSuite(TestXml.class);
480 }
482 }