1   /*
2    *  TestXml.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  8/May/2000
12   *
13   *  $Id: TestXml.java,v 1.61 2006/03/02 13:14:20 cursu Exp $
14   */
15  
16  package gate.xml;
17  
18  import java.io.File;
19  import java.io.FileOutputStream;
20  import java.io.OutputStreamWriter;
21  import java.io.Writer;
22  import java.net.URL;
23  import java.util.*;
24  import java.text.NumberFormat;
25  
26  import junit.framework.*;
27  
28  import gate.*;
29  import gate.corpora.DocumentImpl;
30  import gate.creole.SerialAnalyserController;
31  import gate.util.Files;
32  import gate.util.Err;
33  import gate.creole.ANNIEConstants;
34  
35  //import org.w3c.www.mime.*;
36  
37  
38  /** Test class for XML facilities
39    *
40    */
41  public class TestXml extends TestCase
42  {
43    /** Debug flag */
44    private static final boolean DEBUG = false;
45  
46    /** The encoding used in our tests*/
47    private static String workingEncoding="UTF-8";
48  
49    /** Construction */
50    public TestXml(String name) { super(name); }
51  
52    /** Fixture set up */
53    public void setUp() {
54    } // setUp
55  
56    public void testGateDocumentToAndFromXmlWithDifferentKindOfFormats()
57                                                                 throws Exception{
58      List urlList = new LinkedList();
59      List urlDescription = new LinkedList();
60      URL url = null;
61  
62      url = Gate.getUrl("tests/xml/xces.xml");
63      assertTrue("Coudn't create a URL object for tests/xml/xces.xml ", url != null);
64      urlList.add(url);
65      urlDescription.add(" an XML document ");
66  
67      url = Gate.getUrl("tests/xml/Sentence.xml");
68      assertTrue("Coudn't create a URL object for tests/xml/Sentence.xml",
69                                                           url != null);
70      urlList.add(url);
71      urlDescription.add(" an XML document ");
72  
73      url = Gate.getUrl("tests/html/test1.htm");
74      assertTrue("Coudn't create a URL object for tests/html/test.htm",url != null);
75      urlList.add(url);
76      urlDescription.add(" an HTML document ");
77  
78      url = Gate.getUrl("tests/rtf/Sample.rtf");
79      assertTrue("Coudn't create a URL object for defg ",url != null);
80      urlList.add(url);
81      urlDescription.add(" a RTF document ");
82  
83  
84      url = Gate.getUrl("tests/email/test2.eml");
85      assertTrue("Coudn't create a URL object for defg ",url != null);
86      urlList.add(url);
87      urlDescription.add(" an EMAIL document ");
88  
89      Iterator iter = urlList.iterator();
90      Iterator descrIter = urlDescription.iterator();
91      while(iter.hasNext()){
92        runCompleteTestWithAFormat((URL) iter.next(),(String)descrIter.next());
93      }// End While
94  
95  
96    }// testGateDocumentToAndFromXmlWithDifferentKindOfFormats
97  
98    private void runCompleteTestWithAFormat(URL url, String urlDescription)
99                                                               throws Exception{
100     // Load the xml Key Document and unpack it
101     gate.Document keyDocument = null;
102 
103     FeatureMap params = Factory.newFeatureMap();
104     params.put(Document.DOCUMENT_URL_PARAMETER_NAME, url);
105     params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME, "false");
106     keyDocument = (Document)Factory.createResource("gate.corpora.DocumentImpl",
107                                                     params);
108 
109     assertTrue("Coudn't create a GATE document instance for " +
110             url.toString() +
111             " Can't continue." , keyDocument != null);
112 
113     gate.DocumentFormat keyDocFormat = null;
114     keyDocFormat = gate.DocumentFormat.getDocumentFormat(
115       keyDocument, keyDocument.getSourceUrl()
116     );
117 
118     assertTrue("Fail to recognize " +
119             url.toString() +
120             " as being " + urlDescription + " !", keyDocFormat != null);
121 
122     // Unpack the markup
123     keyDocFormat.unpackMarkup(keyDocument);
124     // Verfy if all annotations from the default annotation set are consistent
125     gate.corpora.TestDocument.verifyNodeIdConsistency(keyDocument);
126 
127     // Verifies if the maximum annotation ID on the GATE doc is less than the
128     // Annotation ID generator of the document.
129     verifyAnnotationIDGenerator(keyDocument);
130 
131     // Save the size of the document and the number of annotations
132     long keyDocumentSize = keyDocument.getContent().size().longValue();
133     int keyDocumentAnnotationSetSize = keyDocument.getAnnotations().size();
134 
135 
136     // Export the Gate document called keyDocument as  XML, into a temp file,
137     // using the working encoding
138     File xmlFile = null;
139     xmlFile = Files.writeTempFile(keyDocument.toXml(), workingEncoding );
140     assertTrue("The temp GATE XML file is null. Can't continue.",xmlFile != null);
141 
142     // Load the XML Gate document form the tmp file into memory
143     gate.Document gateDoc = null;
144     gateDoc = gate.Factory.newDocument(xmlFile.toURL(), workingEncoding);
145 
146     assertTrue("Coudn't create a GATE document instance for " +
147                 xmlFile.toURL().toString() +
148                 " Can't continue." , gateDoc != null);
149 
150     gate.DocumentFormat gateDocFormat = null;
151     gateDocFormat =
152             DocumentFormat.getDocumentFormat(gateDoc,gateDoc.getSourceUrl());
153 
154     assertTrue("Fail to recognize " +
155       xmlFile.toURL().toString() +
156       " as being a GATE XML document !", gateDocFormat != null);
157 
158     gateDocFormat.unpackMarkup(gateDoc);
159     // Verfy if all annotations from the default annotation set are consistent
160     gate.corpora.TestDocument.verifyNodeIdConsistency(gateDoc);
161 
162     // Save the size of the document snd the number of annotations
163     long gateDocSize = keyDocument.getContent().size().longValue();
164     int gateDocAnnotationSetSize = keyDocument.getAnnotations().size();
165 
166     assertTrue("Exporting as GATE XML resulted in document content size lost." +
167       " Something went wrong.", keyDocumentSize == gateDocSize);
168 
169     assertTrue("Exporting as GATE XML resulted in annotation lost." +
170       " No. of annotations missing =  " +
171       Math.abs(keyDocumentAnnotationSetSize - gateDocAnnotationSetSize),
172       keyDocumentAnnotationSetSize == gateDocAnnotationSetSize);
173 
174     // Verifies if the maximum annotation ID on the GATE doc is less than the
175     // Annotation ID generator of the document.
176     verifyAnnotationIDGenerator(gateDoc);
177 
178     //Don't need tmp Gate XML file.
179     xmlFile.delete();
180   }//runCompleteTestWithAFormat
181 
182   /** A test */
183   public void testUnpackMarkup() throws Exception{
184     // create the markupElementsMap map
185     Map markupElementsMap = null;
186     gate.Document doc = null;
187     /*
188     markupElementsMap = new HashMap();
189     // populate it
190     markupElementsMap.put ("S","Sentence");
191     markupElementsMap.put ("s","Sentence");
192     */
193     // Create the element2String map
194     Map anElement2StringMap = null;
195     anElement2StringMap = new HashMap();
196     // Populate it
197     anElement2StringMap.put("S","\n");
198     anElement2StringMap.put("s","\n");
199 
200     doc = gate.Factory.newDocument(Gate.getUrl("tests/xml/xces.xml"), workingEncoding);
201 
202     AnnotationSet annotSet = doc.getAnnotations(
203                         GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
204     assertEquals("For "+doc.getSourceUrl()+" the number of annotations"+
205     " should be:758",758,annotSet.size());
206 
207     gate.corpora.TestDocument.verifyNodeIdConsistency(doc);
208 
209     // Verifies if the maximum annotation ID on the GATE doc is less than the
210     // Annotation ID generator of the document.
211     verifyAnnotationIDGenerator(doc);
212 
213   } // testUnpackMarkup()
214 
215   /*
216    * This method runs ANNIE with defaults on a document, then saves
217    * it as a GATE XML document and loads it back. All the annotations on the
218    * loaded document should be the same as the original ones.
219    *
220    * It also verifies if the matches feature still holds after an export/import to XML
221    */
222   public void testAnnotationConsistencyForSaveAsXml()throws Exception{
223     // Load a document from the test repository
224     //Document origDoc = gate.Factory.newDocument(Gate.getUrl("tests/xml/gateTestSaveAsXML.xml"));
225     String testDoc = gate.util.Files.getGateResourceAsString("gate.ac.uk/tests/xml/gateTestSaveAsXML.xml");
226     Document origDoc = gate.Factory.newDocument(testDoc);
227 
228     // Verifies if the maximum annotation ID on the origDoc is less than the
229     // Annotation ID generator of the document.
230     verifyAnnotationIDGenerator(origDoc);
231 
232     // Load ANNIE with defaults and run it on the document
233     SerialAnalyserController annie = loadANNIEWithDefaults();
234     assertTrue("ANNIE not loaded!", annie != null);
235     Corpus c = Factory.newCorpus("test");
236     c.add(origDoc);
237     annie.setCorpus(c);
238     annie.execute();
239 
240     // SaveAS XML and reload the document into another GATE doc
241     // Export the Gate document called origDoc as XML, into a temp file,
242     // using the working encoding
243     File xmlFile = Files.writeTempFile(origDoc.toXml(),workingEncoding);
244     System.out.println("Saved to temp file :" + xmlFile.toURL());
245 
246     Document reloadedDoc = gate.Factory.newDocument(xmlFile.toURL(), workingEncoding);
247     // Verifies if the maximum annotation ID on the origDoc is less than the
248     // Annotation ID generator of the document.
249     verifyAnnotationIDGenerator(reloadedDoc);
250 
251     // Verify if the annotations are identical in the two docs.
252     Map origAnnotMap = buildID2AnnotMap(origDoc);
253     Map reloadedAnnMap = buildID2AnnotMap(reloadedDoc);
254 
255     //Verifies if the reloaded annotations are the same as the original ones
256     verifyIDConsistency(origAnnotMap, reloadedAnnMap);
257 
258     // Build the original Matches map
259     // ID  -> List of IDs
260     Map origMatchesMap = buildMatchesMap(origDoc);
261     // Verify the consistency of matches
262     // Compare every orig annotation pointed by the MatchesMap with the reloadedAnnot
263     // extracted from the reloadedMAp
264     for(Iterator it = origMatchesMap.keySet().iterator(); it.hasNext();){
265       Integer id = (Integer)it.next();
266       Annotation origAnnot = (Annotation) origAnnotMap.get(id);
267       assertTrue("Couldn't find an original annot with ID=" + id, origAnnot != null);
268       Annotation reloadedAnnot = (Annotation) reloadedAnnMap.get(id);
269       assertTrue("Couldn't find a reloaded annot with ID=" + id, reloadedAnnot != null);
270       compareAnnot(origAnnot,reloadedAnnot);
271       // Iterate through the matches list and repeat the comparison
272       List matchesList = (List) origMatchesMap.get(id);
273       for (Iterator itList = matchesList.iterator(); itList.hasNext();){
274         Integer matchId = (Integer) itList.next();
275         Annotation origA = (Annotation) origAnnotMap.get(matchId);
276         assertTrue("Couldn't find an original annot with ID=" + matchId, origA != null);
277         Annotation reloadedA = (Annotation) reloadedAnnMap.get(matchId);
278         assertTrue("Couldn't find a reloaded annot with ID=" + matchId, reloadedA != null);
279         compareAnnot(origA, reloadedA);
280       }// End for
281     }// End for
282     // Clean up the XMl file
283     xmlFile.delete();
284   }// End testAnnotationIDConsistencyForSaveAsXml
285 
286   /**
287    * Builds a Map based on the matches feature of some annotations. The goal is to
288    * use this map to validate the annotations from the reloaded document.
289    * In case no Annot has the matches feat, will return an Empty MAP
290    * @param doc The document of which annotations will be used to construct the map
291    * @return A Map from Annot ID -> Lists of Annot IDs
292    */
293   private Map buildMatchesMap(Document doc){
294     Map matchesMap = new HashMap();
295     // Scan the default annotation set
296     AnnotationSet annotSet = doc.getAnnotations();
297 
298     helperBuildMatchesMap(annotSet, matchesMap);
299     // Scan all named annotation sets
300     if (doc.getNamedAnnotationSets() != null){
301       for ( Iterator namedAnnotSetsIter = doc.getNamedAnnotationSets().values().iterator();
302                                                                 namedAnnotSetsIter.hasNext(); ){
303         helperBuildMatchesMap((gate.AnnotationSet) namedAnnotSetsIter.next(), matchesMap);
304       }// End while
305     }// End if
306     return matchesMap;
307   }// End of buildMatchesMap()
308 
309   /**
310    * This is a helper metod. It scans an annotation set and adds the ID of the annotations
311    * which have the matches feature to the map.
312    * @param sourceAnnotSet  The annotation set investigated
313    * @param aMap
314    */
315   private void helperBuildMatchesMap(AnnotationSet sourceAnnotSet, Map aMap ){
316 
317     for (Iterator it = sourceAnnotSet.iterator(); it.hasNext();){
318       Annotation a = (Annotation) it.next();
319       FeatureMap aFeatMap = a.getFeatures();
320       // Skip those annotations who don't have features
321       if (aFeatMap == null) continue;
322       // Extract the matches feat
323       List matchesVal = (List) aFeatMap.get("matches");
324       if (matchesVal == null) continue;
325       Integer id = a.getId();
326       aMap.put(id,matchesVal);
327     }//End for
328 
329   }// End of helperBuildMatchesMap()
330 
331   /**
332    * This method tests if the generator for new Annotation IDs is greather than the
333    * maximum Annotation ID present in the GATE document. In oter words, it ensures that
334    * new Annotations will receive an UNIQUE ID.
335    *
336    * @param aDoc The GATE document being tested
337    */
338   protected void verifyAnnotationIDGenerator(gate.Document aDoc){
339     // Creates a MAP containing all the annotations of the document.
340     // In doing so, it also tests if there are annotations with the same ID.
341     Map id2AnnotationMap = buildID2AnnotMap(aDoc);
342 
343     if (id2AnnotationMap == null || id2AnnotationMap.isEmpty()){
344       //System.out.println("No annotations found on the document! Nothing to test.");
345       return;
346     }
347 
348     // Get the key set of the Map and sort them
349     Set keysSet = id2AnnotationMap.keySet();
350     TreeSet sortedSet = new TreeSet(keysSet);
351     // Get the highest Annotation ID
352     Integer maxAnnotId =  (Integer) sortedSet.last();
353     // Compare its value to the one hold by the document's ID generator
354     Integer generatorId = ((DocumentImpl)aDoc).getNextAnnotationId();
355 
356 //    System.out.println("maxAnnotid = " + maxAnnotId + " generatorID = " + generatorId);
357 
358     assertTrue("Annotation ID generator["+generatorId+"] on document [" + aDoc.getSourceUrl() +
359             "] was equal or less than the MAX Annotation ID["+maxAnnotId+"] on the document."+
360             " This may lead to Annotation ID conflicts.", generatorId.intValue() > maxAnnotId.intValue());
361 
362 
363   }// End of verifyAnnotationIDGenerator()
364 
365   /**
366    * Verifies if the two maps hold annotations with the same ID. The only thing not checked
367    * are the features, as some of them could be lost in the serialization/deserialization process
368    * @param origAnnotMap A map by ID, containing the original annotations
369    * @param reloadedAnnMap A map by ID, containing the recreated annotations
370    */
371   private void verifyIDConsistency(Map origAnnotMap, Map reloadedAnnMap) {
372     assertEquals("Found a different number of annot in both documents.",
373             origAnnotMap.keySet().size(), reloadedAnnMap.keySet().size());
374 
375 //    List orig = new ArrayList(origAnnotMap.keySet());
376 //    Collections.sort(orig);
377 //    System.out.println("ORIG SET =" + orig);
378 //
379 //    List rel = new ArrayList(reloadedAnnMap.keySet());
380 //    Collections.sort(rel);
381 //    System.out.println("REL  SET =" + rel);
382 //
383 
384     for (Iterator it = origAnnotMap.keySet().iterator(); it.hasNext();){
385       Integer id = (Integer) it.next();
386       Annotation origAnn = (Annotation) origAnnotMap.get(id);
387       Annotation reloadedAnnot = (Annotation) reloadedAnnMap.get(id);
388 
389       assertTrue("Annotation with ID="+ id +" was not found in the reloaded document.", reloadedAnnot != null);
390       compareAnnot(origAnn, reloadedAnnot);
391 
392     }// End for
393   }// End of verifyIDConsistency()
394 
395   /**
396    * Thes if two annotatiosn are the same, except their features.
397    * @param origAnn
398    * @param reloadedAnnot
399    */
400   private void compareAnnot(Annotation origAnn, Annotation reloadedAnnot) {
401     assertTrue("Found original and reloaded annot without the same ID!",
402             origAnn.getId().equals(reloadedAnnot.getId()));
403     assertTrue("Found original and reloaded annot without the same TYPE!\n"+
404                "Original was ["+origAnn.getType()+"] and reloaded was ["+reloadedAnnot.getType()+"].",
405             origAnn.getType().equals(reloadedAnnot.getType()));
406     assertTrue("Found original and reloaded annot without the same START offset!",
407             origAnn.getStartNode().getOffset().equals(reloadedAnnot.getStartNode().getOffset()));
408     assertTrue("Found original and reloaded annot without the same END offset!",
409             origAnn.getEndNode().getOffset().equals(reloadedAnnot.getEndNode().getOffset()));
410   }// End of compareAnnot()
411 
412 
413   private Map addAnnotSet2Map(AnnotationSet annotSet, Map id2AnnMap){
414     for (Iterator it = annotSet.iterator(); it.hasNext();){
415       Annotation a = (Annotation) it.next();
416       Integer id = a.getId();
417 
418       assertTrue("Found two annotations(one with type = " + a.getType() +
419               ")with the same ID=" + id, !id2AnnMap.keySet().contains(id));
420 
421       id2AnnMap.put(id, a);
422     }// End for
423     return id2AnnMap;
424   }
425 
426   /**
427    * Scans a target Doc for all Annotations and builds a map (from anot ID to annot) in the process
428    * I also checks to see if there are two annotations with the same ID.
429    * @param aDoc The GATE doc to be scaned
430    * @return a Map ID2Annot
431    */
432   private Map buildID2AnnotMap(Document aDoc){
433     Map id2AnnMap = new HashMap();
434     // Scan the default annotation set
435     AnnotationSet annotSet = aDoc.getAnnotations();
436     addAnnotSet2Map(annotSet, id2AnnMap);
437     // Scan all named annotation sets
438     if (aDoc.getNamedAnnotationSets() != null){
439       for ( Iterator namedAnnotSetsIter = aDoc.getNamedAnnotationSets().values().iterator();
440                                                                 namedAnnotSetsIter.hasNext(); ){
441 
442         addAnnotSet2Map((gate.AnnotationSet) namedAnnotSetsIter.next(), id2AnnMap);
443       }// End while
444     }// End if
445     return id2AnnMap;
446   }// End of buildID2AnnotMap()
447 
448   /**
449    * Load ANNIE with defaults
450    * @return
451    */
452   private SerialAnalyserController loadANNIEWithDefaults(){
453     FeatureMap params = Factory.newFeatureMap();
454     SerialAnalyserController sac = null;
455     try{
456       // Create a serial analyser
457       sac = (SerialAnalyserController)
458           Factory.createResource("gate.creole.SerialAnalyserController",
459                                  Factory.newFeatureMap(),
460                                  Factory.newFeatureMap(),
461                                  "ANNIE_" + Gate.genSym());
462       // Load each PR as defined in gate.creole.ANNIEConstants.PR_NAMES
463       for(int i = 0; i < ANNIEConstants.PR_NAMES.length; i++){
464       ProcessingResource pr = (ProcessingResource)
465           Factory.createResource(ANNIEConstants.PR_NAMES[i], params);
466         // Add the PR to the sac
467         sac.add(pr);
468       }// End for
469 
470     }catch(gate.creole.ResourceInstantiationException ex){
471       ex.printStackTrace(Err.getPrintWriter());
472     }
473     return sac;
474   }// End of LoadANNIEWithDefaults()
475 
476 
477   /** Test suite routine for the test runner */
478   public static Test suite() {
479     return new TestSuite(TestXml.class);
480   } // suite
481 
482 } // class TestXml
483