1   /*
2    *  XmlDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: XmlDocumentFormat.java,v 1.59 2006/04/05 13:29:48 ian_roberts Exp $
14   */
15  
16  package gate.corpora;
17  
18  //import com.sun.xml.parser.* ;
19  import java.io.*;
20  import java.net.URLConnection;
21  
22  import javax.xml.parsers.*;
23  
24  import org.xml.sax.InputSource;
25  import org.xml.sax.SAXException;
26  
27  import gate.*;
28  import gate.creole.ResourceInstantiationException;
29  import gate.event.StatusListener;
30  import gate.util.DocumentFormatException;
31  import gate.util.Out;
32  import gate.xml.*;
33  //import org.w3c.www.mime.*;
34  
35  /** The format of Documents. Subclasses of DocumentFormat know about
36    * particular MIME types and how to unpack the information in any
37    * markup or formatting they contain into GATE annotations. Each MIME
38    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
39    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
40    * with a static index residing here when they are constructed. Static
41    * getDocumentFormat methods can then be used to get the appropriate
42    * format class for a particular document.
43    */
44  public class XmlDocumentFormat extends TextualDocumentFormat
45  {
46    /** Debug flag */
47    private static final boolean DEBUG = false;
48  
49    /** Default construction */
50    public XmlDocumentFormat() { super(); }
51  
52    /** We could collect repositioning information during XML parsing */
53    public Boolean supportsRepositioning() {
54      return new Boolean(true);
55    } // supportsRepositioning
56  
57    /** Old style of unpackMarkup (without collecting of RepositioningInfo) */
58    public void unpackMarkup(Document doc) throws DocumentFormatException {
59      unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null);
60    } // unpackMarkup
61  
62  
63    /** Unpack the markup in the document. This converts markup from the
64      * native format (e.g. XML) into annotations in GATE format.
65      * Uses the markupElementsMap to determine which elements to convert, and
66      * what annotation type names to use. If the document was created from a
67      * String, then is recomandable to set the doc's sourceUrl to <b>null</b>.
68      * So, if the document has a valid URL, then the parser will try to
69      * parse the XML document pointed by the URL.If the URL is not valid, or
70      * is null, then the doc's content will be parsed. If the doc's content is
71      * not a valid XML then the parser might crash.
72      *
73      * @param doc The gate document you want to parse. If
74      * <code>doc.getSourceUrl()</code> returns <b>null</b> then the content of
75      * doc will be parsed. Using a URL is recomended because the parser will
76      * report errors corectlly if the XML document is not well formed.
77      */
78    public void unpackMarkup(Document doc, RepositioningInfo repInfo,
79                RepositioningInfo ampCodingInfo) throws DocumentFormatException {
80      if( (doc == null) ||
81          (doc.getSourceUrl() == null && doc.getContent() == null)){
82  
83        throw new DocumentFormatException(
84                 "GATE document is null or no content found. Nothing to parse!");
85      }// End if
86  
87      boolean docHasContentButNoValidURL = false;
88      // This is a test to see if the GATE document has a valid URL or a valid
89      // content. If doesn't has a valid URL then try to parse its content as XML
90      try{
91        if (doc.getSourceUrl() == null && doc.getContent() != null){
92          // The doc's url is null but there is a content.
93          docHasContentButNoValidURL = true;
94        }else {URLConnection conn = doc.getSourceUrl().openConnection();}
95      }catch (IOException ex1){
96        // The URL is not null but is not valid.
97        if(doc.getContent() == null)
98          // The document content is also null. There is nothing we can do.
99          throw new DocumentFormatException("The document doesn't have a" +
100         " valid URL and also no content");
101       docHasContentButNoValidURL = true;
102     }// End try
103 
104     // Create a status listener
105     StatusListener statusListener = new StatusListener(){
106           public void statusChanged(String text){
107             // This is implemented in DocumentFormat.java and inherited here
108             fireStatusChanged(text);
109           }
110     };
111     GateFormatXmlDocumentHandler gateXmlHandler = null;
112     XmlDocumentHandler xmlDocHandler = null;
113     if (docHasContentButNoValidURL)
114       parseDocumentWithoutURL(doc, repInfo, ampCodingInfo);
115     else try {
116       // use Excerces XML parser with JAXP
117       // System.setProperty("javax.xml.parsers.SAXParserFactory",
118       //                         "org.apache.xerces.jaxp.SAXParserFactoryImpl");
119       // Get a parser factory.
120       SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
121       // Set up the factory to create the appropriate type of parser
122       // non validating one
123       saxParserFactory.setValidating(false);
124       // non namesapace aware one
125       saxParserFactory.setNamespaceAware(true);
126       // create it
127       SAXParser xmlParser = saxParserFactory.newSAXParser();
128       if (isGateXmlDocument){
129         // Construct the appropiate xml handler for the job.
130         gateXmlHandler = new GateFormatXmlDocumentHandler(doc);
131         // Register a status listener
132         gateXmlHandler.addStatusListener(statusListener);
133         InputSource is;
134         // Parse the Gate Document with the appropriate encoding
135         if(doc instanceof TextualDocument){
136           String docEncoding = ((TextualDocument)doc).getEncoding();
137           Reader docReader = new InputStreamReader(
138               doc.getSourceUrl().openStream(), docEncoding);
139           is = new InputSource(docReader);
140           is.setSystemId(doc.getSourceUrl().toString());
141         }
142         else {
143           is = new InputSource(doc.getSourceUrl().toString());
144         }
145         
146         xmlParser.parse(is, gateXmlHandler);
147         gateXmlHandler.removeStatusListener(statusListener);
148       }else{
149         // Create a new Xml document handler
150         xmlDocHandler =  new XmlDocumentHandler( doc,
151                                                  this.markupElementsMap,
152                                                  this.element2StringMap);
153         // Register a status listener with it
154         xmlDocHandler.addStatusListener(statusListener);
155         // set repositioning object
156         xmlDocHandler.setRepositioningInfo(repInfo);
157         // set the object with ampersand coding positions
158         xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
159 
160         // Parse the document handler
161 /* Angel
162         xmlParser.parse(doc.getSourceUrl().toString(), xmlDocHandler );
163 Angel */
164       // try to choose concret parser (Xerces)
165 // Angel - start
166 
167       org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
168       //Niraj org.apache.xerces.parsers.SAXParser newxmlParser =
169         //  Niraj new org.apache.xerces.parsers.SAXParser();
170       // Set up the factory to create the appropriate type of parser
171       // non validating one
172       // http://xml.org/sax/features/validation set to false
173       newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
174       // namesapace aware one
175       // http://xml.org/sax/features/namespaces set to true
176       newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
177       newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
178       newxmlParser.setContentHandler(xmlDocHandler);
179       newxmlParser.setErrorHandler(xmlDocHandler);
180       newxmlParser.setDTDHandler(xmlDocHandler);
181       newxmlParser.setEntityResolver(xmlDocHandler);
182       // Parse the XML Document with the appropriate encoding
183       InputSource is;
184       if(doc instanceof TextualDocument){
185         String docEncoding = ((TextualDocument)doc).getEncoding();
186         Reader docReader = new InputStreamReader(
187             doc.getSourceUrl().openStream(), docEncoding);
188         is = new InputSource(docReader);
189         // must set system ID to allow relative URLs (e.g. to a DTD) to work
190         is.setSystemId(doc.getSourceUrl().toString());
191       }
192       else {
193         is = new InputSource(doc.getSourceUrl().toString());
194       }
195       newxmlParser.parse(is);
196 // Angel - end
197         ((DocumentImpl) doc).setNextAnnotationId(
198                                           xmlDocHandler.getCustomObjectsId());
199         xmlDocHandler.removeStatusListener(statusListener);
200       }// End if
201     } catch (ParserConfigurationException e){
202         throw
203         new DocumentFormatException("XML parser configuration exception ", e);
204     } catch (SAXException e){
205       doc.getFeatures().put("parsingError", new Boolean(true));
206 
207       Boolean bThrow = (Boolean)
208         doc.getFeatures().get(GateConstants.THROWEX_FORMAT_PROPERTY_NAME);
209 
210       if(bThrow != null && bThrow.booleanValue()) {
211         // the next line is commented to avoid Document creation fail on error
212         throw new DocumentFormatException(e);
213       }
214       else {
215         Out.println("Warning: Document remains unparsed. \n"
216               +"\n  Stack Dump: ");
217         e.printStackTrace(Out.getPrintWriter());
218       } // if
219 
220     } catch (IOException e){
221         throw new DocumentFormatException("I/O exception for " +
222                                       doc.getSourceUrl().toString(), e);
223     }finally{
224       if(gateXmlHandler != null)
225         gateXmlHandler.removeStatusListener(statusListener);
226       if (xmlDocHandler != null)
227         xmlDocHandler.removeStatusListener(statusListener);
228     }// End if else try
229   }// unpackMarkup
230 
231   /** Called from unpackMarkup() if the document have been created from a
232    *  string
233    */
234   private void parseDocumentWithoutURL(gate.Document aDocument,
235                                         RepositioningInfo repInfo,
236                                         RepositioningInfo ampCodingInfo)
237                                               throws DocumentFormatException {
238     GateFormatXmlDocumentHandler gateXmlHandler = null;
239     XmlDocumentHandler xmlDocHandler = null;
240     // Create a status listener
241     StatusListener statusList = new StatusListener(){
242         public void statusChanged(String text){
243           // this is implemented in DocumentFormat.java and inherited here
244           fireStatusChanged(text);
245         }
246     };
247     try{
248       Reader reader = new StringReader(aDocument.getContent().toString());
249 //
250 //
251 //      new InputStreamReader(
252 //        new ByteArrayInputStream(aDocument.getContent().toString().getBytes("UTF-8")),
253 //        "UTF-8");
254       InputSource is = new InputSource(reader);
255 
256 
257       // use Excerces XML parser with JAXP
258       // System.setProperty("javax.xml.parsers.SAXParserFactory",
259       //                         "org.apache.xerces.jaxp.SAXParserFactoryImpl");
260       // Get a parser factory.
261       SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
262       // Set up the factory to create the appropriate type of parser
263       // non validating one
264       saxParserFactory.setValidating(false);
265       // non namesapace aware one
266       saxParserFactory.setNamespaceAware(true);
267       // create it
268       SAXParser xmlParser = saxParserFactory.newSAXParser();
269       
270       // Andrey Shafirin
271       // added support for GateXML format for documents without URL
272       if (isGateXmlDocument){
273         // Construct the appropiate xml handler for the job.
274         gateXmlHandler = new GateFormatXmlDocumentHandler(aDocument);
275         // Register a status listener
276         gateXmlHandler.addStatusListener(statusList);
277         // Parse the Gate Document with the appropriate encoding
278         // InputSource is = new InputSource(aDocument.getSourceUrl().toString());
279         // if(doc instanceof TextualDocument){
280         // is.setEncoding(((TextualDocument)doc).getEncoding());
281         // }
282         xmlParser.parse(is, gateXmlHandler);
283         gateXmlHandler.removeStatusListener(statusList);
284       }else{
285         // create a new Xml document handler
286         xmlDocHandler =  new XmlDocumentHandler(aDocument,
287                                                 this.markupElementsMap,
288                                                 this.element2StringMap);
289         // Regsiter the statusListener with xmlDocHandler
290         xmlDocHandler.addStatusListener(statusList);
291         // set repositioning object
292         xmlDocHandler.setRepositioningInfo(repInfo);
293         // set the object with ampersand coding positions
294         xmlDocHandler.setAmpCodingInfo(ampCodingInfo);
295         // Parse the document handler
296         // try to choose concret parser
297         org.xml.sax.XMLReader newxmlParser = xmlParser.getXMLReader();
298         // Niraj org.apache.xerces.parsers.SAXParser newxmlParser =
299             // Niraj new org.apache.xerces.parsers.SAXParser();
300         // Set up the factory to create the appropriate type of parser
301         // non validating one
302         // http://xml.org/sax/features/validation set to false
303         newxmlParser.setFeature("http://xml.org/sax/features/validation", false);
304         // namesapace aware one
305         // http://xml.org/sax/features/namespaces set to true
306         newxmlParser.setFeature("http://xml.org/sax/features/namespaces", true);
307         newxmlParser.setFeature("http://xml.org/sax/features/namespace-prefixes", true);
308         newxmlParser.setContentHandler(xmlDocHandler);
309         newxmlParser.setErrorHandler(xmlDocHandler);
310         newxmlParser.setDTDHandler(xmlDocHandler);
311         newxmlParser.setEntityResolver(xmlDocHandler);
312         newxmlParser.parse(is);
313         ((DocumentImpl) aDocument).setNextAnnotationId(
314                                             xmlDocHandler.getCustomObjectsId());
315         }
316     } catch (ParserConfigurationException e){
317         throw new DocumentFormatException(
318                         "XML parser configuration exception ", e);
319     } catch (SAXException e){
320         throw new DocumentFormatException(e);
321     } catch (IOException e){
322         throw new DocumentFormatException(e);
323     }finally{
324       // Remove the statusListener with xmlDocHandler
325       // Andrey xmlDocHandler.removeStatusListener(statusList);
326       if(gateXmlHandler != null) gateXmlHandler.removeStatusListener(statusList);
327       if (xmlDocHandler != null) xmlDocHandler.removeStatusListener(statusList);
328 
329     }// End try
330   }// End parseDocumentWithoutURL()
331 
332   /** Initialise this resource, and return it. */
333   public Resource init() throws ResourceInstantiationException{
334     // Register XML mime type
335     MimeType mime = new MimeType("text","xml");
336     // Register the class handler for this mime type
337     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
338                                                                           this);
339     // Register the mime type with mine string
340     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
341     //sometimes XML file appear as application/xml
342     mimeString2mimeTypeMap.put("application/xml", mime);
343     // Register file sufixes for this mime type
344     suffixes2mimeTypeMap.put("xml",mime);
345     suffixes2mimeTypeMap.put("xhtm",mime);
346     suffixes2mimeTypeMap.put("xhtml",mime);
347     // Register magic numbers for this mime type
348     magic2mimeTypeMap.put("<?xml",mime);
349     // Set the mimeType for this language resource
350     setMimeType(mime);
351     return this;
352   }// init()
353 
354 }//class XmlDocumentFormat
355