1   /*
2    *  SgmlDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 4/July/2000
12   *
13   *  $Id: SgmlDocumentFormat.java,v 1.32 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.IOException;
19  
20  import javax.xml.parsers.*;
21  
22  import org.xml.sax.SAXException;
23  
24  import gate.Document;
25  import gate.Resource;
26  import gate.creole.ResourceInstantiationException;
27  import gate.event.StatusListener;
28  import gate.sgml.Sgml2Xml;
29  import gate.util.DocumentFormatException;
30  import gate.xml.XmlDocumentHandler;
31  
32  /** The format of Documents. Subclasses of DocumentFormat know about
33    * particular MIME types and how to unpack the information in any
34    * markup or formatting they contain into GATE annotations. Each MIME
35    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
36    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
37    * with a static index residing here when they are constructed. Static
38    * getDocumentFormat methods can then be used to get the appropriate
39    * format class for a particular document.
40    */
41  public class SgmlDocumentFormat extends TextualDocumentFormat
42  {
43    /** Debug flag */
44    private static final boolean DEBUG = false;
45  
46    /** Default construction */
47    public SgmlDocumentFormat() { super(); }
48  
49    /** Unpack the markup in the document. This converts markup from the
50      * native format (e.g. SGML) into annotations in GATE format.
51      * Uses the markupElementsMap to determine which elements to convert, and
52      * what annotation type names to use.
53      * The doc's content is first converted to a wel formed XML.
54      * If this succeddes then the document is saved into a temp file and parsed
55      * as an XML document.
56      *
57      * @param doc The gate document you want to parse.
58      *
59      */
60    public void unpackMarkup(Document doc) throws DocumentFormatException{
61      if ( (doc == null) ||
62           (doc.getSourceUrl() == null && doc.getContent() == null)){
63  
64        throw new DocumentFormatException(
65                 "GATE document is null or no content found. Nothing to parse!");
66      }// End if
67      // Create a status listener
68      StatusListener statusListener = new StatusListener(){
69              public void statusChanged(String text){
70                fireStatusChanged(text);
71              }
72      };
73      XmlDocumentHandler xmlDocHandler = null;
74      try {
75        Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
76  
77        fireStatusChanged("Performing SGML to XML...");
78  
79        // convert the SGML document
80        String xmlUri = sgml2Xml.convert();
81  
82        fireStatusChanged("DONE !");
83  
84        //Out.println("Conversion done..." + xmlUri);
85        //Out.println(sgml2Xml.convert());
86        // Get a parser factory.
87        SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
88        // Set up the factory to create the appropriate type of parser
89  
90        // Set up the factory to create the appropriate type of parser
91        // non validating one
92        saxParserFactory.setValidating(false);
93        // non namesapace aware one
94        saxParserFactory.setNamespaceAware(true);
95  
96        // Create a SAX parser
97        SAXParser parser = saxParserFactory.newSAXParser();
98  
99        // use it
100       if (null != doc){
101         // create a new Xml document handler
102         xmlDocHandler = new XmlDocumentHandler(doc,
103                                                this.markupElementsMap,
104                                                this.element2StringMap);
105 
106         // register a status listener with it
107         xmlDocHandler.addStatusListener(statusListener);
108 
109         parser.parse(xmlUri, xmlDocHandler);
110         ((DocumentImpl) doc).setNextAnnotationId(
111                                           xmlDocHandler.getCustomObjectsId());
112      }// end if
113     } catch (ParserConfigurationException e){
114         throw
115         new DocumentFormatException("XML parser configuration exception ", e);
116     } catch (SAXException e){
117         throw new DocumentFormatException(e);
118     } catch (IOException e){
119         throw new DocumentFormatException("I/O exception for " +
120                                       doc.getSourceUrl().toString());
121     }finally{
122       if (xmlDocHandler != null)
123         xmlDocHandler.removeStatusListener(statusListener);
124     }// End try
125 
126   }// unpackMarkup
127 
128   /** This method converts the document's content from SGML 2 XML.*/
129   private String sgml2Xml(Document doc) {
130     String xmlUri = doc.getSourceUrl().toString ();
131 
132     return xmlUri;
133   }// sgml2Xml()
134 
135   /** Initialise this resource, and return it. */
136   public Resource init() throws ResourceInstantiationException{
137     // Register SGML mime type
138     MimeType mime = new MimeType("text","sgml");
139     // Register the class handler for this mime type
140     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
141                                                                           this);
142     // Register the mime type with mine string
143     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
144     // Register file sufixes for this mime type
145     suffixes2mimeTypeMap.put("sgm",mime);
146     suffixes2mimeTypeMap.put("sgml",mime);
147     setMimeType(mime);
148     return this;
149   }// init
150 
151 }//class SgmlDocumentFormat
152