1
15
16 package gate.corpora;
17
18 import java.io.IOException;
19
20 import javax.xml.parsers.*;
21
22 import org.xml.sax.SAXException;
23
24 import gate.Document;
25 import gate.Resource;
26 import gate.creole.ResourceInstantiationException;
27 import gate.event.StatusListener;
28 import gate.sgml.Sgml2Xml;
29 import gate.util.DocumentFormatException;
30 import gate.xml.XmlDocumentHandler;
31
32
41 public class SgmlDocumentFormat extends TextualDocumentFormat
42 {
43
44 private static final boolean DEBUG = false;
45
46
47 public SgmlDocumentFormat() { super(); }
48
49
60 public void unpackMarkup(Document doc) throws DocumentFormatException{
61 if ( (doc == null) ||
62 (doc.getSourceUrl() == null && doc.getContent() == null)){
63
64 throw new DocumentFormatException(
65 "GATE document is null or no content found. Nothing to parse!");
66 } StatusListener statusListener = new StatusListener(){
69 public void statusChanged(String text){
70 fireStatusChanged(text);
71 }
72 };
73 XmlDocumentHandler xmlDocHandler = null;
74 try {
75 Sgml2Xml sgml2Xml = new Sgml2Xml(doc);
76
77 fireStatusChanged("Performing SGML to XML...");
78
79 String xmlUri = sgml2Xml.convert();
81
82 fireStatusChanged("DONE !");
83
84 SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
88
90 saxParserFactory.setValidating(false);
93 saxParserFactory.setNamespaceAware(true);
95
96 SAXParser parser = saxParserFactory.newSAXParser();
98
99 if (null != doc){
101 xmlDocHandler = new XmlDocumentHandler(doc,
103 this.markupElementsMap,
104 this.element2StringMap);
105
106 xmlDocHandler.addStatusListener(statusListener);
108
109 parser.parse(xmlUri, xmlDocHandler);
110 ((DocumentImpl) doc).setNextAnnotationId(
111 xmlDocHandler.getCustomObjectsId());
112 } } catch (ParserConfigurationException e){
114 throw
115 new DocumentFormatException("XML parser configuration exception ", e);
116 } catch (SAXException e){
117 throw new DocumentFormatException(e);
118 } catch (IOException e){
119 throw new DocumentFormatException("I/O exception for " +
120 doc.getSourceUrl().toString());
121 }finally{
122 if (xmlDocHandler != null)
123 xmlDocHandler.removeStatusListener(statusListener);
124 }
126 }
128
129 private String sgml2Xml(Document doc) {
130 String xmlUri = doc.getSourceUrl().toString ();
131
132 return xmlUri;
133 }
135
136 public Resource init() throws ResourceInstantiationException{
137 MimeType mime = new MimeType("text","sgml");
139 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
141 this);
142 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
144 suffixes2mimeTypeMap.put("sgm",mime);
146 suffixes2mimeTypeMap.put("sgml",mime);
147 setMimeType(mime);
148 return this;
149 }
151 }