1
15
16 package gate.corpora;
17
18 import java.io.*;
19 import java.net.URLConnection;
20
21 import javax.swing.text.html.HTMLEditorKit;
22 import javax.swing.text.html.parser.ParserDelegator;
23
24 import gate.Document;
25 import gate.Resource;
26 import gate.creole.ResourceInstantiationException;
27 import gate.event.StatusListener;
28 import gate.html.HtmlDocumentHandler;
29 import gate.util.DocumentFormatException;
30
31
33
42 public class HtmlDocumentFormat extends TextualDocumentFormat
43 {
44
45
46 private static final boolean DEBUG = false;
47
48
49 public HtmlDocumentFormat() { super(); }
50
51
52 public Boolean supportsRepositioning() {
53 return new Boolean(true);
54 }
56
57 public void unpackMarkup(Document doc) throws DocumentFormatException {
58 unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null);
59 }
61
71 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
72 RepositioningInfo ampCodingInfo) throws DocumentFormatException{
73 Reader reader = null;
74 URLConnection conn = null;
75 PrintWriter out = null;
76 HTMLEditorKit.Parser parser = new ParserDelegator();
77
78 if ( doc == null || doc.getContent() == null ){
79 throw new DocumentFormatException(
80 "GATE document is null or no content found. Nothing to parse!");
81 }
83 reader = new InputStreamReader(
84 new ByteArrayInputStream(doc.getContent().toString().getBytes()));
85
86 HtmlDocumentHandler htmlDocHandler = new
88 HtmlDocumentHandler(doc, this.markupElementsMap);
89 StatusListener statusListener = new StatusListener(){
91 public void statusChanged(String text){
92 fireStatusChanged(text);
93 }
94 };
95 htmlDocHandler.addStatusListener(statusListener);
97 htmlDocHandler.setRepositioningInfo(repInfo);
99 htmlDocHandler.setAmpCodingInfo(ampCodingInfo);
101
102 try{
103 parser.parse(reader, htmlDocHandler, true);
105 } catch (IOException e){
106 throw new DocumentFormatException(e);
107 }finally{
108 if (htmlDocHandler != null)
109 htmlDocHandler.removeStatusListener(statusListener);
110 } }
113
114 public Resource init() throws ResourceInstantiationException{
115 MimeType mime = new MimeType("text","html");
117 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
119 this);
120 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
122 suffixes2mimeTypeMap.put("html",mime);
124 suffixes2mimeTypeMap.put("htm",mime);
125 magic2mimeTypeMap.put("<html",mime);
127 setMimeType(mime);
129 return this;
130 }}