gate.corpora.HtmlDocumentFormat (Java2HTML)

1   /*
2    *  HtmlDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: HtmlDocumentFormat.java,v 1.34 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.*;
19  import java.net.URLConnection;
20  
21  import javax.swing.text.html.HTMLEditorKit;
22  import javax.swing.text.html.parser.ParserDelegator;
23  
24  import gate.Document;
25  import gate.Resource;
26  import gate.creole.ResourceInstantiationException;
27  import gate.event.StatusListener;
28  import gate.html.HtmlDocumentHandler;
29  import gate.util.DocumentFormatException;
30  
31  //import org.w3c.www.mime.*;
32  
33  /** The format of Documents. Subclasses of DocumentFormat know about
34    * particular MIME types and how to unpack the information in any
35    * markup or formatting they contain into GATE annotations. Each MIME
36    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
37    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
38    * with a static index residing here when they are constructed. Static
39    * getDocumentFormat methods can then be used to get the appropriate
40    * format class for a particular document.
41    */
42  public class HtmlDocumentFormat extends TextualDocumentFormat
43  {
44  
45    /** Debug flag */
46    private static final boolean DEBUG = false;
47  
48    /** Default construction */
49    public HtmlDocumentFormat() { super(); }
50  
51    /** We could collect repositioning information during XML parsing */
52    public Boolean supportsRepositioning() {
53      return new Boolean(true);
54    } // supportsRepositioning
55  
56    /** Old style of unpackMarkup (without collecting of RepositioningInfo) */
57    public void unpackMarkup(Document doc) throws DocumentFormatException {
58      unpackMarkup(doc, (RepositioningInfo) null, (RepositioningInfo) null);
59    } // unpackMarkup
60  
61    /** Unpack the markup in the document. This converts markup from the
62      * native format (e.g. HTML) into annotations in GATE format.
63      * Uses the markupElementsMap to determine which elements to convert, and
64      * what annotation type names to use.
65      * It always tryes to parse te doc's content. It doesn't matter if the
66      * sourceUrl is null or not.
67      *
68      * @param doc The gate document you want to parse.
69      *
70      */
71    public void unpackMarkup(Document doc, RepositioningInfo repInfo,
72                RepositioningInfo ampCodingInfo) throws DocumentFormatException{
73      Reader                reader = null;
74      URLConnection         conn = null;
75      PrintWriter           out = null;
76      HTMLEditorKit.Parser  parser = new ParserDelegator();
77  
78      if ( doc == null || doc.getContent() == null ){
79        throw new DocumentFormatException(
80                 "GATE document is null or no content found. Nothing to parse!");
81      }// End if
82  
83      reader = new InputStreamReader(
84               new ByteArrayInputStream(doc.getContent().toString().getBytes()));
85  
86      // create a new Htmldocument handler
87      HtmlDocumentHandler htmlDocHandler = new
88                             HtmlDocumentHandler(doc, this.markupElementsMap);
89      // Create a Status Listener
90      StatusListener statusListener = new StatusListener(){
91        public void statusChanged(String text){
92          fireStatusChanged(text);
93        }
94      };
95      // Register the listener with htmlDocHandler
96      htmlDocHandler.addStatusListener(statusListener);
97      // set repositioning object
98      htmlDocHandler.setRepositioningInfo(repInfo);
99      // set the object with ampersand coding positions
100     htmlDocHandler.setAmpCodingInfo(ampCodingInfo);
101 
102     try{
103       // parse the HTML document
104       parser.parse(reader, htmlDocHandler, true);
105     } catch (IOException e){
106       throw new DocumentFormatException(e);
107     }finally{
108       if (htmlDocHandler != null)
109         htmlDocHandler.removeStatusListener(statusListener);
110     }// End try
111   }//unpackMarkup(doc)
112 
113   /** Initialise this resource, and return it. */
114   public Resource init() throws ResourceInstantiationException{
115     // Register HTML mime type
116     MimeType mime = new MimeType("text","html");
117     // Register the class handler for this mime type
118     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
119                                                                           this);
120     // Register the mime type with mine string
121     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
122     // Register file sufixes for this mime type
123     suffixes2mimeTypeMap.put("html",mime);
124     suffixes2mimeTypeMap.put("htm",mime);
125     // Register magic numbers for this mime type
126     magic2mimeTypeMap.put("<html",mime);
127     // Set the mimeType for this language resource
128     setMimeType(mime);
129     return this;
130   }// init()
131 }// class HtmlDocumentFormat
132