| RtfDocumentFormat.java |
1 /*
2 * RtfDocumentFormat.java
3 *
4 * Copyright (c) 1998-2005, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Cristian URSU, 26/July/2000
12 *
13 * $Id: RtfDocumentFormat.java,v 1.21 2005/02/14 16:32:30 valyt Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.*;
19
20 import javax.swing.text.*;
21 import javax.swing.text.rtf.RTFEditorKit;
22
23 import gate.Resource;
24 import gate.creole.ResourceInstantiationException;
25 import gate.util.DocumentFormatException;
26 //import org.w3c.www.mime.*;
27
28 /** The format of Documents. Subclasses of DocumentFormat know about
29 * particular MIME types and how to unpack the information in any
30 * markup or formatting they contain into GATE annotations. Each MIME
31 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
32 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
33 * with a static index residing here when they are constructed. Static
34 * getDocumentFormat methods can then be used to get the appropriate
35 * format class for a particular document.
36 */
37 public class RtfDocumentFormat extends TextualDocumentFormat{
38
39 /** Debug flag */
40 private static final boolean DEBUG = false;
41
42 /** Default construction */
43 public RtfDocumentFormat() { super(); }
44
45 /** Unpack the markup in the document. This converts markup from the
46 * native format (e.g.RTF) into annotations in GATE format.
47 * Uses the markupElementsMap to determine which elements to convert, and
48 * what annotation type names to use.
49 * It always tryes to parse te doc's content. It doesn't matter if the
50 * sourceUrl is null or not.
51 *
52 * @param doc The gate document you want to parse.
53 *
54 */
55 public void unpackMarkup(gate.Document doc) throws DocumentFormatException {
56
57 if ( (doc == null) ||
58 (doc.getSourceUrl() == null && doc.getContent() == null)){
59
60 throw new DocumentFormatException(
61 "GATE document is null or no content found. Nothing to parse!");
62 }// End if
63
64 // create a RTF editor kit
65 RTFEditorKit aRtfEditorkit = new RTFEditorKit();
66
67 // create a Styled Document
68 // NOTE that RTF Kit works only with Systled Document interface
69 StyledDocument styledDoc = new DefaultStyledDocument();
70
71 try {
72 // get an Input stream from the gate document
73 InputStream in = (doc.getSourceUrl() == null) ?
74 (new ByteArrayInputStream(doc.getContent().toString().getBytes())) :
75 doc.getSourceUrl().openStream();
76 aRtfEditorkit.read(in, styledDoc, 0);
77 // replace the document content with the one without markups
78 doc.setContent(new DocumentContentImpl(
79 styledDoc.getText(0,styledDoc.getLength())
80 )
81 );
82 } catch (BadLocationException e) {
83 throw new DocumentFormatException(e);
84 } catch (IOException e){
85 throw new DocumentFormatException("I/O exception for " +
86 doc.getSourceUrl().toExternalForm(),e);
87 }
88 } // unpackMarkup(doc)
89
90 /** Initialise this resource, and return it. */
91 public Resource init() throws ResourceInstantiationException{
92 // Register RTF mime type
93 MimeType mime = new MimeType("text","rtf");
94 // Register the class handler for this mime type
95 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
96 this);
97 // Register the mime type with mine string
98 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
99 // Register file sufixes for this mime type
100 suffixes2mimeTypeMap.put("rtf",mime);
101 // Register magic numbers for this mime type
102 magic2mimeTypeMap.put("{\\rtf1",mime);
103 // Set the mimeType for this language resource
104 setMimeType(mime);
105 return this;
106 }// init()
107 }// class RtfDocumentFormat
108