RtfDocumentFormat.java |
1 /* 2 * RtfDocumentFormat.java 3 * 4 * Copyright (c) 1998-2005, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/July/2000 12 * 13 * $Id: RtfDocumentFormat.java,v 1.21 2005/02/14 16:32:30 valyt Exp $ 14 */ 15 16 package gate.corpora; 17 18 import java.io.*; 19 20 import javax.swing.text.*; 21 import javax.swing.text.rtf.RTFEditorKit; 22 23 import gate.Resource; 24 import gate.creole.ResourceInstantiationException; 25 import gate.util.DocumentFormatException; 26 //import org.w3c.www.mime.*; 27 28 /** The format of Documents. Subclasses of DocumentFormat know about 29 * particular MIME types and how to unpack the information in any 30 * markup or formatting they contain into GATE annotations. Each MIME 31 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 32 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 33 * with a static index residing here when they are constructed. Static 34 * getDocumentFormat methods can then be used to get the appropriate 35 * format class for a particular document. 36 */ 37 public class RtfDocumentFormat extends TextualDocumentFormat{ 38 39 /** Debug flag */ 40 private static final boolean DEBUG = false; 41 42 /** Default construction */ 43 public RtfDocumentFormat() { super(); } 44 45 /** Unpack the markup in the document. This converts markup from the 46 * native format (e.g.RTF) into annotations in GATE format. 47 * Uses the markupElementsMap to determine which elements to convert, and 48 * what annotation type names to use. 49 * It always tryes to parse te doc's content. It doesn't matter if the 50 * sourceUrl is null or not. 51 * 52 * @param doc The gate document you want to parse. 53 * 54 */ 55 public void unpackMarkup(gate.Document doc) throws DocumentFormatException { 56 57 if ( (doc == null) || 58 (doc.getSourceUrl() == null && doc.getContent() == null)){ 59 60 throw new DocumentFormatException( 61 "GATE document is null or no content found. Nothing to parse!"); 62 }// End if 63 64 // create a RTF editor kit 65 RTFEditorKit aRtfEditorkit = new RTFEditorKit(); 66 67 // create a Styled Document 68 // NOTE that RTF Kit works only with Systled Document interface 69 StyledDocument styledDoc = new DefaultStyledDocument(); 70 71 try { 72 // get an Input stream from the gate document 73 InputStream in = (doc.getSourceUrl() == null) ? 74 (new ByteArrayInputStream(doc.getContent().toString().getBytes())) : 75 doc.getSourceUrl().openStream(); 76 aRtfEditorkit.read(in, styledDoc, 0); 77 // replace the document content with the one without markups 78 doc.setContent(new DocumentContentImpl( 79 styledDoc.getText(0,styledDoc.getLength()) 80 ) 81 ); 82 } catch (BadLocationException e) { 83 throw new DocumentFormatException(e); 84 } catch (IOException e){ 85 throw new DocumentFormatException("I/O exception for " + 86 doc.getSourceUrl().toExternalForm(),e); 87 } 88 } // unpackMarkup(doc) 89 90 /** Initialise this resource, and return it. */ 91 public Resource init() throws ResourceInstantiationException{ 92 // Register RTF mime type 93 MimeType mime = new MimeType("text","rtf"); 94 // Register the class handler for this mime type 95 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 96 this); 97 // Register the mime type with mine string 98 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 99 // Register file sufixes for this mime type 100 suffixes2mimeTypeMap.put("rtf",mime); 101 // Register magic numbers for this mime type 102 magic2mimeTypeMap.put("{\\rtf1",mime); 103 // Set the mimeType for this language resource 104 setMimeType(mime); 105 return this; 106 }// init() 107 }// class RtfDocumentFormat 108