| MSWordDocumentFormat.java |
1 /*
2 * Copyright (c) 1998-2005, The University of Sheffield.
3 *
4 * This file is part of GATE (see http://gate.ac.uk/), and is free
5 * software, licenced under the GNU Library General Public License,
6 * Version 2, June 1991 (in the distribution as file licence.html,
7 * and also available at http://gate.ac.uk/gate/licence.html).
8 *
9 * PdfDocumentFormat.java
10 *
11 * Ting Wang, Valentin Tablan, 14-Feb-2005,
12 *
13 * $Id: MSWordDocumentFormat.java,v 1.2 2005/02/14 16:32:30 valyt Exp $
14 */
15
16 package gate.corpora;
17
18 import java.net.URL;
19 import gate.*;
20 import gate.Document;
21 import gate.DocumentFormat;
22 import gate.creole.ResourceInstantiationException;
23 import gate.util.DocumentFormatException;
24
25 // Addition by Ting Wang
26 import org.textmining.text.extraction.WordExtractor;
27 import java.io.*;
28
29 /**
30 */
31 public class MSWordDocumentFormat extends DocumentFormat{
32
33
34 /**
35 * Initialise this resource, and return it.
36 * Registers this format unpacker with the system.
37 */
38 public Resource init() throws ResourceInstantiationException{
39 // Register plain text mime type
40 MimeType mime = new MimeType("application"," msword");
41 // Register the class handler for this mime type
42 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
43 this);
44 // Register the mime type with mine string
45 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
46 // Register file sufixes for this mime type
47 suffixes2mimeTypeMap.put("doc", mime);
48 // Set the mimeType for this language resource
49 setMimeType(mime);
50 return this;
51 } // init()
52
53
54 /**
55 * The MSWord Document Format does not support repositioning info.
56 * @return false.
57 */
58 public Boolean supportsRepositioning() {
59 return new Boolean(false);
60 } // supportsRepositioning
61
62
63 /** Unpack the markup in the document. This converts markup from the
64 * native format (e.g. XML, RTF) into annotations in GATE format.
65 * Uses the markupElementsMap to determine which elements to convert, and
66 * what annotation type names to use.
67 */
68 public void unpackMarkup(Document doc)
69 throws DocumentFormatException{
70 //get the original file
71 URL fileURL = doc.getSourceUrl();
72 if(fileURL == null) throw new DocumentFormatException(
73 "Unpacking MS Word files requires an URL to the original content!");
74
75 //parse the original file into a text
76 String extractedContent = null;
77
78 //Implement the MSWord unpacking.
79 try {
80 // get an Input stream from the gate document
81 InputStream in = fileURL.openStream();
82
83 // create a MSWord Text Extractor
84 WordExtractor extractor = new WordExtractor();
85 // extract the text from the stream
86 extractedContent = extractor.extractText(in);
87 in.close();
88 } catch (Exception e){
89 throw new DocumentFormatException("Exception for " +
90 doc.getSourceUrl().toExternalForm(),e);
91 }
92 //set the content on the document
93 doc.setContent(new DocumentContentImpl(extractedContent));
94 }
95
96
97 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
98 RepositioningInfo ampCodingInfo)
99 throws DocumentFormatException{
100 unpackMarkup(doc);
101 }
102
103
104 }
105