1   /*
2    *  Copyright (c) 1998-2005, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  PdfDocumentFormat.java
10   *
11   *  Ting Wang, Valentin Tablan, 14-Feb-2005, 
12   *
13   *  $Id: MSWordDocumentFormat.java,v 1.2 2005/02/14 16:32:30 valyt Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.net.URL;
19  import gate.*;
20  import gate.Document;
21  import gate.DocumentFormat;
22  import gate.creole.ResourceInstantiationException;
23  import gate.util.DocumentFormatException;
24  
25  // Addition by Ting Wang
26  import org.textmining.text.extraction.WordExtractor;
27  import java.io.*;
28  
29  /**
30   */
31  public class MSWordDocumentFormat extends DocumentFormat{
32  
33  
34    /**
35     * Initialise this resource, and return it.
36     * Registers this format unpacker with the system.
37     */
38    public Resource init() throws ResourceInstantiationException{
39      // Register plain text mime type
40      MimeType mime = new MimeType("application"," msword"); 
41      // Register the class handler for this mime type
42      mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
43                                                                            this);
44      // Register the mime type with mine string
45      mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
46      // Register file sufixes for this mime type
47      suffixes2mimeTypeMap.put("doc", mime);
48      // Set the mimeType for this language resource
49      setMimeType(mime);
50      return this;
51    } // init()
52  
53  
54    /**
55     * The MSWord Document Format does not support repositioning info.
56     * @return false.
57     */
58    public Boolean supportsRepositioning() {
59      return new Boolean(false);
60    } // supportsRepositioning
61  
62  
63    /** Unpack the markup in the document. This converts markup from the
64     * native format (e.g. XML, RTF) into annotations in GATE format.
65     * Uses the markupElementsMap to determine which elements to convert, and
66     * what annotation type names to use.
67     */
68    public void unpackMarkup(Document doc)
69                                       throws DocumentFormatException{
70      //get the original file
71      URL fileURL = doc.getSourceUrl();
72      if(fileURL == null) throw new DocumentFormatException(
73      "Unpacking MS Word files requires an URL to the original content!");
74      
75      //parse the original file into a text
76      String extractedContent = null;
77  
78      //Implement the MSWord unpacking.
79      try {
80        // get an Input stream from the gate document
81        InputStream in = fileURL.openStream();
82  
83        // create a MSWord Text Extractor
84        WordExtractor extractor = new WordExtractor();
85        // extract the text from the stream
86        extractedContent = extractor.extractText(in);
87        in.close();
88      } catch (Exception e){
89        throw new DocumentFormatException("Exception for " +
90                                          doc.getSourceUrl().toExternalForm(),e);
91      }
92      //set the content on the document
93      doc.setContent(new DocumentContentImpl(extractedContent));
94    }
95  
96  
97    public void unpackMarkup(Document doc, RepositioningInfo repInfo,
98                                         RepositioningInfo ampCodingInfo)
99                                       throws DocumentFormatException{
100     unpackMarkup(doc);
101   }
102 
103 
104 }
105