1
15
16 package gate.corpora;
17
18 import java.net.URL;
19 import gate.*;
20 import gate.Document;
21 import gate.DocumentFormat;
22 import gate.creole.ResourceInstantiationException;
23 import gate.util.DocumentFormatException;
24
25 import org.textmining.text.extraction.WordExtractor;
27 import java.io.*;
28
29
31 public class MSWordDocumentFormat extends DocumentFormat{
32
33
34
38 public Resource init() throws ResourceInstantiationException{
39 MimeType mime = new MimeType("application"," msword");
41 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
43 this);
44 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
46 suffixes2mimeTypeMap.put("doc", mime);
48 setMimeType(mime);
50 return this;
51 }
53
54
58 public Boolean supportsRepositioning() {
59 return new Boolean(false);
60 }
62
63
68 public void unpackMarkup(Document doc)
69 throws DocumentFormatException{
70 URL fileURL = doc.getSourceUrl();
72 if(fileURL == null) throw new DocumentFormatException(
73 "Unpacking MS Word files requires an URL to the original content!");
74
75 String extractedContent = null;
77
78 try {
80 InputStream in = fileURL.openStream();
82
83 WordExtractor extractor = new WordExtractor();
85 extractedContent = extractor.extractText(in);
87 in.close();
88 } catch (Exception e){
89 throw new DocumentFormatException("Exception for " +
90 doc.getSourceUrl().toExternalForm(),e);
91 }
92 doc.setContent(new DocumentContentImpl(extractedContent));
94 }
95
96
97 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
98 RepositioningInfo ampCodingInfo)
99 throws DocumentFormatException{
100 unpackMarkup(doc);
101 }
102
103
104 }
105