1
15
16 package gate.corpora;
17
18 import java.net.URL;
19 import gate.*;
20 import gate.Document;
21 import gate.DocumentFormat;
22 import gate.creole.ResourceInstantiationException;
23 import gate.util.DocumentFormatException;
24
25 import org.pdfbox.pdfparser.*;
26 import org.pdfbox.pdmodel.PDDocument;
27 import org.pdfbox.util.*;
28 import java.io.*;
29
30
32 public class PdfDocumentFormat extends DocumentFormat{
33
34
35
39 public Resource init() throws ResourceInstantiationException{
40 MimeType mime = new MimeType("application","pdf");
42 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
44 this);
45 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
47 suffixes2mimeTypeMap.put("pdf",mime);
49 setMimeType(mime);
51 return this;
52 }
54
55
59 public Boolean supportsRepositioning() {
60 return new Boolean(false);
61 }
63
64
69 public void unpackMarkup(Document doc) throws DocumentFormatException{
70 URL fileURL = doc.getSourceUrl();
72 if(fileURL == null) throw new DocumentFormatException(
73 "Unpacking PDF files requires an URL to the original content!");
74
75 InputStream in = null;
76 PDDocument document = null;
77 try {
79 in = fileURL.openStream();
81 PDFTextStripper pdfStripper = new PDFTextStripper();
83
84 document = PDDocument.load(in);
85
86 String extractedContent = pdfStripper.getText(document);
87 doc.setContent(new DocumentContentImpl(extractedContent));
89 } catch (IOException e){
90 throw new DocumentFormatException("I/O exception for " +
91 doc.getSourceUrl().toExternalForm(),
92 e);
93 }finally{
94 try{
95 if(document != null) document.close();
96 }catch(IOException ioe){
97 }
99 try{
100 if(in != null ) in.close();
101 }catch(IOException ioe){
102 }
104
105 }
106 }
107
108
109 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
110 RepositioningInfo ampCodingInfo)
111 throws DocumentFormatException{
112 unpackMarkup(doc);
113 }
114
115
116 }
117