| PdfDocumentFormat.java |
1 /*
2 * Copyright (c) 1998-2005, The University of Sheffield.
3 *
4 * This file is part of GATE (see http://gate.ac.uk/), and is free
5 * software, licenced under the GNU Library General Public License,
6 * Version 2, June 1991 (in the distribution as file licence.html,
7 * and also available at http://gate.ac.uk/gate/licence.html).
8 *
9 * PdfDocumentFormat.java
10 *
11 * Ting Wang, Valentin Tablan, 14-Feb-2005,
12 *
13 * $Id: PdfDocumentFormat.java,v 1.5 2005/02/14 16:44:15 valyt Exp $
14 */
15
16 package gate.corpora;
17
18 import java.net.URL;
19 import gate.*;
20 import gate.Document;
21 import gate.DocumentFormat;
22 import gate.creole.ResourceInstantiationException;
23 import gate.util.DocumentFormatException;
24
25 import org.pdfbox.pdfparser.*;
26 import org.pdfbox.pdmodel.PDDocument;
27 import org.pdfbox.util.*;
28 import java.io.*;
29
30 /**
31 */
32 public class PdfDocumentFormat extends DocumentFormat{
33
34
35 /**
36 * Initialise this resource, and return it.
37 * Registers this format unpacker with the system.
38 */
39 public Resource init() throws ResourceInstantiationException{
40 // Register plain text mime type
41 MimeType mime = new MimeType("application","pdf");
42 // Register the class handler for this mime type
43 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
44 this);
45 // Register the mime type with mine string
46 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
47 // Register file sufixes for this mime type
48 suffixes2mimeTypeMap.put("pdf",mime);
49 // Set the mimeType for this language resource
50 setMimeType(mime);
51 return this;
52 } // init()
53
54
55 /**
56 * The PDF Document Format does not support repositioning info.
57 * @return false.
58 */
59 public Boolean supportsRepositioning() {
60 return new Boolean(false);
61 } // supportsRepositioning
62
63
64 /** Unpack the markup in the document. This converts markup from the
65 * native format (e.g. XML, RTF) into annotations in GATE format.
66 * Uses the markupElementsMap to determine which elements to convert, and
67 * what annotation type names to use.
68 */
69 public void unpackMarkup(Document doc) throws DocumentFormatException{
70 //get the original file
71 URL fileURL = doc.getSourceUrl();
72 if(fileURL == null) throw new DocumentFormatException(
73 "Unpacking PDF files requires an URL to the original content!");
74
75 InputStream in = null;
76 PDDocument document = null;
77 //Implement the PDF unpacking.
78 try {
79 // get an Input stream from the gate document
80 in = fileURL.openStream();
81 // create a PDF Text Stripper
82 PDFTextStripper pdfStripper = new PDFTextStripper();
83
84 document = PDDocument.load(in);
85
86 String extractedContent = pdfStripper.getText(document);
87 //set the content on the document
88 doc.setContent(new DocumentContentImpl(extractedContent));
89 } catch (IOException e){
90 throw new DocumentFormatException("I/O exception for " +
91 doc.getSourceUrl().toExternalForm(),
92 e);
93 }finally{
94 try{
95 if(document != null) document.close();
96 }catch(IOException ioe){
97 //give up
98 }
99 try{
100 if(in != null ) in.close();
101 }catch(IOException ioe){
102 //give up
103 }
104
105 }
106 }
107
108
109 public void unpackMarkup(Document doc, RepositioningInfo repInfo,
110 RepositioningInfo ampCodingInfo)
111 throws DocumentFormatException{
112 unpackMarkup(doc);
113 }
114
115
116 }
117