1   /*
2    *  Copyright (c) 1998-2005, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  PdfDocumentFormat.java
10   *
11   *  Ting Wang, Valentin Tablan, 14-Feb-2005, 
12   *
13   *  $Id: PdfDocumentFormat.java,v 1.5 2005/02/14 16:44:15 valyt Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.net.URL;
19  import gate.*;
20  import gate.Document;
21  import gate.DocumentFormat;
22  import gate.creole.ResourceInstantiationException;
23  import gate.util.DocumentFormatException;
24  
25  import org.pdfbox.pdfparser.*;
26  import org.pdfbox.pdmodel.PDDocument;
27  import org.pdfbox.util.*;
28  import java.io.*;
29  
30  /**
31   */
32  public class PdfDocumentFormat extends DocumentFormat{
33  
34  
35    /**
36     * Initialise this resource, and return it.
37     * Registers this format unpacker with the system.
38     */
39    public Resource init() throws ResourceInstantiationException{
40      // Register plain text mime type
41      MimeType mime = new MimeType("application","pdf");
42      // Register the class handler for this mime type
43      mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
44                                                                            this);
45      // Register the mime type with mine string
46      mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
47      // Register file sufixes for this mime type
48      suffixes2mimeTypeMap.put("pdf",mime);
49      // Set the mimeType for this language resource
50      setMimeType(mime);
51      return this;
52    } // init()
53  
54  
55    /**
56     * The PDF Document Format does not support repositioning info.
57     * @return false.
58     */
59    public Boolean supportsRepositioning() {
60      return new Boolean(false);
61    } // supportsRepositioning
62  
63  
64    /** Unpack the markup in the document. This converts markup from the
65     * native format (e.g. XML, RTF) into annotations in GATE format.
66     * Uses the markupElementsMap to determine which elements to convert, and
67     * what annotation type names to use.
68     */
69    public void unpackMarkup(Document doc) throws DocumentFormatException{
70      //get the original file
71      URL fileURL = doc.getSourceUrl();
72      if(fileURL == null) throw new DocumentFormatException(
73              "Unpacking PDF files requires an URL to the original content!");
74      
75      InputStream in = null;
76      PDDocument document = null;
77      //Implement the PDF unpacking.
78      try {
79        // get an Input stream from the gate document
80        in = fileURL.openStream();
81        // create a PDF Text Stripper
82        PDFTextStripper pdfStripper = new PDFTextStripper();
83        
84        document = PDDocument.load(in);
85        
86        String extractedContent = pdfStripper.getText(document);
87        //set the content on the document
88        doc.setContent(new DocumentContentImpl(extractedContent));
89      } catch (IOException e){
90        throw new DocumentFormatException("I/O exception for " +
91                                          doc.getSourceUrl().toExternalForm(), 
92                                          e);
93      }finally{
94        try{
95          if(document != null) document.close();
96        }catch(IOException ioe){
97          //give up
98        }
99        try{
100         if(in != null ) in.close();
101       }catch(IOException ioe){
102         //give up
103       }
104       
105     }
106   }
107 
108 
109   public void unpackMarkup(Document doc, RepositioningInfo repInfo,
110                                        RepositioningInfo ampCodingInfo)
111                                      throws DocumentFormatException{
112     unpackMarkup(doc);
113   }
114 
115 
116 }
117