1   /*
2    *  EmailDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 3/Aug/2000
12   *
13   *  $Id: EmailDocumentFormat.java,v 1.29 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.IOException;
19  import java.util.Iterator;
20  
21  import gate.*;
22  import gate.creole.ResourceInstantiationException;
23  import gate.email.EmailDocumentHandler;
24  import gate.event.StatusListener;
25  import gate.util.DocumentFormatException;
26  import gate.util.InvalidOffsetException;
27  
28  //import org.w3c.www.mime.*;
29  
30  /** The format of Documents. Subclasses of DocumentFormat know about
31    * particular MIME types and how to unpack the information in any
32    * markup or formatting they contain into GATE annotations. Each MIME
33    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
34    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
35    * with a static index residing here when they are constructed. Static
36    * getDocumentFormat methods can then be used to get the appropriate
37    * format class for a particular document.
38    */
39  public class EmailDocumentFormat extends TextualDocumentFormat
40  {
41    /** Debug flag */
42    private static final boolean DEBUG = false;
43  
44    /** Default construction */
45    public EmailDocumentFormat() { super();}
46  
47    /** Unpack the markup in the document. This converts markup from the
48      * native format (e.g. EMAIL) into annotations in GATE format.
49      * Uses the markupElementsMap to determine which elements to convert, and
50      * what annotation type names to use.
51      * It always tryes to parse te doc's content. It doesn't matter if the
52      * sourceUrl is null or not.
53      *
54      * @param doc The gate document you want to parse.
55      *
56      */
57  
58    public void unpackMarkup(gate.Document doc) throws DocumentFormatException{
59      if ( (doc == null) ||
60           (doc.getSourceUrl() == null && doc.getContent() == null)){
61  
62        throw new DocumentFormatException(
63                 "GATE document is null or no content found. Nothing to parse!");
64      }// End if
65  
66      setNewLineProperty(doc);
67  
68      // create an EmailDocumentHandler
69      EmailDocumentHandler emailDocHandler = null;
70      emailDocHandler = new  gate.email.EmailDocumentHandler(
71                                                         doc,
72                                                         this.markupElementsMap,
73                                                         this.element2StringMap);
74      StatusListener statusListener = new StatusListener(){
75          public void statusChanged(String text) {
76            // this is implemented in DocumentFormat.java and inherited here
77            fireStatusChanged(text);
78          }//statusChanged(String text)
79      };
80      // Register a status listener with it
81      emailDocHandler.addStatusListener(statusListener);
82      try{
83        // Call the method that creates annotations on the gate document
84        emailDocHandler.annotateMessages();
85        // Process the body annotations and search for paragraphs
86        AnnotationSet bodyAnnotations = doc.getAnnotations(
87                      GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("body");
88        if (bodyAnnotations != null && !bodyAnnotations.isEmpty()){
89          Iterator iter = bodyAnnotations.iterator();
90          while(iter.hasNext()){
91            Annotation a = (Annotation)iter.next();
92            annotateParagraphs(doc,a.getStartNode().getOffset().intValue(),
93                                   a.getEndNode().getOffset().intValue(),
94                                   GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
95          }// End while
96        }// End if
97      } catch (IOException e){
98        throw new DocumentFormatException("Couldn't create a buffered reader ",e);
99      } catch (InvalidOffsetException e){
100       throw new DocumentFormatException(e);
101     }finally{
102       emailDocHandler.removeStatusListener(statusListener);
103     }// End try
104   }//unpackMarkup(doc)
105 
106   /** Initialise this resource, and return it. */
107   public Resource init() throws ResourceInstantiationException{
108     // Register EMAIL mime type
109     MimeType mime = new MimeType("text","email");
110     // Register the class handler for this mime type
111     mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
112                                                                           this);
113     // Register the mime type with mine string
114     mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
115     // Register file sufixes for this mime type
116     suffixes2mimeTypeMap.put("eml",mime);
117     suffixes2mimeTypeMap.put("email",mime);
118     suffixes2mimeTypeMap.put("mail",mime);
119     // Register magic numbers for this mime type
120     magic2mimeTypeMap.put("Subject:",mime);
121     // Set the mimeType for this language resource
122     setMimeType(mime);
123     return this;
124   }// init()
125 }// class EmailDocumentFormat
126 
127