1   /*
2    *  DocumentFormat.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 25/May/2000
12   *
13   *  $Id: DocumentFormat.java,v 1.52 2005/01/11 13:51:30 ian Exp $
14   */
15  
16  package gate;
17  
18  import java.io.*;
19  import java.net.URL;
20  import java.util.*;
21  
22  import gate.corpora.MimeType;
23  import gate.corpora.RepositioningInfo;
24  import gate.creole.AbstractLanguageResource;
25  import gate.event.StatusListener;
26  import gate.util.DocumentFormatException;
27  
28  /** The format of Documents. Subclasses of DocumentFormat know about
29    * particular MIME types and how to unpack the information in any
30    * markup or formatting they contain into GATE annotations. Each MIME
31    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
32    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
33    * with a static index residing here when they are constructed. Static
34    * getDocumentFormat methods can then be used to get the appropriate
35    * format class for a particular document.
36    */
37  public abstract class DocumentFormat
38  extends AbstractLanguageResource implements LanguageResource{
39    /** Debug flag */
40    private static final boolean DEBUG = false;
41  
42    /** This fields indicates whether the document being processed is in a
43      * Gate XML custom format.
44      * Detection is done in runMagicNumbers().
45      */
46    protected static boolean isGateXmlDocument = false;
47  
48    /** The MIME type of this format. */
49    private MimeType mimeType = null;
50  
51    /** Map of MimeTypeString to ClassHandler class. This is used to find the
52      * language resource that deals with the specific Document format
53      */
54    protected static Map mimeString2ClassHandlerMap = new HashMap();
55    /** Map of MimeType to DocumentFormat Class. This is used to find the
56      * DocumentFormat subclass that deals with a particular MIME type.
57      */
58    protected static Map mimeString2mimeTypeMap = new HashMap();
59  
60    /** Map of Set of file suffixes to MimeType. This is used to figure
61      * out what MIME type a document is from its file name.
62      */
63    protected static Map suffixes2mimeTypeMap = new HashMap();
64  
65    /** Map of Set of magic numbers to MimeType. This is used to guess the
66      * MIME type of a document, when we don't have any other clues.
67      */
68    protected static Map magic2mimeTypeMap = new HashMap();
69  
70    /** Map of markup elements to annotation types. If it is null, the
71      * unpackMarkup() method will convert all markup, using the element names
72      * for annotation types. If it is non-null, only those elements specified
73      * here will be converted.
74      */
75    protected Map markupElementsMap = null;
76  
77    /** This map is used inside uppackMarkup() method...
78      * When an element from the map is encounted, The corresponding string
79      * element is added to the document content
80      */
81    protected Map element2StringMap = null;
82  
83    /** The features of this resource */
84    private FeatureMap features = null;
85  
86    /** Default construction */
87    public DocumentFormat() {}
88  
89    /** listeners for status report */
90    private transient Vector statusListeners;
91  
92    /** Flag for enable/disable collecting of repositioning information */
93    private Boolean shouldCollectRepositioning = new Boolean(false);
94  
95    /** If the document format could collect repositioning information
96     *  during the unpack phase this method will return <B>true</B>.
97     *  <BR>
98     *  You should override this method in the child class of the defined
99     *  document format if it could collect the repositioning information.
100    */
101   public Boolean supportsRepositioning() {
102     return new Boolean(false);
103   } // supportsRepositioning
104 
105   public void setShouldCollectRepositioning(Boolean b) {
106     if(supportsRepositioning().booleanValue() && b.booleanValue()) {
107       shouldCollectRepositioning = b;
108     }
109     else {
110       shouldCollectRepositioning = new Boolean(false);
111     } // if
112   } // setShouldCollectRepositioning
113 
114   public Boolean getShouldCollectRepositioning() {
115     return shouldCollectRepositioning;
116   } //
117 
118   /** Unpack the markup in the document. This converts markup from the
119     * native format (e.g. XML, RTF) into annotations in GATE format.
120     * Uses the markupElementsMap to determine which elements to convert, and
121     * what annotation type names to use.
122     */
123   abstract public void unpackMarkup(Document doc)
124                                       throws DocumentFormatException;
125 
126   abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo,
127                                         RepositioningInfo ampCodingInfo)
128                                       throws DocumentFormatException;
129   /** Unpack the markup in the document. This method calls unpackMarkup on the
130     * GATE document, but after it saves its content as a feature atached to
131     * the document. This method is usefull if one wants to save the content
132     * of the document being unpacked. After the markups have been unpacked,
133     * the content of the document will be replaced with a new one containing
134     * the text between markups.
135     *
136     * @param doc the document that will be upacked
137     * @param originalContentFeatureType the name of the feature that will hold
138     * the document's content.
139     */
140   public void unpackMarkup( Document doc,
141                             String  originalContentFeatureType )
142                                               throws DocumentFormatException{
143      FeatureMap fm = doc.getFeatures();
144      if (fm == null) fm = Factory.newFeatureMap();
145      fm.put(originalContentFeatureType, doc.getContent().toString());
146      doc.setFeatures(fm);
147      unpackMarkup(doc);
148   }// unpackMarkup();
149 
150   /**
151     * Returns a MimeType having as input a fileSufix.
152     * If the file sufix is <b>null</b> or not recognised then,
153     * <b>null</b> will be returned.
154     * @param fileSufix The file sufix associated with a recognisabe mime type.
155     * @return The MimeType associated with this file suffix.
156     */
157   static private MimeType  getMimeType(String fileSufix){
158     // Get a mimeType string associated with this fileSuffix
159     // Eg: for html returns  MimeType("text/html"), for xml returns
160     // MimeType("text/xml")
161     if(fileSufix == null) return null;
162     return  (MimeType) suffixes2mimeTypeMap.get(fileSufix.toLowerCase());
163   }//getMimeType
164 
165   /**
166     * Returns a MymeType having as input a URL object. If the MimeType wasn't
167     * recognized it returns <b>null</b>.
168     * @param url The URL object from which the MimeType will be extracted
169     * @return A MimeType object for that URL, or <b>null</b> if the Mime Type is
170     * unknown.
171     */
172   static private MimeType  getMimeType(URL url) {
173     String mimeTypeString = null;
174     String charsetFromWebServer = null;
175     String contentType = null;
176     InputStream is = null;
177     MimeType mimeTypeFromWebServer = null;
178     MimeType mimeTypeFromFileSuffix = null;
179     MimeType mimeTypeFromMagicNumbers = null;
180     String fileSufix = null;
181 
182     if (url == null)
183       return null;
184     // Ask the web server for the content type
185     // We expect to get contentType something like this:
186     // "text/html; charset=iso-8859-1"
187     // Charset is optional
188     try{
189       is = url.openConnection().getInputStream();
190       contentType = url.openConnection().getContentType();
191     } catch (IOException e){
192       // Failed to get the content type with te Web server.
193       // Let's try some other methods like FileSuffix or magic numbers.
194     }
195     // If a content Type was returned by the server, try to get the mime Type
196     // string
197     // If contentType is something like this:"text/html; charset=iso-8859-1"
198     // try to get content Type string (text/html)
199     if (contentType != null){
200       StringTokenizer st = new StringTokenizer(contentType, ";");
201       // We assume that the first token is the mime type string...
202       // If this doesn't happen then BAD LUCK :(( ...
203       if (st.hasMoreTokens())
204         mimeTypeString     = st.nextToken().toLowerCase();
205       // The next token it should be the CharSet
206       if (st.hasMoreTokens())
207         charsetFromWebServer = st.nextToken().toLowerCase();
208       if (charsetFromWebServer != null){
209         //We have something like : "charset=iso-8859-1" and let's extract the
210         // encoding.
211         st = new StringTokenizer(charsetFromWebServer, "=");
212         // Don't need this anymore
213         charsetFromWebServer = null;
214         // Discarding the first token which is : "charset"
215         if (st.hasMoreTokens())
216           st.nextToken().toUpperCase();
217         // Get the encoding : "ISO-8859-1"
218         if (st.hasMoreTokens())
219           charsetFromWebServer = st.nextToken().toUpperCase();
220       } // End if
221     }// end if
222     // Return the corresponding MimeType with WebServer from the associated MAP
223     mimeTypeFromWebServer = (MimeType)
224                                 mimeString2mimeTypeMap.get(mimeTypeString);
225     // Let's try a file suffix detection
226     // Get the file sufix from the URL.See method definition for more details
227     fileSufix = getFileSufix(url);
228     // Get the mime type based on the on file sufix
229     mimeTypeFromFileSuffix = getMimeType(fileSufix);
230 
231     // Let's perform a magic numbers guess..
232     mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is,
233                                                     charsetFromWebServer);
234     //All those types enter into a deciding system
235     return decideBetweenThreeMimeTypes( mimeTypeFromWebServer,
236                                         mimeTypeFromFileSuffix,
237                                         mimeTypeFromMagicNumbers);
238   }//getMimeType
239 
240   /**
241     * This method decides what mimeType is in majority
242     * @param aMimeTypeFromWebServer a MimeType
243     * @param aMimeTypeFromFileSuffix a MimeType
244     * @param aMimeTypeFromMagicNumbers a MimeType
245     * @return the MimeType which occurs most. If all are null, then returns
246     * <b>null</b>
247     */
248   protected static MimeType decideBetweenThreeMimeTypes(
249                                     MimeType aMimeTypeFromWebServer,
250                                     MimeType aMimeTypeFromFileSuffix,
251                                     MimeType aMimeTypeFromMagicNumbers){
252 
253     // First a voting system
254     if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix))
255       return aMimeTypeFromFileSuffix;
256     if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers))
257       return aMimeTypeFromFileSuffix;
258     if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers))
259       return aMimeTypeFromWebServer;
260 
261     // 1 is the highest priority
262     if (aMimeTypeFromFileSuffix != null)
263       aMimeTypeFromFileSuffix.addParameter("Priority","1");
264     // 2 is the second priority
265     if (aMimeTypeFromWebServer != null)
266       aMimeTypeFromWebServer.addParameter("Priority","2");
267     // 3 is the third priority
268     if (aMimeTypeFromMagicNumbers != null)
269       aMimeTypeFromMagicNumbers.addParameter("Priority","3");
270 
271     return decideBetweenTwoMimeTypes(
272                              decideBetweenTwoMimeTypes(aMimeTypeFromWebServer,
273                                                        aMimeTypeFromFileSuffix),
274                              aMimeTypeFromMagicNumbers);
275 
276   }// decideBetweenThreeMimeTypes
277 
278   /** Decide between two mimeTypes. The decistion is made on "Priority"
279     * parameter set into decideBetweenThreeMimeTypes method. If both mimeTypes
280     * doesn't have "Priority" paramether set, it will return one on them.
281     * @param aMimeType a MimeType object with "Prority" parameter set
282     * @param anotherMimeType a MimeType object with "Prority" parameter set
283     * @return One of the two mime types.
284     */
285   protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType,
286                                                 MimeType anotherMimeType){
287     if (aMimeType == null) return anotherMimeType;
288     if (anotherMimeType == null) return aMimeType;
289 
290     int priority1 = 0;
291     int priority2 = 0;
292     // Both of them are not null
293     if (aMimeType.hasParameter("Priority"))
294       try{
295         priority1 =
296               new Integer(aMimeType.getParameterValue("Priority")).intValue();
297       }catch (NumberFormatException e){
298         return anotherMimeType;
299       }
300     if (anotherMimeType.hasParameter("Priority"))
301       try{
302         priority2 =
303           new Integer(anotherMimeType.getParameterValue("Priority")).intValue();
304       }catch (NumberFormatException e){
305         return aMimeType;
306       }
307 
308     // The lower the number, the highest the priority
309     if (priority1 <= priority2)
310       return aMimeType;
311     else
312       return anotherMimeType;
313   }// decideBetweenTwoMimeTypes
314 
315   /**
316     * Tests if two MimeType objects are equal.
317     * @return true only if boths MimeType objects are different than <b>null</b>
318     * and their Types and Subtypes are equals. The method is case sensitive.
319     */
320   protected static boolean areEqual( MimeType aMimeType,
321                                      MimeType anotherMimeType){
322     if (aMimeType == null || anotherMimeType == null)
323       return false;
324 
325     if ( aMimeType.getType().equals(anotherMimeType.getType()) &&
326          aMimeType.getSubtype().equals(anotherMimeType.getSubtype())
327        ) return true;
328     else
329       return false;
330   }// are Equal
331 
332   /**
333     * This method tries to guess the mime Type using some magic numbers.
334     * @param aInputStream a InputStream which has to be transformed into a
335     *        InputStreamReader
336     * @param anEncoding the encoding. If is null or unknown then a
337     * InputStreamReader with default encodings will be created.
338     * @return the mime type associated with magic numbers
339     */
340   protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream,
341                                                             String anEncoding){
342 
343     if (aInputStream == null) return null;
344     InputStreamReader reader = null;
345     if (anEncoding != null)
346       try{
347         reader = new InputStreamReader(aInputStream, anEncoding);
348       } catch (UnsupportedEncodingException e){
349         reader = null;
350       }
351     if (reader == null)
352       // Create a reader with the default encoding system
353       reader = new InputStreamReader(aInputStream);
354 
355     // We have a input stream reader
356     return runMagicNumbers(reader);
357   }//guessTypeUsingMagicNumbers
358 
359   /** Performs magic over Gate Document */
360   protected static MimeType runMagicNumbers(InputStreamReader aReader){
361     // No reader, nothing to detect
362     if( aReader == null) return null;
363 
364     // Prepare to run the magic stuff
365     String strBuffer = null;
366     int bufferSize = 2048;
367     int charReads = 0;
368     char[] cbuf = new char[bufferSize];
369 
370     try {
371       charReads = aReader.read(cbuf,0,bufferSize);
372     } catch (IOException e){
373       return null;
374     }// End try
375 
376     if (charReads == -1)
377       // the document is empty
378       return null;
379 
380     // Create a string form the buffer and perform some search on it.
381     strBuffer = new String(cbuf,0,charReads);
382 
383     // If this fails then surrender
384     return getTypeFromContent(strBuffer);
385   }// runMagicNumbers
386 
387   private static MimeType getTypeFromContent(String aContent){
388     MimeType detectedMimeType = null;
389     // Detect whether or not is a GateXmlDocument
390     if (  aContent.indexOf("<GateDocument") != -1  ||
391           aContent.indexOf(" GateDocument") != -1)
392       isGateXmlDocument = true;
393     else
394       isGateXmlDocument = false;
395 
396     // Run the magic numbers test
397     Set magicSet = magic2mimeTypeMap.keySet();
398     Iterator iterator=magicSet.iterator();
399     String magic;
400     // change case to cover more variants
401     aContent = aContent.toLowerCase();
402     while (iterator.hasNext()){
403       magic = ((String) iterator.next()).toLowerCase();
404       if (aContent.indexOf(magic) != -1)
405         detectedMimeType = (MimeType) magic2mimeTypeMap.get(magic);
406     }// End while
407 
408     // If this fails then surrender
409     return detectedMimeType;
410   }// getTypeFromContent
411 
412   /**
413     * Return the fileSuffix or null if the url doesn't have a file suffix
414     * If the url is null then the file suffix will be null also
415     */
416   private static String getFileSufix(URL url){
417     String fileName = null;
418     String fileSuffix = null;
419 
420     // GIGO test  (garbage in garbage out)
421     if (url != null){
422       // get the file name from the URL
423       fileName = url.getFile();
424 
425       // tokenize this file name with "." as separator...
426       // the last token will be the file suffix
427       StringTokenizer st = new StringTokenizer(fileName,".");
428 
429       // fileSuffix is the last token
430       while (st.hasMoreTokens())
431         fileSuffix = st.nextToken();
432       // here fileSuffix is the last token
433     } // End if
434     return fileSuffix;
435   }//getFileSufix
436 
437   /**
438     * Find a DocumentFormat implementation that deals with a particular
439     * MIME type, given that type.
440     * @param  aGateDocument this document will receive as a feature
441     *                      the associated Mime Type. The name of the feature is
442     *                      MimeType and its value is in the format type/subtype
443     * @param  mimeType the mime type that is given as input
444     */
445   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
446                                                             MimeType mimeType){
447     FeatureMap      aFeatureMap    = null;
448     if(mimeType == null) {
449       String content = aGateDocument.getContent().toString();
450       // reduce size for better performance
451       if(content.length() > 2048) content = content.substring(0, 2048);
452       mimeType = getTypeFromContent( content );
453     }
454 
455     if (mimeType != null){
456       // If the Gate Document doesn't have a feature map atached then
457       // We will create and set one.
458       if(aGateDocument.getFeatures() == null){
459             aFeatureMap = Factory.newFeatureMap();
460             aGateDocument.setFeatures(aFeatureMap);
461       }// end if
462       aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" +
463                                           mimeType.getSubtype());
464 
465       return (DocumentFormat) mimeString2ClassHandlerMap.get(mimeType.getType()
466                                                + "/" + mimeType.getSubtype());
467     }// end If
468     return null;
469   } // getDocumentFormat(aGateDocument, MimeType)
470 
471   /**
472     * Find a DocumentFormat implementation that deals with a particular
473     * MIME type, given the file suffix (e.g. ".txt") that the document came
474     * from.
475     * @param  aGateDocument this document will receive as a feature
476     *                     the associated Mime Type. The name of the feature is
477     *                     MimeType and its value is in the format type/subtype
478     * @param  fileSuffix the file suffix that is given as input
479     */
480   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
481                                                             String fileSuffix) {
482     return getDocumentFormat(aGateDocument, getMimeType(fileSuffix));
483   } // getDocumentFormat(String)
484 
485   /**
486     * Find a DocumentFormat implementation that deals with a particular
487     * MIME type, given the URL of the Document. If it is an HTTP URL, we
488     * can ask the web server. If it has a recognised file extension, we
489     * can use that. Otherwise we need to use a map of magic numbers
490     * to MIME types to guess the type, and then look up the format using the
491     * type.
492     * @param  aGateDocument this document will receive as a feature
493     *                      the associated Mime Type. The name of the feature is
494     *                      MimeType and its value is in the format type/subtype
495     * @param  url  the URL that is given as input
496     */
497   static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
498                                                                       URL url) {
499     return getDocumentFormat(aGateDocument, getMimeType(url));
500   } // getDocumentFormat(URL)
501 
502   /** Get the feature set */
503   public FeatureMap getFeatures() { return features; }
504 
505    /** Get the markup elements map */
506   public Map getMarkupElementsMap() { return markupElementsMap; }
507 
508    /** Get the element 2 string map */
509   public Map getElement2StringMap() { return element2StringMap; }
510 
511   /** Set the markup elements map */
512   public void setMarkupElementsMap(Map markupElementsMap) {
513    this.markupElementsMap = markupElementsMap;
514   }
515 
516   /** Set the element 2 string map */
517   public void setElement2StringMap(Map anElement2StringMap) {
518    element2StringMap = anElement2StringMap;
519   }
520 
521   /** Set the features map*/
522   public void setFeatures(FeatureMap features){this.features = features;}
523 
524   /** Set the mime type*/
525 
526   public void setMimeType(MimeType aMimeType){mimeType = aMimeType;}
527   /** Gets the mime Type*/
528   public MimeType getMimeType(){return mimeType;}
529 
530   //StatusReporter Implementation
531 
532 
533   public synchronized void removeStatusListener(StatusListener l) {
534     if (statusListeners != null && statusListeners.contains(l)) {
535       Vector v = (Vector) statusListeners.clone();
536       v.removeElement(l);
537       statusListeners = v;
538     }
539   }
540   public synchronized void addStatusListener(StatusListener l) {
541     Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
542     if (!v.contains(l)) {
543       v.addElement(l);
544       statusListeners = v;
545     }
546   }
547   protected void fireStatusChanged(String e) {
548     if (statusListeners != null) {
549       Vector listeners = statusListeners;
550       int count = listeners.size();
551       for (int i = 0; i < count; i++) {
552         ((StatusListener) listeners.elementAt(i)).statusChanged(e);
553       }
554     }
555   }
556 
557 } // class DocumentFormat
558