1
15
16 package gate;
17
18 import java.io.*;
19 import java.net.URL;
20 import java.util.*;
21
22 import gate.corpora.MimeType;
23 import gate.corpora.RepositioningInfo;
24 import gate.creole.AbstractLanguageResource;
25 import gate.event.StatusListener;
26 import gate.util.DocumentFormatException;
27
28
37 public abstract class DocumentFormat
38 extends AbstractLanguageResource implements LanguageResource{
39
40 private static final boolean DEBUG = false;
41
42
46 protected static boolean isGateXmlDocument = false;
47
48
49 private MimeType mimeType = null;
50
51
54 protected static Map mimeString2ClassHandlerMap = new HashMap();
55
58 protected static Map mimeString2mimeTypeMap = new HashMap();
59
60
63 protected static Map suffixes2mimeTypeMap = new HashMap();
64
65
68 protected static Map magic2mimeTypeMap = new HashMap();
69
70
75 protected Map markupElementsMap = null;
76
77
81 protected Map element2StringMap = null;
82
83
84 private FeatureMap features = null;
85
86
87 public DocumentFormat() {}
88
89
90 private transient Vector statusListeners;
91
92
93 private Boolean shouldCollectRepositioning = new Boolean(false);
94
95
101 public Boolean supportsRepositioning() {
102 return new Boolean(false);
103 }
105 public void setShouldCollectRepositioning(Boolean b) {
106 if(supportsRepositioning().booleanValue() && b.booleanValue()) {
107 shouldCollectRepositioning = b;
108 }
109 else {
110 shouldCollectRepositioning = new Boolean(false);
111 } }
114 public Boolean getShouldCollectRepositioning() {
115 return shouldCollectRepositioning;
116 }
118
123 abstract public void unpackMarkup(Document doc)
124 throws DocumentFormatException;
125
126 abstract public void unpackMarkup(Document doc, RepositioningInfo repInfo,
127 RepositioningInfo ampCodingInfo)
128 throws DocumentFormatException;
129
140 public void unpackMarkup( Document doc,
141 String originalContentFeatureType )
142 throws DocumentFormatException{
143 FeatureMap fm = doc.getFeatures();
144 if (fm == null) fm = Factory.newFeatureMap();
145 fm.put(originalContentFeatureType, doc.getContent().toString());
146 doc.setFeatures(fm);
147 unpackMarkup(doc);
148 }
150
157 static private MimeType getMimeType(String fileSufix){
158 if(fileSufix == null) return null;
162 return (MimeType) suffixes2mimeTypeMap.get(fileSufix.toLowerCase());
163 }
165
172 static private MimeType getMimeType(URL url) {
173 String mimeTypeString = null;
174 String charsetFromWebServer = null;
175 String contentType = null;
176 InputStream is = null;
177 MimeType mimeTypeFromWebServer = null;
178 MimeType mimeTypeFromFileSuffix = null;
179 MimeType mimeTypeFromMagicNumbers = null;
180 String fileSufix = null;
181
182 if (url == null)
183 return null;
184 try{
189 is = url.openConnection().getInputStream();
190 contentType = url.openConnection().getContentType();
191 } catch (IOException e){
192 }
195 if (contentType != null){
200 StringTokenizer st = new StringTokenizer(contentType, ";");
201 if (st.hasMoreTokens())
204 mimeTypeString = st.nextToken().toLowerCase();
205 if (st.hasMoreTokens())
207 charsetFromWebServer = st.nextToken().toLowerCase();
208 if (charsetFromWebServer != null){
209 st = new StringTokenizer(charsetFromWebServer, "=");
212 charsetFromWebServer = null;
214 if (st.hasMoreTokens())
216 st.nextToken().toUpperCase();
217 if (st.hasMoreTokens())
219 charsetFromWebServer = st.nextToken().toUpperCase();
220 } } mimeTypeFromWebServer = (MimeType)
224 mimeString2mimeTypeMap.get(mimeTypeString);
225 fileSufix = getFileSufix(url);
228 mimeTypeFromFileSuffix = getMimeType(fileSufix);
230
231 mimeTypeFromMagicNumbers = guessTypeUsingMagicNumbers(is,
233 charsetFromWebServer);
234 return decideBetweenThreeMimeTypes( mimeTypeFromWebServer,
236 mimeTypeFromFileSuffix,
237 mimeTypeFromMagicNumbers);
238 }
240
248 protected static MimeType decideBetweenThreeMimeTypes(
249 MimeType aMimeTypeFromWebServer,
250 MimeType aMimeTypeFromFileSuffix,
251 MimeType aMimeTypeFromMagicNumbers){
252
253 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromFileSuffix))
255 return aMimeTypeFromFileSuffix;
256 if (areEqual(aMimeTypeFromFileSuffix,aMimeTypeFromMagicNumbers))
257 return aMimeTypeFromFileSuffix;
258 if (areEqual(aMimeTypeFromWebServer,aMimeTypeFromMagicNumbers))
259 return aMimeTypeFromWebServer;
260
261 if (aMimeTypeFromFileSuffix != null)
263 aMimeTypeFromFileSuffix.addParameter("Priority","1");
264 if (aMimeTypeFromWebServer != null)
266 aMimeTypeFromWebServer.addParameter("Priority","2");
267 if (aMimeTypeFromMagicNumbers != null)
269 aMimeTypeFromMagicNumbers.addParameter("Priority","3");
270
271 return decideBetweenTwoMimeTypes(
272 decideBetweenTwoMimeTypes(aMimeTypeFromWebServer,
273 aMimeTypeFromFileSuffix),
274 aMimeTypeFromMagicNumbers);
275
276 }
278
285 protected static MimeType decideBetweenTwoMimeTypes( MimeType aMimeType,
286 MimeType anotherMimeType){
287 if (aMimeType == null) return anotherMimeType;
288 if (anotherMimeType == null) return aMimeType;
289
290 int priority1 = 0;
291 int priority2 = 0;
292 if (aMimeType.hasParameter("Priority"))
294 try{
295 priority1 =
296 new Integer(aMimeType.getParameterValue("Priority")).intValue();
297 }catch (NumberFormatException e){
298 return anotherMimeType;
299 }
300 if (anotherMimeType.hasParameter("Priority"))
301 try{
302 priority2 =
303 new Integer(anotherMimeType.getParameterValue("Priority")).intValue();
304 }catch (NumberFormatException e){
305 return aMimeType;
306 }
307
308 if (priority1 <= priority2)
310 return aMimeType;
311 else
312 return anotherMimeType;
313 }
315
320 protected static boolean areEqual( MimeType aMimeType,
321 MimeType anotherMimeType){
322 if (aMimeType == null || anotherMimeType == null)
323 return false;
324
325 if ( aMimeType.getType().equals(anotherMimeType.getType()) &&
326 aMimeType.getSubtype().equals(anotherMimeType.getSubtype())
327 ) return true;
328 else
329 return false;
330 }
332
340 protected static MimeType guessTypeUsingMagicNumbers(InputStream aInputStream,
341 String anEncoding){
342
343 if (aInputStream == null) return null;
344 InputStreamReader reader = null;
345 if (anEncoding != null)
346 try{
347 reader = new InputStreamReader(aInputStream, anEncoding);
348 } catch (UnsupportedEncodingException e){
349 reader = null;
350 }
351 if (reader == null)
352 reader = new InputStreamReader(aInputStream);
354
355 return runMagicNumbers(reader);
357 }
359
360 protected static MimeType runMagicNumbers(InputStreamReader aReader){
361 if( aReader == null) return null;
363
364 String strBuffer = null;
366 int bufferSize = 2048;
367 int charReads = 0;
368 char[] cbuf = new char[bufferSize];
369
370 try {
371 charReads = aReader.read(cbuf,0,bufferSize);
372 } catch (IOException e){
373 return null;
374 }
376 if (charReads == -1)
377 return null;
379
380 strBuffer = new String(cbuf,0,charReads);
382
383 return getTypeFromContent(strBuffer);
385 }
387 private static MimeType getTypeFromContent(String aContent){
388 MimeType detectedMimeType = null;
389 if ( aContent.indexOf("<GateDocument") != -1 ||
391 aContent.indexOf(" GateDocument") != -1)
392 isGateXmlDocument = true;
393 else
394 isGateXmlDocument = false;
395
396 Set magicSet = magic2mimeTypeMap.keySet();
398 Iterator iterator=magicSet.iterator();
399 String magic;
400 aContent = aContent.toLowerCase();
402 while (iterator.hasNext()){
403 magic = ((String) iterator.next()).toLowerCase();
404 if (aContent.indexOf(magic) != -1)
405 detectedMimeType = (MimeType) magic2mimeTypeMap.get(magic);
406 }
408 return detectedMimeType;
410 }
412
416 private static String getFileSufix(URL url){
417 String fileName = null;
418 String fileSuffix = null;
419
420 if (url != null){
422 fileName = url.getFile();
424
425 StringTokenizer st = new StringTokenizer(fileName,".");
428
429 while (st.hasMoreTokens())
431 fileSuffix = st.nextToken();
432 } return fileSuffix;
435 }
437
445 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
446 MimeType mimeType){
447 FeatureMap aFeatureMap = null;
448 if(mimeType == null) {
449 String content = aGateDocument.getContent().toString();
450 if(content.length() > 2048) content = content.substring(0, 2048);
452 mimeType = getTypeFromContent( content );
453 }
454
455 if (mimeType != null){
456 if(aGateDocument.getFeatures() == null){
459 aFeatureMap = Factory.newFeatureMap();
460 aGateDocument.setFeatures(aFeatureMap);
461 } aGateDocument.getFeatures().put("MimeType",mimeType.getType() + "/" +
463 mimeType.getSubtype());
464
465 return (DocumentFormat) mimeString2ClassHandlerMap.get(mimeType.getType()
466 + "/" + mimeType.getSubtype());
467 } return null;
469 }
471
480 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
481 String fileSuffix) {
482 return getDocumentFormat(aGateDocument, getMimeType(fileSuffix));
483 }
485
497 static public DocumentFormat getDocumentFormat(gate.Document aGateDocument,
498 URL url) {
499 return getDocumentFormat(aGateDocument, getMimeType(url));
500 }
502
503 public FeatureMap getFeatures() { return features; }
504
505
506 public Map getMarkupElementsMap() { return markupElementsMap; }
507
508
509 public Map getElement2StringMap() { return element2StringMap; }
510
511
512 public void setMarkupElementsMap(Map markupElementsMap) {
513 this.markupElementsMap = markupElementsMap;
514 }
515
516
517 public void setElement2StringMap(Map anElement2StringMap) {
518 element2StringMap = anElement2StringMap;
519 }
520
521
522 public void setFeatures(FeatureMap features){this.features = features;}
523
524
525
526 public void setMimeType(MimeType aMimeType){mimeType = aMimeType;}
527
528 public MimeType getMimeType(){return mimeType;}
529
530
532
533 public synchronized void removeStatusListener(StatusListener l) {
534 if (statusListeners != null && statusListeners.contains(l)) {
535 Vector v = (Vector) statusListeners.clone();
536 v.removeElement(l);
537 statusListeners = v;
538 }
539 }
540 public synchronized void addStatusListener(StatusListener l) {
541 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
542 if (!v.contains(l)) {
543 v.addElement(l);
544 statusListeners = v;
545 }
546 }
547 protected void fireStatusChanged(String e) {
548 if (statusListeners != null) {
549 Vector listeners = statusListeners;
550 int count = listeners.size();
551 for (int i = 0; i < count; i++) {
552 ((StatusListener) listeners.elementAt(i)).statusChanged(e);
553 }
554 }
555 }
556
557 }