1   /*
2    *  HtmlDocumentHandler.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  12/June/2000
12   *
13   *  $Id: HtmlDocumentHandler.java,v 1.37 2005/01/11 13:51:35 ian Exp $
14   */
15  
16  package gate.html;
17  
18  import java.util.*;
19  
20  import javax.swing.text.BadLocationException;
21  import javax.swing.text.MutableAttributeSet;
22  import javax.swing.text.html.HTML;
23  import javax.swing.text.html.HTMLEditorKit.ParserCallback;
24  
25  import gate.*;
26  import gate.corpora.DocumentContentImpl;
27  import gate.corpora.RepositioningInfo;
28  import gate.event.StatusListener;
29  import gate.util.Err;
30  import gate.util.InvalidOffsetException;
31  
32  
33  /** Implements the behaviour of the HTML reader.
34    * Methods of an object of this class are called by the HTML parser when
35    * events will appear.
36    * The idea is to parse the HTML document and construct Gate annotations
37    * objects.
38    * This class also will replace the content of the Gate document with a
39    * new one containing anly text from the HTML document.
40    */
41  public class HtmlDocumentHandler extends ParserCallback {
42  
43    /** Debug flag */
44    private static final boolean DEBUG = false;
45  
46    /** Constructor initialises all the private memeber data.
47      * This will use the default annotation set taken from the gate document.
48      * @param aDocument The gate document that will be processed
49      * @param aMarkupElementsMap The map containing the elements that will
50      * transform into annotations
51      */
52    public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
53      this(aDocument,aMarkupElementsMap,null);
54    }
55  
56    /** Constructor initialises all the private memeber data
57      * @param aDocument The gate document that will be processed
58      * @param aMarkupElementsMap The map containing the elements that will
59      * transform into annotations
60      * @param anAnnotationSet The annotation set that will contain annotations
61      * resulted from the processing of the gate document
62      */
63    public HtmlDocumentHandler(gate.Document       aDocument,
64                               Map                 aMarkupElementsMap,
65                               gate.AnnotationSet  anAnnotationSet) {
66      // init stack
67      stack = new java.util.Stack();
68  
69      // this string contains the plain text (the text without markup)
70      tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
71  
72      // colector is used later to transform all custom objects into
73      // annotation objects
74      colector = new LinkedList();
75  
76      // the Gate document
77      doc = aDocument;
78  
79      // this map contains the elements name that we want to create
80      // if it's null all the elements from the XML documents will be transformed
81      // into Gate annotation objects
82      markupElementsMap = aMarkupElementsMap;
83  
84      // init an annotation set for this gate document
85      basicAS = anAnnotationSet;
86  
87      customObjectsId = 0;
88    }//HtmlDocumentHandler
89  
90    /** Keep the refference to this structure */
91    private RepositioningInfo reposInfo = null;
92  
93    /** Keep the refference to this structure */
94    private RepositioningInfo ampCodingInfo = null;
95  
96    /** Set repositioning information structure refference. If you set this
97     *  refference to <B>null</B> information wouldn't be collected.
98     */
99    public void setRepositioningInfo(RepositioningInfo info) {
100     reposInfo = info;
101   } // setRepositioningInfo
102 
103   /** Return current RepositioningInfo object */
104   public RepositioningInfo getRepositioningInfo() {
105     return reposInfo;
106   } // getRepositioningInfo
107 
108   /** Set repositioning information structure refference for ampersand coding.
109    *  If you set this refference to <B>null</B> information wouldn't be used.
110    */
111   public void setAmpCodingInfo(RepositioningInfo info) {
112     ampCodingInfo = info;
113   } // setRepositioningInfo
114 
115   /** Return current RepositioningInfo object for ampersand coding. */
116   public RepositioningInfo getAmpCodingInfo() {
117     return ampCodingInfo;
118   } // getRepositioningInfo
119 
120   /** The text inside the STYLE tag is processed with <code>handleText()</code>.
121    *  We should skip inserting of this text in the document. */
122   private boolean isInsideStyleTag = false;
123 
124   /** This method is called when the HTML parser encounts the beginning
125     * of a tag that means that the tag is paired by an end tag and it's
126     * not an empty one.
127     */
128   public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
129     // Fire the status listener if the elements processed exceded the rate
130     if (0 == (++elements % ELEMENTS_RATE))
131       fireStatusChangedEvent("Processed elements : " + elements);
132 
133     // Start of STYLE tag
134     if(HTML.Tag.STYLE.equals(t)) {
135       isInsideStyleTag = true;
136     } // if
137 
138     // Construct a feature map from the attributes list
139     FeatureMap fm = Factory.newFeatureMap();
140 
141     // Take all the attributes an put them into the feature map
142     if (0 != a.getAttributeCount()){
143       Enumeration enumeration = a.getAttributeNames();
144       while (enumeration.hasMoreElements()){
145         Object attribute = enumeration.nextElement();
146         fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
147       }// while
148     }// if
149 
150     // Just analize the tag t and add some\n chars and spaces to the
151     // tmpDocContent.The reason behind is that we need to have a readable form
152     // for the final document.
153     customizeAppearanceOfDocumentWithStartTag(t);
154 
155     // If until here the "tmpDocContent" ends with a NON whitespace char,
156     // then we add a space char before calculating the START index of this
157     // tag.
158     // This is done in order not to concatenate the content of two separate tags
159     // and obtain a different NEW word.
160     int tmpDocContentSize = tmpDocContent.length();
161     if ( tmpDocContentSize != 0 &&
162          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
163        ) tmpDocContent.append(" ");
164 
165     // create the start index of the annotation
166     Long startIndex = new Long(tmpDocContent.length());
167 
168     // initialy the start index is equal with the End index
169     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
170 
171     // put it into the stack
172     stack.push (obj);
173 
174   }//handleStartTag
175 
176    /** This method is called when the HTML parser encounts the end of a tag
177      * that means that the tag is paired by a beginning tag
178      */
179   public void handleEndTag(HTML.Tag t, int pos){
180     // obj is for internal use
181     CustomObject obj = null;
182 
183     // end of STYLE tag
184     if(HTML.Tag.STYLE.equals(t)) {
185       isInsideStyleTag = false;
186     } // if
187 
188     // If the stack is not empty then we get the object from the stack
189     if (!stack.isEmpty()){
190       obj = (CustomObject) stack.pop();
191       // Before adding it to the colector, we need to check if is an
192       // emptyAndSpan one. See CustomObject's isEmptyAndSpan field.
193       if (obj.getStart().equals(obj.getEnd())){
194         // The element had an end tag and its start was equal to its end. Hence
195         // it is anEmptyAndSpan one.
196         obj.getFM().put("isEmptyAndSpan","true");
197       }// End iff
198       // we add it to the colector
199       colector.add(obj);
200     }// End if
201 
202     // If element has text between, then customize its apearance
203     if ( obj != null &&
204          obj.getStart().longValue() != obj.getEnd().longValue()
205        )
206       // Customize the appearance of the document
207       customizeAppearanceOfDocumentWithEndTag(t);
208 
209     // if t is the </HTML> tag then we reached the end of theHTMLdocument
210     if (t == HTML.Tag.HTML){
211       // replace the old content with the new one
212       doc.setContent (new DocumentContentImpl(tmpDocContent.toString()));
213 
214       // If basicAs is null then get the default annotation
215       // set from this gate document
216       if (basicAS == null)
217         basicAS = doc.getAnnotations(
218                                 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
219 
220       // sort colector ascending on its id
221       Collections.sort(colector);
222       // iterate through colector and construct annotations
223       while (!colector.isEmpty()){
224         obj = (CustomObject) colector.getFirst();
225         colector.remove(obj);
226           // Construct an annotation from this obj
227           try{
228             if (markupElementsMap == null){
229                basicAS.add( obj.getStart(),
230                             obj.getEnd(),
231                             obj.getElemName(),
232                             obj.getFM()
233                            );
234             }else{
235               String annotationType =
236                      (String) markupElementsMap.get(obj.getElemName());
237               if (annotationType != null)
238                  basicAS.add( obj.getStart(),
239                               obj.getEnd(),
240                               annotationType,
241                               obj.getFM()
242                              );
243             }
244           }catch (InvalidOffsetException e){
245               Err.prln("Error creating an annot :" + obj + " Discarded...");
246           }// end try
247 //        }// end if
248       }//while
249 
250       // notify the listener about the total amount of elements that
251       // has been processed
252       fireStatusChangedEvent("Total elements : " + elements);
253 
254     }//else
255 
256   }//handleEndTag
257 
258   /** This method is called when the HTML parser encounts an empty tag
259     */
260   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
261     // fire the status listener if the elements processed exceded the rate
262     if ((++elements % ELEMENTS_RATE) == 0)
263       fireStatusChangedEvent("Processed elements : " + elements);
264 
265     // construct a feature map from the attributes list
266     // these are empty elements
267     FeatureMap fm = Factory.newFeatureMap();
268 
269     // take all the attributes an put them into the feature map
270     if (0 != a.getAttributeCount ()){
271 
272        // Out.println("HAS  attributes = " + a.getAttributeCount ());
273         Enumeration enumeration = a.getAttributeNames ();
274         while (enumeration.hasMoreElements ()){
275           Object attribute = enumeration.nextElement ();
276           fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString());
277 
278         }//while
279 
280     }//if
281 
282     // create the start index of the annotation
283     Long startIndex = new Long(tmpDocContent.length());
284 
285     // initialy the start index is equal with the End index
286     CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
287 
288     // we add the object directly into the colector
289     // we don't add it to the stack because this is an empty tag
290     colector.add(obj);
291 
292     // Just analize the tag t and add some\n chars and spaces to the
293     // tmpDocContent.The reason behind is that we need to have a readable form
294     // for the final document.
295     customizeAppearanceOfDocumentWithSimpleTag(t);
296 
297   } // handleSimpleTag
298 
299   /** This method is called when the HTML parser encounts text (PCDATA)
300     */
301   public void handleText(char[] text, int pos){
302 
303     // Skip the STYLE tag content
304     if(isInsideStyleTag) return;
305 
306     // create a string object based on the reported text
307     String content = new String(text);
308 
309     // remove the difference between JDK 1.3 and JDK 1.4
310     String trimContent = content.trim();
311     if(trimContent.length() == 0) {
312       return;
313     } // if
314 
315     int trimCorrection = content.indexOf(trimContent.charAt(0));
316     content = trimContent;
317 
318     StringBuffer contentBuffer = new StringBuffer("");
319     int tmpDocContentSize = tmpDocContent.length();
320     boolean incrementStartIndex = false;
321     // If the first char of the text just read "text[0]" is NOT whitespace AND
322     // the last char of the tmpDocContent[SIZE-1] is NOT whitespace then
323     // concatenation "tmpDocContent + content" will result into a new different
324     // word... and we want to avoid that...
325     if ( tmpDocContentSize != 0 &&
326          content.length() != 0 &&
327          !Character.isWhitespace(content.charAt(0)) &&
328          !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
329 
330             contentBuffer.append(" ");
331             incrementStartIndex = true;
332     }// End if
333     // update the document content
334 
335     // put the repositioning information
336     if(reposInfo != null) {
337       int extractedPos = tmpDocContent.length() + contentBuffer.length();
338       addRepositioningInfo(content, pos + trimCorrection, extractedPos);
339     } // if
340 
341     contentBuffer.append(content);
342     // calculate the End index for all the elements of the stack
343     // the expression is : End index = Current doc length + text length
344     Long end = new Long(tmpDocContent.length() + contentBuffer.length());
345 
346     CustomObject obj = null;
347     // Iterate through stack to modify the End index of the existing elements
348 
349     java.util.Iterator anIterator = stack.iterator();
350     while (anIterator.hasNext ()){
351       // get the object and move to the next one
352       obj = (CustomObject) anIterator.next ();
353       if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
354         obj.setStart(new Long(obj.getStart().longValue() + 1));
355       }// End if
356       // sets its End index
357       obj.setEnd(end);
358     }// End while
359 
360     tmpDocContent.append(contentBuffer.toString());
361   }// end handleText();
362 
363   /** For given content the list with shrink position information is searched
364    *  and on the corresponding positions the correct repositioning information
365    *  is calculated and generated.
366    */
367   public void addRepositioningInfo(String content, int pos, int extractedPos) {
368     int contentLength = content.length();
369 
370     // wrong way (without correction and analysing)
371    //reposInfo.addPositionInfo(pos, contentLength, extractedPos, contentLength);
372 
373     RepositioningInfo.PositionInfo pi = null;
374     long startPos = pos;
375     long correction = 0;
376     long substituteStart;
377     long remainingLen;
378     long offsetInExtracted;
379 
380     for(int i = 0; i < ampCodingInfo.size(); ++i) {
381       pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
382       substituteStart = pi.getOriginalPosition();
383 
384       if(substituteStart >= startPos) {
385         if(substituteStart > pos + contentLength + correction) {
386           break; // outside the current text
387         } // if
388 
389         // should create two repositioning information records
390         remainingLen = substituteStart - (startPos + correction);
391         offsetInExtracted = startPos - pos;
392         if(remainingLen > 0) {
393           reposInfo.addPositionInfo(startPos + correction, remainingLen,
394                             extractedPos + offsetInExtracted, remainingLen);
395         } // if
396         // record for shrank text
397         reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(),
398                           extractedPos + offsetInExtracted + remainingLen,
399                           pi.getCurrentLength());
400         startPos = startPos + remainingLen + pi.getCurrentLength();
401         correction += pi.getOriginalLength() - pi.getCurrentLength();
402       } // if
403     } // for
404 
405     // there is some text remaining for repositioning
406     offsetInExtracted = startPos - pos;
407     remainingLen = contentLength - offsetInExtracted;
408     if(remainingLen > 0) {
409       reposInfo.addPositionInfo(startPos + correction, remainingLen,
410                         extractedPos + offsetInExtracted, remainingLen);
411     } // if
412   } // addRepositioningInfo
413 
414   /** This method analizes the tag t and adds some \n chars and spaces to the
415     * tmpDocContent.The reason behind is that we need to have a readable form
416     * for the final document. This method modifies the content of tmpDocContent.
417     * @param t the Html tag encounted by the HTML parser
418     */
419   protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){
420     boolean modification = false;
421     // if the HTML tag is BR then we add a new line character to the document
422     if (HTML.Tag.BR == t){
423       tmpDocContent.append("\n");
424       modification = true;
425     }// End if
426     if (modification == true){
427       Long end = new Long (tmpDocContent.length());
428       java.util.Iterator anIterator = stack.iterator();
429       while (anIterator.hasNext ()){
430         // get the object and move to the next one
431         CustomObject obj = (CustomObject) anIterator.next();
432         // sets its End index
433         obj.setEnd(end);
434       }// End while
435     }//End if
436   }// customizeAppearanceOfDocumentWithSimpleTag
437 
438   /** This method analizes the tag t and adds some \n chars and spaces to the
439     * tmpDocContent.The reason behind is that we need to have a readable form
440     * for the final document. This method modifies the content of tmpDocContent.
441     * @param t the Html tag encounted by the HTML parser
442     */
443   protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){
444     boolean modification = false;
445     if (HTML.Tag.P == t){
446       int tmpDocContentSize = tmpDocContent.length();
447       if ( tmpDocContentSize >= 2 &&
448            '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)
449          ) { tmpDocContent.append("\n"); modification = true;}
450     }// End if
451     if (modification == true){
452       Long end = new Long (tmpDocContent.length());
453       java.util.Iterator anIterator = stack.iterator();
454       while (anIterator.hasNext ()){
455         // get the object and move to the next one
456         CustomObject obj = (CustomObject) anIterator.next();
457         // sets its End index
458         obj.setEnd(end);
459       }// End while
460     }//End if
461   }// customizeAppearanceOfDocumentWithStartTag
462 
463   /** This method analizes the tag t and adds some \n chars and spaces to the
464     * tmpDocContent.The reason behind is that we need to have a readable form
465     * for the final document. This method modifies the content of tmpDocContent.
466     * @param t the Html tag encounted by the HTML parser
467     */
468   protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){
469     boolean modification = false;
470     // if the HTML tag is BR then we add a new line character to the document
471     if ( (HTML.Tag.P == t) ||
472 
473          (HTML.Tag.H1 == t) ||
474          (HTML.Tag.H2 == t) ||
475          (HTML.Tag.H3 == t) ||
476          (HTML.Tag.H4 == t) ||
477          (HTML.Tag.H5 == t) ||
478          (HTML.Tag.H6 == t) ||
479          (HTML.Tag.TR == t) ||
480          (HTML.Tag.CENTER == t) ||
481          (HTML.Tag.LI == t)
482        ){ tmpDocContent.append("\n"); modification = true;}
483 
484     if (HTML.Tag.TITLE == t){
485       tmpDocContent.append("\n\n");
486       modification = true;
487     }// End if
488 
489     if (modification == true){
490       Long end = new Long (tmpDocContent.length());
491       java.util.Iterator anIterator = stack.iterator();
492       while (anIterator.hasNext ()){
493         // get the object and move to the next one
494         CustomObject obj = (CustomObject) anIterator.next();
495         // sets its End index
496         obj.setEnd(end);
497       }// End while
498     }//End if
499   }// customizeAppearanceOfDocumentWithEndTag
500 
501   /**
502     * This method is called when the HTML parser encounts an error
503     * it depends on the programmer if he wants to deal with that error
504     */
505   public void handleError(String errorMsg, int pos) {
506     //Out.println ("ERROR CALLED : " + errorMsg);
507   }
508 
509   /** This method is called once, when the HTML parser reaches the end
510     * of its input streamin order to notify the parserCallback that there
511     * is nothing more to parse.
512     */
513   public void flush() throws BadLocationException{
514   }// flush
515 
516   /** This method is called when the HTML parser encounts a comment
517     */
518   public void handleComment(char[] text, int pos) {
519   }
520 
521   //StatusReporter Implementation
522 
523   public void addStatusListener(StatusListener listener) {
524     myStatusListeners.add(listener);
525   }
526 
527   public void removeStatusListener(StatusListener listener) {
528     myStatusListeners.remove(listener);
529   }
530 
531   protected void fireStatusChangedEvent(String text) {
532     Iterator listenersIter = myStatusListeners.iterator();
533     while(listenersIter.hasNext())
534       ((StatusListener)listenersIter.next()).statusChanged(text);
535   }
536 
537   /**
538     * This method verifies if data contained by the CustomObject can be used
539     * to create a GATE annotation.
540     */
541 /*  private boolean canCreateAnnotation(CustomObject aCustomObject){
542     long start            = aCustomObject.getStart().longValue();
543     long end              = aCustomObject.getEnd().longValue();
544     long gateDocumentSize = doc.getContent().size().longValue();
545 
546     if (start < 0 || end < 0 ) return false;
547     if (start > end ) return false;
548     if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false;
549     return true;
550   }// canCreateAnnotation
551 */
552 
553   // HtmlDocumentHandler member data
554 
555   // this constant indicates when to fire the status listener
556   // this listener will add an overhead and we don't want a big overhead
557   // this listener will be callled from ELEMENTS_RATE to ELEMENTS_RATE
558   final static  int ELEMENTS_RATE = 128;
559 
560   // this map contains the elements name that we want to create
561   // if it's null all the elements from the HTML documents will be transformed
562   // into Gate annotation objects otherwise only the elements it contains will
563   // be transformed
564   private Map markupElementsMap = null;
565 
566   // the content of the HTML document, without any tag
567   // for internal use
568   private StringBuffer tmpDocContent = null;
569 
570   // a stack used to remember elements and to keep the order
571   private java.util.Stack stack = null;
572 
573   // a gate document
574   private gate.Document doc = null;
575 
576   // an annotation set used for creating annotation reffering the doc
577   private gate.AnnotationSet basicAS;
578 
579   // listeners for status report
580   protected List myStatusListeners = new LinkedList();
581 
582   // this reports the the number of elements that have beed processed so far
583   private int elements = 0;
584 
585   protected  long customObjectsId = 0;
586   // we need a colection to retain all the CustomObjects that will be
587   // transformed into annotation over the gate document...
588   // the transformation will take place inside onDocumentEnd() method
589   private LinkedList colector = null;
590 
591   // Inner class
592   /**
593     * The objects belonging to this class are used inside the stack.
594     * This class is for internal needs
595     */
596   class  CustomObject implements Comparable {
597 
598     // constructor
599     public CustomObject(String anElemName, FeatureMap aFm,
600                            Long aStart, Long anEnd) {
601       elemName = anElemName;
602       fm = aFm;
603       start = aStart;
604       end = anEnd;
605       id = new Long(customObjectsId ++);
606     }// End CustomObject()
607 
608     // Methos implemented as required by Comparable interface
609     public int compareTo(Object o){
610       CustomObject obj = (CustomObject) o;
611       return this.id.compareTo(obj.getId());
612     }// compareTo();
613 
614     // accesor
615     public String getElemName() {
616       return elemName;
617     }// getElemName()
618 
619     public FeatureMap getFM() {
620       return fm;
621     }// getFM()
622 
623     public Long getStart() {
624       return start;
625     }// getStart()
626 
627     public Long getEnd() {
628       return end;
629     }// getEnd()
630 
631     public Long getId(){ return id;}
632 
633     // mutator
634     public void setElemName(String anElemName) {
635       elemName = anElemName;
636     }// getElemName()
637 
638     public void setFM(FeatureMap aFm) {
639       fm = aFm;
640     }// setFM();
641 
642     public void setStart(Long aStart) {
643       start = aStart;
644     }// setStart();
645 
646     public void setEnd(Long anEnd) {
647       end = anEnd;
648     }// setEnd();
649 
650     // data fields
651     private String elemName = null;
652     private FeatureMap fm = null;
653     private Long start = null;
654     private Long end  = null;
655     private Long id = null;
656 
657   } // End inner class CustomObject
658 
659 }//End class HtmlDocumentHandler
660 
661 
662 
663