1
15
16 package gate.html;
17
18 import java.util.*;
19
20 import javax.swing.text.BadLocationException;
21 import javax.swing.text.MutableAttributeSet;
22 import javax.swing.text.html.HTML;
23 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
24
25 import gate.*;
26 import gate.corpora.DocumentContentImpl;
27 import gate.corpora.RepositioningInfo;
28 import gate.event.StatusListener;
29 import gate.util.Err;
30 import gate.util.InvalidOffsetException;
31
32
33
41 public class HtmlDocumentHandler extends ParserCallback {
42
43
44 private static final boolean DEBUG = false;
45
46
52 public HtmlDocumentHandler(gate.Document aDocument, Map aMarkupElementsMap) {
53 this(aDocument,aMarkupElementsMap,null);
54 }
55
56
63 public HtmlDocumentHandler(gate.Document aDocument,
64 Map aMarkupElementsMap,
65 gate.AnnotationSet anAnnotationSet) {
66 stack = new java.util.Stack();
68
69 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
71
72 colector = new LinkedList();
75
76 doc = aDocument;
78
79 markupElementsMap = aMarkupElementsMap;
83
84 basicAS = anAnnotationSet;
86
87 customObjectsId = 0;
88 }
90
91 private RepositioningInfo reposInfo = null;
92
93
94 private RepositioningInfo ampCodingInfo = null;
95
96
99 public void setRepositioningInfo(RepositioningInfo info) {
100 reposInfo = info;
101 }
103
104 public RepositioningInfo getRepositioningInfo() {
105 return reposInfo;
106 }
108
111 public void setAmpCodingInfo(RepositioningInfo info) {
112 ampCodingInfo = info;
113 }
115
116 public RepositioningInfo getAmpCodingInfo() {
117 return ampCodingInfo;
118 }
120
122 private boolean isInsideStyleTag = false;
123
124
128 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
129 if (0 == (++elements % ELEMENTS_RATE))
131 fireStatusChangedEvent("Processed elements : " + elements);
132
133 if(HTML.Tag.STYLE.equals(t)) {
135 isInsideStyleTag = true;
136 }
138 FeatureMap fm = Factory.newFeatureMap();
140
141 if (0 != a.getAttributeCount()){
143 Enumeration enumeration = a.getAttributeNames();
144 while (enumeration.hasMoreElements()){
145 Object attribute = enumeration.nextElement();
146 fm.put(attribute.toString(),(a.getAttribute(attribute)).toString());
147 } }
150 customizeAppearanceOfDocumentWithStartTag(t);
154
155 int tmpDocContentSize = tmpDocContent.length();
161 if ( tmpDocContentSize != 0 &&
162 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))
163 ) tmpDocContent.append(" ");
164
165 Long startIndex = new Long(tmpDocContent.length());
167
168 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
170
171 stack.push (obj);
173
174 }
176
179 public void handleEndTag(HTML.Tag t, int pos){
180 CustomObject obj = null;
182
183 if(HTML.Tag.STYLE.equals(t)) {
185 isInsideStyleTag = false;
186 }
188 if (!stack.isEmpty()){
190 obj = (CustomObject) stack.pop();
191 if (obj.getStart().equals(obj.getEnd())){
194 obj.getFM().put("isEmptyAndSpan","true");
197 } colector.add(obj);
200 }
202 if ( obj != null &&
204 obj.getStart().longValue() != obj.getEnd().longValue()
205 )
206 customizeAppearanceOfDocumentWithEndTag(t);
208
209 if (t == HTML.Tag.HTML){
211 doc.setContent (new DocumentContentImpl(tmpDocContent.toString()));
213
214 if (basicAS == null)
217 basicAS = doc.getAnnotations(
218 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
219
220 Collections.sort(colector);
222 while (!colector.isEmpty()){
224 obj = (CustomObject) colector.getFirst();
225 colector.remove(obj);
226 try{
228 if (markupElementsMap == null){
229 basicAS.add( obj.getStart(),
230 obj.getEnd(),
231 obj.getElemName(),
232 obj.getFM()
233 );
234 }else{
235 String annotationType =
236 (String) markupElementsMap.get(obj.getElemName());
237 if (annotationType != null)
238 basicAS.add( obj.getStart(),
239 obj.getEnd(),
240 annotationType,
241 obj.getFM()
242 );
243 }
244 }catch (InvalidOffsetException e){
245 Err.prln("Error creating an annot :" + obj + " Discarded...");
246 } }
250 fireStatusChangedEvent("Total elements : " + elements);
253
254 }
256 }
258
260 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
261 if ((++elements % ELEMENTS_RATE) == 0)
263 fireStatusChangedEvent("Processed elements : " + elements);
264
265 FeatureMap fm = Factory.newFeatureMap();
268
269 if (0 != a.getAttributeCount ()){
271
272 Enumeration enumeration = a.getAttributeNames ();
274 while (enumeration.hasMoreElements ()){
275 Object attribute = enumeration.nextElement ();
276 fm.put ( attribute.toString(),(a.getAttribute(attribute)).toString());
277
278 }
280 }
282 Long startIndex = new Long(tmpDocContent.length());
284
285 CustomObject obj = new CustomObject(t.toString(),fm,startIndex,startIndex);
287
288 colector.add(obj);
291
292 customizeAppearanceOfDocumentWithSimpleTag(t);
296
297 }
299
301 public void handleText(char[] text, int pos){
302
303 if(isInsideStyleTag) return;
305
306 String content = new String(text);
308
309 String trimContent = content.trim();
311 if(trimContent.length() == 0) {
312 return;
313 }
315 int trimCorrection = content.indexOf(trimContent.charAt(0));
316 content = trimContent;
317
318 StringBuffer contentBuffer = new StringBuffer("");
319 int tmpDocContentSize = tmpDocContent.length();
320 boolean incrementStartIndex = false;
321 if ( tmpDocContentSize != 0 &&
326 content.length() != 0 &&
327 !Character.isWhitespace(content.charAt(0)) &&
328 !Character.isWhitespace(tmpDocContent.charAt(tmpDocContentSize - 1))){
329
330 contentBuffer.append(" ");
331 incrementStartIndex = true;
332 }
335 if(reposInfo != null) {
337 int extractedPos = tmpDocContent.length() + contentBuffer.length();
338 addRepositioningInfo(content, pos + trimCorrection, extractedPos);
339 }
341 contentBuffer.append(content);
342 Long end = new Long(tmpDocContent.length() + contentBuffer.length());
345
346 CustomObject obj = null;
347
349 java.util.Iterator anIterator = stack.iterator();
350 while (anIterator.hasNext ()){
351 obj = (CustomObject) anIterator.next ();
353 if (incrementStartIndex && obj.getStart().equals(obj.getEnd())){
354 obj.setStart(new Long(obj.getStart().longValue() + 1));
355 } obj.setEnd(end);
358 }
360 tmpDocContent.append(contentBuffer.toString());
361 }
363
367 public void addRepositioningInfo(String content, int pos, int extractedPos) {
368 int contentLength = content.length();
369
370
373 RepositioningInfo.PositionInfo pi = null;
374 long startPos = pos;
375 long correction = 0;
376 long substituteStart;
377 long remainingLen;
378 long offsetInExtracted;
379
380 for(int i = 0; i < ampCodingInfo.size(); ++i) {
381 pi = (RepositioningInfo.PositionInfo) ampCodingInfo.get(i);
382 substituteStart = pi.getOriginalPosition();
383
384 if(substituteStart >= startPos) {
385 if(substituteStart > pos + contentLength + correction) {
386 break; }
389 remainingLen = substituteStart - (startPos + correction);
391 offsetInExtracted = startPos - pos;
392 if(remainingLen > 0) {
393 reposInfo.addPositionInfo(startPos + correction, remainingLen,
394 extractedPos + offsetInExtracted, remainingLen);
395 } reposInfo.addPositionInfo(substituteStart, pi.getOriginalLength(),
398 extractedPos + offsetInExtracted + remainingLen,
399 pi.getCurrentLength());
400 startPos = startPos + remainingLen + pi.getCurrentLength();
401 correction += pi.getOriginalLength() - pi.getCurrentLength();
402 } }
405 offsetInExtracted = startPos - pos;
407 remainingLen = contentLength - offsetInExtracted;
408 if(remainingLen > 0) {
409 reposInfo.addPositionInfo(startPos + correction, remainingLen,
410 extractedPos + offsetInExtracted, remainingLen);
411 } }
414
419 protected void customizeAppearanceOfDocumentWithSimpleTag(HTML.Tag t){
420 boolean modification = false;
421 if (HTML.Tag.BR == t){
423 tmpDocContent.append("\n");
424 modification = true;
425 } if (modification == true){
427 Long end = new Long (tmpDocContent.length());
428 java.util.Iterator anIterator = stack.iterator();
429 while (anIterator.hasNext ()){
430 CustomObject obj = (CustomObject) anIterator.next();
432 obj.setEnd(end);
434 } } }
438
443 protected void customizeAppearanceOfDocumentWithStartTag(HTML.Tag t){
444 boolean modification = false;
445 if (HTML.Tag.P == t){
446 int tmpDocContentSize = tmpDocContent.length();
447 if ( tmpDocContentSize >= 2 &&
448 '\n' != tmpDocContent.charAt(tmpDocContentSize - 2)
449 ) { tmpDocContent.append("\n"); modification = true;}
450 } if (modification == true){
452 Long end = new Long (tmpDocContent.length());
453 java.util.Iterator anIterator = stack.iterator();
454 while (anIterator.hasNext ()){
455 CustomObject obj = (CustomObject) anIterator.next();
457 obj.setEnd(end);
459 } } }
463
468 protected void customizeAppearanceOfDocumentWithEndTag(HTML.Tag t){
469 boolean modification = false;
470 if ( (HTML.Tag.P == t) ||
472
473 (HTML.Tag.H1 == t) ||
474 (HTML.Tag.H2 == t) ||
475 (HTML.Tag.H3 == t) ||
476 (HTML.Tag.H4 == t) ||
477 (HTML.Tag.H5 == t) ||
478 (HTML.Tag.H6 == t) ||
479 (HTML.Tag.TR == t) ||
480 (HTML.Tag.CENTER == t) ||
481 (HTML.Tag.LI == t)
482 ){ tmpDocContent.append("\n"); modification = true;}
483
484 if (HTML.Tag.TITLE == t){
485 tmpDocContent.append("\n\n");
486 modification = true;
487 }
489 if (modification == true){
490 Long end = new Long (tmpDocContent.length());
491 java.util.Iterator anIterator = stack.iterator();
492 while (anIterator.hasNext ()){
493 CustomObject obj = (CustomObject) anIterator.next();
495 obj.setEnd(end);
497 } } }
501
505 public void handleError(String errorMsg, int pos) {
506 }
508
509
513 public void flush() throws BadLocationException{
514 }
516
518 public void handleComment(char[] text, int pos) {
519 }
520
521
523 public void addStatusListener(StatusListener listener) {
524 myStatusListeners.add(listener);
525 }
526
527 public void removeStatusListener(StatusListener listener) {
528 myStatusListeners.remove(listener);
529 }
530
531 protected void fireStatusChangedEvent(String text) {
532 Iterator listenersIter = myStatusListeners.iterator();
533 while(listenersIter.hasNext())
534 ((StatusListener)listenersIter.next()).statusChanged(text);
535 }
536
537
541
552
553
555 final static int ELEMENTS_RATE = 128;
559
560 private Map markupElementsMap = null;
565
566 private StringBuffer tmpDocContent = null;
569
570 private java.util.Stack stack = null;
572
573 private gate.Document doc = null;
575
576 private gate.AnnotationSet basicAS;
578
579 protected List myStatusListeners = new LinkedList();
581
582 private int elements = 0;
584
585 protected long customObjectsId = 0;
586 private LinkedList colector = null;
590
591
596 class CustomObject implements Comparable {
597
598 public CustomObject(String anElemName, FeatureMap aFm,
600 Long aStart, Long anEnd) {
601 elemName = anElemName;
602 fm = aFm;
603 start = aStart;
604 end = anEnd;
605 id = new Long(customObjectsId ++);
606 }
608 public int compareTo(Object o){
610 CustomObject obj = (CustomObject) o;
611 return this.id.compareTo(obj.getId());
612 }
614 public String getElemName() {
616 return elemName;
617 }
619 public FeatureMap getFM() {
620 return fm;
621 }
623 public Long getStart() {
624 return start;
625 }
627 public Long getEnd() {
628 return end;
629 }
631 public Long getId(){ return id;}
632
633 public void setElemName(String anElemName) {
635 elemName = anElemName;
636 }
638 public void setFM(FeatureMap aFm) {
639 fm = aFm;
640 }
642 public void setStart(Long aStart) {
643 start = aStart;
644 }
646 public void setEnd(Long anEnd) {
647 end = anEnd;
648 }
650 private String elemName = null;
652 private FeatureMap fm = null;
653 private Long start = null;
654 private Long end = null;
655 private Long id = null;
656
657 }
659 }
661
662
663