1
15
16 package gate.xml;
17
18 import java.lang.reflect.Constructor;
19 import java.util.*;
20
21 import org.xml.sax.*;
22 import org.xml.sax.helpers.DefaultHandler;
23
24 import gate.*;
25 import gate.corpora.DocumentContentImpl;
26 import gate.corpora.DocumentImpl;
27 import gate.event.StatusListener;
28 import gate.util.*;
29
30
31
35 public class GateFormatXmlDocumentHandler extends DefaultHandler{
36
37 private static final boolean DEBUG = false;
38
39
40 private StringBuffer contentBuffer = new StringBuffer("");
41
42
43 private boolean readCharacterStatus = false;
44
45
46
47 private static final int OLD = 1;
48
49 private static final int NEW = 2;
50
51 private static final int UNDEFINED = 0;
52
53
56 private int gateXmlFormatType = UNDEFINED;
57
58
63 private TreeSet annotationIdSet = new TreeSet();
64
65
67 public GateFormatXmlDocumentHandler(gate.Document aDocument){
68 tmpDocContent = new StringBuffer(aDocument.getContent().size().intValue());
70
71 colector = new LinkedList();
74
75 doc = aDocument;
77 currentAnnotationSet = doc.getAnnotations();
78 }
80
84 public void startDocument() throws org.xml.sax.SAXException {
85 }
87
95 public void endDocument() throws org.xml.sax.SAXException {
96
97 doc.setContent(new DocumentContentImpl(tmpDocContent.toString()));
99
101 if (gateXmlFormatType == NEW && !annotationIdSet.isEmpty()){
104 Integer maxAnnotID = (Integer) annotationIdSet.last();
107 ((DocumentImpl)doc).setNextAnnotationId(maxAnnotID.intValue() + 1);
109 annotationIdSet = null;
111 }
113 fireStatusChangedEvent("Total elements: " + elements);
115
116 }
118
122 public void startElement (String uri, String qName, String elemName,
123 Attributes atts) throws SAXException {
124
125 if(readCharacterStatus) {
127 readCharacterStatus = false;
128 charactersAction(new String(contentBuffer).toCharArray(),0,contentBuffer.length());
129 }
130
131 if ((++elements % ELEMENTS_RATE) == 0 )
134 fireStatusChangedEvent("Processed elements : " + elements);
135
136 currentElementStack.add(elemName);
138
139 if("AnnotationSet".equals(elemName))
140 processAnnotationSetElement(atts);
141
142 if("Annotation".equals(elemName))
143 processAnnotationElement(atts);
144
145 if("Feature".equals(elemName))
146 processFeatureElement(atts);
147
148 if("Name".equals(elemName))
149 processNameElement(atts);
150
151 if("Value".equals(elemName))
152 processValueElement(atts);
153
154 if("Node".equals(elemName))
155 processNodeElement(atts);
156 }
158
162 public void endElement (String uri, String qName, String elemName )
163 throws SAXException{
164
165 if(readCharacterStatus) {
167 readCharacterStatus = false;
168 charactersAction(new String(contentBuffer).toCharArray(),0,contentBuffer.length());
169 }
170
171 currentElementStack.pop();
172 if ("Annotation".equals(elemName)){
174 if (currentFeatureMap == null)
175 currentFeatureMap = Factory.newFeatureMap();
176 currentAnnot.setFM(currentFeatureMap);
177 colector.add(currentAnnot);
178 currentAnnot = null;
180 currentFeatureMap = null;
181 return;
182 } if ("Value".equals(elemName) && "Feature".equals(
185 (String)currentElementStack.peek())){
186 if (currentFeatureValue == null) currentFeatureValue = "";
188 } if ("Feature".equals(elemName)){
191 if(currentFeatureName == null){
192 throw new GateSaxException("A feature name was empty." +
195 "The annotation that cause it is " +
196 currentAnnot +
197 ".Please check the document with a text editor before trying again.");
198 }else {
199 if (currentFeatureMap == null){
200 throw new GateSaxException("Document not consistent. A start"+
202 " feature element is missing. " +
203 "The annotation that cause it is " +
204 currentAnnot +
205 "Please check the document with a text editor before trying again.");
206 } currentFeatureMap.put(createFeatKey(),createFeatValue());
211 currentFeatureKeyClassName = null;
214 currentFeatureKeyItemClassName = null;
215 currentFeatureName = null;
216 currentFeatureValueClassName = null;
218 currentFeatureValueItemClassName = null;
219 currentFeatureValue = null;
220 } currentFeatureName = null;
223 currentFeatureValue = null;
224 return;
225 } if ("GateDocumentFeatures".equals(elemName)){
228 if (currentFeatureMap == null)
229 currentFeatureMap = Factory.newFeatureMap();
230 doc.setFeatures(currentFeatureMap);
231 currentFeatureMap = null;
232 return;
233 }
235 if ("AnnotationSet".equals(elemName)){
237 Iterator iterator = colector.iterator();
239 while (iterator.hasNext()){
240 AnnotationObject annot = (AnnotationObject) iterator.next();
241 iterator.remove();
243
244 try{
246
247
253 if (annot.getId() == null){
255
257 if (gateXmlFormatType == NEW)
259 throw new GateSaxException("Found an annotation without ID while " +
261 "previous annotations had one." + "The NEW GATE XML document format requires" +
262 " all annotations to have an UNIQUE ID." +
263 " The offending annotation was of [type=" + annot.getElemName() +
264 ", startOffset=" + annot.getStart() +
265 ", endOffset=" + annot.getEnd() + "]");
266
267 gateXmlFormatType = OLD;
269 currentAnnotationSet.add( annot.getStart(),
270 annot.getEnd(),
271 annot.getElemName(),
272 annot.getFM());
273 }else{
274
276 if (gateXmlFormatType == OLD)
278 throw new GateSaxException("Found an annotation with ID while " +
280 "previous annotations didn't have one." + "The OLD GATE XML" +
281 "document format requires all annotations NOT to have an ID." +
282 " The offending annotation was of [Id=" + annot.getId() +
283 ", type=" + annot.getElemName() +
284 ", startOffset=" + annot.getStart() +
285 ", endOffset=" + annot.getEnd() + "]");
286
287 gateXmlFormatType = NEW;
288 testAnnotationIdUnicity(annot.getId());
291
292 currentAnnotationSet.add( annot.getId(),
294 annot.getStart(),
295 annot.getEnd(),
296 annot.getElemName(),
297 annot.getFM());
298 }
299 }catch (gate.util.InvalidOffsetException e){
300 throw new GateSaxException(e);
301 } } return;
305 }
307
308 }
310
315 public void characters(char [] text,int start,int length) throws SAXException {
316 if(!readCharacterStatus) {
317 contentBuffer = new StringBuffer(new String(text,start,length));
318 } else {
319 contentBuffer.append(new String(text,start,length));
320 }
321 readCharacterStatus = true;
322 }
323
324
327 public void charactersAction( char[] text,int start,int length) throws SAXException{
328 String content = new String(text, start, length);
330 if ("TextWithNodes".equals((String)currentElementStack.peek())){
331 processTextOfTextWithNodesElement(content);
332 return;
333 } if ("Name".equals((String)currentElementStack.peek())){
335 processTextOfNameElement(content);
336 return;
337 } if ("Value".equals((String)currentElementStack.peek())){
339 processTextOfValueElement(content);
344 return;
345 } }
348
351 public void ignorableWhitespace(char ch[],int start,int length) throws
352 SAXException{
353 }
355
358 public void error(SAXParseException ex) throws SAXException {
359 _seh.error(ex);
362 }
364
367 public void fatalError(SAXParseException ex) throws SAXException {
368 _seh.fatalError(ex);
371 }
373
376 public void warning(SAXParseException ex) throws SAXException {
377 _seh.warning(ex);
380 }
382
384
385
386 private void processAnnotationSetElement(Attributes atts){
387 if (atts != null){
388 for (int i = 0; i < atts.getLength(); i++) {
389 String attName = atts.getLocalName(i);
391 String attValue = atts.getValue(i);
392 if ("Name".equals(attName))
393 currentAnnotationSet = doc.getAnnotations(attValue);
394 } } }
398
399 private void processNameElement(Attributes atts){
400 if (atts == null) return;
401 currentFeatureKeyClassName = atts.getValue("className");
402 currentFeatureKeyItemClassName = atts.getValue("itemClassName");
403 }
405
406 private void processValueElement(Attributes atts){
407 if (atts == null) return;
408 currentFeatureValueClassName = atts.getValue("className");
409 currentFeatureValueItemClassName = atts.getValue("itemClassName");
410 }
412
413 private void processAnnotationElement(Attributes atts){
414 if (atts != null){
415 currentAnnot = new AnnotationObject();
416 for (int i = 0; i < atts.getLength(); i++) {
417 String attName = atts.getLocalName(i);
419 String attValue = atts.getValue(i);
420
421 if ("Id".equals(attName))
422 currentAnnot.setId(new Integer(attValue));
423
424 if ("Type".equals(attName))
425 currentAnnot.setElemName(attValue);
426
427 try{
428 if ("StartNode".equals(attName)){
429 Integer id = new Integer(attValue);
430 Long offset = (Long)id2Offset.get(id);
431 if (offset == null){
432 throw new GateRuntimeException("Couldn't found Node with id = " +
433 id +
434 ".It was specified in annot " +
435 currentAnnot+
436 " as a start node!" +
437 "Check the document with a text editor or something"+
438 " before trying again.");
439
440 }else
441 currentAnnot.setStart(offset);
442 } if ("EndNode".equals(attName)){
444 Integer id = new Integer(attValue);
445 Long offset = (Long) id2Offset.get(id);
446 if (offset == null){
447 throw new GateRuntimeException("Couldn't found Node with id = " +
448 id+
449 ".It was specified in annot " +
450 currentAnnot+
451 " as a end node!" +
452 "Check the document with a text editor or something"+
453 " before trying again.");
454 }else
455 currentAnnot.setEnd(offset);
456 } } catch (NumberFormatException e){
458 throw new GateRuntimeException("Offsets problems.Couldn't create"+
459 " Integers from" + " id[" +
460 attValue + "]) in annot " +
461 currentAnnot+
462 "Check the document with a text editor or something,"+
463 " before trying again");
464 } } } }
469
470 private void processFeatureElement(Attributes atts){
471 if (currentFeatureMap == null)
473 currentFeatureMap = Factory.newFeatureMap();
474 }
476
477 private void processNodeElement(Attributes atts){
478 if (atts != null){
479 for (int i = 0; i < atts.getLength(); i++) {
480 String attName = atts.getLocalName(i);
482 String attValue = atts.getValue(i);
483 if ("id".equals(attName)){
485 try{
486 Integer id = new Integer(attValue);
487 id2Offset.put(id,new Long(tmpDocContent.length()));
488 }catch(NumberFormatException e){
489 throw new GateRuntimeException("Coudn't create a node from " +
490 attValue + " Expected an integer.");
491 } } } } }
497
498 private void processTextOfTextWithNodesElement(String text){
499 text = recoverNewLineSequence(text);
500 tmpDocContent.append(text);
501 }
503
504 private String recoverNewLineSequence(String text) {
505 String result = text;
506
507 if(text.indexOf('\n') != -1) {
509 String newLineType =
510 (String) doc.getFeatures().get(GateConstants.DOCUMENT_NEW_LINE_TYPE);
511
512 if("LF".equalsIgnoreCase(newLineType)) {
513 newLineType = null;
514 }
515
516 if(newLineType == null) return result;
518
519 String newLine = "\n";
520 if("CRLF".equalsIgnoreCase(newLineType)) {
521 newLine = "\r\n";
522 }
523 if("CR".equalsIgnoreCase(newLineType)) {
524 newLine = "\r";
525 }
526 if("LFCR".equalsIgnoreCase(newLineType)) {
527 newLine = "\n\r";
528 }
529
530 StringBuffer buff = new StringBuffer(text);
531 int index = text.lastIndexOf('\n');
532 while(index != -1) {
533 buff.replace(index, index+1, newLine);
534 index = text.lastIndexOf('\n', index-1);
535 } result = buff.toString();
537 }
539 return result;
540 }
542
543 private void processTextOfNameElement(String text) throws GateSaxException{
544 if (currentFeatureMap == null)
545 throw new GateSaxException("GATE xml format processing error:" +
546 " Found a Name element that is not enclosed into a Feature one while" +
547 " analyzing the annotation " +
548 currentAnnot +
549 "Please check the document with a text editor or something before" +
550 " trying again.");
551 else{
552 if (currentFeatureName == null)
555 currentFeatureName = text;
556 else
557 currentFeatureName = currentFeatureName + text;
558 } }
561
562 private void processTextOfValueElement(String text) throws GateSaxException{
563 if (currentFeatureMap == null)
564 throw new GateSaxException("GATE xml format processing error:" +
565 " Found a Value element that is not enclosed into a Feature one while" +
566 " analyzing the annotation " +
567 currentAnnot+
568 "Please check the document with a text editor or something before" +
569 " trying again.");
570 else{
571 if (currentFeatureValue == null)
574 currentFeatureValue = text;
575 else
576 currentFeatureValue = currentFeatureValue + text;
577 } }
580
584 private Object createFeatKey(){
585 return createFeatObject(currentFeatureKeyClassName,
586 currentFeatureKeyItemClassName,
587 currentFeatureName);
588 }
590
594 private Object createFeatValue(){
595 return createFeatObject(currentFeatureValueClassName,
596 currentFeatureValueItemClassName,
597 currentFeatureValue);
598 }
600
618 private Object createFeatObject( String aFeatClassName,
619 String aFeatItemClassName,
620 String aFeatStringRepresentation){
621 if (aFeatStringRepresentation == null) return null;
623 if (aFeatClassName == null) aFeatClassName = "java.lang.String";
624 if (aFeatItemClassName == null) aFeatItemClassName = "java.lang.String";
625 Class currentFeatClass = null;
626 try{
627 currentFeatClass = Gate.getClassLoader().loadClass(aFeatClassName);
628 }catch (ClassNotFoundException cnfex){
629 return aFeatStringRepresentation;
630 } if (java.util.Collection.class.isAssignableFrom(currentFeatClass)){
632 Class itemClass = null;
633 Collection featObject = null;
634 try{
635 featObject = (Collection) currentFeatClass.newInstance();
636 try{
637 itemClass = Gate.getClassLoader().loadClass(aFeatItemClassName);
638 }catch(ClassNotFoundException cnfex){
639 Out.prln("Warning: Item class "+ aFeatItemClassName + " not found."+
640 "Adding items as Strings to the feature called \"" + currentFeatureName
641 + "\" in the annotation " + currentAnnot);
642 itemClass = java.lang.String.class;
643 } Class[] paramsArray = new Class[1];
646 paramsArray[0] = java.lang.String.class;
647 Constructor itemConstructor = null;
648 boolean addItemAsString = false;
649 try{
650 itemConstructor = itemClass.getConstructor(paramsArray);
651 }catch (NoSuchMethodException nsme){
652 addItemAsString = true;
653 }catch (SecurityException se){
654 addItemAsString = true;
655 } StringTokenizer strTok = new StringTokenizer(
657 aFeatStringRepresentation,";");
658 Object[] params = new Object[1];
659 Object itemObj = null;
660 while (strTok.hasMoreTokens()){
661 String itemStrRep = strTok.nextToken();
662 if (addItemAsString) featObject.add(itemStrRep);
663 else{
664 params[0] = itemStrRep;
665 try{
666 itemObj = itemConstructor.newInstance(params);
667 }catch (Exception e){
668 throw new GateRuntimeException("An item("+
669 itemStrRep +
670 ") does not comply with its class" +
671 " definition("+aFeatItemClassName+").Happened while tried to"+
672 " add feature: " +
673 aFeatStringRepresentation + " to the annotation " + currentAnnot);
674 } featObject.add(itemObj);
676 } } }catch(InstantiationException instex ){
679 return aFeatStringRepresentation;
680 }catch (IllegalAccessException iae){
681 return aFeatStringRepresentation;
682 } return featObject;
684 } Class[] params = new Class[1];
688 params[0] = java.lang.String.class;
689 try{
690 Constructor featConstr = currentFeatClass.getConstructor(params);
691 Object[] featConstrParams = new Object[1];
692 featConstrParams[0] = aFeatStringRepresentation;
693 Object featObject = featConstr.newInstance(featConstrParams);
694 return featObject;
695 } catch(Exception e){
696 return aFeatStringRepresentation;
697 } }
700
706 private void testAnnotationIdUnicity(Integer anAnnotId) throws GateSaxException{
707
708 if (annotationIdSet.contains(anAnnotId))
709 throw new GateSaxException("Found two or possibly more annotations with" +
710 " the same ID! The offending ID was " + anAnnotId );
711 else annotationIdSet.add(anAnnotId);
712 }
714
715
720 public void comment(String text) throws SAXException {
721 }
723
729 public void startCDATA()throws SAXException {
730 }
732
738 public void endCDATA() throws SAXException {
739 }
741
746 public void startParsedEntity(String name) throws SAXException {
747 }
749
755 public void endParsedEntity(String name, boolean included)throws SAXException{
756 }
758
760
763 public void addStatusListener(StatusListener listener){
764 myStatusListeners.add(listener);
765 }
769 public void removeStatusListener(StatusListener listener){
770 myStatusListeners.remove(listener);
771 }
776 protected void fireStatusChangedEvent(String text){
777 Iterator listenersIter = myStatusListeners.iterator();
778 while(listenersIter.hasNext())
779 ((StatusListener)listenersIter.next()).statusChanged(text);
780 }
782
784
788 final static int ELEMENTS_RATE = 128;
789
790
791 private SimpleErrorHandler _seh = new SimpleErrorHandler();
792
793
794 private StringBuffer tmpDocContent = new StringBuffer("");
795
796
797 private gate.Document doc = null;
798
799
800 protected List myStatusListeners = new LinkedList();
801
802
803 private int elements = 0;
804
805
810 private List colector = null;
811
814 private Map id2Offset = new TreeMap();
815
816 private Stack currentElementStack = new Stack();
817
820 private AnnotationObject currentAnnot = null;
821
822 private FeatureMap currentFeatureMap = null;
823
824 private String currentFeatureName = null;
825
826 private String currentFeatureValue = null;
827
828 private String currentFeatureKeyClassName = null;
829
832 private String currentFeatureKeyItemClassName = null;
833
834 private String currentFeatureValueClassName = null;
835
838 private String currentFeatureValueItemClassName = null;
839
842 private AnnotationSet currentAnnotationSet = null;
843
844
845 class AnnotationObject {
846
847 public AnnotationObject(){}
849
850 public String getElemName(){
851 return elemName;
852 }
854 public FeatureMap getFM(){
855 return fm;
856 }
858 public Long getStart(){
859 return start;
860 }
862 public Long getEnd(){
863 return end;
864 }
866 public void setElemName(String anElemName){
867 elemName = anElemName;
868 }
870 public void setFM(FeatureMap aFm){
871 fm = aFm;
872 }
874 public void setStart(Long aStart){
875 start = aStart;
876 }
878 public void setEnd(Long anEnd){
879 end = anEnd;
880 }
882 public Integer getId() {
883 return id;
884 }
886 public void setId(Integer anId) {
887 id = anId;
888 }
890 public String toString(){
891 return " [id =" + id +
892 " type=" + elemName +
893 " startNode=" + start+
894 " endNode=" + end+
895 " features="+ fm +"] ";
896 }
897
898 private String elemName = null;
900 private FeatureMap fm = null;
901 private Long start = null;
902 private Long end = null;
903 private Integer id = null;
904 } }
907