gate.corpora.DocumentImpl (Java2HTML)

1   /*
2    *  DocumentImpl.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentImpl.java,v 1.137 2006/03/08 15:56:44 cursu Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.IOException;
19  import java.net.URL;
20  import java.util.*;
21  
22  import gate.*;
23  import gate.annotation.AnnotationSetImpl;
24  import gate.creole.AbstractLanguageResource;
25  import gate.creole.ResourceInstantiationException;
26  import gate.event.*;
27  import gate.util.*;
28  
29  /** Represents the commonalities between all sorts of documents.
30    *
31    * <H2>Editing</H2>
32    *
33    * <P>
34    * The DocumentImpl class implements the Document interface.
35    * The DocumentContentImpl class models the textual or audio-visual
36    * materials which are the source and content of Documents.
37    * The AnnotationSetImpl class supplies annotations on Documents.
38    *
39    * <P>
40    * Abbreviations:
41    *
42    * <UL>
43    * <LI>
44    * DC = DocumentContent
45    * <LI>
46    * D = Document
47    * <LI>
48    * AS = AnnotationSet
49    * </UL>
50    *
51    * <P>
52    * We add an edit method to each of these classes; for DC and AS
53    * the methods are package private; D has the public method.
54    *
55    * <PRE>
56    *   void edit(Long start, Long end, DocumentContent replacement)
57    *   throws InvalidOffsetException;
58    * </PRE>
59    *
60    * <P>
61    * D receives edit requests and forwards them to DC and AS.
62    * On DC, this method makes a change to the content - e.g. replacing
63    * a String range from start to end with replacement. (Deletions
64    * are catered for by having replacement = null.) D then calls
65    * AS.edit on each of its annotation sets.
66    *
67    * <P>
68    * On AS, edit calls replacement.size() (i.e. DC.size()) to
69    * figure out how long the replacement is (0 for null). It then
70    * considers annotations that terminate (start or end) in
71    * the altered or deleted range as invalid; annotations that
72    * terminate after the range have their offsets adjusted.
73    * I.e.:
74    * <UL>
75    * <LI>
76    * the nodes that pointed inside the old modified area are invalid now and
77    * will be deleted along with the connected annotations;
78    * <LI>
79    * the nodes that are before the start of the modified area remain
80    * untouched;
81    * <LI>
82    * the nodes that are after the end of the affected area will have the
83    * offset changed according to the formula below.
84    * </UL>
85    *
86    * <P>
87    * A note re. AS and annotations: annotations no longer have
88    * offsets as in the old model, they now have nodes, and nodes
89    * have offsets.
90    *
91    * <P>
92    * To implement AS.edit, we have several indices:
93    * <PRE>
94    *   HashMap annotsByStartNode, annotsByEndNode;
95    * </PRE>
96    * which map node ids to annotations;
97    * <PRE>
98    *   RBTreeMap nodesByOffset;
99    * </PRE>
100   * which maps offset to Nodes.
101   *
102   * <P>
103   * When we get an edit request, we traverse that part of the
104   * nodesByOffset tree representing the altered or deleted
105   * range of the DC. For each node found, we delete any annotations
106   * that terminate on the node, and then delete the node itself.
107   * We then traverse the rest of the tree, changing the offset
108   * on all remaining nodes by:
109   * <PRE>
110   *   newOffset =
111   *     oldOffset -
112   *     (
113   *       (end - start) -                                     // size of mod
114   *       ( (replacement == null) ? 0 : replacement.size() )  // size of repl
115   *     );
116   * </PRE>
117   * Note that we use the same convention as e.g. java.lang.String: start
118   * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119   * range 1-3 = "bc". Examples, for a node with offset 4:
120   * <PRE>
121   * edit(1, 3, "BC");
122   * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123   *
124   * edit(1, 3, null);
125   * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126   *
127   * edit(1, 3, "BBCC");
128   * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129   * </PRE>
130   */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements TextualDocument, CreoleListener,
133                                             DatastoreListener {
134   /** Debug flag */
135   private static final boolean DEBUG = false;
136 
137   /** If you set this flag to true the original content of the document will
138    *  be kept in the document feature. <br>
139    *  Default value is false to avoid the unnecessary waste of memory */
140   private Boolean preserveOriginalContent = new Boolean(false);
141 
142   /** If you set this flag to true the repositioning information for
143    *  the document will be kept in the document feature. <br>
144    *  Default value is false to avoid the unnecessary waste of time and memory
145    */
146   private Boolean collectRepositioningInfo = new Boolean(false);
147 
148   /**
149    * This is a variable which contains the latest crossed over annotation
150    * found during export with preserving format, i.e., toXml(annotations)
151    * method.
152    */
153   private Annotation crossedOverAnnotation = null;
154 
155   /** Default construction. Content left empty. */
156   public DocumentImpl() {
157     content = new DocumentContentImpl();
158     stringContent = "";
159   } // default construction
160 
161   /** Cover unpredictable Features creation */
162   public FeatureMap getFeatures() {
163     if (features == null) {
164       features = new SimpleFeatureMapImpl();
165     }
166     return features;
167   }
168 
169   /** Initialise this resource, and return it. */
170   public Resource init() throws ResourceInstantiationException {
171     // set up the source URL and create the content
172     if(sourceUrl == null) {
173       if(stringContent == null) {
174         throw new ResourceInstantiationException(
175           "The sourceURL and document's content were null."
176         );
177       }
178 
179       content = new DocumentContentImpl(stringContent);
180       getFeatures().put("gate.SourceURL", "created from String");
181     } else {
182       try {
183         content = new DocumentContentImpl(
184           sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
185         getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
186       } catch(IOException e) {
187         throw new ResourceInstantiationException("DocumentImpl.init: " + e);
188       }
189     }
190 
191     if(preserveOriginalContent.booleanValue() && content != null) {
192       String originalContent = new String(
193         ((DocumentContentImpl) content).getOriginalContent());
194       getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
195                     originalContent);
196     } // if
197     
198     // set up a DocumentFormat if markup unpacking required
199     if(getMarkupAware().booleanValue()) {
200       DocumentFormat docFormat =
201         DocumentFormat.getDocumentFormat(this, sourceUrl);
202       try {
203         if(docFormat != null){
204           StatusListener sListener = (StatusListener)
205                                       gate.gui.MainFrame.getListeners().
206                                       get("gate.event.StatusListener");
207           if(sListener != null) docFormat.addStatusListener(sListener);
208 
209           // set the flag if true and if the document format support collecting
210           docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
211 
212           if(docFormat.getShouldCollectRepositioning().booleanValue()) {
213             // unpack with collectiong of repositioning information
214             RepositioningInfo info = new RepositioningInfo();
215 
216             String origContent = (String) getFeatures().get(
217                 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
218 
219             RepositioningInfo ampCodingInfo = new RepositioningInfo();
220             if(origContent != null) {
221               boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
222               collectInformationForAmpCodding(origContent, ampCodingInfo,
223                                               shouldCorrectCR);
224               if(docFormat instanceof HtmlDocumentFormat) {
225                 collectInformationForWS(origContent, ampCodingInfo);
226               } // if
227             } // if
228 
229             docFormat.unpackMarkup(this, info, ampCodingInfo);
230 
231             if(origContent != null
232                 && docFormat instanceof XmlDocumentFormat) {
233               // CRLF correction of RepositioningInfo
234               correctRepositioningForCRLFInXML(origContent, info);
235             } // if
236 
237             getFeatures().put(
238                 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
239           }
240           else {
241             // normal old fashioned unpack
242             docFormat.unpackMarkup(this);
243           }
244           docFormat.removeStatusListener(sListener);
245        } //if format != null
246       } catch(DocumentFormatException e) {
247         throw new ResourceInstantiationException(
248           "Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") +
249           "!", e);
250       }
251     } // if markup aware
252 
253 //try{
254 //  FileWriter fw = new FileWriter("d:/temp/doccontent.txt");
255 //  fw.write(getContent().toString());
256 //  fw.flush();
257 //  fw.close();
258 //}catch(IOException ioe){
259 //  ioe.printStackTrace();
260 //}
261 
262     return this;
263   } // init()
264 
265   /**
266    * Correct repositioning information for substitution of "\r\n" with "\n"
267    */
268   private void correctRepositioningForCRLFInXML(String content,
269                                             RepositioningInfo info) {
270     int index = -1;
271 
272     do {
273       index = content.indexOf("\r\n", index+1);
274       if(index != -1) {
275         info.correctInformationOriginalMove(index, 1);
276       } // if
277     } while(index != -1);
278   } // correctRepositioningForCRLF
279 
280   /**
281    * Collect information for substitution of "&xxx;" with "y"
282    *
283    * It couldn't be collected a position information about
284    * some unicode and &-coded symbols during parsing. The parser "hide" the
285    * information about the position of such kind of parsed text.
286    * So, there is minimal chance to have &-coded symbol inside the covered by
287    * repositioning records area. The new record should be created for every
288    * coded symbol outside the existing records.
289    * <BR>
290    * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction
291    * for CRLF substitution is performed.
292    */
293   private void collectInformationForAmpCodding(String content,
294                                             RepositioningInfo info,
295                                             boolean shouldCorrectCR) {
296 
297     if(content == null || info == null) return;
298 
299     int ampIndex = -1;
300     int semiIndex;
301 
302     do {
303       ampIndex = content.indexOf('&', ampIndex+1);
304       if(ampIndex != -1) {
305         semiIndex = content.indexOf(';', ampIndex+1);
306         // have semicolon and it is near enough for amp codding
307         if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
308           info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
309         }
310         else {
311           // no semicolon or it is too far
312           // analyse for amp codding without semicolon
313           int maxEnd = Math.min(ampIndex+8, content.length());
314           String ampCandidate = content.substring(ampIndex, maxEnd);
315           int ampCodingSize = analyseAmpCodding(ampCandidate);
316 
317           if(ampCodingSize != -1) {
318             info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
319           } // if
320 
321         } // if - semicolon found
322       } // if - ampersand found
323     } while (ampIndex != -1);
324 
325     // correct the collected information to adjust it's positions
326     // with reported by the parser
327     int index = -1;
328 
329     if(shouldCorrectCR) {
330       do {
331         index = content.indexOf("\r\n", index+1);
332         if(index != -1) {
333           info.correctInformationOriginalMove(index, -1);
334         } // if
335       } while(index != -1);
336     } // if
337   } // collectInformationForAmpCodding
338 
339   /**
340    * This function compute size of the ampersand codded sequence when
341    * semicolin is not present.
342    */
343   private int analyseAmpCodding(String content) {
344     int result = -1;
345 
346     try {
347       char ch = content.charAt(1);
348 
349       switch(ch) {
350         case 'l' : // &lt
351         case 'L' : // &lt
352           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
353             result = 3;
354           } // if
355           break;
356         case 'g' : // &gt
357         case 'G' : // &gt
358           if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
359             result = 3;
360           } // if
361           break;
362         case 'a' : // &amp
363         case 'A' : // &amp
364           if(content.substring(2, 4).equalsIgnoreCase("mp")) {
365             result = 4;
366           } // if
367           break;
368         case 'q' : // &quot
369         case 'Q' : // &quot
370           if(content.substring(2, 5).equalsIgnoreCase("uot")) {
371             result = 5;
372           } // if
373           break;
374         case '#' : // #number (example &#145, &#x4C38)
375           int endIndex = 2;
376           boolean hexCoded = false;
377           if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
378             // Hex codding
379             ++endIndex;
380             hexCoded = true;
381           } // if
382 
383           while (endIndex < 8
384                   && isNumber(content.charAt(endIndex), hexCoded) ) {
385             ++endIndex;
386           } // while
387           result = endIndex;
388           break;
389       } // switch
390     } catch (StringIndexOutOfBoundsException ex) {
391       // do nothing
392     } // catch
393 
394     return result;
395   } // analyseAmpCodding
396 
397   /** Check for numeric range. If hex is true the A..F range is included */
398   private boolean isNumber(char ch, boolean hex) {
399     if(ch >= '0' && ch <= '9') return true;
400 
401     if(hex) {
402       if(ch >= 'A' && ch <= 'F') return true;
403       if(ch >= 'a' && ch <= 'f') return true;
404     } // if
405 
406     return false;
407   } // isNumber
408 
409   /** HTML parser perform substitution of multiple whitespaces (WS) with
410    *  a single WS. To create correct repositioning information structure we
411    *  should keep the information for such multiple WS.
412    *  <BR>
413    *  The criteria for WS is <code>(ch <= ' ')</code>.
414    */
415   private void collectInformationForWS(String content, RepositioningInfo info) {
416 
417     if(content == null || info == null) return;
418 
419     // analyse the content and correct the repositioning information
420     char ch;
421     int startWS, endWS;
422 
423     startWS = endWS = -1;
424     int contentLength = content.length();
425 
426     for(int i=0; i<contentLength; ++i) {
427       ch = content.charAt(i);
428 
429       // is whitespace
430       if(ch <= ' ') {
431         if(startWS == -1) {
432           startWS = i;
433         } // if
434         endWS = i;
435       }
436       else {
437         if(endWS - startWS > 0) {
438           // put the repositioning information about the WS substitution
439           info.addPositionInfo(
440             (long)startWS, (long)(endWS - startWS + 1), 0, 1);
441         } // if
442         // clear positions
443         startWS = endWS = -1;
444       }// if
445     } // for
446   } // collectInformationForWS
447 
448   /** Clear all the data members of the object. */
449   public void cleanup() {
450 
451     defaultAnnots = null;
452     if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
453         namedAnnotSets.clear();
454     if (DEBUG) Out.prln("Document cleanup called");
455     if (this.lrPersistentId != null)
456       Gate.getCreoleRegister().removeCreoleListener(this);
457     if(this.getDataStore() != null)
458       this.getDataStore().removeDatastoreListener(this);
459   } // cleanup()
460 
461 
462   /** Documents are identified by URLs */
463   public URL getSourceUrl() { return sourceUrl; }
464 
465   /** Set method for the document's URL */
466   public void setSourceUrl(URL sourceUrl) {
467     this.sourceUrl = sourceUrl;
468   } // setSourceUrl
469 
470   /** Documents may be packed within files; in this case an optional pair of
471     * offsets refer to the location of the document.
472     */
473   public Long[] getSourceUrlOffsets() {
474     Long[] sourceUrlOffsets = new Long[2];
475     sourceUrlOffsets[0] = sourceUrlStartOffset;
476     sourceUrlOffsets[1] = sourceUrlEndOffset;
477     return sourceUrlOffsets;
478   } // getSourceUrlOffsets
479 
480   /**
481    * Allow/disallow preserving of the original document content.
482    * If is <B>true</B> the original content will be retrieved from
483    * the DocumentContent object and preserved as document feature.
484    */
485   public void setPreserveOriginalContent(Boolean b) {
486     preserveOriginalContent = b;
487   } // setPreserveOriginalContent
488 
489   /** Get the preserving of content status of the Document.
490    *
491    *  @return whether the Document should preserve it's original content.
492    */
493   public Boolean getPreserveOriginalContent() {
494     return preserveOriginalContent;
495   } // getPreserveOriginalContent
496 
497   /**
498    *  Allow/disallow collecting of repositioning information.
499    *  If is <B>true</B> information will be retrieved and preserved
500    *  as document feature.<BR>
501    *  Preserving of repositioning information give the possibilities
502    *  for converting of coordinates between the original document content and
503    *  extracted from the document text.
504    */
505   public void setCollectRepositioningInfo(Boolean b) {
506     collectRepositioningInfo = b;
507   } // setCollectRepositioningInfo
508 
509   /** Get the collectiong and preserving of repositioning information
510    *  for the Document. <BR>
511    *  Preserving of repositioning information give the possibilities
512    *  for converting of coordinates between the original document content and
513    *  extracted from the document text.
514    *
515    *  @return whether the Document should collect and preserve information.
516    */
517   public Boolean getCollectRepositioningInfo() {
518     return collectRepositioningInfo;
519   } // getCollectRepositioningInfo
520 
521   /** Documents may be packed within files; in this case an optional pair of
522     * offsets refer to the location of the document. This method gets the
523     * start offset.
524     */
525   public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
526 
527   /** Documents may be packed within files; in this case an optional pair of
528     * offsets refer to the location of the document. This method sets the
529     * start offset.
530     */
531   public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
532     this.sourceUrlStartOffset = sourceUrlStartOffset;
533   } // setSourceUrlStartOffset
534 
535   /** Documents may be packed within files; in this case an optional pair of
536     * offsets refer to the location of the document. This method gets the
537     * end offset.
538     */
539   public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
540 
541   /** Documents may be packed within files; in this case an optional pair of
542     * offsets refer to the location of the document. This method sets the
543     * end offset.
544     */
545   public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
546     this.sourceUrlEndOffset = sourceUrlEndOffset;
547   } // setSourceUrlStartOffset
548 
549   /** The content of the document: a String for text; MPEG for video; etc. */
550   public DocumentContent getContent() { return content; }
551 
552   /** Set method for the document content */
553   public void setContent(DocumentContent content) {
554     this.content = content;
555     this.stringContent = content.toString();
556   }
557 
558   /** Get the encoding of the document content source */
559   public String getEncoding() {
560     //we need to make sure we ALWAYS have an encoding
561     if(encoding == null || encoding.trim().length() == 0){
562       //no encoding definded: use the platform default
563       encoding = java.nio.charset.Charset.forName(
564           System.getProperty("file.encoding")).name();
565     }
566     return encoding;
567   }
568 
569   /** Set the encoding of the document content source */
570   public void setEncoding(String encoding) { this.encoding = encoding; }
571 
572   /** Get the default set of annotations. The set is created if it
573     * doesn't exist yet.
574     */
575   public AnnotationSet getAnnotations() {
576     if(defaultAnnots == null){
577       defaultAnnots = new AnnotationSetImpl(this);
578       fireAnnotationSetAdded(new DocumentEvent(
579            this, DocumentEvent.ANNOTATION_SET_ADDED, null));
580     }//if
581     return defaultAnnots;
582   } // getAnnotations()
583 
584   /** Get a named set of annotations. Creates a new set if one with this
585     * name doesn't exist yet.
586     * If the provided name is null then it returns the default annotation set.
587     */
588   public AnnotationSet getAnnotations(String name) {
589     if(name == null) return getAnnotations();
590     if(namedAnnotSets == null)
591       namedAnnotSets = new HashMap();
592     AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
593 
594     if(namedSet == null) {
595       namedSet = new AnnotationSetImpl(this, name);
596       namedAnnotSets.put(name, namedSet);
597 
598       DocumentEvent evt = new DocumentEvent(
599         this, DocumentEvent.ANNOTATION_SET_ADDED, name
600       );
601       fireAnnotationSetAdded(evt);
602     }
603     return namedSet;
604   } // getAnnotations(name)
605 
606   /** Make the document markup-aware. This will trigger the creation
607    *  of a DocumentFormat object at Document initialisation time; the
608    *  DocumentFormat object will unpack the markup in the Document and
609    *  add it as annotations. Documents are <B>not</B> markup-aware by default.
610    *
611    *  @param newMarkupAware markup awareness status.
612    */
613   public void setMarkupAware(Boolean newMarkupAware) {
614       this.markupAware = newMarkupAware;
615   }
616 
617   /** Get the markup awareness status of the Document.
618    *  <B>Documents are markup-aware by default.</B>
619    *  @return whether the Document is markup aware.
620    */
621   public Boolean getMarkupAware() { return markupAware; }
622 
623   /** Returns an XML document aming to preserve the original markups(
624     * the original markup will be in the same place and format as it was
625     * before processing the document) and include (if possible)
626     * the annotations specified in the aSourceAnnotationSet.
627     * It is equivalent to toXml(aSourceAnnotationSet, true).
628     */
629   public String toXml(Set aSourceAnnotationSet){
630     return toXml(aSourceAnnotationSet, true);
631   }
632 
633   /** Returns an XML document aming to preserve the original markups(
634     * the original markup will be in the same place and format as it was
635     * before processing the document) and include (if possible)
636     * the annotations specified in the aSourceAnnotationSet.
637     * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
638     * if they will cause a crosed over situation.
639     * @param aSourceAnnotationSet is an annotation set containing all the
640     * annotations that will be combined with the original marup set. If the
641     * param is <code>null</code> it will only dump the original markups.
642     * @param includeFeatures is a boolean that controls whether the annotation
643     * features should be included or not. If false, only the annotation type
644     * is included in the tag.
645     * @return a string representing an XML document containing the original
646     * markup + dumped annotations form the aSourceAnnotationSet
647     */
648   public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
649 
650     if(hasOriginalContentFeatures()) {
651       return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
652     } // if
653 
654     AnnotationSet originalMarkupsAnnotSet =
655             this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
656 
657     // Create a dumping annotation set on the document. It will be used for
658     // dumping annotations...
659 //    AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
660     List dumpingList = new ArrayList(originalMarkupsAnnotSet.size());
661 
662     // This set will be constructed inside this method. If is not empty, the
663     // annotation contained will be lost.
664 /*    if (!dumpingSet.isEmpty()){
665       Out.prln("WARNING: The dumping annotation set was not empty."+
666       "All annotation it contained were lost.");
667       dumpingSet.clear();
668     }// End if
669 */
670     StatusListener sListener = (StatusListener)
671                                gate.gui.MainFrame.getListeners().
672                                get("gate.event.StatusListener");
673     // Construct the dumping set in that way that all annotations will verify
674     // the condition that there are not annotations which are crossed.
675     // First add all annotation from the original markups
676     if(sListener != null)
677       sListener.statusChanged("Constructing the dumping annotation set.");
678 //    dumpingSet.addAll(originalMarkupsAnnotSet);
679     dumpingList.addAll(originalMarkupsAnnotSet);
680     // Then take all the annotations from aSourceAnnotationSet and verify if
681     // they can be inserted safely into the dumpingSet. Where not possible,
682     // report.
683     if (aSourceAnnotationSet != null){
684       Iterator iter = aSourceAnnotationSet.iterator();
685       while (iter.hasNext()){
686         Annotation currentAnnot = (Annotation) iter.next();
687         if(insertsSafety(dumpingList,currentAnnot)){
688 //          dumpingSet.add(currentAnnot);
689           dumpingList.add(currentAnnot);
690         }else if (crossedOverAnnotation != null && DEBUG){
691           try {
692             Out.prln("Warning: Annotations were found to violate the " +
693             "crossed over condition: \n" +
694             "1. [" +
695             getContent().getContent(
696                            crossedOverAnnotation.getStartNode().getOffset(),
697                            crossedOverAnnotation.getEndNode().getOffset()) +
698             " (" + crossedOverAnnotation.getType() + ": " +
699             crossedOverAnnotation.getStartNode().getOffset() +
700             ";" + crossedOverAnnotation.getEndNode().getOffset() +
701             ")]\n" +
702             "2. [" +
703             getContent().getContent(
704                            currentAnnot.getStartNode().getOffset(),
705                            currentAnnot.getEndNode().getOffset()) +
706             " (" + currentAnnot.getType() + ": " +
707             currentAnnot.getStartNode().getOffset() +
708             ";" + currentAnnot.getEndNode().getOffset() +
709             ")]\nThe second one will be discarded.\n"  );
710           } catch (gate.util.InvalidOffsetException ex) {
711             throw new GateRuntimeException(ex.getMessage());
712           }
713         }// End if
714       }// End while
715     }// End if
716 
717     //kalina: order the dumping list by start offset
718     Collections.sort(dumpingList, new gate.util.OffsetComparator());
719 
720     // The dumpingSet is ready to be exported as XML
721     // Here we go.
722     if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
723     StringBuffer xmlDoc = new StringBuffer(
724           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
725 
726     // Add xml header if original format was xml
727     String mimeType = getFeatures() == null ?
728                       null :
729                       (String)getFeatures().get("MimeType");
730     boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
731 
732     if(wasXML){
733       xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
734       xmlDoc.append(getEncoding());
735       xmlDoc.append("\" ?>");
736       xmlDoc.append(Strings.getNl());
737     }// ENd if
738     // Identify and extract the root annotation from the dumpingSet.
739     theRootAnnotation = identifyTheRootAnnotation(dumpingList);
740     // If a root annotation has been identified then add it eplicitley at the
741     // beginning of the document
742     if (theRootAnnotation != null){
743       dumpingList.remove(theRootAnnotation);
744       xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures));
745     }// End if
746     // Construct and append the rest of the document
747     xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
748     // If a root annotation has been identified then add it eplicitley at the
749     // end of the document
750     if (theRootAnnotation != null){
751       xmlDoc.append(writeEndTag(theRootAnnotation));
752     }// End if
753 
754     if(sListener != null) sListener.statusChanged("Done.");
755     return xmlDoc.toString();
756   }//End toXml()
757 
758   /** This method verifies if aSourceAnnotation can ve inserted safety into the
759     * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
760     * contition with any annotation from the aTargetAnnotSet.
761     * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
762     * @param aSourceAnnotation the annotation to be inserted into the
763     * aTargetAnnotSet
764     * @return true if the annotation inserts safety, or false otherwise.
765     */
766   private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
767                                                 Annotation aSourceAnnotation){
768 
769     if (aTargetAnnotSet == null || aSourceAnnotation == null) {
770       this.crossedOverAnnotation = null;
771       return false;
772     }
773     if (aSourceAnnotation.getStartNode() == null ||
774         aSourceAnnotation.getStartNode().getOffset()== null) {
775       this.crossedOverAnnotation = null;
776       return false;
777     }
778     if (aSourceAnnotation.getEndNode() == null ||
779         aSourceAnnotation.getEndNode().getOffset()== null) {
780       this.crossedOverAnnotation = null;
781       return false;
782     }
783 
784     // Get the start and end offsets
785     Long start = aSourceAnnotation.getStartNode().getOffset();
786     Long end =   aSourceAnnotation.getEndNode().getOffset();
787     // Read aSourceAnnotation offsets long
788     long s2 = start.longValue();
789     long e2 = end.longValue();
790 
791     // Obtain a set with all annotations annotations that overlap
792     // totaly or partially with the interval defined by the two provided offsets
793     AnnotationSet as = aTargetAnnotSet.get(start,end);
794 
795     // Investigate all the annotations from as to see if there is one that
796     // comes in conflict with aSourceAnnotation
797     Iterator it = as.iterator();
798     while(it.hasNext()){
799       Annotation ann = (Annotation) it.next();
800       // Read ann offsets
801       long s1 = ann.getStartNode().getOffset().longValue();
802       long e1 = ann.getEndNode().getOffset().longValue();
803 
804       if (s1<s2 && s2<e1 && e1<e2) {
805         this.crossedOverAnnotation = ann;
806         return false;
807       }
808       if (s2<s1 && s1<e2 && e2<e1) {
809         this.crossedOverAnnotation = ann;
810         return false;
811       }
812     }// End while
813     return true;
814   }// insertsSafety()
815 
816   private boolean insertsSafety(List aTargetAnnotList,
817                                                 Annotation aSourceAnnotation){
818 
819     if (aTargetAnnotList == null || aSourceAnnotation == null) {
820       this.crossedOverAnnotation = null;
821       return false;
822     }
823     if (aSourceAnnotation.getStartNode() == null ||
824         aSourceAnnotation.getStartNode().getOffset()== null) {
825       this.crossedOverAnnotation = null;
826       return false;
827     }
828     if (aSourceAnnotation.getEndNode() == null ||
829         aSourceAnnotation.getEndNode().getOffset()== null) {
830       this.crossedOverAnnotation = null;
831       return false;
832     }
833 
834     // Get the start and end offsets
835     Long start = aSourceAnnotation.getStartNode().getOffset();
836     Long end =   aSourceAnnotation.getEndNode().getOffset();
837     // Read aSourceAnnotation offsets long
838     long s2 = start.longValue();
839     long e2 = end.longValue();
840 
841     // Obtain a set with all annotations annotations that overlap
842     // totaly or partially with the interval defined by the two provided offsets
843     List as = new ArrayList();
844     for (int i=0; i < aTargetAnnotList.size(); i++) {
845       Annotation annot = (Annotation) aTargetAnnotList.get(i);
846       if (annot.getStartNode().getOffset().longValue() >= s2
847           &&
848           annot.getStartNode().getOffset().longValue() <= e2)
849         as.add(annot);
850       else if (annot.getEndNode().getOffset().longValue() >= s2
851           &&
852           annot.getEndNode().getOffset().longValue() <= e2)
853         as.add(annot);
854     }
855 
856     // Investigate all the annotations from as to see if there is one that
857     // comes in conflict with aSourceAnnotation
858     Iterator it = as.iterator();
859     while(it.hasNext()){
860       Annotation ann = (Annotation) it.next();
861       // Read ann offsets
862       long s1 = ann.getStartNode().getOffset().longValue();
863       long e1 = ann.getEndNode().getOffset().longValue();
864 
865       if (s1<s2 && s2<e1 && e1<e2) {
866         this.crossedOverAnnotation = ann;
867         return false;
868       }
869       if (s2<s1 && s1<e2 && e2<e1) {
870         this.crossedOverAnnotation = ann;
871         return false;
872       }
873     }// End while
874     return true;
875   }// insertsSafety()
876 
877   /** This method saves all the annotations from aDumpAnnotSet and combines
878     * them with the document content.
879     * @param aDumpAnnotSet is a GATE annotation set prepared to be used
880     * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
881     * then an empty string will be returned.
882     * @param includeFeatures is a boolean, which controls whether the annotation
883     * features and gate ID are included or not.
884     * @return The XML document obtained from raw text + the information from
885     * the dump annotation set.
886     */
887   private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
888                                         boolean includeFeatures){
889     String content = null;
890     if (this.getContent()== null)
891       content = new String("");
892     else
893       content = this.getContent().toString();
894     StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
895     if (aDumpAnnotSet == null)   return docContStrBuff.toString();
896 
897     TreeMap offsets2CharsMap = new TreeMap();
898     if (this.getContent().size().longValue() != 0){
899       // Fill the offsets2CharsMap with all the indices where
900       // special chars appear
901       buildEntityMapFromString(content,offsets2CharsMap);
902     }//End if
903     // The saving alghorithm is as follows:
904     ///////////////////////////////////////////
905     // Construct a set of annot with all IDs in asc order.
906     // All annotations that end at that offset swap their place in descending
907     // order. For each node write all the tags from left to right.
908 
909     // Construct the node set
910     TreeSet offsets = new TreeSet();
911     Iterator iter = aDumpAnnotSet.iterator();
912     while (iter.hasNext()){
913       Annotation annot = (Annotation) iter.next();
914       offsets.add(annot.getStartNode().getOffset());
915       offsets.add(annot.getEndNode().getOffset());
916     }// End while
917 
918     // ofsets is sorted in ascending order.
919     // Iterate this set in descending order and remove an offset at each
920     // iteration
921     while (!offsets.isEmpty()){
922       Long offset = (Long)offsets.last();
923       // Remove the offset from the set
924       offsets.remove(offset);
925       // Now, use it.
926       // Returns a list with annotations that needs to be serialized in that
927       // offset.
928       List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
929       // Attention: the annotation are serialized from left to right
930 //      StringBuffer tmpBuff = new StringBuffer("");
931       StringBuffer tmpBuff = new StringBuffer(
932           DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
933       Stack stack = new Stack();
934       // Iterate through all these annotations and serialize them
935       Iterator it = annotations.iterator();
936       while(it.hasNext()){
937         Annotation a = (Annotation) it.next();
938         it.remove();
939         // Test if a Ends at offset
940         if ( offset.equals(a.getEndNode().getOffset()) ){
941           // Test if a Starts at offset
942           if ( offset.equals(a.getStartNode().getOffset()) ){
943             // Here, the annotation a Starts and Ends at the offset
944             if ( null != a.getFeatures().get("isEmptyAndSpan") &&
945                  "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
946 
947               // Assert: annotation a with start == end and isEmptyAndSpan
948               tmpBuff.append(writeStartTag(a, includeFeatures));
949               stack.push(a);
950             }else{
951               // Assert annotation a with start == end and an empty tag
952               tmpBuff.append(writeEmptyTag(a));
953               // The annotation is removed from dumped set
954               aDumpAnnotSet.remove(a);
955             }// End if
956           }else{
957             // Here the annotation a Ends at the offset.
958             // In this case empty the stack and write the end tag
959             if (!stack.isEmpty()){
960               while(!stack.isEmpty()){
961                 Annotation a1 = (Annotation)stack.pop();
962                 tmpBuff.append(writeEndTag(a1));
963               }// End while
964             }// End if
965             tmpBuff.append(writeEndTag(a));
966           }// End if
967         }else{
968           // The annotation a does NOT end at the offset. Let's see if it starts
969           // at the offset
970           if ( offset.equals(a.getStartNode().getOffset()) ){
971             // The annotation a starts at the offset.
972             // In this case empty the stack and write the end tag
973             if (!stack.isEmpty()){
974               while(!stack.isEmpty()){
975                 Annotation a1 = (Annotation)stack.pop();
976                 tmpBuff.append(writeEndTag(a1));
977               }// End while
978             }// End if
979             tmpBuff.append(writeStartTag(a, includeFeatures));
980             // The annotation is removed from dumped set
981             aDumpAnnotSet.remove(a);
982           }// End if ( offset.equals(a.getStartNode().getOffset()) )
983         }// End if ( offset.equals(a.getEndNode().getOffset()) )
984       }// End while(it.hasNext()){
985 
986       // In this case empty the stack and write the end tag
987       if (!stack.isEmpty()){
988         while(!stack.isEmpty()){
989           Annotation a1 = (Annotation)stack.pop();
990           tmpBuff.append(writeEndTag(a1));
991         }// End while
992       }// End if
993 
994       // Before inserting tmpBuff into docContStrBuff we need to check
995       // if there are chars to be replaced and if there are, they would be
996       // replaced.
997       if (!offsets2CharsMap.isEmpty()){
998         Long offsChar = (Long) offsets2CharsMap.lastKey();
999         while( !offsets2CharsMap.isEmpty() &&
1000                       offsChar.intValue() >= offset.intValue()){
1001          // Replace the char at offsChar with its corresponding entity form
1002          // the entitiesMap.
1003          docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1004          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1005          // Discard the offsChar after it was used.
1006          offsets2CharsMap.remove(offsChar);
1007          // Investigate next offsChar
1008          if (!offsets2CharsMap.isEmpty())
1009            offsChar = (Long) offsets2CharsMap.lastKey();
1010        }// End while
1011      }// End if
1012      // Insert tmpBuff to the location where it belongs in docContStrBuff
1013      docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1014    }// End while(!offsets.isEmpty())
1015    // Need to replace the entities in the remaining text, if there is any text
1016    // So, if there are any more items in offsets2CharsMap they need to be
1017    // replaced
1018    while (!offsets2CharsMap.isEmpty()){
1019      Long offsChar = (Long) offsets2CharsMap.lastKey();
1020      // Replace the char with its entity
1021      docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1022      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1023      // remove the offset from the map
1024      offsets2CharsMap.remove(offsChar);
1025    }// End while
1026    return docContStrBuff.toString();
1027  }// saveAnnotationSetAsXml()
1028
1029  private String saveAnnotationSetAsXml(List aDumpAnnotList,
1030                                        boolean includeFeatures){
1031    String content = null;
1032    if (this.getContent()== null)
1033      content = new String("");
1034    else
1035      content = this.getContent().toString();
1036    StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1037    if (aDumpAnnotList == null)   return docContStrBuff.toString();
1038
1039    StringBuffer resultStrBuff = new StringBuffer(
1040        DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1041    // last offset position used to extract portions of text
1042    Long lastOffset = new Long(0);
1043
1044    TreeMap offsets2CharsMap = new TreeMap();
1045    HashMap annotsForOffset = new HashMap(100);
1046    if (this.getContent().size().longValue() != 0){
1047      // Fill the offsets2CharsMap with all the indices where
1048      // special chars appear
1049      buildEntityMapFromString(content,offsets2CharsMap);
1050    }//End if
1051    // The saving alghorithm is as follows:
1052    ///////////////////////////////////////////
1053    // Construct a set of annot with all IDs in asc order.
1054    // All annotations that end at that offset swap their place in descending
1055    // order. For each node write all the tags from left to right.
1056
1057    // Construct the node set
1058    TreeSet offsets = new TreeSet();
1059    Iterator iter = aDumpAnnotList.iterator();
1060    Annotation annot;
1061    Long start;
1062    Long end;
1063    while (iter.hasNext()){
1064      annot = (Annotation) iter.next();
1065      start = annot.getStartNode().getOffset();
1066      end = annot.getEndNode().getOffset();
1067      offsets.add(start);
1068      offsets.add(end);
1069      if (annotsForOffset.containsKey(start)) {
1070        ((List) annotsForOffset.get(start)).add(annot);
1071      } else {
1072        List newList = new ArrayList(10);
1073        newList.add(annot);
1074        annotsForOffset.put(start, newList);
1075      }
1076      if (annotsForOffset.containsKey(end)) {
1077        ((List) annotsForOffset.get(end)).add(annot);
1078      } else {
1079        List newList = new ArrayList(10);
1080        newList.add(annot);
1081        annotsForOffset.put(end, newList);
1082      }
1083    }// End while
1084
1085    // ofsets is sorted in ascending order.
1086    // Iterate this set in descending order and remove an offset at each
1087    // iteration
1088    Iterator offsetIt = offsets.iterator();
1089    Long offset;
1090    List annotations;
1091    // This don't have to be a large buffer - just for tags
1092    StringBuffer tmpBuff = new StringBuffer(255);
1093    Stack stack = new Stack();
1094    while (offsetIt.hasNext()){
1095      offset = (Long)offsetIt.next();
1096      // Now, use it.
1097      // Returns a list with annotations that needs to be serialized in that
1098      // offset.
1099      annotations = (List) annotsForOffset.get(offset);
1100      // order annotations in list for offset to print tags in correct order
1101      annotations = getAnnotationsForOffset(annotations, offset);
1102      // clear structures
1103      tmpBuff.setLength(0);
1104      stack.clear();
1105
1106      // Iterate through all these annotations and serialize them
1107      Iterator it = annotations.iterator();
1108      Annotation a;
1109      Annotation annStack;
1110      while(it.hasNext()){
1111        a = (Annotation) it.next();
1112        // Test if a Ends at offset
1113        if ( offset.equals(a.getEndNode().getOffset()) ){
1114          // Test if a Starts at offset
1115          if ( offset.equals(a.getStartNode().getOffset()) ){
1116            // Here, the annotation a Starts and Ends at the offset
1117            if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1118                 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1119
1120              // Assert: annotation a with start == end and isEmptyAndSpan
1121              tmpBuff.append(writeStartTag(a, includeFeatures));
1122              stack.push(a);
1123            }else{
1124              // Assert annotation a with start == end and an empty tag
1125              tmpBuff.append(writeEmptyTag(a));
1126              // The annotation is removed from dumped set
1127              aDumpAnnotList.remove(a);
1128            }// End if
1129          }else{
1130            // Here the annotation a Ends at the offset.
1131            // In this case empty the stack and write the end tag
1132            if (!stack.isEmpty()){
1133              while(!stack.isEmpty()){
1134                annStack = (Annotation)stack.pop();
1135                tmpBuff.append(writeEndTag(annStack));
1136              }// End while
1137            }// End if
1138            tmpBuff.append(writeEndTag(a));
1139          }// End if
1140        }else{
1141          // The annotation a does NOT end at the offset. Let's see if it starts
1142          // at the offset
1143          if ( offset.equals(a.getStartNode().getOffset()) ){
1144            // The annotation a starts at the offset.
1145            // In this case empty the stack and write the end tag
1146            if (!stack.isEmpty()){
1147              while(!stack.isEmpty()){
1148                annStack = (Annotation)stack.pop();
1149                tmpBuff.append(writeEndTag(annStack));
1150              }// End while
1151            }// End if
1152            tmpBuff.append(writeStartTag(a, includeFeatures));
1153            // The annotation is removed from dumped set
1154          }// End if ( offset.equals(a.getStartNode().getOffset()) )
1155        }// End if ( offset.equals(a.getEndNode().getOffset()) )
1156      }// End while(it.hasNext()){
1157
1158      // In this case empty the stack and write the end tag
1159      if (!stack.isEmpty()){
1160        while(!stack.isEmpty()){
1161          annStack = (Annotation)stack.pop();
1162          tmpBuff.append(writeEndTag(annStack));
1163        }// End while
1164      }// End if
1165
1166      // extract text from content and replace spec chars
1167      StringBuffer partText = new StringBuffer();
1168      SortedMap offsetsInRange =
1169          offsets2CharsMap.subMap(lastOffset, offset);
1170      Long tmpOffset;
1171      Long tmpLastOffset = lastOffset;
1172      String replacement;
1173
1174      // Before inserting tmpBuff into the buffer we need to check
1175      // if there are chars to be replaced in range
1176      if(!offsetsInRange.isEmpty()) {
1177        tmpOffset = (Long) offsetsInRange.firstKey();
1178        replacement =
1179            (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1180        partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1181                                               tmpOffset.intValue()));
1182        partText.append(replacement);
1183        tmpLastOffset = new Long(tmpOffset.longValue()+1);
1184      }
1185      partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1186                                               offset.intValue()));
1187      resultStrBuff.append(partText);
1188      // Insert tmpBuff to the result string
1189      resultStrBuff.append(tmpBuff.toString());
1190      lastOffset = offset;
1191    }// End while(!offsets.isEmpty())
1192
1193    // get text to the end of content
1194    // extract text from content and replace spec chars
1195    StringBuffer partText = new StringBuffer();
1196    SortedMap offsetsInRange =
1197        offsets2CharsMap.subMap(lastOffset, new Long(docContStrBuff.length()));
1198    Long tmpOffset;
1199    Long tmpLastOffset = lastOffset;
1200    String replacement;
1201
1202    // Need to replace the entities in the remaining text, if there is any text
1203    // So, if there are any more items in offsets2CharsMap for remaining text
1204    // they need to be replaced
1205    if(!offsetsInRange.isEmpty()) {
1206      tmpOffset = (Long) offsetsInRange.firstKey();
1207      replacement =
1208          (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1209      partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1210                                             tmpOffset.intValue()));
1211      partText.append(replacement);
1212      tmpLastOffset = new Long(tmpOffset.longValue()+1);
1213    }
1214    partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1215                                             docContStrBuff.length()));
1216    resultStrBuff.append(partText);
1217
1218    return resultStrBuff.toString();
1219  }// saveAnnotationSetAsXml()
1220
1221/* Old method created by Cristian. Create content backward.
1222
1223    private String saveAnnotationSetAsXml(List aDumpAnnotList,
1224                                          boolean includeFeatures){
1225      String content = null;
1226      if (this.getContent()== null)
1227        content = new String("");
1228      else
1229        content = this.getContent().toString();
1230      StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1231      if (aDumpAnnotList == null)   return docContStrBuff.toString();
1232
1233      TreeMap offsets2CharsMap = new TreeMap();
1234      HashMap annotsForOffset = new HashMap(100);
1235      if (this.getContent().size().longValue() != 0){
1236        // Fill the offsets2CharsMap with all the indices where
1237        // special chars appear
1238        buildEntityMapFromString(content,offsets2CharsMap);
1239      }//End if
1240      // The saving alghorithm is as follows:
1241      ///////////////////////////////////////////
1242      // Construct a set of annot with all IDs in asc order.
1243      // All annotations that end at that offset swap their place in descending
1244      // order. For each node write all the tags from left to right.
1245
1246      // Construct the node set
1247      TreeSet offsets = new TreeSet();
1248      Iterator iter = aDumpAnnotList.iterator();
1249      while (iter.hasNext()){
1250        Annotation annot = (Annotation) iter.next();
1251        offsets.add(annot.getStartNode().getOffset());
1252        offsets.add(annot.getEndNode().getOffset());
1253        if (annotsForOffset.containsKey(annot.getStartNode().getOffset())) {
1254          ((List) annotsForOffset.get(annot.getStartNode().getOffset())).add(annot);
1255        } else {
1256          List newList = new ArrayList(10);
1257          newList.add(annot);
1258          annotsForOffset.put(annot.getStartNode().getOffset(), newList);
1259        }
1260        if (annotsForOffset.containsKey(annot.getEndNode().getOffset())) {
1261          ((List) annotsForOffset.get(annot.getEndNode().getOffset())).add(annot);
1262        } else {
1263          List newList = new ArrayList(10);
1264          newList.add(annot);
1265          annotsForOffset.put(annot.getEndNode().getOffset(), newList);
1266        }
1267      }// End while
1268
1269      // ofsets is sorted in ascending order.
1270      // Iterate this set in descending order and remove an offset at each
1271      // iteration
1272      while (!offsets.isEmpty()){
1273        Long offset = (Long)offsets.last();
1274        // Remove the offset from the set
1275        offsets.remove(offset);
1276        // Now, use it.
1277        // Returns a list with annotations that needs to be serialized in that
1278        // offset.
1279//      List annotations = getAnnotationsForOffset(aDumpAnnotList,offset);
1280        List annotations = (List) annotsForOffset.get(offset);
1281        annotations = getAnnotationsForOffset(annotations,offset);
1282        // Attention: the annotation are serialized from left to right
1283//      StringBuffer tmpBuff = new StringBuffer("");
1284        StringBuffer tmpBuff = new StringBuffer(
1285            DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1286        Stack stack = new Stack();
1287        // Iterate through all these annotations and serialize them
1288        Iterator it = annotations.iterator();
1289        while(it.hasNext()){
1290          Annotation a = (Annotation) it.next();
1291          it.remove();
1292          // Test if a Ends at offset
1293          if ( offset.equals(a.getEndNode().getOffset()) ){
1294            // Test if a Starts at offset
1295            if ( offset.equals(a.getStartNode().getOffset()) ){
1296              // Here, the annotation a Starts and Ends at the offset
1297              if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1298                   "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1299
1300                // Assert: annotation a with start == end and isEmptyAndSpan
1301                tmpBuff.append(writeStartTag(a, includeFeatures));
1302                stack.push(a);
1303              }else{
1304                // Assert annotation a with start == end and an empty tag
1305                tmpBuff.append(writeEmptyTag(a));
1306                // The annotation is removed from dumped set
1307                aDumpAnnotList.remove(a);
1308              }// End if
1309            }else{
1310              // Here the annotation a Ends at the offset.
1311              // In this case empty the stack and write the end tag
1312              if (!stack.isEmpty()){
1313                while(!stack.isEmpty()){
1314                  Annotation a1 = (Annotation)stack.pop();
1315                  tmpBuff.append(writeEndTag(a1));
1316                }// End while
1317              }// End if
1318              tmpBuff.append(writeEndTag(a));
1319            }// End if
1320          }else{
1321            // The annotation a does NOT end at the offset. Let's see if it starts
1322            // at the offset
1323            if ( offset.equals(a.getStartNode().getOffset()) ){
1324              // The annotation a starts at the offset.
1325              // In this case empty the stack and write the end tag
1326              if (!stack.isEmpty()){
1327                while(!stack.isEmpty()){
1328                  Annotation a1 = (Annotation)stack.pop();
1329                  tmpBuff.append(writeEndTag(a1));
1330                }// End while
1331              }// End if
1332              tmpBuff.append(writeStartTag(a, includeFeatures));
1333              // The annotation is removed from dumped set
1334              aDumpAnnotList.remove(a);
1335            }// End if ( offset.equals(a.getStartNode().getOffset()) )
1336          }// End if ( offset.equals(a.getEndNode().getOffset()) )
1337        }// End while(it.hasNext()){
1338
1339        // In this case empty the stack and write the end tag
1340        if (!stack.isEmpty()){
1341          while(!stack.isEmpty()){
1342            Annotation a1 = (Annotation)stack.pop();
1343            tmpBuff.append(writeEndTag(a1));
1344          }// End while
1345        }// End if
1346
1347        // Before inserting tmpBuff into docContStrBuff we need to check
1348        // if there are chars to be replaced and if there are, they would be
1349        // replaced.
1350        if (!offsets2CharsMap.isEmpty()){
1351          Long offsChar = (Long) offsets2CharsMap.lastKey();
1352          while( !offsets2CharsMap.isEmpty() &&
1353                         offsChar.intValue() >= offset.intValue()){
1354            // Replace the char at offsChar with its corresponding entity form
1355            // the entitiesMap.
1356            docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1357            (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1358            // Discard the offsChar after it was used.
1359            offsets2CharsMap.remove(offsChar);
1360            // Investigate next offsChar
1361            if (!offsets2CharsMap.isEmpty())
1362              offsChar = (Long) offsets2CharsMap.lastKey();
1363          }// End while
1364        }// End if
1365        // Insert tmpBuff to the location where it belongs in docContStrBuff
1366        docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1367      }// End while(!offsets.isEmpty())
1368      // Need to replace the entities in the remaining text, if there is any text
1369      // So, if there are any more items in offsets2CharsMap they need to be
1370      // replaced
1371      while (!offsets2CharsMap.isEmpty()){
1372        Long offsChar = (Long) offsets2CharsMap.lastKey();
1373        // Replace the char with its entity
1374        docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1375        (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1376        // remove the offset from the map
1377        offsets2CharsMap.remove(offsChar);
1378      }// End while
1379      return docContStrBuff.toString();
1380    }// saveAnnotationSetAsXml()
1381*/
1382
1383  /**
1384   *  Return true only if the document has features for original content and
1385   *  repositioning information.
1386   */
1387  private boolean hasOriginalContentFeatures() {
1388    FeatureMap features = getFeatures();
1389    boolean result = false;
1390
1391    result =
1392    (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
1393      &&
1394    (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
1395      != null);
1396
1397    return result;
1398  } // hasOriginalContentFeatures
1399
1400  /** This method saves all the annotations from aDumpAnnotSet and combines
1401    * them with the original document content, if preserved as feature.
1402    * @param aSourceAnnotationSet is a GATE annotation set prepared to be used
1403    * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
1404    * then an empty string will be returned.
1405    * @param includeFeatures is a boolean, which controls whether the annotation
1406    * features and gate ID are included or not.
1407    * @return The XML document obtained from raw text + the information from
1408    * the dump annotation set.
1409    */
1410  private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
1411                                        boolean includeFeatures){
1412    StringBuffer docContStrBuff;
1413
1414    String origContent;
1415
1416    origContent =
1417     (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
1418    if(origContent == null) {
1419      origContent = "";
1420    } // if
1421
1422    long originalContentSize = origContent.length();
1423
1424    RepositioningInfo repositioning = (RepositioningInfo)
1425      getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
1426
1427    docContStrBuff = new StringBuffer(origContent);
1428    if (aSourceAnnotationSet == null) return docContStrBuff.toString();
1429
1430    StatusListener sListener = (StatusListener)
1431                               gate.gui.MainFrame.getListeners().
1432                               get("gate.event.StatusListener");
1433
1434    AnnotationSet originalMarkupsAnnotSet =
1435            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1436    // Create a dumping annotation set on the document. It will be used for
1437    // dumping annotations...
1438    AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1439    if(sListener != null)
1440      sListener.statusChanged("Constructing the dumping annotation set.");
1441    // Then take all the annotations from aSourceAnnotationSet and verify if
1442    // they can be inserted safely into the dumpingSet. Where not possible,
1443    // report.
1444    if (aSourceAnnotationSet != null){
1445      Iterator iter = aSourceAnnotationSet.iterator();
1446      Annotation currentAnnot;
1447      while (iter.hasNext()){
1448        currentAnnot = (Annotation) iter.next();
1449        if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1450            && insertsSafety(dumpingSet, currentAnnot)){
1451          dumpingSet.add(currentAnnot);
1452        }else{
1453          Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1454          ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1455          ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1456          ", type=" + currentAnnot.getType()+ " was found to violate the" +
1457          " crossed over condition. It will be discarded");
1458        }// End if
1459      }// End while
1460    }// End if
1461
1462    // The dumpingSet is ready to be exported as XML
1463    // Here we go.
1464    if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1465
1466    ///////////////////////////////////////////
1467    // Construct a set of annot with all IDs in asc order.
1468    // All annotations that end at that offset swap their place in descending
1469    // order. For each node write all the tags from left to right.
1470
1471    // Construct the node set
1472    TreeSet offsets = new TreeSet();
1473    Iterator iter = aSourceAnnotationSet.iterator();
1474    while (iter.hasNext()){
1475      Annotation annot = (Annotation) iter.next();
1476      offsets.add(annot.getStartNode().getOffset());
1477      offsets.add(annot.getEndNode().getOffset());
1478    }// End while
1479
1480    // ofsets is sorted in ascending order.
1481    // Iterate this set in descending order and remove an offset at each
1482    // iteration
1483    while (!offsets.isEmpty()){
1484      Long offset = (Long)offsets.last();
1485      // Remove the offset from the set
1486      offsets.remove(offset);
1487      // Now, use it.
1488      // Returns a list with annotations that needs to be serialized in that
1489      // offset.
1490      List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1491      // Attention: the annotation are serialized from left to right
1492      StringBuffer tmpBuff = new StringBuffer("");
1493      Stack stack = new Stack();
1494      // Iterate through all these annotations and serialize them
1495      Iterator it = annotations.iterator();
1496      Annotation a = null;
1497      while(it.hasNext()) {
1498        a = (Annotation) it.next();
1499        it.remove();
1500        // Test if a Ends at offset
1501        if ( offset.equals(a.getEndNode().getOffset()) ){
1502          // Test if a Starts at offset
1503          if ( offset.equals(a.getStartNode().getOffset()) ){
1504            // Here, the annotation a Starts and Ends at the offset
1505            if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1506                 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1507
1508              // Assert: annotation a with start == end and isEmptyAndSpan
1509              tmpBuff.append(writeStartTag(a, includeFeatures, false));
1510              stack.push(a);
1511            }else{
1512              // Assert annotation a with start == end and an empty tag
1513              tmpBuff.append(writeEmptyTag(a, false));
1514              // The annotation is removed from dumped set
1515              aSourceAnnotationSet.remove(a);
1516            }// End if
1517          }else{
1518            // Here the annotation a Ends at the offset.
1519            // In this case empty the stack and write the end tag
1520            while(!stack.isEmpty()){
1521              Annotation a1 = (Annotation)stack.pop();
1522              tmpBuff.append(writeEndTag(a1));
1523            }// End while
1524            tmpBuff.append(writeEndTag(a));
1525          }// End if
1526        }else{
1527          // The annotation a does NOT end at the offset. Let's see if it starts
1528          // at the offset
1529          if ( offset.equals(a.getStartNode().getOffset()) ){
1530            // The annotation a starts at the offset.
1531            // In this case empty the stack and write the end tag
1532            while(!stack.isEmpty()){
1533              Annotation a1 = (Annotation)stack.pop();
1534              tmpBuff.append(writeEndTag(a1));
1535            }// End while
1536
1537            tmpBuff.append(writeStartTag(a, includeFeatures, false));
1538            // The annotation is removed from dumped set
1539            aSourceAnnotationSet.remove(a);
1540          }// End if ( offset.equals(a.getStartNode().getOffset()) )
1541        }// End if ( offset.equals(a.getEndNode().getOffset()) )
1542      }// End while(it.hasNext()){
1543
1544      // In this case empty the stack and write the end tag
1545      while(!stack.isEmpty()){
1546        Annotation a1 = (Annotation)stack.pop();
1547        tmpBuff.append(writeEndTag(a1));
1548      }// End while
1549
1550      long originalPosition = -1;
1551      boolean backPositioning =
1552        a != null && offset.equals(a.getEndNode().getOffset());
1553      if ( backPositioning ) {
1554        // end of the annotation correction
1555        originalPosition =
1556          repositioning.getOriginalPos(offset.intValue(), true);
1557      } // if
1558
1559      if(originalPosition == -1) {
1560        originalPosition = repositioning.getOriginalPos(offset.intValue());
1561      } // if
1562
1563      // Insert tmpBuff to the location where it belongs in docContStrBuff
1564      if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1565        docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1566      }
1567      else {
1568        Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1569        +") could not be positioned in the original document. \n"
1570        +"Calculated position is: "+originalPosition
1571        +" placed back: "+backPositioning);
1572      } // if
1573
1574    }// End while(!offsets.isEmpty())
1575    if (theRootAnnotation != null)
1576      docContStrBuff.append(writeEndTag(theRootAnnotation));
1577    return docContStrBuff.toString();
1578  } // saveAnnotationSetAsXmlInOrig()
1579
1580  /** This method returns a list with annotations ordered that way that
1581    * they can be serialized from left to right, at the offset. If one of the
1582    * params is null then an empty list will be returned.
1583    * @param aDumpAnnotSet is a set containing all annotations that will be
1584    * dumped.
1585    * @param offset represent the offset at witch the annotation must start
1586    * AND/OR end.
1587    * @return a list with those annotations that need to be serialized.
1588    */
1589  private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1590    List annotationList = new LinkedList();
1591    if (aDumpAnnotSet == null || offset == null) return annotationList;
1592    Set annotThatStartAtOffset = new TreeSet(
1593                          new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1594    Set annotThatEndAtOffset = new TreeSet(
1595                          new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1596    Set annotThatStartAndEndAtOffset = new TreeSet(
1597                          new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1598
1599    // Fill these tree lists with annotation tat start, end or start and
1600    // end at the offset.
1601    Iterator iter = aDumpAnnotSet.iterator();
1602    while(iter.hasNext()){
1603      Annotation ann = (Annotation) iter.next();
1604      if (offset.equals(ann.getStartNode().getOffset())){
1605        if (offset.equals(ann.getEndNode().getOffset()))
1606          annotThatStartAndEndAtOffset.add(ann);
1607        else
1608          annotThatStartAtOffset.add(ann);
1609      }else{
1610        if (offset.equals(ann.getEndNode().getOffset()))
1611          annotThatEndAtOffset.add(ann);
1612      }// End if
1613    }// End while
1614    annotationList.addAll(annotThatEndAtOffset);
1615    annotThatEndAtOffset = null;
1616    annotationList.addAll(annotThatStartAtOffset);
1617    annotThatStartAtOffset = null;
1618    iter = annotThatStartAndEndAtOffset.iterator();
1619    while(iter.hasNext()){
1620      Annotation ann = (Annotation) iter.next();
1621      Iterator it = annotationList.iterator();
1622      boolean breaked = false;
1623      while (it.hasNext()){
1624        Annotation annFromList = (Annotation) it.next();
1625        if (annFromList.getId().intValue() > ann.getId().intValue()){
1626          annotationList.add(annotationList.indexOf(annFromList),ann);
1627          breaked = true;
1628          break;
1629        }// End if
1630      }// End while
1631      if (!breaked)
1632        annotationList.add(ann);
1633      iter.remove();
1634    }// End while
1635    return annotationList;
1636  }// getAnnotationsForOffset()
1637
1638  private List getAnnotationsForOffset(List aDumpAnnotList, Long offset){
1639    List annotationList = new ArrayList();
1640    if (aDumpAnnotList == null || offset == null) return annotationList;
1641    Set annotThatStartAtOffset;
1642    Set annotThatEndAtOffset;
1643    Set annotThatStartAndEndAtOffset;
1644    annotThatStartAtOffset = new TreeSet(
1645        new AnnotationComparator(ORDER_ON_END_OFFSET, DESC));
1646    annotThatEndAtOffset = new TreeSet(
1647        new AnnotationComparator(ORDER_ON_START_OFFSET, DESC));
1648    annotThatStartAndEndAtOffset = new TreeSet(
1649        new AnnotationComparator(ORDER_ON_ANNOT_ID, ASC));
1650
1651    // Fill these tree lists with annotation tat start, end or start and
1652    // end at the offset.
1653    Iterator iter = aDumpAnnotList.iterator();
1654    while(iter.hasNext()){
1655      Annotation ann = (Annotation) iter.next();
1656      if (offset.equals(ann.getStartNode().getOffset())){
1657        if (offset.equals(ann.getEndNode().getOffset()))
1658          annotThatStartAndEndAtOffset.add(ann);
1659        else
1660          annotThatStartAtOffset.add(ann);
1661      }else{
1662        if (offset.equals(ann.getEndNode().getOffset()))
1663          annotThatEndAtOffset.add(ann);
1664      }// End if
1665    }// End while
1666
1667    annotationList.addAll(annotThatEndAtOffset);
1668    annotationList.addAll(annotThatStartAtOffset);
1669    annotThatEndAtOffset = null;
1670    annotThatStartAtOffset = null;
1671
1672    iter = annotThatStartAndEndAtOffset.iterator();
1673    while(iter.hasNext()){
1674      Annotation ann = (Annotation) iter.next();
1675      Iterator it = annotationList.iterator();
1676      boolean breaked = false;
1677      while (it.hasNext()){
1678        Annotation annFromList = (Annotation) it.next();
1679        if (annFromList.getId().intValue() > ann.getId().intValue()){
1680          annotationList.add(annotationList.indexOf(annFromList),ann);
1681          breaked = true;
1682          break;
1683        }// End if
1684      }// End while
1685      if (!breaked)
1686        annotationList.add(ann);
1687      iter.remove();
1688    }// End while
1689    return annotationList;
1690  }// getAnnotationsForOffset()
1691
1692  private String writeStartTag(Annotation annot, boolean includeFeatures){
1693    return writeStartTag(annot, includeFeatures, true);
1694  } // writeStartTag
1695
1696  /** Returns a string representing a start tag based on the input annot*/
1697  private String writeStartTag(Annotation annot, boolean includeFeatures,
1698                                boolean includeNamespace){
1699    AnnotationSet originalMarkupsAnnotSet =
1700            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1701
1702    StringBuffer strBuff = new StringBuffer("");
1703    if (annot == null) return strBuff.toString();
1704//    if (!addGatePreserveFormatTag && isRootTag){
1705      if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){
1706      //the features are included either if desired or if that's an annotation
1707      //from the original markup of the document. We don't want for example to
1708      //spoil all links in an HTML file!
1709      if (includeFeatures) {
1710        strBuff.append("<");
1711        strBuff.append(annot.getType());
1712        strBuff.append(" ");
1713        if(includeNamespace) {
1714          strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1715          strBuff.append(" gate:");
1716        }
1717        strBuff.append("gateId=\"");
1718        strBuff.append(annot.getId());
1719        strBuff.append("\"");
1720        strBuff.append(" ");
1721        if(includeNamespace) {
1722          strBuff.append("gate:");
1723        }
1724        strBuff.append("annotMaxId=\"");
1725        strBuff.append(nextAnnotationId);
1726        strBuff.append("\"");
1727        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1728        strBuff.append(">");
1729      }
1730      else if (originalMarkupsAnnotSet.contains(annot)) {
1731          strBuff.append("<");
1732          strBuff.append(annot.getType());
1733          strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1734          strBuff.append(">");
1735        }
1736      else {
1737        strBuff.append("<");
1738        strBuff.append(annot.getType());
1739        strBuff.append(">");
1740      }
1741
1742    }else{
1743      //the features are included either if desired or if that's an annotation
1744      //from the original markup of the document. We don't want for example to
1745      //spoil all links in an HTML file!
1746      if (includeFeatures) {
1747        strBuff.append("<");
1748        strBuff.append(annot.getType());
1749        strBuff.append(" ");
1750        if(includeNamespace) {
1751          strBuff.append("gate:");
1752        } // if includeNamespaces
1753        strBuff.append("gateId=\"");
1754        strBuff.append(annot.getId());
1755        strBuff.append("\"");
1756        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1757        strBuff.append(">");
1758      }
1759      else if (originalMarkupsAnnotSet.contains(annot)) {
1760        strBuff.append("<");
1761        strBuff.append(annot.getType());
1762        strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1763        strBuff.append(">");
1764      }
1765      else {
1766        strBuff.append("<");
1767        strBuff.append(annot.getType());
1768        strBuff.append(">");
1769      }
1770    }// End if
1771    return strBuff.toString();
1772  }// writeStartTag()
1773
1774  /**
1775   * Identifies the root annotations inside an annotation set.
1776   * The root annotation is the one that starts at offset 0, and has the
1777   * greatest span. If there are more than one with this function, then the
1778   * annotation with the smalled ID wil be selected as root.
1779   * If none is identified it will return null.
1780   * @param anAnnotationSet The annotation set possibly containing
1781   *  the root annotation.
1782   * @return The root annotation or null is it fails
1783   */
1784  private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){
1785    if (anAnnotationSet == null) return null;
1786    // If the starting node of this annotation is not null, then the annotation
1787    // set will not have a root annotation.
1788    Node startNode = anAnnotationSet.firstNode();
1789    Node endNode = anAnnotationSet.lastNode();
1790    // This is placed here just to speed things up. The alghorithm bellow can
1791    // can identity the annotation that span over the entire set and with the
1792    // smallest ID. However the root annotation will have to have the start
1793    // offset equal to 0.
1794    if (startNode.getOffset().longValue() != 0) return null;
1795    // Go anf find the annotation.
1796    Annotation theRootAnnotation = null;
1797    // Check if there are annotations starting at offset 0. If there are, then
1798    // check all of them to see which one has the greatest span. Basically its
1799    // END offset should be the bigest offset from the input annotation set.
1800    long start = startNode.getOffset().longValue();
1801    long end = endNode.getOffset().longValue();
1802    for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){
1803      Annotation currentAnnot = (Annotation) it.next();
1804      // If the currentAnnot has both its Start and End equals to the Start and
1805      // end of the AnnotationSet then check to see if its ID is the smallest.
1806      if (
1807          (start == currentAnnot.getStartNode().getOffset().longValue()) &&
1808          (end   == currentAnnot.getEndNode().getOffset().longValue())
1809         ){
1810          // The currentAnnotation has is a potencial root one.
1811          if (theRootAnnotation == null)
1812            theRootAnnotation = currentAnnot;
1813          else{
1814            // If its ID is greater that the currentAnnot then update the root
1815            if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1816              theRootAnnotation = currentAnnot;
1817          }// End if
1818      }// End if
1819    }// End for
1820    return theRootAnnotation;
1821  }// End identifyTheRootAnnotation()
1822
1823  private Annotation identifyTheRootAnnotation(List anAnnotationList){
1824    if (anAnnotationList == null || anAnnotationList.isEmpty()) return null;
1825    // If the first annotation in the list (which is sorted by start offset)
1826    //does not have an offset = 0, then there's no root tag.
1827    if(((Annotation)anAnnotationList.get(0)).
1828       getStartNode().getOffset().longValue() > 0) return null;
1829
1830    // If there's a single annotation and it starts at the start (which we
1831  // already know it does), make sure it ends at the end.
1832  if (anAnnotationList.size() == 1){
1833    Annotation onlyAnn = (Annotation) anAnnotationList.get(0);
1834    if ( onlyAnn.getEndNode().getOffset().equals( content.size() ) ) return onlyAnn;
1835      return null;
1836  }
1837    
1838    //find the limits
1839    long start = 0; //we know this already
1840    long end = 0; //end = 0  will be improved by the next loop
1841    for(int i = 0; i < anAnnotationList.size(); i++){
1842      Annotation anAnnotation = (Annotation)anAnnotationList.get(i);
1843      long localEnd = anAnnotation.getEndNode().getOffset().longValue();
1844      if(localEnd > end) end = localEnd;
1845    }
1846
1847    // Go and find the annotation.
1848    //look at all annotations that start at 0 and end at end
1849    //if there are several, choose the one with the smallest ID
1850    Annotation theRootAnnotation = null;
1851    for(int i = 0; i < anAnnotationList.size(); i++){
1852      Annotation currentAnnot = (Annotation) anAnnotationList.get(i);
1853      long localStart = currentAnnot.getStartNode().getOffset().longValue();
1854      long localEnd = currentAnnot.getEndNode().getOffset().longValue();
1855      // If the currentAnnot has both its Start and End equals to the Start and
1856      // end of the AnnotationSet then check to see if its ID is the smallest.
1857      if (
1858          (start == localStart) && (end == localEnd)){
1859          // The currentAnnotation has is a potential root one.
1860          if (theRootAnnotation == null) theRootAnnotation = currentAnnot;
1861          else{
1862            // If root's ID is greater that the currentAnnot then update the root
1863            if (theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1864              theRootAnnotation = currentAnnot;
1865          }// End if
1866      }// End if
1867    }// End for
1868    return theRootAnnotation;
1869  }// End identifyTheRootAnnotation()
1870
1871
1872  /** This method takes aScanString and searches for those chars from
1873    * entitiesMap that appear in the string. A tree map(offset2Char) is filled
1874    * using as key the offsets where those Chars appear and the Char.
1875    * If one of the params is null the method simply returns.
1876    */
1877  private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1878    if (aScanString == null || aMapToFill == null) return;
1879    if (entitiesMap == null || entitiesMap.isEmpty()){
1880      Err.prln("WARNING: Entities map was not initialised !");
1881      return;
1882    }// End if
1883    // Fill the Map with the offsets of the special chars
1884    Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1885    Character c;
1886    int fromIndex;
1887    while(entitiesMapIterator.hasNext()){
1888      c = (Character) entitiesMapIterator.next();
1889      fromIndex = 0;
1890      while (-1 != fromIndex){
1891        fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1892        if (-1 != fromIndex){
1893          aMapToFill.put(new Long(fromIndex),c);
1894          fromIndex ++;
1895        }// End if
1896      }// End while
1897    }// End while
1898  }//buildEntityMapFromString();
1899
1900  private String writeEmptyTag(Annotation annot){
1901    return writeEmptyTag(annot, true);
1902  } // writeEmptyTag
1903
1904  /** Returns a string representing an empty tag based on the input annot*/
1905  private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1906    StringBuffer strBuff = new StringBuffer("");
1907    if (annot == null) return strBuff.toString();
1908
1909    strBuff.append("<");
1910    strBuff.append(annot.getType());
1911
1912    AnnotationSet originalMarkupsAnnotSet =
1913            this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1914    if (! originalMarkupsAnnotSet.contains(annot)) {
1915      strBuff.append(" gateId=\"");
1916      strBuff.append(annot.getId());
1917      strBuff.append("\"");
1918    }
1919    strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1920    strBuff.append("/>");
1921
1922    return strBuff.toString();
1923  }// writeEmptyTag()
1924
1925  /** Returns a string representing an end tag based on the input annot*/
1926  private String writeEndTag(Annotation annot){
1927    StringBuffer strBuff = new StringBuffer("");
1928    if (annot == null) return strBuff.toString();
1929/*
1930    if (annot.getType().indexOf(" ") != -1)
1931      Out.prln("Warning: Truncating end tag to first word for annot type \""
1932      +annot.getType()+ "\". ");
1933*/
1934    strBuff.append("</"+annot.getType()+">");
1935
1936    return strBuff.toString();
1937  }// writeEndTag()
1938
1939  /** Returns a string representing a FeatureMap serialized as XML attributes*/
1940  private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1941    StringBuffer strBuff = new StringBuffer("");
1942    if (feat == null) return strBuff.toString();
1943    Iterator it = feat.keySet().iterator();
1944    while (it.hasNext()){
1945      Object key = it.next();
1946      Object value = feat.get(key);
1947      if ( (key != null) && (value != null) ){
1948        // Eliminate a feature inserted at reading time and which help to
1949        // take some decissions at saving time
1950        if ("isEmptyAndSpan".equals(key.toString()))
1951          continue;
1952        if( !(String.class.isAssignableFrom(key.getClass()) ||
1953              Number.class.isAssignableFrom(key.getClass()))){
1954
1955            Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1956                             " from String or Number.(feature discarded)");
1957            continue;
1958        }// End if
1959        if ( !(String.class.isAssignableFrom(value.getClass()) ||
1960               Number.class.isAssignableFrom(value.getClass()) ||
1961               java.util.Collection.class.isAssignableFrom(value.getClass()))){
1962
1963            Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1964                       " from String, Number or Collection.(feature discarded)");
1965            continue;
1966        }// End if
1967        if ("matches".equals(key)) {
1968          strBuff.append(" ");
1969          if(includeNamespace) {
1970            strBuff.append("gate:");
1971          }
1972//          strBuff.append(key);
1973          // replace non XML chars in attribute name
1974          strBuff.append(
1975            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1976          strBuff.append("=\"");
1977        }
1978        else {
1979          strBuff.append(" ");
1980//          strBuff.append(key);
1981          // replace non XML chars in attribute name
1982          strBuff.append(
1983            filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1984          strBuff.append("=\"");
1985        }
1986        if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1987          Iterator valueIter = ((Collection)value).iterator();
1988          while(valueIter.hasNext()){
1989            Object item = valueIter.next();
1990            if (!(String.class.isAssignableFrom(item.getClass()) ||
1991                  Number.class.isAssignableFrom(item.getClass())))
1992                  continue;
1993//            strBuff.append(item);
1994            // replace non XML chars in collection item
1995            strBuff.append(
1996              filterNonXmlChars(replaceCharsWithEntities(item.toString())));
1997            strBuff.append(";");
1998          }// End while
1999          if (strBuff.charAt(strBuff.length()-1) == ';')
2000            strBuff.deleteCharAt(strBuff.length()-1);
2001        }else{
2002//          strBuff.append(value);
2003          // replace non XML chars in attribute value
2004          strBuff.append(
2005            filterNonXmlChars(replaceCharsWithEntities(value.toString())));
2006        }// End if
2007        strBuff.append("\"");
2008      }// End if
2009    }// End while
2010    return strBuff.toString();
2011  }// writeFeatures()
2012
2013  /** Returns a GateXml document that is a custom XML format for wich there is
2014    * a reader inside GATE called gate.xml.GateFormatXmlHandler.
2015    * What it does is to serialize a GATE document in an XML format.
2016    * @return a string representing a Gate Xml document.
2017    */
2018  public String toXml(){
2019    // Initialize the xmlContent with 3 time the size of the current document.
2020    // This is because of the tags size. This measure is made to increase the
2021    // performance of StringBuffer.
2022    StringBuffer xmlContent = new StringBuffer(
2023         DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
2024    // Add xml header
2025    xmlContent.append("<?xml version=\"1.0\" encoding=\"");
2026    xmlContent.append(getEncoding());
2027    xmlContent.append("\" ?>");
2028    xmlContent.append(Strings.getNl());
2029
2030    // Add the root element
2031    xmlContent.append("<GateDocument>\n");
2032    xmlContent.append("<!-- The document's features-->\n\n");
2033    xmlContent.append("<GateDocumentFeatures>\n");
2034
2035    xmlContent.append(featuresToXml(this.getFeatures()));
2036    xmlContent.append("</GateDocumentFeatures>\n");
2037    xmlContent.append("<!-- The document content area with serialized"+
2038                      " nodes -->\n\n");
2039    // Add plain text element
2040    xmlContent.append("<TextWithNodes>");
2041    xmlContent.append(textWithNodes(this.getContent().toString()));
2042    xmlContent.append("</TextWithNodes>\n");
2043    // Serialize as XML all document's annotation sets
2044    // Serialize the default AnnotationSet
2045    StatusListener sListener = (StatusListener)
2046                               gate.gui.MainFrame.getListeners().
2047                               get("gate.event.StatusListener");
2048    if(sListener != null)
2049      sListener.statusChanged("Saving the default annotation set ");
2050    xmlContent.append("<!-- The default annotation set -->\n\n");
2051    xmlContent.append(annotationSetToXml(this.getAnnotations()));
2052    // Serialize all others AnnotationSets
2053    // namedAnnotSets is a Map containing all other named Annotation Sets.
2054    if (namedAnnotSets != null){
2055      Iterator iter = namedAnnotSets.values().iterator();
2056      while(iter.hasNext()){
2057        AnnotationSet annotSet = (AnnotationSet) iter.next();
2058        xmlContent.append("<!-- Named annotation set -->\n\n");
2059        // Serialize it as XML
2060        if(sListener != null) sListener.statusChanged("Saving " +
2061                                                      annotSet.getName()+
2062                                                      " annotation set ");
2063        xmlContent.append(annotationSetToXml(annotSet));
2064      }// End while
2065    }// End if
2066    // Add the end of GateDocument
2067    xmlContent.append("</GateDocument>");
2068    if(sListener != null) sListener.statusChanged("Done !");
2069    // return the XmlGateDocument
2070    return xmlContent.toString();
2071  }// toXml
2072
2073  /** This method filters any non XML char
2074    * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
2075    * All non XML chars will be replaced with 0x20 (space char) This assures
2076    * that the next time the document is loaded there won't be any problems.
2077    * @param aStrBuffer represents the input String that is filtred. If the
2078    * aStrBuffer is null then an empty string will be returend
2079    * @return the "purified" StringBuffer version of the aStrBuffer
2080    */
2081  private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
2082    if (aStrBuffer == null) return new StringBuffer("");
2083//    String space = new String(" ");
2084    char space = ' ';
2085    for (int i=aStrBuffer.length()-1;i>=0; i--){
2086      if (!isXmlChar(aStrBuffer.charAt(i)))
2087        aStrBuffer.setCharAt(i, space);
2088    }// End for
2089    return aStrBuffer;
2090  }// filterNonXmlChars()
2091
2092  /** This method decide if a char is a valid XML one or not
2093    * @param ch the char to be tested
2094    * @return true if is a valid XML char and fals if is not.
2095    */
2096  public static boolean isXmlChar(char ch){
2097    if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
2098    if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
2099    if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
2100    if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
2101    return false;
2102  }// End isXmlChar()
2103
2104  /** This method saves a FeatureMap as XML elements.
2105    * @param aFeatureMap the feature map that has to be saved as XML.
2106    * @return a String like this: <Feature><Name>...</Name>
2107    * <Value>...</Value></Feature><Feature>...</Feature>
2108    */
2109  private String featuresToXml(FeatureMap aFeatureMap){
2110    StringBuffer str = new StringBuffer("");
2111
2112    if (aFeatureMap == null) return str.toString();
2113
2114    Set keySet = aFeatureMap.keySet();
2115    Iterator keyIterator = keySet.iterator();
2116    while(keyIterator.hasNext()){
2117      Object key = keyIterator.next();
2118      Object value = aFeatureMap.get(key);
2119      if ((key != null) && (value != null)){
2120        String keyClassName = null;
2121        String keyItemClassName = null;
2122        String valueClassName = null;
2123        String valueItemClassName = null;
2124        String key2String = key.toString();
2125        String value2String = value.toString();
2126
2127        Object item = null;
2128        // Test key if it is String, Number or Collection
2129        if (key instanceof java.lang.String ||
2130            key instanceof java.lang.Number ||
2131            key instanceof java.util.Collection)
2132          keyClassName = key.getClass().getName();
2133
2134        // Test value if it is String, Number or Collection
2135        if (value instanceof java.lang.String ||
2136            value instanceof java.lang.Number ||
2137            value instanceof java.util.Collection)
2138          valueClassName = value.getClass().getName();
2139
2140        // Features and values that are not Strings, Numbers or collections
2141        // will be discarded.
2142        if (keyClassName == null || valueClassName == null) continue;
2143
2144        // If key is collection serialize the colection in a specific format
2145        if (key instanceof java.util.Collection){
2146          StringBuffer keyStrBuff = new StringBuffer("");
2147          Iterator iter = ((Collection) key).iterator();
2148          if (iter.hasNext()){
2149            item = iter.next();
2150            if (item instanceof java.lang.Number)
2151              keyItemClassName = item.getClass().getName();
2152            else
2153              keyItemClassName = String.class.getName();
2154            keyStrBuff.append(item.toString());
2155          }// End if
2156          while (iter.hasNext()){
2157            item = iter.next();
2158            keyStrBuff.append(";" + item.toString());
2159          }// End while
2160          key2String = keyStrBuff.toString();
2161        }// End if
2162        // If key is collection serialize the colection in a specific format
2163        if (value instanceof java.util.Collection){
2164          StringBuffer valueStrBuff = new StringBuffer("");
2165          Iterator iter = ((Collection) value).iterator();
2166          if (iter.hasNext()){
2167            item = iter.next();
2168            if (item instanceof java.lang.Number)
2169              valueItemClassName = item.getClass().getName();
2170            else
2171              valueItemClassName = String.class.getName();
2172            valueStrBuff.append(item.toString());
2173          }// End if
2174          while (iter.hasNext()){
2175            item = iter.next();
2176            valueStrBuff.append(";" + item.toString());
2177          }// End while
2178          value2String = valueStrBuff.toString();
2179        }// End if
2180        str.append("<Feature>\n  <Name");
2181        if (keyClassName != null)
2182          str.append(" className=\""+keyClassName+"\"");
2183        if (keyItemClassName != null)
2184          str.append(" itemClassName=\""+keyItemClassName+"\"");
2185        str.append(">");
2186        str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
2187        str.append("</Name>\n  <Value");
2188        if (valueClassName != null)
2189          str.append(" className=\"" + valueClassName + "\"");
2190        if (valueItemClassName != null)
2191          str.append(" itemClassName=\"" + valueItemClassName + "\"");
2192        str.append(">");
2193        str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
2194        str.append("</Value>\n</Feature>\n");
2195      }// End if
2196    }// end While
2197    return str.toString();
2198  }//featuresToXml
2199
2200  /** This method replace all chars that appears in the anInputString and also
2201    * that are in the entitiesMap with their corresponding entity
2202    * @param anInputString the string analyzed. If it is null then returns the
2203    *  empty string
2204    * @return a string representing the input string with chars replaced with
2205    *  entities
2206    */
2207  private StringBuffer replaceCharsWithEntities(String anInputString){
2208    if (anInputString == null) return new StringBuffer("");
2209    StringBuffer strBuff = new StringBuffer(anInputString);
2210    for (int i=strBuff.length()-1; i>=0; i--){
2211      Character ch = new Character(strBuff.charAt(i));
2212      if (entitiesMap.keySet().contains(ch)){
2213        strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
2214      }// End if
2215    }// End for
2216    return strBuff;
2217  }//replaceCharsWithEntities()
2218
2219  /** This method creates Node XML elements and inserts them at the
2220    * corresponding offset inside the text. Nodes are created from the default
2221    * annotation set, as well as from all existing named annotation sets.
2222    * @param aText The text representing the document's plain text.
2223    * @return The text with empty <Node id="NodeId"/> elements.
2224    */
2225  private String textWithNodes(String aText){
2226    if (aText == null) return new String("");
2227    StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
2228
2229    // Construct a map from offsets to Chars
2230    TreeMap offsets2CharsMap = new TreeMap();
2231    if (aText.length()!= 0){
2232      // Fill the offsets2CharsMap with all the indices where special chars appear
2233      buildEntityMapFromString(aText,offsets2CharsMap);
2234    }//End if
2235    // Construct the offsetsSet for all nodes belonging to this document
2236    TreeSet offsetsSet = new TreeSet();
2237    Iterator annotSetIter = this.getAnnotations().iterator();
2238    while (annotSetIter.hasNext()){
2239      Annotation annot = (Annotation) annotSetIter.next();
2240      offsetsSet.add(annot.getStartNode().getOffset());
2241      offsetsSet.add(annot.getEndNode().getOffset());
2242    }// end While
2243    // Get the nodes from all other named annotation sets.
2244    if (namedAnnotSets != null){
2245      Iterator iter = namedAnnotSets.values().iterator();
2246      while(iter.hasNext()){
2247        AnnotationSet annotSet = (AnnotationSet) iter.next();
2248        Iterator iter2 = annotSet.iterator();
2249        while(iter2.hasNext()){
2250          Annotation annotTmp = (Annotation) iter2.next();
2251          offsetsSet.add(annotTmp.getStartNode().getOffset());
2252          offsetsSet.add(annotTmp.getEndNode().getOffset());
2253        }// End while
2254      }// End while
2255    }// End if
2256    // offsetsSet is ordered in ascending order because the structure
2257    // is a TreeSet
2258
2259    if (offsetsSet.isEmpty()){
2260      return replaceCharsWithEntities(aText).toString();
2261    }// End if
2262    // Iterate through all nodes from anAnnotSet and transform them to
2263    // XML elements. Then insert those elements at the node's offset into the
2264    // textWithNodes .
2265    while (!offsetsSet.isEmpty()){
2266      Long offset = (Long) offsetsSet.last();
2267      // Eliminate the offset from the list in order to create more memory space
2268      offsetsSet.remove(offset);
2269      // Use offset
2270      int offsetValue = offset.intValue();
2271      String strNode = "<Node id=\"" + offsetValue + "\"/>";
2272      // Before inserting this string into the textWithNodes, check to see if
2273      // there are any chars to be replaced with their corresponding entities
2274      if (!offsets2CharsMap.isEmpty()){
2275        Long offsChar = (Long) offsets2CharsMap.lastKey();
2276        while( !offsets2CharsMap.isEmpty() &&
2277                       offsChar.intValue() >= offset.intValue()){
2278          // Replace the char at offsChar with its corresponding entity form
2279          // the entitiesMap.
2280          textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2281          (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2282          // Discard the offsChar after it was used because this offset will
2283          // never appear again
2284          offsets2CharsMap.remove(offsChar);
2285          // Investigate next offsChar
2286          if (!offsets2CharsMap.isEmpty())
2287            offsChar = (Long) offsets2CharsMap.lastKey();
2288        }// End while
2289      }// End if
2290      // Now it is safe to insert the node
2291      textWithNodes.insert(offsetValue,strNode);
2292    }// end while
2293    // Need to replace the entities in the remaining text, if there is any text
2294    // So, if there are any more items in offsets2CharsMap they need to be
2295    // replaced
2296    while (!offsets2CharsMap.isEmpty()){
2297      Long offsChar = (Long) offsets2CharsMap.lastKey();
2298      // Replace the char with its entity
2299      textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2300      (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2301      // remove the offset from the map
2302      offsets2CharsMap.remove(offsChar);
2303    }// End while
2304    return textWithNodes.toString();
2305  }//textWithNodes()
2306
2307  /** This method saves an AnnotationSet as XML.
2308    * @param anAnnotationSet The annotation set that has to be saved as XML.
2309    * @return a String like this: <AnnotationSet> <Annotation>....
2310    * </AnnotationSet>
2311    */
2312  private String annotationSetToXml(AnnotationSet anAnnotationSet){
2313    StringBuffer str = new StringBuffer("");
2314
2315    if (anAnnotationSet == null){
2316      str.append("<AnnotationSet>\n");
2317      str.append("</AnnotationSet>\n");
2318      return str.toString();
2319    }// End if
2320    if (anAnnotationSet.getName() == null)
2321      str.append("<AnnotationSet>\n");
2322    else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
2323                                                                    "\" >\n");
2324    // Iterate through AnnotationSet and save each Annotation as XML
2325    Iterator iterator = anAnnotationSet.iterator();
2326    while (iterator.hasNext()){
2327      Annotation annot = (Annotation) iterator.next();
2328      str.append("<Annotation " + "Id=\"" + annot.getId() +
2329                  "\" Type=\"" + annot.getType() +
2330                  "\" StartNode=\"" + annot.getStartNode().getOffset() +
2331                   "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
2332      str.append(featuresToXml(annot.getFeatures()));
2333      str.append("</Annotation>\n");
2334    }// End while
2335
2336    str.append("</AnnotationSet>\n");
2337    return str.toString();
2338  }// annotationSetToXml
2339
2340  /** Returns a map with the named annotation sets. It returns <code>null</code>
2341   *  if no named annotaton set exists. */
2342  public Map getNamedAnnotationSets() {
2343    return namedAnnotSets;
2344  } // getNamedAnnotationSets
2345
2346  /** Returns a set of all named annotation sets in existence
2347  */
2348  public Set getAnnotationSetNames(){
2349    return namedAnnotSets.keySet();
2350  }
2351
2352
2353  /**
2354   * Removes one of the named annotation sets.
2355   * Note that the default annotation set cannot be removed.
2356   * @param name the name of the annotation set to be removed
2357   */
2358  public void removeAnnotationSet(String name){
2359    Object removed = namedAnnotSets.remove(name);
2360    if(removed != null){
2361      fireAnnotationSetRemoved(
2362        new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
2363    }
2364  }
2365
2366  /** Propagate edit changes to the document content and annotations. */
2367  public void edit(Long start, Long end, DocumentContent replacement)
2368    throws InvalidOffsetException
2369  {
2370    if(! isValidOffsetRange(start, end))
2371      throw new InvalidOffsetException();
2372
2373    if(content != null)
2374      ((DocumentContentImpl) content).edit(start, end, replacement);
2375
2376    if(defaultAnnots != null)
2377      ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
2378
2379    if(namedAnnotSets != null) {
2380      Iterator iter = namedAnnotSets.values().iterator();
2381      while(iter.hasNext())
2382        ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
2383    }
2384    //let the listeners know
2385    fireContentEdited(new DocumentEvent(this, DocumentEvent.CONTENT_EDITED,
2386            start, end));
2387  } // edit(start,end,replacement)
2388
2389  /** Check that an offset is valid, i.e. it is non-null, greater than
2390    * or equal to 0 and less than the size of the document content.
2391    */
2392  public boolean isValidOffset(Long offset) {
2393    if(offset == null)
2394      return false;
2395
2396    long o = offset.longValue();
2397    if(o > getContent().size().longValue() || o < 0)
2398      return false;
2399
2400    return true;
2401  } // isValidOffset
2402
2403  /** Check that both start and end are valid offsets and that
2404    * they constitute a valid offset range, i.e. start is greater
2405    * than or equal to long.
2406    */
2407  public boolean isValidOffsetRange(Long start, Long end) {
2408    return
2409      isValidOffset(start) && isValidOffset(end) &&
2410      start.longValue() <= end.longValue();
2411  } // isValidOffsetRange(start,end)
2412
2413  /** Sets the nextAnnotationId */
2414  public void setNextAnnotationId(int aNextAnnotationId){
2415    nextAnnotationId = aNextAnnotationId;
2416  }// setNextAnnotationId();
2417
2418  /** Generate and return the next annotation ID */
2419  public Integer getNextAnnotationId() {
2420    return new Integer(nextAnnotationId++);
2421  } // getNextAnnotationId
2422
2423  /** Generate and return the next node ID */
2424  public Integer getNextNodeId() { return new Integer(nextNodeId++); }
2425
2426  /** Ordering based on URL.toString() and the URL offsets (if any) */
2427  public int compareTo(Object o) throws ClassCastException {
2428    DocumentImpl other = (DocumentImpl) o;
2429    return getOrderingString().compareTo(other.getOrderingString());
2430  } // compareTo
2431
2432  /** Utility method to produce a string for comparison in ordering.
2433    * String is based on the source URL and offsets.
2434    */
2435  protected String getOrderingString() {
2436    if(sourceUrl == null) return toString();
2437
2438    StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
2439    if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
2440      orderingString.append(sourceUrlStartOffset.toString());
2441      orderingString.append(sourceUrlEndOffset.toString());
2442    }
2443
2444    return orderingString.toString();
2445  } // getOrderingString()
2446
2447  /** The id of the next new annotation */
2448  protected int nextAnnotationId = 0;
2449
2450  /** The id of the next new node */
2451  protected int nextNodeId = 0;
2452  /** The source URL */
2453  protected URL sourceUrl;
2454
2455  /** The document's URL name. */
2456
2457  /** The content of the document */
2458  protected DocumentContent content;
2459
2460  /** The encoding of the source of the document content */
2461  protected String encoding = null;
2462
2463  // Data needed in toXml(AnnotationSet) methos
2464
2465  /** This field indicates whether or not to add the tag
2466    * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
2467    * have this tag added
2468    */
2469//  private boolean addGatePreserveFormatTag = false;
2470
2471  /**
2472   * Used by the XML dump preserving format method
2473   */
2474  private Annotation theRootAnnotation = null;
2475
2476  /** This field is used when creating StringBuffers for toXml() methods.
2477    * The size of the StringBuffer will be docDonctent.size() multiplied by this
2478    * value. It is aimed to improve the performance of StringBuffer
2479    */
2480  private final int DOC_SIZE_MULTIPLICATION_FACTOR = 2;
2481
2482  /** Constant used in the inner class AnnotationComparator to order
2483    * annotations on their start offset
2484    */
2485  private final int ORDER_ON_START_OFFSET = 0;
2486  /** Constant used in the inner class AnnotationComparator to order
2487    * annotations on their end offset
2488    */
2489  private final int ORDER_ON_END_OFFSET = 1;
2490  /** Constant used in the inner class AnnotationComparator to order
2491    * annotations on their ID
2492    */
2493  private final int ORDER_ON_ANNOT_ID = 2;
2494  /** Constant used in the inner class AnnotationComparator to order
2495    * annotations ascending
2496    */
2497  private final int ASC = 3;
2498  /** Constant used in the inner class AnnotationComparator to order
2499    * annotations descending
2500    */
2501  private final int DESC = -3;
2502
2503  /** A map initialized in init() containing entities that needs to be
2504    * replaced in strings
2505    */
2506  private static Map entitiesMap = null;
2507  // Initialize the entities map use when saving as xml
2508  static{
2509    entitiesMap = new HashMap();
2510    entitiesMap.put(new Character('<'),"&lt;");
2511    entitiesMap.put(new Character('>'),"&gt;");
2512    entitiesMap.put(new Character('&'),"&amp;");
2513    entitiesMap.put(new Character('\''),"&apos;");
2514    entitiesMap.put(new Character('"'),"&quot;");
2515    entitiesMap.put(new Character((char)160),"&#160;");
2516    entitiesMap.put(new Character((char)169),"&#169;");
2517  }//static
2518
2519  /** The range that the content comes from at the source URL
2520    * (or null if none).
2521    */
2522  //protected Long[] sourceUrlOffsets;
2523
2524  /** The start of the range that the content comes from at the source URL
2525    * (or null if none).
2526    */
2527  protected Long sourceUrlStartOffset;
2528
2529  /** The end of the range that the content comes from at the source URL
2530    * (or null if none).
2531    */
2532  protected Long sourceUrlEndOffset;
2533
2534  /** The default annotation set */
2535  protected AnnotationSet defaultAnnots;
2536
2537  /** Named sets of annotations */
2538  protected Map namedAnnotSets;
2539
2540  /**
2541   * A property of the document that will be set when the user
2542   * wants to create the document from a string, as opposed to from
2543   * a URL.
2544   */
2545  private String stringContent;
2546
2547  /**
2548   * The stringContent of a document is
2549   * a property of the document that will be set when the user
2550   * wants to create the document from a string, as opposed to from
2551   * a URL.
2552   * <B>Use the <TT>getContent</TT> method instead to get the actual document
2553   * content.</B>
2554   */
2555  public String getStringContent() { return stringContent; }
2556
2557  /**
2558   * The stringContent of a document is
2559   * a property of the document that will be set when the user
2560   * wants to create the document from a string, as opposed to from
2561   * a URL.
2562   * <B>Use the <TT>setContent</TT> method instead to update the actual
2563   * document content.</B>
2564   */
2565  public void setStringContent(String stringContent) {
2566    this.stringContent = stringContent;
2567  } // set StringContent
2568
2569  /** Is the document markup-aware? */
2570  protected Boolean markupAware = new Boolean(false);
2571//  /** Hash code */
2572//  public int hashCode() {
2573//    int code = getContent().hashCode();
2574//    int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
2575//    code += memberCode;
2576//    memberCode = (encoding == null) ? 0 : encoding.hashCode();
2577//    code += memberCode;
2578//    memberCode = (features == null) ? 0 : features.hashCode();
2579//    code += memberCode;
2580//    code += (markupAware.booleanValue()) ? 0 : 1;
2581//    memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
2582//    code += memberCode;
2583//    code += nextAnnotationId;
2584//    code += nextNodeId;
2585//    memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2586//    code += memberCode;
2587//    memberCode =
2588//      (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2589//    code += memberCode;
2590//    memberCode =
2591//      (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2592//    code += memberCode;
2593//    return code;
2594//  } // hashcode
2595
2596  /** String respresentation */
2597  public String toString() {
2598    String n = Strings.getNl();
2599    StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2600    s.append("  content:" + content + n);
2601    s.append("  defaultAnnots:" + defaultAnnots + n);
2602    s.append("  encoding:" + encoding + n);
2603    s.append("  features:" + features + n);
2604    s.append("  markupAware:" + markupAware + n);
2605    s.append("  namedAnnotSets:" + namedAnnotSets + n);
2606    s.append("  nextAnnotationId:" + nextAnnotationId + n);
2607    s.append("  nextNodeId:" + nextNodeId + n);
2608    s.append("  sourceUrl:" + sourceUrl + n);
2609    s.append("  sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2610    s.append("  sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2611    s.append(n);
2612
2613    return s.toString();
2614  } // toString
2615
2616   /** Freeze the serialization UID. */
2617  static final long serialVersionUID = -8456893608311510260L;
2618
2619  /** Inner class needed to compare annotations*/
2620  class AnnotationComparator implements java.util.Comparator {
2621    int orderOn = -1;
2622    int orderType = ASC;
2623    /** Constructs a comparator according to one of three sorter types:
2624      * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2625      */
2626      public AnnotationComparator(int anOrderOn, int anOrderType){
2627        orderOn = anOrderOn;
2628        orderType = anOrderType;
2629      }// AnnotationComparator()
2630
2631      /**This method must be implemented according to Comparator interface */
2632      public int compare(Object o1, Object o2){
2633        Annotation a1 = (Annotation) o1;
2634        Annotation a2 = (Annotation) o2;
2635        // ORDER_ON_START_OFFSET ?
2636        if (orderOn == ORDER_ON_START_OFFSET){
2637          int result = a1.getStartNode().getOffset().compareTo(
2638                                                a2.getStartNode().getOffset());
2639          if (orderType == ASC){
2640            // ASC
2641            // If they are equal then their ID will decide.
2642            if (result == 0)
2643              return a1.getId().compareTo(a2.getId());
2644            return result;
2645          }else{
2646            // DESC
2647            if (result == 0)
2648              return - (a1.getId().compareTo(a2.getId()));
2649            return -result;
2650          }// End if (orderType == ASC)
2651        }// End if (orderOn == ORDER_ON_START_OFFSET)
2652
2653        // ORDER_ON_END_OFFSET ?
2654        if (orderOn == ORDER_ON_END_OFFSET){
2655          int result = a1.getEndNode().getOffset().compareTo(
2656                                                a2.getEndNode().getOffset());
2657          if (orderType == ASC){
2658            // ASC
2659            // If they are equal then their ID will decide.
2660            if (result == 0)
2661              return - (a1.getId().compareTo(a2.getId()));
2662            return result;
2663          }else{
2664            // DESC
2665            // If they are equal then their ID will decide.
2666            if (result == 0)
2667              return a1.getId().compareTo(a2.getId());
2668            return - result;
2669          }// End if (orderType == ASC)
2670        }// End if (orderOn == ORDER_ON_END_OFFSET)
2671
2672        // ORDER_ON_ANNOT_ID ?
2673        if (orderOn == ORDER_ON_ANNOT_ID){
2674          if (orderType == ASC)
2675            return a1.getId().compareTo(a2.getId());
2676          else
2677            return -(a1.getId().compareTo(a2.getId()));
2678        }// End if
2679        return 0;
2680      }//compare()
2681  } // End inner class AnnotationComparator
2682
2683
2684  private transient Vector documentListeners;
2685  private transient Vector gateListeners;
2686
2687  public synchronized void removeDocumentListener(DocumentListener l) {
2688    if (documentListeners != null && documentListeners.contains(l)) {
2689      Vector v = (Vector) documentListeners.clone();
2690      v.removeElement(l);
2691      documentListeners = v;
2692    }
2693  }
2694  public synchronized void addDocumentListener(DocumentListener l) {
2695    Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2696    if (!v.contains(l)) {
2697      v.addElement(l);
2698      documentListeners = v;
2699    }
2700  }
2701
2702  protected void fireAnnotationSetAdded(DocumentEvent e) {
2703    if (documentListeners != null) {
2704      Vector listeners = documentListeners;
2705      int count = listeners.size();
2706      for (int i = 0; i < count; i++) {
2707        ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2708      }
2709    }
2710  }
2711
2712  protected void fireAnnotationSetRemoved(DocumentEvent e) {
2713    if (documentListeners != null) {
2714      Vector listeners = documentListeners;
2715      int count = listeners.size();
2716      for (int i = 0; i < count; i++) {
2717        ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2718      }
2719    }
2720  }
2721
2722  protected void fireContentEdited(DocumentEvent e) {
2723    if (documentListeners != null) {
2724      Vector listeners = documentListeners;
2725      int count = listeners.size();
2726      for (int i = 0; i < count; i++) {
2727        ((DocumentListener) listeners.elementAt(i)).contentEdited(e);
2728      }
2729    }
2730  }
2731
2732  public void resourceLoaded(CreoleEvent e) {
2733  }
2734  public void resourceUnloaded(CreoleEvent e) {
2735  }
2736  public void datastoreOpened(CreoleEvent e) {
2737  }
2738  public void datastoreCreated(CreoleEvent e) {
2739  }
2740  public void resourceRenamed(Resource resource, String oldName,
2741                              String newName){
2742  }
2743  public void datastoreClosed(CreoleEvent e) {
2744    if (! e.getDatastore().equals(this.getDataStore()))
2745      return;
2746    //close this lr, since it cannot stay open when the DS it comes from
2747    //is closed
2748    Factory.deleteResource(this);
2749  }
2750  public void setLRPersistenceId(Object lrID) {
2751    super.setLRPersistenceId( lrID);
2752    //make persistent documents listen to the creole register
2753    //for events about their DS
2754    Gate.getCreoleRegister().addCreoleListener(this);
2755  }
2756  public void resourceAdopted(DatastoreEvent evt) {
2757  }
2758  public void resourceDeleted(DatastoreEvent evt) {
2759    if(! evt.getSource().equals(this.getDataStore()))
2760      return;
2761    //if an open document is deleted from a DS, then
2762    //it must close itself immediately, as is no longer valid
2763    if(evt.getResourceID().equals(this.getLRPersistenceId()))
2764      Factory.deleteResource(this);
2765  }
2766  public void resourceWritten(DatastoreEvent evt) {
2767  }
2768  public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2769    super.setDataStore( dataStore);
2770    if (this.dataStore != null)
2771      this.dataStore.addDatastoreListener(this);
2772  }
2773
2774  /**
2775   * This method added by Shafirin Andrey, to allow access to
2776   * protected member {@link #defaultAnnots}
2777   * Required for JAPE-Debugger.
2778   * */
2779  public void setDefaultAnnotations(AnnotationSet defaultAnnotations) {
2780      defaultAnnots = defaultAnnotations;
2781  }
2782
2783} // class DocumentImpl
2784