| DocumentImpl.java |
1 /*
2 * DocumentImpl.java
3 *
4 * Copyright (c) 1998-2005, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Hamish Cunningham, 11/Feb/2000
12 *
13 * $Id: DocumentImpl.java,v 1.137 2006/03/08 15:56:44 cursu Exp $
14 */
15
16 package gate.corpora;
17
18 import java.io.IOException;
19 import java.net.URL;
20 import java.util.*;
21
22 import gate.*;
23 import gate.annotation.AnnotationSetImpl;
24 import gate.creole.AbstractLanguageResource;
25 import gate.creole.ResourceInstantiationException;
26 import gate.event.*;
27 import gate.util.*;
28
29 /** Represents the commonalities between all sorts of documents.
30 *
31 * <H2>Editing</H2>
32 *
33 * <P>
34 * The DocumentImpl class implements the Document interface.
35 * The DocumentContentImpl class models the textual or audio-visual
36 * materials which are the source and content of Documents.
37 * The AnnotationSetImpl class supplies annotations on Documents.
38 *
39 * <P>
40 * Abbreviations:
41 *
42 * <UL>
43 * <LI>
44 * DC = DocumentContent
45 * <LI>
46 * D = Document
47 * <LI>
48 * AS = AnnotationSet
49 * </UL>
50 *
51 * <P>
52 * We add an edit method to each of these classes; for DC and AS
53 * the methods are package private; D has the public method.
54 *
55 * <PRE>
56 * void edit(Long start, Long end, DocumentContent replacement)
57 * throws InvalidOffsetException;
58 * </PRE>
59 *
60 * <P>
61 * D receives edit requests and forwards them to DC and AS.
62 * On DC, this method makes a change to the content - e.g. replacing
63 * a String range from start to end with replacement. (Deletions
64 * are catered for by having replacement = null.) D then calls
65 * AS.edit on each of its annotation sets.
66 *
67 * <P>
68 * On AS, edit calls replacement.size() (i.e. DC.size()) to
69 * figure out how long the replacement is (0 for null). It then
70 * considers annotations that terminate (start or end) in
71 * the altered or deleted range as invalid; annotations that
72 * terminate after the range have their offsets adjusted.
73 * I.e.:
74 * <UL>
75 * <LI>
76 * the nodes that pointed inside the old modified area are invalid now and
77 * will be deleted along with the connected annotations;
78 * <LI>
79 * the nodes that are before the start of the modified area remain
80 * untouched;
81 * <LI>
82 * the nodes that are after the end of the affected area will have the
83 * offset changed according to the formula below.
84 * </UL>
85 *
86 * <P>
87 * A note re. AS and annotations: annotations no longer have
88 * offsets as in the old model, they now have nodes, and nodes
89 * have offsets.
90 *
91 * <P>
92 * To implement AS.edit, we have several indices:
93 * <PRE>
94 * HashMap annotsByStartNode, annotsByEndNode;
95 * </PRE>
96 * which map node ids to annotations;
97 * <PRE>
98 * RBTreeMap nodesByOffset;
99 * </PRE>
100 * which maps offset to Nodes.
101 *
102 * <P>
103 * When we get an edit request, we traverse that part of the
104 * nodesByOffset tree representing the altered or deleted
105 * range of the DC. For each node found, we delete any annotations
106 * that terminate on the node, and then delete the node itself.
107 * We then traverse the rest of the tree, changing the offset
108 * on all remaining nodes by:
109 * <PRE>
110 * newOffset =
111 * oldOffset -
112 * (
113 * (end - start) - // size of mod
114 * ( (replacement == null) ? 0 : replacement.size() ) // size of repl
115 * );
116 * </PRE>
117 * Note that we use the same convention as e.g. java.lang.String: start
118 * offsets are inclusive; end offsets are exclusive. I.e. for string "abcd"
119 * range 1-3 = "bc". Examples, for a node with offset 4:
120 * <PRE>
121 * edit(1, 3, "BC");
122 * newOffset = 4 - ( (3 - 1) - 2 ) = 4
123 *
124 * edit(1, 3, null);
125 * newOffset = 4 - ( (3 - 1) - 0 ) = 2
126 *
127 * edit(1, 3, "BBCC");
128 * newOffset = 4 - ( (3 - 1) - 4 ) = 6
129 * </PRE>
130 */
131 public class DocumentImpl
132 extends AbstractLanguageResource implements TextualDocument, CreoleListener,
133 DatastoreListener {
134 /** Debug flag */
135 private static final boolean DEBUG = false;
136
137 /** If you set this flag to true the original content of the document will
138 * be kept in the document feature. <br>
139 * Default value is false to avoid the unnecessary waste of memory */
140 private Boolean preserveOriginalContent = new Boolean(false);
141
142 /** If you set this flag to true the repositioning information for
143 * the document will be kept in the document feature. <br>
144 * Default value is false to avoid the unnecessary waste of time and memory
145 */
146 private Boolean collectRepositioningInfo = new Boolean(false);
147
148 /**
149 * This is a variable which contains the latest crossed over annotation
150 * found during export with preserving format, i.e., toXml(annotations)
151 * method.
152 */
153 private Annotation crossedOverAnnotation = null;
154
155 /** Default construction. Content left empty. */
156 public DocumentImpl() {
157 content = new DocumentContentImpl();
158 stringContent = "";
159 } // default construction
160
161 /** Cover unpredictable Features creation */
162 public FeatureMap getFeatures() {
163 if (features == null) {
164 features = new SimpleFeatureMapImpl();
165 }
166 return features;
167 }
168
169 /** Initialise this resource, and return it. */
170 public Resource init() throws ResourceInstantiationException {
171 // set up the source URL and create the content
172 if(sourceUrl == null) {
173 if(stringContent == null) {
174 throw new ResourceInstantiationException(
175 "The sourceURL and document's content were null."
176 );
177 }
178
179 content = new DocumentContentImpl(stringContent);
180 getFeatures().put("gate.SourceURL", "created from String");
181 } else {
182 try {
183 content = new DocumentContentImpl(
184 sourceUrl, getEncoding(), sourceUrlStartOffset, sourceUrlEndOffset);
185 getFeatures().put("gate.SourceURL", sourceUrl.toExternalForm());
186 } catch(IOException e) {
187 throw new ResourceInstantiationException("DocumentImpl.init: " + e);
188 }
189 }
190
191 if(preserveOriginalContent.booleanValue() && content != null) {
192 String originalContent = new String(
193 ((DocumentContentImpl) content).getOriginalContent());
194 getFeatures().put(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME,
195 originalContent);
196 } // if
197
198 // set up a DocumentFormat if markup unpacking required
199 if(getMarkupAware().booleanValue()) {
200 DocumentFormat docFormat =
201 DocumentFormat.getDocumentFormat(this, sourceUrl);
202 try {
203 if(docFormat != null){
204 StatusListener sListener = (StatusListener)
205 gate.gui.MainFrame.getListeners().
206 get("gate.event.StatusListener");
207 if(sListener != null) docFormat.addStatusListener(sListener);
208
209 // set the flag if true and if the document format support collecting
210 docFormat.setShouldCollectRepositioning(collectRepositioningInfo);
211
212 if(docFormat.getShouldCollectRepositioning().booleanValue()) {
213 // unpack with collectiong of repositioning information
214 RepositioningInfo info = new RepositioningInfo();
215
216 String origContent = (String) getFeatures().get(
217 GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
218
219 RepositioningInfo ampCodingInfo = new RepositioningInfo();
220 if(origContent != null) {
221 boolean shouldCorrectCR = docFormat instanceof XmlDocumentFormat;
222 collectInformationForAmpCodding(origContent, ampCodingInfo,
223 shouldCorrectCR);
224 if(docFormat instanceof HtmlDocumentFormat) {
225 collectInformationForWS(origContent, ampCodingInfo);
226 } // if
227 } // if
228
229 docFormat.unpackMarkup(this, info, ampCodingInfo);
230
231 if(origContent != null
232 && docFormat instanceof XmlDocumentFormat) {
233 // CRLF correction of RepositioningInfo
234 correctRepositioningForCRLFInXML(origContent, info);
235 } // if
236
237 getFeatures().put(
238 GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME, info);
239 }
240 else {
241 // normal old fashioned unpack
242 docFormat.unpackMarkup(this);
243 }
244 docFormat.removeStatusListener(sListener);
245 } //if format != null
246 } catch(DocumentFormatException e) {
247 throw new ResourceInstantiationException(
248 "Couldn't unpack markup in document " + (sourceUrl != null ? sourceUrl.toExternalForm() : "") +
249 "!", e);
250 }
251 } // if markup aware
252
253 //try{
254 // FileWriter fw = new FileWriter("d:/temp/doccontent.txt");
255 // fw.write(getContent().toString());
256 // fw.flush();
257 // fw.close();
258 //}catch(IOException ioe){
259 // ioe.printStackTrace();
260 //}
261
262 return this;
263 } // init()
264
265 /**
266 * Correct repositioning information for substitution of "\r\n" with "\n"
267 */
268 private void correctRepositioningForCRLFInXML(String content,
269 RepositioningInfo info) {
270 int index = -1;
271
272 do {
273 index = content.indexOf("\r\n", index+1);
274 if(index != -1) {
275 info.correctInformationOriginalMove(index, 1);
276 } // if
277 } while(index != -1);
278 } // correctRepositioningForCRLF
279
280 /**
281 * Collect information for substitution of "&xxx;" with "y"
282 *
283 * It couldn't be collected a position information about
284 * some unicode and &-coded symbols during parsing. The parser "hide" the
285 * information about the position of such kind of parsed text.
286 * So, there is minimal chance to have &-coded symbol inside the covered by
287 * repositioning records area. The new record should be created for every
288 * coded symbol outside the existing records.
289 * <BR>
290 * If <code>shouldCorrectCR</code> flag is <code>true</code> the correction
291 * for CRLF substitution is performed.
292 */
293 private void collectInformationForAmpCodding(String content,
294 RepositioningInfo info,
295 boolean shouldCorrectCR) {
296
297 if(content == null || info == null) return;
298
299 int ampIndex = -1;
300 int semiIndex;
301
302 do {
303 ampIndex = content.indexOf('&', ampIndex+1);
304 if(ampIndex != -1) {
305 semiIndex = content.indexOf(';', ampIndex+1);
306 // have semicolon and it is near enough for amp codding
307 if(semiIndex != -1 && (semiIndex-ampIndex) < 8) {
308 info.addPositionInfo(ampIndex, semiIndex-ampIndex+1, 0, 1);
309 }
310 else {
311 // no semicolon or it is too far
312 // analyse for amp codding without semicolon
313 int maxEnd = Math.min(ampIndex+8, content.length());
314 String ampCandidate = content.substring(ampIndex, maxEnd);
315 int ampCodingSize = analyseAmpCodding(ampCandidate);
316
317 if(ampCodingSize != -1) {
318 info.addPositionInfo(ampIndex, ampCodingSize, 0, 1);
319 } // if
320
321 } // if - semicolon found
322 } // if - ampersand found
323 } while (ampIndex != -1);
324
325 // correct the collected information to adjust it's positions
326 // with reported by the parser
327 int index = -1;
328
329 if(shouldCorrectCR) {
330 do {
331 index = content.indexOf("\r\n", index+1);
332 if(index != -1) {
333 info.correctInformationOriginalMove(index, -1);
334 } // if
335 } while(index != -1);
336 } // if
337 } // collectInformationForAmpCodding
338
339 /**
340 * This function compute size of the ampersand codded sequence when
341 * semicolin is not present.
342 */
343 private int analyseAmpCodding(String content) {
344 int result = -1;
345
346 try {
347 char ch = content.charAt(1);
348
349 switch(ch) {
350 case 'l' : // <
351 case 'L' : // <
352 if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
353 result = 3;
354 } // if
355 break;
356 case 'g' : // >
357 case 'G' : // >
358 if(content.charAt(2) == 't' || content.charAt(2) == 'T') {
359 result = 3;
360 } // if
361 break;
362 case 'a' : // &
363 case 'A' : // &
364 if(content.substring(2, 4).equalsIgnoreCase("mp")) {
365 result = 4;
366 } // if
367 break;
368 case 'q' : // "
369 case 'Q' : // "
370 if(content.substring(2, 5).equalsIgnoreCase("uot")) {
371 result = 5;
372 } // if
373 break;
374 case '#' : // #number (example ‘, 䰸)
375 int endIndex = 2;
376 boolean hexCoded = false;
377 if(content.charAt(2) == 'x' || content.charAt(2) == 'X') {
378 // Hex codding
379 ++endIndex;
380 hexCoded = true;
381 } // if
382
383 while (endIndex < 8
384 && isNumber(content.charAt(endIndex), hexCoded) ) {
385 ++endIndex;
386 } // while
387 result = endIndex;
388 break;
389 } // switch
390 } catch (StringIndexOutOfBoundsException ex) {
391 // do nothing
392 } // catch
393
394 return result;
395 } // analyseAmpCodding
396
397 /** Check for numeric range. If hex is true the A..F range is included */
398 private boolean isNumber(char ch, boolean hex) {
399 if(ch >= '0' && ch <= '9') return true;
400
401 if(hex) {
402 if(ch >= 'A' && ch <= 'F') return true;
403 if(ch >= 'a' && ch <= 'f') return true;
404 } // if
405
406 return false;
407 } // isNumber
408
409 /** HTML parser perform substitution of multiple whitespaces (WS) with
410 * a single WS. To create correct repositioning information structure we
411 * should keep the information for such multiple WS.
412 * <BR>
413 * The criteria for WS is <code>(ch <= ' ')</code>.
414 */
415 private void collectInformationForWS(String content, RepositioningInfo info) {
416
417 if(content == null || info == null) return;
418
419 // analyse the content and correct the repositioning information
420 char ch;
421 int startWS, endWS;
422
423 startWS = endWS = -1;
424 int contentLength = content.length();
425
426 for(int i=0; i<contentLength; ++i) {
427 ch = content.charAt(i);
428
429 // is whitespace
430 if(ch <= ' ') {
431 if(startWS == -1) {
432 startWS = i;
433 } // if
434 endWS = i;
435 }
436 else {
437 if(endWS - startWS > 0) {
438 // put the repositioning information about the WS substitution
439 info.addPositionInfo(
440 (long)startWS, (long)(endWS - startWS + 1), 0, 1);
441 } // if
442 // clear positions
443 startWS = endWS = -1;
444 }// if
445 } // for
446 } // collectInformationForWS
447
448 /** Clear all the data members of the object. */
449 public void cleanup() {
450
451 defaultAnnots = null;
452 if ( (namedAnnotSets != null) && (!namedAnnotSets.isEmpty()))
453 namedAnnotSets.clear();
454 if (DEBUG) Out.prln("Document cleanup called");
455 if (this.lrPersistentId != null)
456 Gate.getCreoleRegister().removeCreoleListener(this);
457 if(this.getDataStore() != null)
458 this.getDataStore().removeDatastoreListener(this);
459 } // cleanup()
460
461
462 /** Documents are identified by URLs */
463 public URL getSourceUrl() { return sourceUrl; }
464
465 /** Set method for the document's URL */
466 public void setSourceUrl(URL sourceUrl) {
467 this.sourceUrl = sourceUrl;
468 } // setSourceUrl
469
470 /** Documents may be packed within files; in this case an optional pair of
471 * offsets refer to the location of the document.
472 */
473 public Long[] getSourceUrlOffsets() {
474 Long[] sourceUrlOffsets = new Long[2];
475 sourceUrlOffsets[0] = sourceUrlStartOffset;
476 sourceUrlOffsets[1] = sourceUrlEndOffset;
477 return sourceUrlOffsets;
478 } // getSourceUrlOffsets
479
480 /**
481 * Allow/disallow preserving of the original document content.
482 * If is <B>true</B> the original content will be retrieved from
483 * the DocumentContent object and preserved as document feature.
484 */
485 public void setPreserveOriginalContent(Boolean b) {
486 preserveOriginalContent = b;
487 } // setPreserveOriginalContent
488
489 /** Get the preserving of content status of the Document.
490 *
491 * @return whether the Document should preserve it's original content.
492 */
493 public Boolean getPreserveOriginalContent() {
494 return preserveOriginalContent;
495 } // getPreserveOriginalContent
496
497 /**
498 * Allow/disallow collecting of repositioning information.
499 * If is <B>true</B> information will be retrieved and preserved
500 * as document feature.<BR>
501 * Preserving of repositioning information give the possibilities
502 * for converting of coordinates between the original document content and
503 * extracted from the document text.
504 */
505 public void setCollectRepositioningInfo(Boolean b) {
506 collectRepositioningInfo = b;
507 } // setCollectRepositioningInfo
508
509 /** Get the collectiong and preserving of repositioning information
510 * for the Document. <BR>
511 * Preserving of repositioning information give the possibilities
512 * for converting of coordinates between the original document content and
513 * extracted from the document text.
514 *
515 * @return whether the Document should collect and preserve information.
516 */
517 public Boolean getCollectRepositioningInfo() {
518 return collectRepositioningInfo;
519 } // getCollectRepositioningInfo
520
521 /** Documents may be packed within files; in this case an optional pair of
522 * offsets refer to the location of the document. This method gets the
523 * start offset.
524 */
525 public Long getSourceUrlStartOffset() { return sourceUrlStartOffset; }
526
527 /** Documents may be packed within files; in this case an optional pair of
528 * offsets refer to the location of the document. This method sets the
529 * start offset.
530 */
531 public void setSourceUrlStartOffset(Long sourceUrlStartOffset) {
532 this.sourceUrlStartOffset = sourceUrlStartOffset;
533 } // setSourceUrlStartOffset
534
535 /** Documents may be packed within files; in this case an optional pair of
536 * offsets refer to the location of the document. This method gets the
537 * end offset.
538 */
539 public Long getSourceUrlEndOffset() { return sourceUrlEndOffset; }
540
541 /** Documents may be packed within files; in this case an optional pair of
542 * offsets refer to the location of the document. This method sets the
543 * end offset.
544 */
545 public void setSourceUrlEndOffset(Long sourceUrlEndOffset) {
546 this.sourceUrlEndOffset = sourceUrlEndOffset;
547 } // setSourceUrlStartOffset
548
549 /** The content of the document: a String for text; MPEG for video; etc. */
550 public DocumentContent getContent() { return content; }
551
552 /** Set method for the document content */
553 public void setContent(DocumentContent content) {
554 this.content = content;
555 this.stringContent = content.toString();
556 }
557
558 /** Get the encoding of the document content source */
559 public String getEncoding() {
560 //we need to make sure we ALWAYS have an encoding
561 if(encoding == null || encoding.trim().length() == 0){
562 //no encoding definded: use the platform default
563 encoding = java.nio.charset.Charset.forName(
564 System.getProperty("file.encoding")).name();
565 }
566 return encoding;
567 }
568
569 /** Set the encoding of the document content source */
570 public void setEncoding(String encoding) { this.encoding = encoding; }
571
572 /** Get the default set of annotations. The set is created if it
573 * doesn't exist yet.
574 */
575 public AnnotationSet getAnnotations() {
576 if(defaultAnnots == null){
577 defaultAnnots = new AnnotationSetImpl(this);
578 fireAnnotationSetAdded(new DocumentEvent(
579 this, DocumentEvent.ANNOTATION_SET_ADDED, null));
580 }//if
581 return defaultAnnots;
582 } // getAnnotations()
583
584 /** Get a named set of annotations. Creates a new set if one with this
585 * name doesn't exist yet.
586 * If the provided name is null then it returns the default annotation set.
587 */
588 public AnnotationSet getAnnotations(String name) {
589 if(name == null) return getAnnotations();
590 if(namedAnnotSets == null)
591 namedAnnotSets = new HashMap();
592 AnnotationSet namedSet = (AnnotationSet) namedAnnotSets.get(name);
593
594 if(namedSet == null) {
595 namedSet = new AnnotationSetImpl(this, name);
596 namedAnnotSets.put(name, namedSet);
597
598 DocumentEvent evt = new DocumentEvent(
599 this, DocumentEvent.ANNOTATION_SET_ADDED, name
600 );
601 fireAnnotationSetAdded(evt);
602 }
603 return namedSet;
604 } // getAnnotations(name)
605
606 /** Make the document markup-aware. This will trigger the creation
607 * of a DocumentFormat object at Document initialisation time; the
608 * DocumentFormat object will unpack the markup in the Document and
609 * add it as annotations. Documents are <B>not</B> markup-aware by default.
610 *
611 * @param newMarkupAware markup awareness status.
612 */
613 public void setMarkupAware(Boolean newMarkupAware) {
614 this.markupAware = newMarkupAware;
615 }
616
617 /** Get the markup awareness status of the Document.
618 * <B>Documents are markup-aware by default.</B>
619 * @return whether the Document is markup aware.
620 */
621 public Boolean getMarkupAware() { return markupAware; }
622
623 /** Returns an XML document aming to preserve the original markups(
624 * the original markup will be in the same place and format as it was
625 * before processing the document) and include (if possible)
626 * the annotations specified in the aSourceAnnotationSet.
627 * It is equivalent to toXml(aSourceAnnotationSet, true).
628 */
629 public String toXml(Set aSourceAnnotationSet){
630 return toXml(aSourceAnnotationSet, true);
631 }
632
633 /** Returns an XML document aming to preserve the original markups(
634 * the original markup will be in the same place and format as it was
635 * before processing the document) and include (if possible)
636 * the annotations specified in the aSourceAnnotationSet.
637 * <b>Warning:</b> Annotations from the aSourceAnnotationSet will be lost
638 * if they will cause a crosed over situation.
639 * @param aSourceAnnotationSet is an annotation set containing all the
640 * annotations that will be combined with the original marup set. If the
641 * param is <code>null</code> it will only dump the original markups.
642 * @param includeFeatures is a boolean that controls whether the annotation
643 * features should be included or not. If false, only the annotation type
644 * is included in the tag.
645 * @return a string representing an XML document containing the original
646 * markup + dumped annotations form the aSourceAnnotationSet
647 */
648 public String toXml(Set aSourceAnnotationSet, boolean includeFeatures){
649
650 if(hasOriginalContentFeatures()) {
651 return saveAnnotationSetAsXmlInOrig(aSourceAnnotationSet,includeFeatures);
652 } // if
653
654 AnnotationSet originalMarkupsAnnotSet =
655 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
656
657 // Create a dumping annotation set on the document. It will be used for
658 // dumping annotations...
659 // AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
660 List dumpingList = new ArrayList(originalMarkupsAnnotSet.size());
661
662 // This set will be constructed inside this method. If is not empty, the
663 // annotation contained will be lost.
664 /* if (!dumpingSet.isEmpty()){
665 Out.prln("WARNING: The dumping annotation set was not empty."+
666 "All annotation it contained were lost.");
667 dumpingSet.clear();
668 }// End if
669 */
670 StatusListener sListener = (StatusListener)
671 gate.gui.MainFrame.getListeners().
672 get("gate.event.StatusListener");
673 // Construct the dumping set in that way that all annotations will verify
674 // the condition that there are not annotations which are crossed.
675 // First add all annotation from the original markups
676 if(sListener != null)
677 sListener.statusChanged("Constructing the dumping annotation set.");
678 // dumpingSet.addAll(originalMarkupsAnnotSet);
679 dumpingList.addAll(originalMarkupsAnnotSet);
680 // Then take all the annotations from aSourceAnnotationSet and verify if
681 // they can be inserted safely into the dumpingSet. Where not possible,
682 // report.
683 if (aSourceAnnotationSet != null){
684 Iterator iter = aSourceAnnotationSet.iterator();
685 while (iter.hasNext()){
686 Annotation currentAnnot = (Annotation) iter.next();
687 if(insertsSafety(dumpingList,currentAnnot)){
688 // dumpingSet.add(currentAnnot);
689 dumpingList.add(currentAnnot);
690 }else if (crossedOverAnnotation != null && DEBUG){
691 try {
692 Out.prln("Warning: Annotations were found to violate the " +
693 "crossed over condition: \n" +
694 "1. [" +
695 getContent().getContent(
696 crossedOverAnnotation.getStartNode().getOffset(),
697 crossedOverAnnotation.getEndNode().getOffset()) +
698 " (" + crossedOverAnnotation.getType() + ": " +
699 crossedOverAnnotation.getStartNode().getOffset() +
700 ";" + crossedOverAnnotation.getEndNode().getOffset() +
701 ")]\n" +
702 "2. [" +
703 getContent().getContent(
704 currentAnnot.getStartNode().getOffset(),
705 currentAnnot.getEndNode().getOffset()) +
706 " (" + currentAnnot.getType() + ": " +
707 currentAnnot.getStartNode().getOffset() +
708 ";" + currentAnnot.getEndNode().getOffset() +
709 ")]\nThe second one will be discarded.\n" );
710 } catch (gate.util.InvalidOffsetException ex) {
711 throw new GateRuntimeException(ex.getMessage());
712 }
713 }// End if
714 }// End while
715 }// End if
716
717 //kalina: order the dumping list by start offset
718 Collections.sort(dumpingList, new gate.util.OffsetComparator());
719
720 // The dumpingSet is ready to be exported as XML
721 // Here we go.
722 if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
723 StringBuffer xmlDoc = new StringBuffer(
724 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
725
726 // Add xml header if original format was xml
727 String mimeType = getFeatures() == null ?
728 null :
729 (String)getFeatures().get("MimeType");
730 boolean wasXML = mimeType != null && mimeType.equalsIgnoreCase("text/xml");
731
732 if(wasXML){
733 xmlDoc.append("<?xml version=\"1.0\" encoding=\"");
734 xmlDoc.append(getEncoding());
735 xmlDoc.append("\" ?>");
736 xmlDoc.append(Strings.getNl());
737 }// ENd if
738 // Identify and extract the root annotation from the dumpingSet.
739 theRootAnnotation = identifyTheRootAnnotation(dumpingList);
740 // If a root annotation has been identified then add it eplicitley at the
741 // beginning of the document
742 if (theRootAnnotation != null){
743 dumpingList.remove(theRootAnnotation);
744 xmlDoc.append(writeStartTag(theRootAnnotation,includeFeatures));
745 }// End if
746 // Construct and append the rest of the document
747 xmlDoc.append(saveAnnotationSetAsXml(dumpingList, includeFeatures));
748 // If a root annotation has been identified then add it eplicitley at the
749 // end of the document
750 if (theRootAnnotation != null){
751 xmlDoc.append(writeEndTag(theRootAnnotation));
752 }// End if
753
754 if(sListener != null) sListener.statusChanged("Done.");
755 return xmlDoc.toString();
756 }//End toXml()
757
758 /** This method verifies if aSourceAnnotation can ve inserted safety into the
759 * aTargetAnnotSet. Safety means that it doesn't violate the crossed over
760 * contition with any annotation from the aTargetAnnotSet.
761 * @param aTargetAnnotSet the annotation set to include the aSourceAnnotation
762 * @param aSourceAnnotation the annotation to be inserted into the
763 * aTargetAnnotSet
764 * @return true if the annotation inserts safety, or false otherwise.
765 */
766 private boolean insertsSafety(AnnotationSet aTargetAnnotSet,
767 Annotation aSourceAnnotation){
768
769 if (aTargetAnnotSet == null || aSourceAnnotation == null) {
770 this.crossedOverAnnotation = null;
771 return false;
772 }
773 if (aSourceAnnotation.getStartNode() == null ||
774 aSourceAnnotation.getStartNode().getOffset()== null) {
775 this.crossedOverAnnotation = null;
776 return false;
777 }
778 if (aSourceAnnotation.getEndNode() == null ||
779 aSourceAnnotation.getEndNode().getOffset()== null) {
780 this.crossedOverAnnotation = null;
781 return false;
782 }
783
784 // Get the start and end offsets
785 Long start = aSourceAnnotation.getStartNode().getOffset();
786 Long end = aSourceAnnotation.getEndNode().getOffset();
787 // Read aSourceAnnotation offsets long
788 long s2 = start.longValue();
789 long e2 = end.longValue();
790
791 // Obtain a set with all annotations annotations that overlap
792 // totaly or partially with the interval defined by the two provided offsets
793 AnnotationSet as = aTargetAnnotSet.get(start,end);
794
795 // Investigate all the annotations from as to see if there is one that
796 // comes in conflict with aSourceAnnotation
797 Iterator it = as.iterator();
798 while(it.hasNext()){
799 Annotation ann = (Annotation) it.next();
800 // Read ann offsets
801 long s1 = ann.getStartNode().getOffset().longValue();
802 long e1 = ann.getEndNode().getOffset().longValue();
803
804 if (s1<s2 && s2<e1 && e1<e2) {
805 this.crossedOverAnnotation = ann;
806 return false;
807 }
808 if (s2<s1 && s1<e2 && e2<e1) {
809 this.crossedOverAnnotation = ann;
810 return false;
811 }
812 }// End while
813 return true;
814 }// insertsSafety()
815
816 private boolean insertsSafety(List aTargetAnnotList,
817 Annotation aSourceAnnotation){
818
819 if (aTargetAnnotList == null || aSourceAnnotation == null) {
820 this.crossedOverAnnotation = null;
821 return false;
822 }
823 if (aSourceAnnotation.getStartNode() == null ||
824 aSourceAnnotation.getStartNode().getOffset()== null) {
825 this.crossedOverAnnotation = null;
826 return false;
827 }
828 if (aSourceAnnotation.getEndNode() == null ||
829 aSourceAnnotation.getEndNode().getOffset()== null) {
830 this.crossedOverAnnotation = null;
831 return false;
832 }
833
834 // Get the start and end offsets
835 Long start = aSourceAnnotation.getStartNode().getOffset();
836 Long end = aSourceAnnotation.getEndNode().getOffset();
837 // Read aSourceAnnotation offsets long
838 long s2 = start.longValue();
839 long e2 = end.longValue();
840
841 // Obtain a set with all annotations annotations that overlap
842 // totaly or partially with the interval defined by the two provided offsets
843 List as = new ArrayList();
844 for (int i=0; i < aTargetAnnotList.size(); i++) {
845 Annotation annot = (Annotation) aTargetAnnotList.get(i);
846 if (annot.getStartNode().getOffset().longValue() >= s2
847 &&
848 annot.getStartNode().getOffset().longValue() <= e2)
849 as.add(annot);
850 else if (annot.getEndNode().getOffset().longValue() >= s2
851 &&
852 annot.getEndNode().getOffset().longValue() <= e2)
853 as.add(annot);
854 }
855
856 // Investigate all the annotations from as to see if there is one that
857 // comes in conflict with aSourceAnnotation
858 Iterator it = as.iterator();
859 while(it.hasNext()){
860 Annotation ann = (Annotation) it.next();
861 // Read ann offsets
862 long s1 = ann.getStartNode().getOffset().longValue();
863 long e1 = ann.getEndNode().getOffset().longValue();
864
865 if (s1<s2 && s2<e1 && e1<e2) {
866 this.crossedOverAnnotation = ann;
867 return false;
868 }
869 if (s2<s1 && s1<e2 && e2<e1) {
870 this.crossedOverAnnotation = ann;
871 return false;
872 }
873 }// End while
874 return true;
875 }// insertsSafety()
876
877 /** This method saves all the annotations from aDumpAnnotSet and combines
878 * them with the document content.
879 * @param aDumpAnnotSet is a GATE annotation set prepared to be used
880 * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
881 * then an empty string will be returned.
882 * @param includeFeatures is a boolean, which controls whether the annotation
883 * features and gate ID are included or not.
884 * @return The XML document obtained from raw text + the information from
885 * the dump annotation set.
886 */
887 private String saveAnnotationSetAsXml(AnnotationSet aDumpAnnotSet,
888 boolean includeFeatures){
889 String content = null;
890 if (this.getContent()== null)
891 content = new String("");
892 else
893 content = this.getContent().toString();
894 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
895 if (aDumpAnnotSet == null) return docContStrBuff.toString();
896
897 TreeMap offsets2CharsMap = new TreeMap();
898 if (this.getContent().size().longValue() != 0){
899 // Fill the offsets2CharsMap with all the indices where
900 // special chars appear
901 buildEntityMapFromString(content,offsets2CharsMap);
902 }//End if
903 // The saving alghorithm is as follows:
904 ///////////////////////////////////////////
905 // Construct a set of annot with all IDs in asc order.
906 // All annotations that end at that offset swap their place in descending
907 // order. For each node write all the tags from left to right.
908
909 // Construct the node set
910 TreeSet offsets = new TreeSet();
911 Iterator iter = aDumpAnnotSet.iterator();
912 while (iter.hasNext()){
913 Annotation annot = (Annotation) iter.next();
914 offsets.add(annot.getStartNode().getOffset());
915 offsets.add(annot.getEndNode().getOffset());
916 }// End while
917
918 // ofsets is sorted in ascending order.
919 // Iterate this set in descending order and remove an offset at each
920 // iteration
921 while (!offsets.isEmpty()){
922 Long offset = (Long)offsets.last();
923 // Remove the offset from the set
924 offsets.remove(offset);
925 // Now, use it.
926 // Returns a list with annotations that needs to be serialized in that
927 // offset.
928 List annotations = getAnnotationsForOffset(aDumpAnnotSet,offset);
929 // Attention: the annotation are serialized from left to right
930 // StringBuffer tmpBuff = new StringBuffer("");
931 StringBuffer tmpBuff = new StringBuffer(
932 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
933 Stack stack = new Stack();
934 // Iterate through all these annotations and serialize them
935 Iterator it = annotations.iterator();
936 while(it.hasNext()){
937 Annotation a = (Annotation) it.next();
938 it.remove();
939 // Test if a Ends at offset
940 if ( offset.equals(a.getEndNode().getOffset()) ){
941 // Test if a Starts at offset
942 if ( offset.equals(a.getStartNode().getOffset()) ){
943 // Here, the annotation a Starts and Ends at the offset
944 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
945 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
946
947 // Assert: annotation a with start == end and isEmptyAndSpan
948 tmpBuff.append(writeStartTag(a, includeFeatures));
949 stack.push(a);
950 }else{
951 // Assert annotation a with start == end and an empty tag
952 tmpBuff.append(writeEmptyTag(a));
953 // The annotation is removed from dumped set
954 aDumpAnnotSet.remove(a);
955 }// End if
956 }else{
957 // Here the annotation a Ends at the offset.
958 // In this case empty the stack and write the end tag
959 if (!stack.isEmpty()){
960 while(!stack.isEmpty()){
961 Annotation a1 = (Annotation)stack.pop();
962 tmpBuff.append(writeEndTag(a1));
963 }// End while
964 }// End if
965 tmpBuff.append(writeEndTag(a));
966 }// End if
967 }else{
968 // The annotation a does NOT end at the offset. Let's see if it starts
969 // at the offset
970 if ( offset.equals(a.getStartNode().getOffset()) ){
971 // The annotation a starts at the offset.
972 // In this case empty the stack and write the end tag
973 if (!stack.isEmpty()){
974 while(!stack.isEmpty()){
975 Annotation a1 = (Annotation)stack.pop();
976 tmpBuff.append(writeEndTag(a1));
977 }// End while
978 }// End if
979 tmpBuff.append(writeStartTag(a, includeFeatures));
980 // The annotation is removed from dumped set
981 aDumpAnnotSet.remove(a);
982 }// End if ( offset.equals(a.getStartNode().getOffset()) )
983 }// End if ( offset.equals(a.getEndNode().getOffset()) )
984 }// End while(it.hasNext()){
985
986 // In this case empty the stack and write the end tag
987 if (!stack.isEmpty()){
988 while(!stack.isEmpty()){
989 Annotation a1 = (Annotation)stack.pop();
990 tmpBuff.append(writeEndTag(a1));
991 }// End while
992 }// End if
993
994 // Before inserting tmpBuff into docContStrBuff we need to check
995 // if there are chars to be replaced and if there are, they would be
996 // replaced.
997 if (!offsets2CharsMap.isEmpty()){
998 Long offsChar = (Long) offsets2CharsMap.lastKey();
999 while( !offsets2CharsMap.isEmpty() &&
1000 offsChar.intValue() >= offset.intValue()){
1001 // Replace the char at offsChar with its corresponding entity form
1002 // the entitiesMap.
1003 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1004 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1005 // Discard the offsChar after it was used.
1006 offsets2CharsMap.remove(offsChar);
1007 // Investigate next offsChar
1008 if (!offsets2CharsMap.isEmpty())
1009 offsChar = (Long) offsets2CharsMap.lastKey();
1010 }// End while
1011 }// End if
1012 // Insert tmpBuff to the location where it belongs in docContStrBuff
1013 docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1014 }// End while(!offsets.isEmpty())
1015 // Need to replace the entities in the remaining text, if there is any text
1016 // So, if there are any more items in offsets2CharsMap they need to be
1017 // replaced
1018 while (!offsets2CharsMap.isEmpty()){
1019 Long offsChar = (Long) offsets2CharsMap.lastKey();
1020 // Replace the char with its entity
1021 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1022 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1023 // remove the offset from the map
1024 offsets2CharsMap.remove(offsChar);
1025 }// End while
1026 return docContStrBuff.toString();
1027 }// saveAnnotationSetAsXml()
1028
1029 private String saveAnnotationSetAsXml(List aDumpAnnotList,
1030 boolean includeFeatures){
1031 String content = null;
1032 if (this.getContent()== null)
1033 content = new String("");
1034 else
1035 content = this.getContent().toString();
1036 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1037 if (aDumpAnnotList == null) return docContStrBuff.toString();
1038
1039 StringBuffer resultStrBuff = new StringBuffer(
1040 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1041 // last offset position used to extract portions of text
1042 Long lastOffset = new Long(0);
1043
1044 TreeMap offsets2CharsMap = new TreeMap();
1045 HashMap annotsForOffset = new HashMap(100);
1046 if (this.getContent().size().longValue() != 0){
1047 // Fill the offsets2CharsMap with all the indices where
1048 // special chars appear
1049 buildEntityMapFromString(content,offsets2CharsMap);
1050 }//End if
1051 // The saving alghorithm is as follows:
1052 ///////////////////////////////////////////
1053 // Construct a set of annot with all IDs in asc order.
1054 // All annotations that end at that offset swap their place in descending
1055 // order. For each node write all the tags from left to right.
1056
1057 // Construct the node set
1058 TreeSet offsets = new TreeSet();
1059 Iterator iter = aDumpAnnotList.iterator();
1060 Annotation annot;
1061 Long start;
1062 Long end;
1063 while (iter.hasNext()){
1064 annot = (Annotation) iter.next();
1065 start = annot.getStartNode().getOffset();
1066 end = annot.getEndNode().getOffset();
1067 offsets.add(start);
1068 offsets.add(end);
1069 if (annotsForOffset.containsKey(start)) {
1070 ((List) annotsForOffset.get(start)).add(annot);
1071 } else {
1072 List newList = new ArrayList(10);
1073 newList.add(annot);
1074 annotsForOffset.put(start, newList);
1075 }
1076 if (annotsForOffset.containsKey(end)) {
1077 ((List) annotsForOffset.get(end)).add(annot);
1078 } else {
1079 List newList = new ArrayList(10);
1080 newList.add(annot);
1081 annotsForOffset.put(end, newList);
1082 }
1083 }// End while
1084
1085 // ofsets is sorted in ascending order.
1086 // Iterate this set in descending order and remove an offset at each
1087 // iteration
1088 Iterator offsetIt = offsets.iterator();
1089 Long offset;
1090 List annotations;
1091 // This don't have to be a large buffer - just for tags
1092 StringBuffer tmpBuff = new StringBuffer(255);
1093 Stack stack = new Stack();
1094 while (offsetIt.hasNext()){
1095 offset = (Long)offsetIt.next();
1096 // Now, use it.
1097 // Returns a list with annotations that needs to be serialized in that
1098 // offset.
1099 annotations = (List) annotsForOffset.get(offset);
1100 // order annotations in list for offset to print tags in correct order
1101 annotations = getAnnotationsForOffset(annotations, offset);
1102 // clear structures
1103 tmpBuff.setLength(0);
1104 stack.clear();
1105
1106 // Iterate through all these annotations and serialize them
1107 Iterator it = annotations.iterator();
1108 Annotation a;
1109 Annotation annStack;
1110 while(it.hasNext()){
1111 a = (Annotation) it.next();
1112 // Test if a Ends at offset
1113 if ( offset.equals(a.getEndNode().getOffset()) ){
1114 // Test if a Starts at offset
1115 if ( offset.equals(a.getStartNode().getOffset()) ){
1116 // Here, the annotation a Starts and Ends at the offset
1117 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1118 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1119
1120 // Assert: annotation a with start == end and isEmptyAndSpan
1121 tmpBuff.append(writeStartTag(a, includeFeatures));
1122 stack.push(a);
1123 }else{
1124 // Assert annotation a with start == end and an empty tag
1125 tmpBuff.append(writeEmptyTag(a));
1126 // The annotation is removed from dumped set
1127 aDumpAnnotList.remove(a);
1128 }// End if
1129 }else{
1130 // Here the annotation a Ends at the offset.
1131 // In this case empty the stack and write the end tag
1132 if (!stack.isEmpty()){
1133 while(!stack.isEmpty()){
1134 annStack = (Annotation)stack.pop();
1135 tmpBuff.append(writeEndTag(annStack));
1136 }// End while
1137 }// End if
1138 tmpBuff.append(writeEndTag(a));
1139 }// End if
1140 }else{
1141 // The annotation a does NOT end at the offset. Let's see if it starts
1142 // at the offset
1143 if ( offset.equals(a.getStartNode().getOffset()) ){
1144 // The annotation a starts at the offset.
1145 // In this case empty the stack and write the end tag
1146 if (!stack.isEmpty()){
1147 while(!stack.isEmpty()){
1148 annStack = (Annotation)stack.pop();
1149 tmpBuff.append(writeEndTag(annStack));
1150 }// End while
1151 }// End if
1152 tmpBuff.append(writeStartTag(a, includeFeatures));
1153 // The annotation is removed from dumped set
1154 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1155 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1156 }// End while(it.hasNext()){
1157
1158 // In this case empty the stack and write the end tag
1159 if (!stack.isEmpty()){
1160 while(!stack.isEmpty()){
1161 annStack = (Annotation)stack.pop();
1162 tmpBuff.append(writeEndTag(annStack));
1163 }// End while
1164 }// End if
1165
1166 // extract text from content and replace spec chars
1167 StringBuffer partText = new StringBuffer();
1168 SortedMap offsetsInRange =
1169 offsets2CharsMap.subMap(lastOffset, offset);
1170 Long tmpOffset;
1171 Long tmpLastOffset = lastOffset;
1172 String replacement;
1173
1174 // Before inserting tmpBuff into the buffer we need to check
1175 // if there are chars to be replaced in range
1176 if(!offsetsInRange.isEmpty()) {
1177 tmpOffset = (Long) offsetsInRange.firstKey();
1178 replacement =
1179 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1180 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1181 tmpOffset.intValue()));
1182 partText.append(replacement);
1183 tmpLastOffset = new Long(tmpOffset.longValue()+1);
1184 }
1185 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1186 offset.intValue()));
1187 resultStrBuff.append(partText);
1188 // Insert tmpBuff to the result string
1189 resultStrBuff.append(tmpBuff.toString());
1190 lastOffset = offset;
1191 }// End while(!offsets.isEmpty())
1192
1193 // get text to the end of content
1194 // extract text from content and replace spec chars
1195 StringBuffer partText = new StringBuffer();
1196 SortedMap offsetsInRange =
1197 offsets2CharsMap.subMap(lastOffset, new Long(docContStrBuff.length()));
1198 Long tmpOffset;
1199 Long tmpLastOffset = lastOffset;
1200 String replacement;
1201
1202 // Need to replace the entities in the remaining text, if there is any text
1203 // So, if there are any more items in offsets2CharsMap for remaining text
1204 // they need to be replaced
1205 if(!offsetsInRange.isEmpty()) {
1206 tmpOffset = (Long) offsetsInRange.firstKey();
1207 replacement =
1208 (String)entitiesMap.get((Character)offsets2CharsMap.get(tmpOffset));
1209 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1210 tmpOffset.intValue()));
1211 partText.append(replacement);
1212 tmpLastOffset = new Long(tmpOffset.longValue()+1);
1213 }
1214 partText.append(docContStrBuff.substring(tmpLastOffset.intValue(),
1215 docContStrBuff.length()));
1216 resultStrBuff.append(partText);
1217
1218 return resultStrBuff.toString();
1219 }// saveAnnotationSetAsXml()
1220
1221/* Old method created by Cristian. Create content backward.
1222
1223 private String saveAnnotationSetAsXml(List aDumpAnnotList,
1224 boolean includeFeatures){
1225 String content = null;
1226 if (this.getContent()== null)
1227 content = new String("");
1228 else
1229 content = this.getContent().toString();
1230 StringBuffer docContStrBuff = filterNonXmlChars(new StringBuffer(content));
1231 if (aDumpAnnotList == null) return docContStrBuff.toString();
1232
1233 TreeMap offsets2CharsMap = new TreeMap();
1234 HashMap annotsForOffset = new HashMap(100);
1235 if (this.getContent().size().longValue() != 0){
1236 // Fill the offsets2CharsMap with all the indices where
1237 // special chars appear
1238 buildEntityMapFromString(content,offsets2CharsMap);
1239 }//End if
1240 // The saving alghorithm is as follows:
1241 ///////////////////////////////////////////
1242 // Construct a set of annot with all IDs in asc order.
1243 // All annotations that end at that offset swap their place in descending
1244 // order. For each node write all the tags from left to right.
1245
1246 // Construct the node set
1247 TreeSet offsets = new TreeSet();
1248 Iterator iter = aDumpAnnotList.iterator();
1249 while (iter.hasNext()){
1250 Annotation annot = (Annotation) iter.next();
1251 offsets.add(annot.getStartNode().getOffset());
1252 offsets.add(annot.getEndNode().getOffset());
1253 if (annotsForOffset.containsKey(annot.getStartNode().getOffset())) {
1254 ((List) annotsForOffset.get(annot.getStartNode().getOffset())).add(annot);
1255 } else {
1256 List newList = new ArrayList(10);
1257 newList.add(annot);
1258 annotsForOffset.put(annot.getStartNode().getOffset(), newList);
1259 }
1260 if (annotsForOffset.containsKey(annot.getEndNode().getOffset())) {
1261 ((List) annotsForOffset.get(annot.getEndNode().getOffset())).add(annot);
1262 } else {
1263 List newList = new ArrayList(10);
1264 newList.add(annot);
1265 annotsForOffset.put(annot.getEndNode().getOffset(), newList);
1266 }
1267 }// End while
1268
1269 // ofsets is sorted in ascending order.
1270 // Iterate this set in descending order and remove an offset at each
1271 // iteration
1272 while (!offsets.isEmpty()){
1273 Long offset = (Long)offsets.last();
1274 // Remove the offset from the set
1275 offsets.remove(offset);
1276 // Now, use it.
1277 // Returns a list with annotations that needs to be serialized in that
1278 // offset.
1279// List annotations = getAnnotationsForOffset(aDumpAnnotList,offset);
1280 List annotations = (List) annotsForOffset.get(offset);
1281 annotations = getAnnotationsForOffset(annotations,offset);
1282 // Attention: the annotation are serialized from left to right
1283// StringBuffer tmpBuff = new StringBuffer("");
1284 StringBuffer tmpBuff = new StringBuffer(
1285 DOC_SIZE_MULTIPLICATION_FACTOR*(this.getContent().size().intValue()));
1286 Stack stack = new Stack();
1287 // Iterate through all these annotations and serialize them
1288 Iterator it = annotations.iterator();
1289 while(it.hasNext()){
1290 Annotation a = (Annotation) it.next();
1291 it.remove();
1292 // Test if a Ends at offset
1293 if ( offset.equals(a.getEndNode().getOffset()) ){
1294 // Test if a Starts at offset
1295 if ( offset.equals(a.getStartNode().getOffset()) ){
1296 // Here, the annotation a Starts and Ends at the offset
1297 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1298 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1299
1300 // Assert: annotation a with start == end and isEmptyAndSpan
1301 tmpBuff.append(writeStartTag(a, includeFeatures));
1302 stack.push(a);
1303 }else{
1304 // Assert annotation a with start == end and an empty tag
1305 tmpBuff.append(writeEmptyTag(a));
1306 // The annotation is removed from dumped set
1307 aDumpAnnotList.remove(a);
1308 }// End if
1309 }else{
1310 // Here the annotation a Ends at the offset.
1311 // In this case empty the stack and write the end tag
1312 if (!stack.isEmpty()){
1313 while(!stack.isEmpty()){
1314 Annotation a1 = (Annotation)stack.pop();
1315 tmpBuff.append(writeEndTag(a1));
1316 }// End while
1317 }// End if
1318 tmpBuff.append(writeEndTag(a));
1319 }// End if
1320 }else{
1321 // The annotation a does NOT end at the offset. Let's see if it starts
1322 // at the offset
1323 if ( offset.equals(a.getStartNode().getOffset()) ){
1324 // The annotation a starts at the offset.
1325 // In this case empty the stack and write the end tag
1326 if (!stack.isEmpty()){
1327 while(!stack.isEmpty()){
1328 Annotation a1 = (Annotation)stack.pop();
1329 tmpBuff.append(writeEndTag(a1));
1330 }// End while
1331 }// End if
1332 tmpBuff.append(writeStartTag(a, includeFeatures));
1333 // The annotation is removed from dumped set
1334 aDumpAnnotList.remove(a);
1335 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1336 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1337 }// End while(it.hasNext()){
1338
1339 // In this case empty the stack and write the end tag
1340 if (!stack.isEmpty()){
1341 while(!stack.isEmpty()){
1342 Annotation a1 = (Annotation)stack.pop();
1343 tmpBuff.append(writeEndTag(a1));
1344 }// End while
1345 }// End if
1346
1347 // Before inserting tmpBuff into docContStrBuff we need to check
1348 // if there are chars to be replaced and if there are, they would be
1349 // replaced.
1350 if (!offsets2CharsMap.isEmpty()){
1351 Long offsChar = (Long) offsets2CharsMap.lastKey();
1352 while( !offsets2CharsMap.isEmpty() &&
1353 offsChar.intValue() >= offset.intValue()){
1354 // Replace the char at offsChar with its corresponding entity form
1355 // the entitiesMap.
1356 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1357 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1358 // Discard the offsChar after it was used.
1359 offsets2CharsMap.remove(offsChar);
1360 // Investigate next offsChar
1361 if (!offsets2CharsMap.isEmpty())
1362 offsChar = (Long) offsets2CharsMap.lastKey();
1363 }// End while
1364 }// End if
1365 // Insert tmpBuff to the location where it belongs in docContStrBuff
1366 docContStrBuff.insert(offset.intValue(),tmpBuff.toString());
1367 }// End while(!offsets.isEmpty())
1368 // Need to replace the entities in the remaining text, if there is any text
1369 // So, if there are any more items in offsets2CharsMap they need to be
1370 // replaced
1371 while (!offsets2CharsMap.isEmpty()){
1372 Long offsChar = (Long) offsets2CharsMap.lastKey();
1373 // Replace the char with its entity
1374 docContStrBuff.replace(offsChar.intValue(),offsChar.intValue()+1,
1375 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
1376 // remove the offset from the map
1377 offsets2CharsMap.remove(offsChar);
1378 }// End while
1379 return docContStrBuff.toString();
1380 }// saveAnnotationSetAsXml()
1381*/
1382
1383 /**
1384 * Return true only if the document has features for original content and
1385 * repositioning information.
1386 */
1387 private boolean hasOriginalContentFeatures() {
1388 FeatureMap features = getFeatures();
1389 boolean result = false;
1390
1391 result =
1392 (features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME) != null)
1393 &&
1394 (features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME)
1395 != null);
1396
1397 return result;
1398 } // hasOriginalContentFeatures
1399
1400 /** This method saves all the annotations from aDumpAnnotSet and combines
1401 * them with the original document content, if preserved as feature.
1402 * @param aSourceAnnotationSet is a GATE annotation set prepared to be used
1403 * on the raw text from document content. If aDumpAnnotSet is <b>null<b>
1404 * then an empty string will be returned.
1405 * @param includeFeatures is a boolean, which controls whether the annotation
1406 * features and gate ID are included or not.
1407 * @return The XML document obtained from raw text + the information from
1408 * the dump annotation set.
1409 */
1410 private String saveAnnotationSetAsXmlInOrig(Set aSourceAnnotationSet,
1411 boolean includeFeatures){
1412 StringBuffer docContStrBuff;
1413
1414 String origContent;
1415
1416 origContent =
1417 (String)features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
1418 if(origContent == null) {
1419 origContent = "";
1420 } // if
1421
1422 long originalContentSize = origContent.length();
1423
1424 RepositioningInfo repositioning = (RepositioningInfo)
1425 getFeatures().get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);
1426
1427 docContStrBuff = new StringBuffer(origContent);
1428 if (aSourceAnnotationSet == null) return docContStrBuff.toString();
1429
1430 StatusListener sListener = (StatusListener)
1431 gate.gui.MainFrame.getListeners().
1432 get("gate.event.StatusListener");
1433
1434 AnnotationSet originalMarkupsAnnotSet =
1435 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1436 // Create a dumping annotation set on the document. It will be used for
1437 // dumping annotations...
1438 AnnotationSet dumpingSet = new AnnotationSetImpl((Document) this);
1439 if(sListener != null)
1440 sListener.statusChanged("Constructing the dumping annotation set.");
1441 // Then take all the annotations from aSourceAnnotationSet and verify if
1442 // they can be inserted safely into the dumpingSet. Where not possible,
1443 // report.
1444 if (aSourceAnnotationSet != null){
1445 Iterator iter = aSourceAnnotationSet.iterator();
1446 Annotation currentAnnot;
1447 while (iter.hasNext()){
1448 currentAnnot = (Annotation) iter.next();
1449 if(insertsSafety(originalMarkupsAnnotSet, currentAnnot)
1450 && insertsSafety(dumpingSet, currentAnnot)){
1451 dumpingSet.add(currentAnnot);
1452 }else{
1453 Out.prln("Warning: Annotation with ID=" + currentAnnot.getId() +
1454 ", startOffset=" + currentAnnot.getStartNode().getOffset() +
1455 ", endOffset=" + currentAnnot.getEndNode().getOffset() +
1456 ", type=" + currentAnnot.getType()+ " was found to violate the" +
1457 " crossed over condition. It will be discarded");
1458 }// End if
1459 }// End while
1460 }// End if
1461
1462 // The dumpingSet is ready to be exported as XML
1463 // Here we go.
1464 if(sListener != null) sListener.statusChanged("Dumping annotations as XML");
1465
1466 ///////////////////////////////////////////
1467 // Construct a set of annot with all IDs in asc order.
1468 // All annotations that end at that offset swap their place in descending
1469 // order. For each node write all the tags from left to right.
1470
1471 // Construct the node set
1472 TreeSet offsets = new TreeSet();
1473 Iterator iter = aSourceAnnotationSet.iterator();
1474 while (iter.hasNext()){
1475 Annotation annot = (Annotation) iter.next();
1476 offsets.add(annot.getStartNode().getOffset());
1477 offsets.add(annot.getEndNode().getOffset());
1478 }// End while
1479
1480 // ofsets is sorted in ascending order.
1481 // Iterate this set in descending order and remove an offset at each
1482 // iteration
1483 while (!offsets.isEmpty()){
1484 Long offset = (Long)offsets.last();
1485 // Remove the offset from the set
1486 offsets.remove(offset);
1487 // Now, use it.
1488 // Returns a list with annotations that needs to be serialized in that
1489 // offset.
1490 List annotations = getAnnotationsForOffset(aSourceAnnotationSet,offset);
1491 // Attention: the annotation are serialized from left to right
1492 StringBuffer tmpBuff = new StringBuffer("");
1493 Stack stack = new Stack();
1494 // Iterate through all these annotations and serialize them
1495 Iterator it = annotations.iterator();
1496 Annotation a = null;
1497 while(it.hasNext()) {
1498 a = (Annotation) it.next();
1499 it.remove();
1500 // Test if a Ends at offset
1501 if ( offset.equals(a.getEndNode().getOffset()) ){
1502 // Test if a Starts at offset
1503 if ( offset.equals(a.getStartNode().getOffset()) ){
1504 // Here, the annotation a Starts and Ends at the offset
1505 if ( null != a.getFeatures().get("isEmptyAndSpan") &&
1506 "true".equals((String)a.getFeatures().get("isEmptyAndSpan"))){
1507
1508 // Assert: annotation a with start == end and isEmptyAndSpan
1509 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1510 stack.push(a);
1511 }else{
1512 // Assert annotation a with start == end and an empty tag
1513 tmpBuff.append(writeEmptyTag(a, false));
1514 // The annotation is removed from dumped set
1515 aSourceAnnotationSet.remove(a);
1516 }// End if
1517 }else{
1518 // Here the annotation a Ends at the offset.
1519 // In this case empty the stack and write the end tag
1520 while(!stack.isEmpty()){
1521 Annotation a1 = (Annotation)stack.pop();
1522 tmpBuff.append(writeEndTag(a1));
1523 }// End while
1524 tmpBuff.append(writeEndTag(a));
1525 }// End if
1526 }else{
1527 // The annotation a does NOT end at the offset. Let's see if it starts
1528 // at the offset
1529 if ( offset.equals(a.getStartNode().getOffset()) ){
1530 // The annotation a starts at the offset.
1531 // In this case empty the stack and write the end tag
1532 while(!stack.isEmpty()){
1533 Annotation a1 = (Annotation)stack.pop();
1534 tmpBuff.append(writeEndTag(a1));
1535 }// End while
1536
1537 tmpBuff.append(writeStartTag(a, includeFeatures, false));
1538 // The annotation is removed from dumped set
1539 aSourceAnnotationSet.remove(a);
1540 }// End if ( offset.equals(a.getStartNode().getOffset()) )
1541 }// End if ( offset.equals(a.getEndNode().getOffset()) )
1542 }// End while(it.hasNext()){
1543
1544 // In this case empty the stack and write the end tag
1545 while(!stack.isEmpty()){
1546 Annotation a1 = (Annotation)stack.pop();
1547 tmpBuff.append(writeEndTag(a1));
1548 }// End while
1549
1550 long originalPosition = -1;
1551 boolean backPositioning =
1552 a != null && offset.equals(a.getEndNode().getOffset());
1553 if ( backPositioning ) {
1554 // end of the annotation correction
1555 originalPosition =
1556 repositioning.getOriginalPos(offset.intValue(), true);
1557 } // if
1558
1559 if(originalPosition == -1) {
1560 originalPosition = repositioning.getOriginalPos(offset.intValue());
1561 } // if
1562
1563 // Insert tmpBuff to the location where it belongs in docContStrBuff
1564 if(originalPosition != -1 && originalPosition <= originalContentSize ) {
1565 docContStrBuff.insert((int) originalPosition, tmpBuff.toString());
1566 }
1567 else {
1568 Out.prln("Error in the repositioning. The offset ("+offset.intValue()
1569 +") could not be positioned in the original document. \n"
1570 +"Calculated position is: "+originalPosition
1571 +" placed back: "+backPositioning);
1572 } // if
1573
1574 }// End while(!offsets.isEmpty())
1575 if (theRootAnnotation != null)
1576 docContStrBuff.append(writeEndTag(theRootAnnotation));
1577 return docContStrBuff.toString();
1578 } // saveAnnotationSetAsXmlInOrig()
1579
1580 /** This method returns a list with annotations ordered that way that
1581 * they can be serialized from left to right, at the offset. If one of the
1582 * params is null then an empty list will be returned.
1583 * @param aDumpAnnotSet is a set containing all annotations that will be
1584 * dumped.
1585 * @param offset represent the offset at witch the annotation must start
1586 * AND/OR end.
1587 * @return a list with those annotations that need to be serialized.
1588 */
1589 private List getAnnotationsForOffset(Set aDumpAnnotSet, Long offset){
1590 List annotationList = new LinkedList();
1591 if (aDumpAnnotSet == null || offset == null) return annotationList;
1592 Set annotThatStartAtOffset = new TreeSet(
1593 new AnnotationComparator(ORDER_ON_END_OFFSET,DESC));
1594 Set annotThatEndAtOffset = new TreeSet(
1595 new AnnotationComparator(ORDER_ON_START_OFFSET,DESC));
1596 Set annotThatStartAndEndAtOffset = new TreeSet(
1597 new AnnotationComparator(ORDER_ON_ANNOT_ID,ASC));
1598
1599 // Fill these tree lists with annotation tat start, end or start and
1600 // end at the offset.
1601 Iterator iter = aDumpAnnotSet.iterator();
1602 while(iter.hasNext()){
1603 Annotation ann = (Annotation) iter.next();
1604 if (offset.equals(ann.getStartNode().getOffset())){
1605 if (offset.equals(ann.getEndNode().getOffset()))
1606 annotThatStartAndEndAtOffset.add(ann);
1607 else
1608 annotThatStartAtOffset.add(ann);
1609 }else{
1610 if (offset.equals(ann.getEndNode().getOffset()))
1611 annotThatEndAtOffset.add(ann);
1612 }// End if
1613 }// End while
1614 annotationList.addAll(annotThatEndAtOffset);
1615 annotThatEndAtOffset = null;
1616 annotationList.addAll(annotThatStartAtOffset);
1617 annotThatStartAtOffset = null;
1618 iter = annotThatStartAndEndAtOffset.iterator();
1619 while(iter.hasNext()){
1620 Annotation ann = (Annotation) iter.next();
1621 Iterator it = annotationList.iterator();
1622 boolean breaked = false;
1623 while (it.hasNext()){
1624 Annotation annFromList = (Annotation) it.next();
1625 if (annFromList.getId().intValue() > ann.getId().intValue()){
1626 annotationList.add(annotationList.indexOf(annFromList),ann);
1627 breaked = true;
1628 break;
1629 }// End if
1630 }// End while
1631 if (!breaked)
1632 annotationList.add(ann);
1633 iter.remove();
1634 }// End while
1635 return annotationList;
1636 }// getAnnotationsForOffset()
1637
1638 private List getAnnotationsForOffset(List aDumpAnnotList, Long offset){
1639 List annotationList = new ArrayList();
1640 if (aDumpAnnotList == null || offset == null) return annotationList;
1641 Set annotThatStartAtOffset;
1642 Set annotThatEndAtOffset;
1643 Set annotThatStartAndEndAtOffset;
1644 annotThatStartAtOffset = new TreeSet(
1645 new AnnotationComparator(ORDER_ON_END_OFFSET, DESC));
1646 annotThatEndAtOffset = new TreeSet(
1647 new AnnotationComparator(ORDER_ON_START_OFFSET, DESC));
1648 annotThatStartAndEndAtOffset = new TreeSet(
1649 new AnnotationComparator(ORDER_ON_ANNOT_ID, ASC));
1650
1651 // Fill these tree lists with annotation tat start, end or start and
1652 // end at the offset.
1653 Iterator iter = aDumpAnnotList.iterator();
1654 while(iter.hasNext()){
1655 Annotation ann = (Annotation) iter.next();
1656 if (offset.equals(ann.getStartNode().getOffset())){
1657 if (offset.equals(ann.getEndNode().getOffset()))
1658 annotThatStartAndEndAtOffset.add(ann);
1659 else
1660 annotThatStartAtOffset.add(ann);
1661 }else{
1662 if (offset.equals(ann.getEndNode().getOffset()))
1663 annotThatEndAtOffset.add(ann);
1664 }// End if
1665 }// End while
1666
1667 annotationList.addAll(annotThatEndAtOffset);
1668 annotationList.addAll(annotThatStartAtOffset);
1669 annotThatEndAtOffset = null;
1670 annotThatStartAtOffset = null;
1671
1672 iter = annotThatStartAndEndAtOffset.iterator();
1673 while(iter.hasNext()){
1674 Annotation ann = (Annotation) iter.next();
1675 Iterator it = annotationList.iterator();
1676 boolean breaked = false;
1677 while (it.hasNext()){
1678 Annotation annFromList = (Annotation) it.next();
1679 if (annFromList.getId().intValue() > ann.getId().intValue()){
1680 annotationList.add(annotationList.indexOf(annFromList),ann);
1681 breaked = true;
1682 break;
1683 }// End if
1684 }// End while
1685 if (!breaked)
1686 annotationList.add(ann);
1687 iter.remove();
1688 }// End while
1689 return annotationList;
1690 }// getAnnotationsForOffset()
1691
1692 private String writeStartTag(Annotation annot, boolean includeFeatures){
1693 return writeStartTag(annot, includeFeatures, true);
1694 } // writeStartTag
1695
1696 /** Returns a string representing a start tag based on the input annot*/
1697 private String writeStartTag(Annotation annot, boolean includeFeatures,
1698 boolean includeNamespace){
1699 AnnotationSet originalMarkupsAnnotSet =
1700 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1701
1702 StringBuffer strBuff = new StringBuffer("");
1703 if (annot == null) return strBuff.toString();
1704// if (!addGatePreserveFormatTag && isRootTag){
1705 if (theRootAnnotation != null && annot.getId().equals(theRootAnnotation.getId())){
1706 //the features are included either if desired or if that's an annotation
1707 //from the original markup of the document. We don't want for example to
1708 //spoil all links in an HTML file!
1709 if (includeFeatures) {
1710 strBuff.append("<");
1711 strBuff.append(annot.getType());
1712 strBuff.append(" ");
1713 if(includeNamespace) {
1714 strBuff.append(" xmlns:gate=\"http://www.gate.ac.uk\"");
1715 strBuff.append(" gate:");
1716 }
1717 strBuff.append("gateId=\"");
1718 strBuff.append(annot.getId());
1719 strBuff.append("\"");
1720 strBuff.append(" ");
1721 if(includeNamespace) {
1722 strBuff.append("gate:");
1723 }
1724 strBuff.append("annotMaxId=\"");
1725 strBuff.append(nextAnnotationId);
1726 strBuff.append("\"");
1727 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1728 strBuff.append(">");
1729 }
1730 else if (originalMarkupsAnnotSet.contains(annot)) {
1731 strBuff.append("<");
1732 strBuff.append(annot.getType());
1733 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1734 strBuff.append(">");
1735 }
1736 else {
1737 strBuff.append("<");
1738 strBuff.append(annot.getType());
1739 strBuff.append(">");
1740 }
1741
1742 }else{
1743 //the features are included either if desired or if that's an annotation
1744 //from the original markup of the document. We don't want for example to
1745 //spoil all links in an HTML file!
1746 if (includeFeatures) {
1747 strBuff.append("<");
1748 strBuff.append(annot.getType());
1749 strBuff.append(" ");
1750 if(includeNamespace) {
1751 strBuff.append("gate:");
1752 } // if includeNamespaces
1753 strBuff.append("gateId=\"");
1754 strBuff.append(annot.getId());
1755 strBuff.append("\"");
1756 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1757 strBuff.append(">");
1758 }
1759 else if (originalMarkupsAnnotSet.contains(annot)) {
1760 strBuff.append("<");
1761 strBuff.append(annot.getType());
1762 strBuff.append(writeFeatures(annot.getFeatures(), includeNamespace));
1763 strBuff.append(">");
1764 }
1765 else {
1766 strBuff.append("<");
1767 strBuff.append(annot.getType());
1768 strBuff.append(">");
1769 }
1770 }// End if
1771 return strBuff.toString();
1772 }// writeStartTag()
1773
1774 /**
1775 * Identifies the root annotations inside an annotation set.
1776 * The root annotation is the one that starts at offset 0, and has the
1777 * greatest span. If there are more than one with this function, then the
1778 * annotation with the smalled ID wil be selected as root.
1779 * If none is identified it will return null.
1780 * @param anAnnotationSet The annotation set possibly containing
1781 * the root annotation.
1782 * @return The root annotation or null is it fails
1783 */
1784 private Annotation identifyTheRootAnnotation(AnnotationSet anAnnotationSet){
1785 if (anAnnotationSet == null) return null;
1786 // If the starting node of this annotation is not null, then the annotation
1787 // set will not have a root annotation.
1788 Node startNode = anAnnotationSet.firstNode();
1789 Node endNode = anAnnotationSet.lastNode();
1790 // This is placed here just to speed things up. The alghorithm bellow can
1791 // can identity the annotation that span over the entire set and with the
1792 // smallest ID. However the root annotation will have to have the start
1793 // offset equal to 0.
1794 if (startNode.getOffset().longValue() != 0) return null;
1795 // Go anf find the annotation.
1796 Annotation theRootAnnotation = null;
1797 // Check if there are annotations starting at offset 0. If there are, then
1798 // check all of them to see which one has the greatest span. Basically its
1799 // END offset should be the bigest offset from the input annotation set.
1800 long start = startNode.getOffset().longValue();
1801 long end = endNode.getOffset().longValue();
1802 for(Iterator it = anAnnotationSet.iterator(); it.hasNext();){
1803 Annotation currentAnnot = (Annotation) it.next();
1804 // If the currentAnnot has both its Start and End equals to the Start and
1805 // end of the AnnotationSet then check to see if its ID is the smallest.
1806 if (
1807 (start == currentAnnot.getStartNode().getOffset().longValue()) &&
1808 (end == currentAnnot.getEndNode().getOffset().longValue())
1809 ){
1810 // The currentAnnotation has is a potencial root one.
1811 if (theRootAnnotation == null)
1812 theRootAnnotation = currentAnnot;
1813 else{
1814 // If its ID is greater that the currentAnnot then update the root
1815 if ( theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1816 theRootAnnotation = currentAnnot;
1817 }// End if
1818 }// End if
1819 }// End for
1820 return theRootAnnotation;
1821 }// End identifyTheRootAnnotation()
1822
1823 private Annotation identifyTheRootAnnotation(List anAnnotationList){
1824 if (anAnnotationList == null || anAnnotationList.isEmpty()) return null;
1825 // If the first annotation in the list (which is sorted by start offset)
1826 //does not have an offset = 0, then there's no root tag.
1827 if(((Annotation)anAnnotationList.get(0)).
1828 getStartNode().getOffset().longValue() > 0) return null;
1829
1830 // If there's a single annotation and it starts at the start (which we
1831 // already know it does), make sure it ends at the end.
1832 if (anAnnotationList.size() == 1){
1833 Annotation onlyAnn = (Annotation) anAnnotationList.get(0);
1834 if ( onlyAnn.getEndNode().getOffset().equals( content.size() ) ) return onlyAnn;
1835 return null;
1836 }
1837
1838 //find the limits
1839 long start = 0; //we know this already
1840 long end = 0; //end = 0 will be improved by the next loop
1841 for(int i = 0; i < anAnnotationList.size(); i++){
1842 Annotation anAnnotation = (Annotation)anAnnotationList.get(i);
1843 long localEnd = anAnnotation.getEndNode().getOffset().longValue();
1844 if(localEnd > end) end = localEnd;
1845 }
1846
1847 // Go and find the annotation.
1848 //look at all annotations that start at 0 and end at end
1849 //if there are several, choose the one with the smallest ID
1850 Annotation theRootAnnotation = null;
1851 for(int i = 0; i < anAnnotationList.size(); i++){
1852 Annotation currentAnnot = (Annotation) anAnnotationList.get(i);
1853 long localStart = currentAnnot.getStartNode().getOffset().longValue();
1854 long localEnd = currentAnnot.getEndNode().getOffset().longValue();
1855 // If the currentAnnot has both its Start and End equals to the Start and
1856 // end of the AnnotationSet then check to see if its ID is the smallest.
1857 if (
1858 (start == localStart) && (end == localEnd)){
1859 // The currentAnnotation has is a potential root one.
1860 if (theRootAnnotation == null) theRootAnnotation = currentAnnot;
1861 else{
1862 // If root's ID is greater that the currentAnnot then update the root
1863 if (theRootAnnotation.getId().intValue() > currentAnnot.getId().intValue())
1864 theRootAnnotation = currentAnnot;
1865 }// End if
1866 }// End if
1867 }// End for
1868 return theRootAnnotation;
1869 }// End identifyTheRootAnnotation()
1870
1871
1872 /** This method takes aScanString and searches for those chars from
1873 * entitiesMap that appear in the string. A tree map(offset2Char) is filled
1874 * using as key the offsets where those Chars appear and the Char.
1875 * If one of the params is null the method simply returns.
1876 */
1877 private void buildEntityMapFromString(String aScanString, TreeMap aMapToFill){
1878 if (aScanString == null || aMapToFill == null) return;
1879 if (entitiesMap == null || entitiesMap.isEmpty()){
1880 Err.prln("WARNING: Entities map was not initialised !");
1881 return;
1882 }// End if
1883 // Fill the Map with the offsets of the special chars
1884 Iterator entitiesMapIterator = entitiesMap.keySet().iterator();
1885 Character c;
1886 int fromIndex;
1887 while(entitiesMapIterator.hasNext()){
1888 c = (Character) entitiesMapIterator.next();
1889 fromIndex = 0;
1890 while (-1 != fromIndex){
1891 fromIndex = aScanString.indexOf(c.charValue(),fromIndex);
1892 if (-1 != fromIndex){
1893 aMapToFill.put(new Long(fromIndex),c);
1894 fromIndex ++;
1895 }// End if
1896 }// End while
1897 }// End while
1898 }//buildEntityMapFromString();
1899
1900 private String writeEmptyTag(Annotation annot){
1901 return writeEmptyTag(annot, true);
1902 } // writeEmptyTag
1903
1904 /** Returns a string representing an empty tag based on the input annot*/
1905 private String writeEmptyTag(Annotation annot, boolean includeNamespace){
1906 StringBuffer strBuff = new StringBuffer("");
1907 if (annot == null) return strBuff.toString();
1908
1909 strBuff.append("<");
1910 strBuff.append(annot.getType());
1911
1912 AnnotationSet originalMarkupsAnnotSet =
1913 this.getAnnotations(GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
1914 if (! originalMarkupsAnnotSet.contains(annot)) {
1915 strBuff.append(" gateId=\"");
1916 strBuff.append(annot.getId());
1917 strBuff.append("\"");
1918 }
1919 strBuff.append(writeFeatures(annot.getFeatures(),includeNamespace));
1920 strBuff.append("/>");
1921
1922 return strBuff.toString();
1923 }// writeEmptyTag()
1924
1925 /** Returns a string representing an end tag based on the input annot*/
1926 private String writeEndTag(Annotation annot){
1927 StringBuffer strBuff = new StringBuffer("");
1928 if (annot == null) return strBuff.toString();
1929/*
1930 if (annot.getType().indexOf(" ") != -1)
1931 Out.prln("Warning: Truncating end tag to first word for annot type \""
1932 +annot.getType()+ "\". ");
1933*/
1934 strBuff.append("</"+annot.getType()+">");
1935
1936 return strBuff.toString();
1937 }// writeEndTag()
1938
1939 /** Returns a string representing a FeatureMap serialized as XML attributes*/
1940 private String writeFeatures(FeatureMap feat, boolean includeNamespace){
1941 StringBuffer strBuff = new StringBuffer("");
1942 if (feat == null) return strBuff.toString();
1943 Iterator it = feat.keySet().iterator();
1944 while (it.hasNext()){
1945 Object key = it.next();
1946 Object value = feat.get(key);
1947 if ( (key != null) && (value != null) ){
1948 // Eliminate a feature inserted at reading time and which help to
1949 // take some decissions at saving time
1950 if ("isEmptyAndSpan".equals(key.toString()))
1951 continue;
1952 if( !(String.class.isAssignableFrom(key.getClass()) ||
1953 Number.class.isAssignableFrom(key.getClass()))){
1954
1955 Out.prln("Warning:Found a feature NAME("+key+") that doesn't came"+
1956 " from String or Number.(feature discarded)");
1957 continue;
1958 }// End if
1959 if ( !(String.class.isAssignableFrom(value.getClass()) ||
1960 Number.class.isAssignableFrom(value.getClass()) ||
1961 java.util.Collection.class.isAssignableFrom(value.getClass()))){
1962
1963 Out.prln("Warning:Found a feature VALUE("+value+") that doesn't came"+
1964 " from String, Number or Collection.(feature discarded)");
1965 continue;
1966 }// End if
1967 if ("matches".equals(key)) {
1968 strBuff.append(" ");
1969 if(includeNamespace) {
1970 strBuff.append("gate:");
1971 }
1972// strBuff.append(key);
1973 // replace non XML chars in attribute name
1974 strBuff.append(
1975 filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1976 strBuff.append("=\"");
1977 }
1978 else {
1979 strBuff.append(" ");
1980// strBuff.append(key);
1981 // replace non XML chars in attribute name
1982 strBuff.append(
1983 filterNonXmlChars(replaceCharsWithEntities(key.toString())));
1984 strBuff.append("=\"");
1985 }
1986 if (java.util.Collection.class.isAssignableFrom(value.getClass())){
1987 Iterator valueIter = ((Collection)value).iterator();
1988 while(valueIter.hasNext()){
1989 Object item = valueIter.next();
1990 if (!(String.class.isAssignableFrom(item.getClass()) ||
1991 Number.class.isAssignableFrom(item.getClass())))
1992 continue;
1993// strBuff.append(item);
1994 // replace non XML chars in collection item
1995 strBuff.append(
1996 filterNonXmlChars(replaceCharsWithEntities(item.toString())));
1997 strBuff.append(";");
1998 }// End while
1999 if (strBuff.charAt(strBuff.length()-1) == ';')
2000 strBuff.deleteCharAt(strBuff.length()-1);
2001 }else{
2002// strBuff.append(value);
2003 // replace non XML chars in attribute value
2004 strBuff.append(
2005 filterNonXmlChars(replaceCharsWithEntities(value.toString())));
2006 }// End if
2007 strBuff.append("\"");
2008 }// End if
2009 }// End while
2010 return strBuff.toString();
2011 }// writeFeatures()
2012
2013 /** Returns a GateXml document that is a custom XML format for wich there is
2014 * a reader inside GATE called gate.xml.GateFormatXmlHandler.
2015 * What it does is to serialize a GATE document in an XML format.
2016 * @return a string representing a Gate Xml document.
2017 */
2018 public String toXml(){
2019 // Initialize the xmlContent with 3 time the size of the current document.
2020 // This is because of the tags size. This measure is made to increase the
2021 // performance of StringBuffer.
2022 StringBuffer xmlContent = new StringBuffer(
2023 DOC_SIZE_MULTIPLICATION_FACTOR*(getContent().size().intValue()));
2024 // Add xml header
2025 xmlContent.append("<?xml version=\"1.0\" encoding=\"");
2026 xmlContent.append(getEncoding());
2027 xmlContent.append("\" ?>");
2028 xmlContent.append(Strings.getNl());
2029
2030 // Add the root element
2031 xmlContent.append("<GateDocument>\n");
2032 xmlContent.append("<!-- The document's features-->\n\n");
2033 xmlContent.append("<GateDocumentFeatures>\n");
2034
2035 xmlContent.append(featuresToXml(this.getFeatures()));
2036 xmlContent.append("</GateDocumentFeatures>\n");
2037 xmlContent.append("<!-- The document content area with serialized"+
2038 " nodes -->\n\n");
2039 // Add plain text element
2040 xmlContent.append("<TextWithNodes>");
2041 xmlContent.append(textWithNodes(this.getContent().toString()));
2042 xmlContent.append("</TextWithNodes>\n");
2043 // Serialize as XML all document's annotation sets
2044 // Serialize the default AnnotationSet
2045 StatusListener sListener = (StatusListener)
2046 gate.gui.MainFrame.getListeners().
2047 get("gate.event.StatusListener");
2048 if(sListener != null)
2049 sListener.statusChanged("Saving the default annotation set ");
2050 xmlContent.append("<!-- The default annotation set -->\n\n");
2051 xmlContent.append(annotationSetToXml(this.getAnnotations()));
2052 // Serialize all others AnnotationSets
2053 // namedAnnotSets is a Map containing all other named Annotation Sets.
2054 if (namedAnnotSets != null){
2055 Iterator iter = namedAnnotSets.values().iterator();
2056 while(iter.hasNext()){
2057 AnnotationSet annotSet = (AnnotationSet) iter.next();
2058 xmlContent.append("<!-- Named annotation set -->\n\n");
2059 // Serialize it as XML
2060 if(sListener != null) sListener.statusChanged("Saving " +
2061 annotSet.getName()+
2062 " annotation set ");
2063 xmlContent.append(annotationSetToXml(annotSet));
2064 }// End while
2065 }// End if
2066 // Add the end of GateDocument
2067 xmlContent.append("</GateDocument>");
2068 if(sListener != null) sListener.statusChanged("Done !");
2069 // return the XmlGateDocument
2070 return xmlContent.toString();
2071 }// toXml
2072
2073 /** This method filters any non XML char
2074 * see: http://www.w3c.org/TR/2000/REC-xml-20001006#charsets
2075 * All non XML chars will be replaced with 0x20 (space char) This assures
2076 * that the next time the document is loaded there won't be any problems.
2077 * @param aStrBuffer represents the input String that is filtred. If the
2078 * aStrBuffer is null then an empty string will be returend
2079 * @return the "purified" StringBuffer version of the aStrBuffer
2080 */
2081 private StringBuffer filterNonXmlChars(StringBuffer aStrBuffer){
2082 if (aStrBuffer == null) return new StringBuffer("");
2083// String space = new String(" ");
2084 char space = ' ';
2085 for (int i=aStrBuffer.length()-1;i>=0; i--){
2086 if (!isXmlChar(aStrBuffer.charAt(i)))
2087 aStrBuffer.setCharAt(i, space);
2088 }// End for
2089 return aStrBuffer;
2090 }// filterNonXmlChars()
2091
2092 /** This method decide if a char is a valid XML one or not
2093 * @param ch the char to be tested
2094 * @return true if is a valid XML char and fals if is not.
2095 */
2096 public static boolean isXmlChar(char ch){
2097 if (ch == 0x9 || ch == 0xA || ch ==0xD) return true;
2098 if ((0x20 <= ch) && (ch <= 0xD7FF)) return true;
2099 if ((0xE000 <= ch) && (ch <= 0xFFFD)) return true;
2100 if ((0x10000 <= ch) && (ch <= 0x10FFFF)) return true;
2101 return false;
2102 }// End isXmlChar()
2103
2104 /** This method saves a FeatureMap as XML elements.
2105 * @param aFeatureMap the feature map that has to be saved as XML.
2106 * @return a String like this: <Feature><Name>...</Name>
2107 * <Value>...</Value></Feature><Feature>...</Feature>
2108 */
2109 private String featuresToXml(FeatureMap aFeatureMap){
2110 StringBuffer str = new StringBuffer("");
2111
2112 if (aFeatureMap == null) return str.toString();
2113
2114 Set keySet = aFeatureMap.keySet();
2115 Iterator keyIterator = keySet.iterator();
2116 while(keyIterator.hasNext()){
2117 Object key = keyIterator.next();
2118 Object value = aFeatureMap.get(key);
2119 if ((key != null) && (value != null)){
2120 String keyClassName = null;
2121 String keyItemClassName = null;
2122 String valueClassName = null;
2123 String valueItemClassName = null;
2124 String key2String = key.toString();
2125 String value2String = value.toString();
2126
2127 Object item = null;
2128 // Test key if it is String, Number or Collection
2129 if (key instanceof java.lang.String ||
2130 key instanceof java.lang.Number ||
2131 key instanceof java.util.Collection)
2132 keyClassName = key.getClass().getName();
2133
2134 // Test value if it is String, Number or Collection
2135 if (value instanceof java.lang.String ||
2136 value instanceof java.lang.Number ||
2137 value instanceof java.util.Collection)
2138 valueClassName = value.getClass().getName();
2139
2140 // Features and values that are not Strings, Numbers or collections
2141 // will be discarded.
2142 if (keyClassName == null || valueClassName == null) continue;
2143
2144 // If key is collection serialize the colection in a specific format
2145 if (key instanceof java.util.Collection){
2146 StringBuffer keyStrBuff = new StringBuffer("");
2147 Iterator iter = ((Collection) key).iterator();
2148 if (iter.hasNext()){
2149 item = iter.next();
2150 if (item instanceof java.lang.Number)
2151 keyItemClassName = item.getClass().getName();
2152 else
2153 keyItemClassName = String.class.getName();
2154 keyStrBuff.append(item.toString());
2155 }// End if
2156 while (iter.hasNext()){
2157 item = iter.next();
2158 keyStrBuff.append(";" + item.toString());
2159 }// End while
2160 key2String = keyStrBuff.toString();
2161 }// End if
2162 // If key is collection serialize the colection in a specific format
2163 if (value instanceof java.util.Collection){
2164 StringBuffer valueStrBuff = new StringBuffer("");
2165 Iterator iter = ((Collection) value).iterator();
2166 if (iter.hasNext()){
2167 item = iter.next();
2168 if (item instanceof java.lang.Number)
2169 valueItemClassName = item.getClass().getName();
2170 else
2171 valueItemClassName = String.class.getName();
2172 valueStrBuff.append(item.toString());
2173 }// End if
2174 while (iter.hasNext()){
2175 item = iter.next();
2176 valueStrBuff.append(";" + item.toString());
2177 }// End while
2178 value2String = valueStrBuff.toString();
2179 }// End if
2180 str.append("<Feature>\n <Name");
2181 if (keyClassName != null)
2182 str.append(" className=\""+keyClassName+"\"");
2183 if (keyItemClassName != null)
2184 str.append(" itemClassName=\""+keyItemClassName+"\"");
2185 str.append(">");
2186 str.append(filterNonXmlChars(replaceCharsWithEntities(key2String)));
2187 str.append("</Name>\n <Value");
2188 if (valueClassName != null)
2189 str.append(" className=\"" + valueClassName + "\"");
2190 if (valueItemClassName != null)
2191 str.append(" itemClassName=\"" + valueItemClassName + "\"");
2192 str.append(">");
2193 str.append(filterNonXmlChars(replaceCharsWithEntities(value2String)));
2194 str.append("</Value>\n</Feature>\n");
2195 }// End if
2196 }// end While
2197 return str.toString();
2198 }//featuresToXml
2199
2200 /** This method replace all chars that appears in the anInputString and also
2201 * that are in the entitiesMap with their corresponding entity
2202 * @param anInputString the string analyzed. If it is null then returns the
2203 * empty string
2204 * @return a string representing the input string with chars replaced with
2205 * entities
2206 */
2207 private StringBuffer replaceCharsWithEntities(String anInputString){
2208 if (anInputString == null) return new StringBuffer("");
2209 StringBuffer strBuff = new StringBuffer(anInputString);
2210 for (int i=strBuff.length()-1; i>=0; i--){
2211 Character ch = new Character(strBuff.charAt(i));
2212 if (entitiesMap.keySet().contains(ch)){
2213 strBuff.replace(i,i+1,(String) entitiesMap.get(ch));
2214 }// End if
2215 }// End for
2216 return strBuff;
2217 }//replaceCharsWithEntities()
2218
2219 /** This method creates Node XML elements and inserts them at the
2220 * corresponding offset inside the text. Nodes are created from the default
2221 * annotation set, as well as from all existing named annotation sets.
2222 * @param aText The text representing the document's plain text.
2223 * @return The text with empty <Node id="NodeId"/> elements.
2224 */
2225 private String textWithNodes(String aText){
2226 if (aText == null) return new String("");
2227 StringBuffer textWithNodes = filterNonXmlChars(new StringBuffer(aText));
2228
2229 // Construct a map from offsets to Chars
2230 TreeMap offsets2CharsMap = new TreeMap();
2231 if (aText.length()!= 0){
2232 // Fill the offsets2CharsMap with all the indices where special chars appear
2233 buildEntityMapFromString(aText,offsets2CharsMap);
2234 }//End if
2235 // Construct the offsetsSet for all nodes belonging to this document
2236 TreeSet offsetsSet = new TreeSet();
2237 Iterator annotSetIter = this.getAnnotations().iterator();
2238 while (annotSetIter.hasNext()){
2239 Annotation annot = (Annotation) annotSetIter.next();
2240 offsetsSet.add(annot.getStartNode().getOffset());
2241 offsetsSet.add(annot.getEndNode().getOffset());
2242 }// end While
2243 // Get the nodes from all other named annotation sets.
2244 if (namedAnnotSets != null){
2245 Iterator iter = namedAnnotSets.values().iterator();
2246 while(iter.hasNext()){
2247 AnnotationSet annotSet = (AnnotationSet) iter.next();
2248 Iterator iter2 = annotSet.iterator();
2249 while(iter2.hasNext()){
2250 Annotation annotTmp = (Annotation) iter2.next();
2251 offsetsSet.add(annotTmp.getStartNode().getOffset());
2252 offsetsSet.add(annotTmp.getEndNode().getOffset());
2253 }// End while
2254 }// End while
2255 }// End if
2256 // offsetsSet is ordered in ascending order because the structure
2257 // is a TreeSet
2258
2259 if (offsetsSet.isEmpty()){
2260 return replaceCharsWithEntities(aText).toString();
2261 }// End if
2262 // Iterate through all nodes from anAnnotSet and transform them to
2263 // XML elements. Then insert those elements at the node's offset into the
2264 // textWithNodes .
2265 while (!offsetsSet.isEmpty()){
2266 Long offset = (Long) offsetsSet.last();
2267 // Eliminate the offset from the list in order to create more memory space
2268 offsetsSet.remove(offset);
2269 // Use offset
2270 int offsetValue = offset.intValue();
2271 String strNode = "<Node id=\"" + offsetValue + "\"/>";
2272 // Before inserting this string into the textWithNodes, check to see if
2273 // there are any chars to be replaced with their corresponding entities
2274 if (!offsets2CharsMap.isEmpty()){
2275 Long offsChar = (Long) offsets2CharsMap.lastKey();
2276 while( !offsets2CharsMap.isEmpty() &&
2277 offsChar.intValue() >= offset.intValue()){
2278 // Replace the char at offsChar with its corresponding entity form
2279 // the entitiesMap.
2280 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2281 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2282 // Discard the offsChar after it was used because this offset will
2283 // never appear again
2284 offsets2CharsMap.remove(offsChar);
2285 // Investigate next offsChar
2286 if (!offsets2CharsMap.isEmpty())
2287 offsChar = (Long) offsets2CharsMap.lastKey();
2288 }// End while
2289 }// End if
2290 // Now it is safe to insert the node
2291 textWithNodes.insert(offsetValue,strNode);
2292 }// end while
2293 // Need to replace the entities in the remaining text, if there is any text
2294 // So, if there are any more items in offsets2CharsMap they need to be
2295 // replaced
2296 while (!offsets2CharsMap.isEmpty()){
2297 Long offsChar = (Long) offsets2CharsMap.lastKey();
2298 // Replace the char with its entity
2299 textWithNodes.replace(offsChar.intValue(),offsChar.intValue()+1,
2300 (String)entitiesMap.get((Character)offsets2CharsMap.get(offsChar)));
2301 // remove the offset from the map
2302 offsets2CharsMap.remove(offsChar);
2303 }// End while
2304 return textWithNodes.toString();
2305 }//textWithNodes()
2306
2307 /** This method saves an AnnotationSet as XML.
2308 * @param anAnnotationSet The annotation set that has to be saved as XML.
2309 * @return a String like this: <AnnotationSet> <Annotation>....
2310 * </AnnotationSet>
2311 */
2312 private String annotationSetToXml(AnnotationSet anAnnotationSet){
2313 StringBuffer str = new StringBuffer("");
2314
2315 if (anAnnotationSet == null){
2316 str.append("<AnnotationSet>\n");
2317 str.append("</AnnotationSet>\n");
2318 return str.toString();
2319 }// End if
2320 if (anAnnotationSet.getName() == null)
2321 str.append("<AnnotationSet>\n");
2322 else str.append("<AnnotationSet Name=\"" + anAnnotationSet.getName()+
2323 "\" >\n");
2324 // Iterate through AnnotationSet and save each Annotation as XML
2325 Iterator iterator = anAnnotationSet.iterator();
2326 while (iterator.hasNext()){
2327 Annotation annot = (Annotation) iterator.next();
2328 str.append("<Annotation " + "Id=\"" + annot.getId() +
2329 "\" Type=\"" + annot.getType() +
2330 "\" StartNode=\"" + annot.getStartNode().getOffset() +
2331 "\" EndNode=\"" + annot.getEndNode().getOffset() + "\">\n");
2332 str.append(featuresToXml(annot.getFeatures()));
2333 str.append("</Annotation>\n");
2334 }// End while
2335
2336 str.append("</AnnotationSet>\n");
2337 return str.toString();
2338 }// annotationSetToXml
2339
2340 /** Returns a map with the named annotation sets. It returns <code>null</code>
2341 * if no named annotaton set exists. */
2342 public Map getNamedAnnotationSets() {
2343 return namedAnnotSets;
2344 } // getNamedAnnotationSets
2345
2346 /** Returns a set of all named annotation sets in existence
2347 */
2348 public Set getAnnotationSetNames(){
2349 return namedAnnotSets.keySet();
2350 }
2351
2352
2353 /**
2354 * Removes one of the named annotation sets.
2355 * Note that the default annotation set cannot be removed.
2356 * @param name the name of the annotation set to be removed
2357 */
2358 public void removeAnnotationSet(String name){
2359 Object removed = namedAnnotSets.remove(name);
2360 if(removed != null){
2361 fireAnnotationSetRemoved(
2362 new DocumentEvent(this, DocumentEvent.ANNOTATION_SET_REMOVED, name));
2363 }
2364 }
2365
2366 /** Propagate edit changes to the document content and annotations. */
2367 public void edit(Long start, Long end, DocumentContent replacement)
2368 throws InvalidOffsetException
2369 {
2370 if(! isValidOffsetRange(start, end))
2371 throw new InvalidOffsetException();
2372
2373 if(content != null)
2374 ((DocumentContentImpl) content).edit(start, end, replacement);
2375
2376 if(defaultAnnots != null)
2377 ((AnnotationSetImpl) defaultAnnots).edit(start, end, replacement);
2378
2379 if(namedAnnotSets != null) {
2380 Iterator iter = namedAnnotSets.values().iterator();
2381 while(iter.hasNext())
2382 ((AnnotationSetImpl) iter.next()).edit(start, end, replacement);
2383 }
2384 //let the listeners know
2385 fireContentEdited(new DocumentEvent(this, DocumentEvent.CONTENT_EDITED,
2386 start, end));
2387 } // edit(start,end,replacement)
2388
2389 /** Check that an offset is valid, i.e. it is non-null, greater than
2390 * or equal to 0 and less than the size of the document content.
2391 */
2392 public boolean isValidOffset(Long offset) {
2393 if(offset == null)
2394 return false;
2395
2396 long o = offset.longValue();
2397 if(o > getContent().size().longValue() || o < 0)
2398 return false;
2399
2400 return true;
2401 } // isValidOffset
2402
2403 /** Check that both start and end are valid offsets and that
2404 * they constitute a valid offset range, i.e. start is greater
2405 * than or equal to long.
2406 */
2407 public boolean isValidOffsetRange(Long start, Long end) {
2408 return
2409 isValidOffset(start) && isValidOffset(end) &&
2410 start.longValue() <= end.longValue();
2411 } // isValidOffsetRange(start,end)
2412
2413 /** Sets the nextAnnotationId */
2414 public void setNextAnnotationId(int aNextAnnotationId){
2415 nextAnnotationId = aNextAnnotationId;
2416 }// setNextAnnotationId();
2417
2418 /** Generate and return the next annotation ID */
2419 public Integer getNextAnnotationId() {
2420 return new Integer(nextAnnotationId++);
2421 } // getNextAnnotationId
2422
2423 /** Generate and return the next node ID */
2424 public Integer getNextNodeId() { return new Integer(nextNodeId++); }
2425
2426 /** Ordering based on URL.toString() and the URL offsets (if any) */
2427 public int compareTo(Object o) throws ClassCastException {
2428 DocumentImpl other = (DocumentImpl) o;
2429 return getOrderingString().compareTo(other.getOrderingString());
2430 } // compareTo
2431
2432 /** Utility method to produce a string for comparison in ordering.
2433 * String is based on the source URL and offsets.
2434 */
2435 protected String getOrderingString() {
2436 if(sourceUrl == null) return toString();
2437
2438 StringBuffer orderingString = new StringBuffer(sourceUrl.toString());
2439 if(sourceUrlStartOffset != null && sourceUrlEndOffset != null) {
2440 orderingString.append(sourceUrlStartOffset.toString());
2441 orderingString.append(sourceUrlEndOffset.toString());
2442 }
2443
2444 return orderingString.toString();
2445 } // getOrderingString()
2446
2447 /** The id of the next new annotation */
2448 protected int nextAnnotationId = 0;
2449
2450 /** The id of the next new node */
2451 protected int nextNodeId = 0;
2452 /** The source URL */
2453 protected URL sourceUrl;
2454
2455 /** The document's URL name. */
2456
2457 /** The content of the document */
2458 protected DocumentContent content;
2459
2460 /** The encoding of the source of the document content */
2461 protected String encoding = null;
2462
2463 // Data needed in toXml(AnnotationSet) methos
2464
2465 /** This field indicates whether or not to add the tag
2466 * called GatePreserveFormat to the document. HTML, XML, SGML docs won't
2467 * have this tag added
2468 */
2469// private boolean addGatePreserveFormatTag = false;
2470
2471 /**
2472 * Used by the XML dump preserving format method
2473 */
2474 private Annotation theRootAnnotation = null;
2475
2476 /** This field is used when creating StringBuffers for toXml() methods.
2477 * The size of the StringBuffer will be docDonctent.size() multiplied by this
2478 * value. It is aimed to improve the performance of StringBuffer
2479 */
2480 private final int DOC_SIZE_MULTIPLICATION_FACTOR = 2;
2481
2482 /** Constant used in the inner class AnnotationComparator to order
2483 * annotations on their start offset
2484 */
2485 private final int ORDER_ON_START_OFFSET = 0;
2486 /** Constant used in the inner class AnnotationComparator to order
2487 * annotations on their end offset
2488 */
2489 private final int ORDER_ON_END_OFFSET = 1;
2490 /** Constant used in the inner class AnnotationComparator to order
2491 * annotations on their ID
2492 */
2493 private final int ORDER_ON_ANNOT_ID = 2;
2494 /** Constant used in the inner class AnnotationComparator to order
2495 * annotations ascending
2496 */
2497 private final int ASC = 3;
2498 /** Constant used in the inner class AnnotationComparator to order
2499 * annotations descending
2500 */
2501 private final int DESC = -3;
2502
2503 /** A map initialized in init() containing entities that needs to be
2504 * replaced in strings
2505 */
2506 private static Map entitiesMap = null;
2507 // Initialize the entities map use when saving as xml
2508 static{
2509 entitiesMap = new HashMap();
2510 entitiesMap.put(new Character('<'),"<");
2511 entitiesMap.put(new Character('>'),">");
2512 entitiesMap.put(new Character('&'),"&");
2513 entitiesMap.put(new Character('\''),"'");
2514 entitiesMap.put(new Character('"'),""");
2515 entitiesMap.put(new Character((char)160)," ");
2516 entitiesMap.put(new Character((char)169),"©");
2517 }//static
2518
2519 /** The range that the content comes from at the source URL
2520 * (or null if none).
2521 */
2522 //protected Long[] sourceUrlOffsets;
2523
2524 /** The start of the range that the content comes from at the source URL
2525 * (or null if none).
2526 */
2527 protected Long sourceUrlStartOffset;
2528
2529 /** The end of the range that the content comes from at the source URL
2530 * (or null if none).
2531 */
2532 protected Long sourceUrlEndOffset;
2533
2534 /** The default annotation set */
2535 protected AnnotationSet defaultAnnots;
2536
2537 /** Named sets of annotations */
2538 protected Map namedAnnotSets;
2539
2540 /**
2541 * A property of the document that will be set when the user
2542 * wants to create the document from a string, as opposed to from
2543 * a URL.
2544 */
2545 private String stringContent;
2546
2547 /**
2548 * The stringContent of a document is
2549 * a property of the document that will be set when the user
2550 * wants to create the document from a string, as opposed to from
2551 * a URL.
2552 * <B>Use the <TT>getContent</TT> method instead to get the actual document
2553 * content.</B>
2554 */
2555 public String getStringContent() { return stringContent; }
2556
2557 /**
2558 * The stringContent of a document is
2559 * a property of the document that will be set when the user
2560 * wants to create the document from a string, as opposed to from
2561 * a URL.
2562 * <B>Use the <TT>setContent</TT> method instead to update the actual
2563 * document content.</B>
2564 */
2565 public void setStringContent(String stringContent) {
2566 this.stringContent = stringContent;
2567 } // set StringContent
2568
2569 /** Is the document markup-aware? */
2570 protected Boolean markupAware = new Boolean(false);
2571// /** Hash code */
2572// public int hashCode() {
2573// int code = getContent().hashCode();
2574// int memberCode = (defaultAnnots == null) ? 0 : defaultAnnots.hashCode();
2575// code += memberCode;
2576// memberCode = (encoding == null) ? 0 : encoding.hashCode();
2577// code += memberCode;
2578// memberCode = (features == null) ? 0 : features.hashCode();
2579// code += memberCode;
2580// code += (markupAware.booleanValue()) ? 0 : 1;
2581// memberCode = (namedAnnotSets == null) ? 0 : namedAnnotSets.hashCode();
2582// code += memberCode;
2583// code += nextAnnotationId;
2584// code += nextNodeId;
2585// memberCode = (sourceUrl == null) ? 0 : sourceUrl.hashCode();
2586// code += memberCode;
2587// memberCode =
2588// (sourceUrlStartOffset == null) ? 0 : sourceUrlStartOffset.hashCode();
2589// code += memberCode;
2590// memberCode =
2591// (sourceUrlEndOffset == null) ? 0 : sourceUrlEndOffset.hashCode();
2592// code += memberCode;
2593// return code;
2594// } // hashcode
2595
2596 /** String respresentation */
2597 public String toString() {
2598 String n = Strings.getNl();
2599 StringBuffer s = new StringBuffer("DocumentImpl: " + n);
2600 s.append(" content:" + content + n);
2601 s.append(" defaultAnnots:" + defaultAnnots + n);
2602 s.append(" encoding:" + encoding + n);
2603 s.append(" features:" + features + n);
2604 s.append(" markupAware:" + markupAware + n);
2605 s.append(" namedAnnotSets:" + namedAnnotSets + n);
2606 s.append(" nextAnnotationId:" + nextAnnotationId + n);
2607 s.append(" nextNodeId:" + nextNodeId + n);
2608 s.append(" sourceUrl:" + sourceUrl + n);
2609 s.append(" sourceUrlStartOffset:" + sourceUrlStartOffset + n);
2610 s.append(" sourceUrlEndOffset:" + sourceUrlEndOffset + n);
2611 s.append(n);
2612
2613 return s.toString();
2614 } // toString
2615
2616 /** Freeze the serialization UID. */
2617 static final long serialVersionUID = -8456893608311510260L;
2618
2619 /** Inner class needed to compare annotations*/
2620 class AnnotationComparator implements java.util.Comparator {
2621 int orderOn = -1;
2622 int orderType = ASC;
2623 /** Constructs a comparator according to one of three sorter types:
2624 * ORDER_ON_ANNOT_TYPE, ORDER_ON_END_OFFSET, ORDER_ON_START_OFFSET
2625 */
2626 public AnnotationComparator(int anOrderOn, int anOrderType){
2627 orderOn = anOrderOn;
2628 orderType = anOrderType;
2629 }// AnnotationComparator()
2630
2631 /**This method must be implemented according to Comparator interface */
2632 public int compare(Object o1, Object o2){
2633 Annotation a1 = (Annotation) o1;
2634 Annotation a2 = (Annotation) o2;
2635 // ORDER_ON_START_OFFSET ?
2636 if (orderOn == ORDER_ON_START_OFFSET){
2637 int result = a1.getStartNode().getOffset().compareTo(
2638 a2.getStartNode().getOffset());
2639 if (orderType == ASC){
2640 // ASC
2641 // If they are equal then their ID will decide.
2642 if (result == 0)
2643 return a1.getId().compareTo(a2.getId());
2644 return result;
2645 }else{
2646 // DESC
2647 if (result == 0)
2648 return - (a1.getId().compareTo(a2.getId()));
2649 return -result;
2650 }// End if (orderType == ASC)
2651 }// End if (orderOn == ORDER_ON_START_OFFSET)
2652
2653 // ORDER_ON_END_OFFSET ?
2654 if (orderOn == ORDER_ON_END_OFFSET){
2655 int result = a1.getEndNode().getOffset().compareTo(
2656 a2.getEndNode().getOffset());
2657 if (orderType == ASC){
2658 // ASC
2659 // If they are equal then their ID will decide.
2660 if (result == 0)
2661 return - (a1.getId().compareTo(a2.getId()));
2662 return result;
2663 }else{
2664 // DESC
2665 // If they are equal then their ID will decide.
2666 if (result == 0)
2667 return a1.getId().compareTo(a2.getId());
2668 return - result;
2669 }// End if (orderType == ASC)
2670 }// End if (orderOn == ORDER_ON_END_OFFSET)
2671
2672 // ORDER_ON_ANNOT_ID ?
2673 if (orderOn == ORDER_ON_ANNOT_ID){
2674 if (orderType == ASC)
2675 return a1.getId().compareTo(a2.getId());
2676 else
2677 return -(a1.getId().compareTo(a2.getId()));
2678 }// End if
2679 return 0;
2680 }//compare()
2681 } // End inner class AnnotationComparator
2682
2683
2684 private transient Vector documentListeners;
2685 private transient Vector gateListeners;
2686
2687 public synchronized void removeDocumentListener(DocumentListener l) {
2688 if (documentListeners != null && documentListeners.contains(l)) {
2689 Vector v = (Vector) documentListeners.clone();
2690 v.removeElement(l);
2691 documentListeners = v;
2692 }
2693 }
2694 public synchronized void addDocumentListener(DocumentListener l) {
2695 Vector v = documentListeners == null ? new Vector(2) : (Vector) documentListeners.clone();
2696 if (!v.contains(l)) {
2697 v.addElement(l);
2698 documentListeners = v;
2699 }
2700 }
2701
2702 protected void fireAnnotationSetAdded(DocumentEvent e) {
2703 if (documentListeners != null) {
2704 Vector listeners = documentListeners;
2705 int count = listeners.size();
2706 for (int i = 0; i < count; i++) {
2707 ((DocumentListener) listeners.elementAt(i)).annotationSetAdded(e);
2708 }
2709 }
2710 }
2711
2712 protected void fireAnnotationSetRemoved(DocumentEvent e) {
2713 if (documentListeners != null) {
2714 Vector listeners = documentListeners;
2715 int count = listeners.size();
2716 for (int i = 0; i < count; i++) {
2717 ((DocumentListener) listeners.elementAt(i)).annotationSetRemoved(e);
2718 }
2719 }
2720 }
2721
2722 protected void fireContentEdited(DocumentEvent e) {
2723 if (documentListeners != null) {
2724 Vector listeners = documentListeners;
2725 int count = listeners.size();
2726 for (int i = 0; i < count; i++) {
2727 ((DocumentListener) listeners.elementAt(i)).contentEdited(e);
2728 }
2729 }
2730 }
2731
2732 public void resourceLoaded(CreoleEvent e) {
2733 }
2734 public void resourceUnloaded(CreoleEvent e) {
2735 }
2736 public void datastoreOpened(CreoleEvent e) {
2737 }
2738 public void datastoreCreated(CreoleEvent e) {
2739 }
2740 public void resourceRenamed(Resource resource, String oldName,
2741 String newName){
2742 }
2743 public void datastoreClosed(CreoleEvent e) {
2744 if (! e.getDatastore().equals(this.getDataStore()))
2745 return;
2746 //close this lr, since it cannot stay open when the DS it comes from
2747 //is closed
2748 Factory.deleteResource(this);
2749 }
2750 public void setLRPersistenceId(Object lrID) {
2751 super.setLRPersistenceId( lrID);
2752 //make persistent documents listen to the creole register
2753 //for events about their DS
2754 Gate.getCreoleRegister().addCreoleListener(this);
2755 }
2756 public void resourceAdopted(DatastoreEvent evt) {
2757 }
2758 public void resourceDeleted(DatastoreEvent evt) {
2759 if(! evt.getSource().equals(this.getDataStore()))
2760 return;
2761 //if an open document is deleted from a DS, then
2762 //it must close itself immediately, as is no longer valid
2763 if(evt.getResourceID().equals(this.getLRPersistenceId()))
2764 Factory.deleteResource(this);
2765 }
2766 public void resourceWritten(DatastoreEvent evt) {
2767 }
2768 public void setDataStore(DataStore dataStore) throws gate.persist.PersistenceException {
2769 super.setDataStore( dataStore);
2770 if (this.dataStore != null)
2771 this.dataStore.addDatastoreListener(this);
2772 }
2773
2774 /**
2775 * This method added by Shafirin Andrey, to allow access to
2776 * protected member {@link #defaultAnnots}
2777 * Required for JAPE-Debugger.
2778 * */
2779 public void setDefaultAnnotations(AnnotationSet defaultAnnotations) {
2780 defaultAnnots = defaultAnnotations;
2781 }
2782
2783} // class DocumentImpl
2784