1   /*
2    *  TextualDocumentFormat.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/May/2000
12   *
13   *  $Id: TextualDocumentFormat.java,v 1.25 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.corpora;
17  
18  import gate.*;
19  import gate.creole.ResourceInstantiationException;
20  import gate.util.DocumentFormatException;
21  
22  //import org.w3c.www.mime.*;
23  
24  /** The format of Documents. Subclasses of DocumentFormat know about
25    * particular MIME types and how to unpack the information in any
26    * markup or formatting they contain into GATE annotations. Each MIME
27    * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat,
28    * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves
29    * with a static index residing here when they are constructed. Static
30    * getDocumentFormat methods can then be used to get the appropriate
31    * format class for a particular document.
32    */
33  public class TextualDocumentFormat extends DocumentFormat
34  {
35  
36    /** Debug flag */
37    private static final boolean DEBUG = false;
38  
39    /** Default construction */
40    public TextualDocumentFormat() { super(); }
41  
42    /** Initialise this resource, and return it. */
43    public Resource init() throws ResourceInstantiationException{
44      // Register plain text mime type
45      MimeType mime = new MimeType("text","plain");
46      // Register the class handler for this mime type
47      mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(),
48                                                                            this);
49      // Register the mime type with mine string
50      mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime);
51      // Register file sufixes for this mime type
52      suffixes2mimeTypeMap.put("txt",mime);
53      suffixes2mimeTypeMap.put("text",mime);
54      // Set the mimeType for this language resource
55      setMimeType(mime);
56      return this;
57    } // init()
58  
59    /** Unpack the markup in the document. This converts markup from the
60      * native format (e.g. XML, RTF) into annotations in GATE format.
61      * Uses the markupElementsMap to determine which elements to convert, and
62      * what annotation type names to use.
63      */
64    public void unpackMarkup(Document doc) throws DocumentFormatException{
65      if (doc == null || doc.getContent() == null) return;
66      setNewLineProperty(doc);
67      // Create paragraph annotations in the specified annotation set
68      int endOffset = doc.getContent().toString().length();
69      int startOffset = 0;
70      annotateParagraphs(doc,startOffset,endOffset,
71                                  GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
72    }//unpackMarkup
73  
74    public void unpackMarkup(Document doc, RepositioningInfo repInfo,
75                              RepositioningInfo ampCodingInfo)
76                                        throws DocumentFormatException {
77      unpackMarkup(doc);
78    } // unpackMarkup
79  
80  
81    /**
82     * Check the new line sequence and set document property.
83     * <BR>
84     * Possible values are CRLF, LFCR, CR, LF
85     */
86    protected void setNewLineProperty(Document doc) {
87      String content = doc.getContent().toString();
88      String newLineType = "";
89  
90      char ch = ' ';
91      char lastch = ' ';
92      for(int i=0; i < content.length(); ++i) {
93        ch = content.charAt(i);
94        if(lastch == '\r') {
95          if(ch == '\n') {
96            newLineType = "CRLF";
97            break;
98          }
99          else {
100           newLineType = "CR";
101           break;
102         }
103       }
104       if(lastch == '\n') {
105         if(ch == '\r') {
106           newLineType = "LFCR";
107           break;
108         }
109         else {
110           newLineType = "LF";
111           break;
112         }
113       }
114       lastch = ch;
115     } // for
116 
117     doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType);
118   } // setNewLineProperty()
119 
120   /** Delete '\r' in combination CRLF or LFCR in document content */
121   private void removeExtraNewLine(Document doc) {
122     String content = doc.getContent().toString();
123     StringBuffer buff = new StringBuffer(content);
124 
125     char ch = ' ';
126     char lastch = ' ';
127     for(int i=content.length()-1; i > -1; --i) {
128       ch = content.charAt(i);
129       if(ch == '\n' && lastch == '\r') {
130         buff.deleteCharAt(i+1);
131       }
132       if(ch == '\r' && lastch == '\n') {
133         buff.deleteCharAt(i);
134         ch = lastch;
135       }
136       lastch = ch;
137     } // for
138 
139     doc.setContent(new DocumentContentImpl(buff.toString()));
140   } // removeExtraNewLine(Document doc)
141 
142   /** This method annotates paragraphs in a GATE document. The investigated text
143     * spans beetween start and end offsets and the paragraph annotations are
144     * created in the annotSetName. If annotSetName is null then they are creted
145     * in the default annotation set.
146     * @param aDoc is the gate document on which the paragraph detection would
147     *  be performed.If it is null or its content it's null then the method woul
148     *  simply return doing nothing.
149     * @param startOffset is the index  form the document content from which the
150     * paragraph detection will start
151     * @param endOffset is the offset where the detection will end.
152     * @param annotSetName is the name of the set in which paragraph annotation
153     * would be created.The annotation type created will be "paragraph"
154     */
155   public void annotateParagraphs(Document aDoc,int startOffset,int endOffset,
156                             String annotSetName)throws DocumentFormatException{
157     // Simply return if the document is null or its content
158     if (aDoc == null || aDoc.getContent() == null) return;
159     // Simply return if the start is > than the end
160     if (startOffset > endOffset) return;
161     // Decide where to put the newly detected annotations
162     AnnotationSet annotSet = null;
163     if (annotSetName == null)
164       annotSet = aDoc.getAnnotations();
165     else
166       annotSet = aDoc.getAnnotations(annotSetName);
167     // Extract the document content
168     String content = aDoc.getContent().toString();
169     // This is the offset marking the start of a para
170     int startOffsetPara = startOffset;
171     // This marks the ned of a para
172     int endOffsetPara = endOffset;
173     // The initial sate of the FSA
174     int state = 1;
175     // This field marks that a BR entity was read
176     // A BR entity can be NL or NL CR, depending on the operating system (UNIX
177     // or DOS)
178     boolean readBR = false;
179     int index = startOffset;
180     while (index < endOffset){
181       // Read the current char
182       char ch = content.charAt(index);
183       // Test if a BR entity was read
184       if (ch =='\n'){
185         readBR = true;
186         // If \n is followed by a \r then advance the index in order to read a
187         // BR entity
188         while ((index+1 < endOffset) && (content.charAt(index+1) == '\r'))
189           index ++;
190       }// End if
191       switch(state){
192         // It is the initial and also a final state
193         // Stay in state 1 while it reads whitespaces
194         case 1:{
195           // If reads a non whitespace char then move to state 2 and record
196           // the beggining of a paragraph
197           if (!Character.isWhitespace(ch)){
198             state = 2;
199             startOffsetPara = index;
200           }// End if
201         }break;
202         // It can be also a final state.
203         case 2:{
204           // Stay in state 2 while reading chars != BR entities
205           if (readBR){
206             // If you find a BR char go to state 3. The possible end of the para
207             // can be index. This will be confirmed by state 3. So, this is why
208             // the end of a para is recorded here.
209             readBR = false;
210             endOffsetPara = index;
211             state = 3;
212           }// End if
213         }break;
214         // It can be also a final state
215         // From state 3 there are only 2 possible ways: (state 2 or state1)
216         // In state 1 it needs to read a BR
217         // For state 2 it nead to read something different then a BR
218         case 3:{
219           if (readBR){
220             // A BR was read. Go to state 1
221             readBR = false;
222             state = 1;
223             // Create an annotation type paragraph
224             try{
225               annotSet.add( new Long(startOffsetPara),
226                             new Long(endOffsetPara),
227                             "paragraph",
228                             Factory.newFeatureMap());
229             } catch (gate.util.InvalidOffsetException ioe){
230               throw new DocumentFormatException("Coudn't create a paragraph"+
231               " annotation",ioe);
232             }// End try
233           }else{
234             // Go to state 2 an keep reading chars
235             state = 2;
236           }// End if
237         }break;
238       }// End switch
239       // Prepare to read the next char.
240       index ++;
241     }// End while
242     endOffsetPara = index;
243     // Investigate where the finite automata has stoped
244     if ( state==2 || state==3 ){
245       // Create an annotation type paragraph
246       try{
247         annotSet.add( new Long(startOffsetPara),
248                       // Create the final annotation using the endOffset
249                       new Long(endOffsetPara),
250                       "paragraph",
251                       Factory.newFeatureMap());
252       } catch (gate.util.InvalidOffsetException ioe){
253               throw new DocumentFormatException("Coudn't create a paragraph"+
254               " annotation",ioe);
255       }// End try
256     }// End if
257   }// End annotateParagraphs();
258 
259   public DataStore getDataStore(){ return null;}
260 
261 } // class TextualDocumentFormat
262