TextualDocumentFormat.java |
1 /* 2 * TextualDocumentFormat.java 3 * 4 * Copyright (c) 1998-2005, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 26/May/2000 12 * 13 * $Id: TextualDocumentFormat.java,v 1.25 2005/01/11 13:51:31 ian Exp $ 14 */ 15 16 package gate.corpora; 17 18 import gate.*; 19 import gate.creole.ResourceInstantiationException; 20 import gate.util.DocumentFormatException; 21 22 //import org.w3c.www.mime.*; 23 24 /** The format of Documents. Subclasses of DocumentFormat know about 25 * particular MIME types and how to unpack the information in any 26 * markup or formatting they contain into GATE annotations. Each MIME 27 * type has its own subclass of DocumentFormat, e.g. XmlDocumentFormat, 28 * RtfDocumentFormat, MpegDocumentFormat. These classes register themselves 29 * with a static index residing here when they are constructed. Static 30 * getDocumentFormat methods can then be used to get the appropriate 31 * format class for a particular document. 32 */ 33 public class TextualDocumentFormat extends DocumentFormat 34 { 35 36 /** Debug flag */ 37 private static final boolean DEBUG = false; 38 39 /** Default construction */ 40 public TextualDocumentFormat() { super(); } 41 42 /** Initialise this resource, and return it. */ 43 public Resource init() throws ResourceInstantiationException{ 44 // Register plain text mime type 45 MimeType mime = new MimeType("text","plain"); 46 // Register the class handler for this mime type 47 mimeString2ClassHandlerMap.put(mime.getType()+ "/" + mime.getSubtype(), 48 this); 49 // Register the mime type with mine string 50 mimeString2mimeTypeMap.put(mime.getType() + "/" + mime.getSubtype(), mime); 51 // Register file sufixes for this mime type 52 suffixes2mimeTypeMap.put("txt",mime); 53 suffixes2mimeTypeMap.put("text",mime); 54 // Set the mimeType for this language resource 55 setMimeType(mime); 56 return this; 57 } // init() 58 59 /** Unpack the markup in the document. This converts markup from the 60 * native format (e.g. XML, RTF) into annotations in GATE format. 61 * Uses the markupElementsMap to determine which elements to convert, and 62 * what annotation type names to use. 63 */ 64 public void unpackMarkup(Document doc) throws DocumentFormatException{ 65 if (doc == null || doc.getContent() == null) return; 66 setNewLineProperty(doc); 67 // Create paragraph annotations in the specified annotation set 68 int endOffset = doc.getContent().toString().length(); 69 int startOffset = 0; 70 annotateParagraphs(doc,startOffset,endOffset, 71 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME); 72 }//unpackMarkup 73 74 public void unpackMarkup(Document doc, RepositioningInfo repInfo, 75 RepositioningInfo ampCodingInfo) 76 throws DocumentFormatException { 77 unpackMarkup(doc); 78 } // unpackMarkup 79 80 81 /** 82 * Check the new line sequence and set document property. 83 * <BR> 84 * Possible values are CRLF, LFCR, CR, LF 85 */ 86 protected void setNewLineProperty(Document doc) { 87 String content = doc.getContent().toString(); 88 String newLineType = ""; 89 90 char ch = ' '; 91 char lastch = ' '; 92 for(int i=0; i < content.length(); ++i) { 93 ch = content.charAt(i); 94 if(lastch == '\r') { 95 if(ch == '\n') { 96 newLineType = "CRLF"; 97 break; 98 } 99 else { 100 newLineType = "CR"; 101 break; 102 } 103 } 104 if(lastch == '\n') { 105 if(ch == '\r') { 106 newLineType = "LFCR"; 107 break; 108 } 109 else { 110 newLineType = "LF"; 111 break; 112 } 113 } 114 lastch = ch; 115 } // for 116 117 doc.getFeatures().put(GateConstants.DOCUMENT_NEW_LINE_TYPE, newLineType); 118 } // setNewLineProperty() 119 120 /** Delete '\r' in combination CRLF or LFCR in document content */ 121 private void removeExtraNewLine(Document doc) { 122 String content = doc.getContent().toString(); 123 StringBuffer buff = new StringBuffer(content); 124 125 char ch = ' '; 126 char lastch = ' '; 127 for(int i=content.length()-1; i > -1; --i) { 128 ch = content.charAt(i); 129 if(ch == '\n' && lastch == '\r') { 130 buff.deleteCharAt(i+1); 131 } 132 if(ch == '\r' && lastch == '\n') { 133 buff.deleteCharAt(i); 134 ch = lastch; 135 } 136 lastch = ch; 137 } // for 138 139 doc.setContent(new DocumentContentImpl(buff.toString())); 140 } // removeExtraNewLine(Document doc) 141 142 /** This method annotates paragraphs in a GATE document. The investigated text 143 * spans beetween start and end offsets and the paragraph annotations are 144 * created in the annotSetName. If annotSetName is null then they are creted 145 * in the default annotation set. 146 * @param aDoc is the gate document on which the paragraph detection would 147 * be performed.If it is null or its content it's null then the method woul 148 * simply return doing nothing. 149 * @param startOffset is the index form the document content from which the 150 * paragraph detection will start 151 * @param endOffset is the offset where the detection will end. 152 * @param annotSetName is the name of the set in which paragraph annotation 153 * would be created.The annotation type created will be "paragraph" 154 */ 155 public void annotateParagraphs(Document aDoc,int startOffset,int endOffset, 156 String annotSetName)throws DocumentFormatException{ 157 // Simply return if the document is null or its content 158 if (aDoc == null || aDoc.getContent() == null) return; 159 // Simply return if the start is > than the end 160 if (startOffset > endOffset) return; 161 // Decide where to put the newly detected annotations 162 AnnotationSet annotSet = null; 163 if (annotSetName == null) 164 annotSet = aDoc.getAnnotations(); 165 else 166 annotSet = aDoc.getAnnotations(annotSetName); 167 // Extract the document content 168 String content = aDoc.getContent().toString(); 169 // This is the offset marking the start of a para 170 int startOffsetPara = startOffset; 171 // This marks the ned of a para 172 int endOffsetPara = endOffset; 173 // The initial sate of the FSA 174 int state = 1; 175 // This field marks that a BR entity was read 176 // A BR entity can be NL or NL CR, depending on the operating system (UNIX 177 // or DOS) 178 boolean readBR = false; 179 int index = startOffset; 180 while (index < endOffset){ 181 // Read the current char 182 char ch = content.charAt(index); 183 // Test if a BR entity was read 184 if (ch =='\n'){ 185 readBR = true; 186 // If \n is followed by a \r then advance the index in order to read a 187 // BR entity 188 while ((index+1 < endOffset) && (content.charAt(index+1) == '\r')) 189 index ++; 190 }// End if 191 switch(state){ 192 // It is the initial and also a final state 193 // Stay in state 1 while it reads whitespaces 194 case 1:{ 195 // If reads a non whitespace char then move to state 2 and record 196 // the beggining of a paragraph 197 if (!Character.isWhitespace(ch)){ 198 state = 2; 199 startOffsetPara = index; 200 }// End if 201 }break; 202 // It can be also a final state. 203 case 2:{ 204 // Stay in state 2 while reading chars != BR entities 205 if (readBR){ 206 // If you find a BR char go to state 3. The possible end of the para 207 // can be index. This will be confirmed by state 3. So, this is why 208 // the end of a para is recorded here. 209 readBR = false; 210 endOffsetPara = index; 211 state = 3; 212 }// End if 213 }break; 214 // It can be also a final state 215 // From state 3 there are only 2 possible ways: (state 2 or state1) 216 // In state 1 it needs to read a BR 217 // For state 2 it nead to read something different then a BR 218 case 3:{ 219 if (readBR){ 220 // A BR was read. Go to state 1 221 readBR = false; 222 state = 1; 223 // Create an annotation type paragraph 224 try{ 225 annotSet.add( new Long(startOffsetPara), 226 new Long(endOffsetPara), 227 "paragraph", 228 Factory.newFeatureMap()); 229 } catch (gate.util.InvalidOffsetException ioe){ 230 throw new DocumentFormatException("Coudn't create a paragraph"+ 231 " annotation",ioe); 232 }// End try 233 }else{ 234 // Go to state 2 an keep reading chars 235 state = 2; 236 }// End if 237 }break; 238 }// End switch 239 // Prepare to read the next char. 240 index ++; 241 }// End while 242 endOffsetPara = index; 243 // Investigate where the finite automata has stoped 244 if ( state==2 || state==3 ){ 245 // Create an annotation type paragraph 246 try{ 247 annotSet.add( new Long(startOffsetPara), 248 // Create the final annotation using the endOffset 249 new Long(endOffsetPara), 250 "paragraph", 251 Factory.newFeatureMap()); 252 } catch (gate.util.InvalidOffsetException ioe){ 253 throw new DocumentFormatException("Coudn't create a paragraph"+ 254 " annotation",ioe); 255 }// End try 256 }// End if 257 }// End annotateParagraphs(); 258 259 public DataStore getDataStore(){ return null;} 260 261 } // class TextualDocumentFormat 262