gate.creole.APFormatExporter (Java2HTML)

1   /*
2    *  APFormatExporter.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU, 26/Oct/2001
12   *
13   *  $Id: APFormatExporter.java,v 1.27 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.creole;
17  
18  import java.io.*;
19  import java.net.URL;
20  import java.util.*;
21  
22  import gate.*;
23  import gate.util.*;
24  
25  /** This class implements a APF xml exporter. It works on documents or corpora
26    * to export them in the APF format.
27    */
28  public class APFormatExporter extends AbstractLanguageAnalyser
29                                implements ANNIEConstants{
30    public static final String
31      APF_EXP_DOCUMENT_PARAMETER_NAME = "document";
32  
33    public static final String
34      APF_EXP_SOURCE_PARAMETER_NAME = "source";
35  
36    public static final String
37      APF_EXP_DTD_PARAMETER_NAME = "dtdFileName";
38  
39    public static final String
40      APF_EXP_PATH_PARAMETER_NAME = "exportFilePath";
41  
42    public static final String
43      APF_EXP_TYPES_PARAMETER_NAME = "exportedTypes";
44  
45    public static final String
46      APF_EXP_WRITE_SOURCE_PARAMETER_NAME = "isSourceWritten";
47  
48    /** Debug flag */
49    private static final boolean DEBUG = false;
50    /** Constructor does nothing. This PR is bean like initialized*/
51    public APFormatExporter() {}
52  
53    /** Run the resource and does the entire export process*/
54    public void execute() throws ExecutionException{
55      // Check if the thing can be run
56      if(document == null)
57        throw new ExecutionException("No document found to export in APF format!");
58      if (exportedTypes == null)
59        throw new ExecutionException("No export types found.");
60      xmlDoc = new StringBuffer(10*(document.getContent().size().intValue()));
61      initDocId();
62      if (docId == null)
63        throw new ExecutionException("Couldn't detect the document's ID");
64      if (DEBUG)
65        Out.prln("Document id = "+ docId);
66  
67      String exportFilePathStr = null;
68      if (exportFilePath == null)
69        exportFilePathStr = new String(document.getSourceUrl().getFile() +
70                                                                    ".apf.xml");
71      else
72        exportFilePathStr = exportFilePath.getPath()+ "/"
73            + gate.util.Files.getLastPathComponent(
74                document.getSourceUrl().getFile()) + ".apf.xml";
75  
76      if (DEBUG)
77        Out.prln("Export file path = "+ exportFilePathStr);
78  //*
79      // Prepare to write into the xmlFile
80      OutputStreamWriter writer = null;
81      try{
82        writer = new OutputStreamWriter(
83                new FileOutputStream(new File(exportFilePathStr)));
84  
85        // Write (test the toXml() method)
86        // This Action is added only when a gate.Document is created.
87        // So, is Bor sure that the resource is a gate.Document
88        serializeDocumentToAPF();
89        writer.write(xmlDoc.toString());
90        writer.flush();
91        writer.close();
92      }catch (Exception e){
93        throw new ExecutionException(e);
94      }// End try
95  //*/
96    } // execute()
97  
98  
99    /** Initialise this resource, and returns it. */
100   public Resource init() throws ResourceInstantiationException {
101     return this;
102   } // init()
103 
104   /** Java bean style mutator for exportedTypes */
105   public void setExportedTypes(List anExportedTypesList){
106     exportedTypes = anExportedTypesList;
107   }// setExportedTypes();
108 
109   /** Java bean style accesor for exportedTypes */
110   public List getExportedTypes(){
111     return exportedTypes;
112   }// getExportedTypes()
113 
114   /** Java bean style mutator for dtdFileName */
115   public void setDtdFileName(String aDtdFileName){
116     dtdFileName = aDtdFileName;
117   }// setDtdFileName();
118 
119   /** Java bean style accesor for DtdFileName */
120   public String getDtdFileName(){
121     return dtdFileName;
122   }// getDtdFileName()
123 
124   /** Java bean style mutator for exportFilePath */
125   public void setExportFilePath(URL anExportFilePath){
126     exportFilePath = anExportFilePath;
127   }// setExportFilePath();
128 
129   /** Java bean style accesor for exportFilePath */
130   public URL getExportFilePath(){
131     return exportFilePath;
132   }// getDtdFileName()
133 
134   /** Java bean style mutator for source */
135   public void setSource(String aSource){
136     source = aSource;
137   }// setSource();
138 
139   /** Java bean style accesor for source */
140   public String getSource(){
141     return source;
142   }// getSource()
143 
144   /** Java bean style accesor for isSourceWritten */
145   public Boolean getIsSourceWritten() {
146     return new Boolean(isSourceWritten);
147   }
148 
149   /** Java bean style mutator for isSourceWritten */
150   public void setIsSourceWritten(Boolean aIsSourceWritten){
151     isSourceWritten = aIsSourceWritten.booleanValue();
152   }// setIsSourceWritten();
153 
154 
155 
156   /** Initialises the docId with documents' file name without the complete path*/
157   private void initDocId(){
158     String fileName = "";
159     fileName = gate.util.Files.getLastPathComponent(
160                                             document.getSourceUrl().getFile());
161     // File name contains now the last token
162     if (DEBUG)
163       Out.prln("From initDocId, fileName ="+ fileName);
164     StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,".");
165     StringBuffer tmpDocId = new StringBuffer("");
166     while(fileNameTokenizer.hasMoreTokens()){
167       String token = (String)fileNameTokenizer.nextToken();
168       // We don't want to append the last token
169       if (fileNameTokenizer.hasMoreTokens())
170         tmpDocId.append(token + ".");
171     }// End while
172     // if tokenization had place
173     if (!"".equals(tmpDocId)){
174       // Remove the last dot
175       tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),"");
176       docId = tmpDocId.toString();
177     }// End if
178   }// initDocId()
179 
180   /** Returns the xml document conforming to APF dtd.*/
181   protected void serializeDocumentToAPF(){
182     xmlDoc.append("<?xml version=\"1.0\" ?>\n");
183     xmlDoc.append("<!DOCTYPE source_file SYSTEM ");
184        if (dtdFileName == null)
185       xmlDoc.append("\"ace-rdc.v2.0.1.dtd\"");
186          else
187            xmlDoc.append("\""+dtdFileName+"\"");
188     xmlDoc.append(">\n");
189     xmlDoc.append("<source_file TYPE=\"text\"");
190     if (isSourceWritten) {
191       AnnotationSet docTypeAnns = document.getAnnotations(
192         GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("DOCTYPE");
193       if (docTypeAnns == null || docTypeAnns.isEmpty())
194         xmlDoc.append(" SOURCE=\""+ source+ "\" ");
195       else {
196         Annotation docTypeAnn = (Annotation) docTypeAnns.iterator().next();
197         if (docTypeAnn.getFeatures().get("SOURCE") == null)
198           xmlDoc.append(" SOURCE=\""+ source+ "\" ");
199         else
200           xmlDoc.append(" SOURCE=\""+ docTypeAnn.getFeatures().get("SOURCE")+ "\" ");
201       }//if no doc type annotations
202     }
203     xmlDoc.append("VERSION=\"2.0\" URI=\"");
204     xmlDoc.append(docId);
205     xmlDoc.append("-lf\">\n");
206     xmlDoc.append("  <document DOCID=\"");
207     xmlDoc.append(docId + "\">\n");
208     serializeEntities();
209     xmlDoc.append("  </document>\n");
210     xmlDoc.append("</source_file>");
211   }// serializeDocumentToAPF()
212 
213   /** Transforms all the entities from exportedTypes found in the GATE document
214     * into their xml representation
215     */
216   protected void serializeEntities(){
217     // If no types founded then simply return
218     if (exportedTypes == null || exportedTypes.isEmpty()) return;
219 
220     Map entitiesMap = null;
221     if ( document.getFeatures() == null ||
222          document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null)
223       entitiesMap = new HashMap();
224     else
225       entitiesMap = (Map)document.getFeatures().
226                                         get(DOCUMENT_COREF_FEATURE_NAME);
227     Map namedAnnotSetMap = null;
228     if (document.getNamedAnnotationSets() == null)
229       namedAnnotSetMap = new HashMap();
230     else
231       namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets());
232     // Add the default annoattion set
233     namedAnnotSetMap.put(null,document.getAnnotations());
234     // The entities map is a map from annotation sets names to list of lists
235     // Each list element is composed from annotations refering the same entity
236     // All the entities that are in the exportedTypes need to be serialized.
237     Iterator exportedTypesIter = exportedTypes.iterator();
238     while(exportedTypesIter.hasNext()){
239       String entityType = (String)exportedTypesIter.next();
240       // Serialize all entities of type
241       // The keys in the entitesMap are annotation sets names. The null key
242       // designates the default annotation.
243       Set annotationSetNames = namedAnnotSetMap.keySet();
244       Iterator annotationSetNamesIter = annotationSetNames.iterator();
245       while (annotationSetNamesIter.hasNext()){
246         Object annotSetName = annotationSetNamesIter.next();
247         // This list contains entities found in the annotSetName
248         List entitiesList = (List) entitiesMap.get(annotSetName);
249         if (entitiesList == null) entitiesList = new ArrayList();
250         // This annotation set will contain all annotations of "entityType"
251         AnnotationSet annotSet = null;
252         Set serializationAnnotSet = null;
253         annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName);
254         if (annotSet == null || annotSet.get(entityType) == null) continue;
255         serializationAnnotSet = new HashSet(annotSet.get(entityType));
256         // All annotations from annotSet will be serialized as entities unless
257         // some of them are present in the entities map
258         // Now we are searching for the entityType in the entitiesMap and
259         // serialize it from there. After that, remove all annotations
260         // entityType present in entitiesMap from annotSet and serialize the
261         // remaining entities.
262         //Iterate through the entitiesList in searching for entityType
263         Iterator entitiesListIter = entitiesList.iterator();
264         while (entitiesListIter.hasNext()){
265           List entity = (List)entitiesListIter.next();
266           // We want now to accesate an annotation from the entity list to get
267           // its type and compare it with entityType
268           String theEntityType = new String("");
269           if (entity != null && !entity.isEmpty()){
270             Integer annotId = (Integer)entity.get(0);
271             Annotation a = (Annotation)annotSet.get(annotId);
272             if (a != null) theEntityType = a.getType();
273           }// End if
274           // The the types are equal then serialize the entities
275           if (theEntityType.equals(entityType)){
276             List ent = new ArrayList();
277             Iterator entityIter = entity.iterator();
278             while(entityIter.hasNext()){
279               Integer id = (Integer)entityIter.next();
280               ent.add(annotSet.get(id));
281             }// End while
282             serializeAnEntity(ent);
283             // Remove all annotation from entity that apear in annotSet
284             serializationAnnotSet.removeAll(ent);
285           }// End if
286         }// End while(entitiesListIter.hasNext())
287         // Serialize the remaining entities in annotSet
288         Iterator serializationAnnotSetIter = serializationAnnotSet.iterator();
289         while(serializationAnnotSetIter.hasNext()){
290           Annotation annotEntity = (Annotation) serializationAnnotSetIter.next();
291           List ent = new ArrayList();
292           ent.add(annotEntity);
293           serializeAnEntity(ent);
294         }// End while(annotSetIter.hasNext())
295       }// End while(entitiesKeysIter.hasNext())
296     }// End while(exportedTypesIter.hasNext())
297   }// serializeEntities()
298 
299   /** Writes an entity in the xmlDoc conforming to APF standards.
300     * @param anEntity represents a list with annotations that refer the same
301     * entity. Those annotations were detected and constructed by the
302     * orthomatcher.
303     */
304   private void serializeAnEntity(List anEntity){
305     if (anEntity == null || anEntity.isEmpty()) return;
306     // Write the entities tags
307     xmlDoc.append("  <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n");
308     // We know for sure that the list is not empty (see above)
309     Annotation a = (Annotation) anEntity.get(0);
310     xmlDoc.append("    <entity_type GENERIC=\"FALSE\">" + a.getType().toUpperCase() +
311      "</entity_type>\n");
312     // Write the entities mentions
313     Iterator anEntityIter = anEntity.iterator();
314     while(anEntityIter.hasNext()){
315       Annotation ann = (Annotation)anEntityIter.next();
316       serializeAnEntityMention(ann);
317     }// End while(anEntityIter.hasNext())
318     // Write the entities attributes
319     xmlDoc.append("      <entity_attributes>\n");
320     anEntityIter = anEntity.iterator();
321     while(anEntityIter.hasNext()){
322       Annotation ann = (Annotation)anEntityIter.next();
323       serializeAnEntityAttributes(ann);
324     }// End while(anEntityIter.hasNext())
325     xmlDoc.append("      </entity_attributes>\n");
326     xmlDoc.append("  </entity>\n");
327   }// End serializeAnEntity();
328 
329   /** This method serializes an entity mention from an Annotation*/
330   private void serializeAnEntityMention(Annotation ann){
331     if (ann == null) return;
332     String entityMentionType = "NAME";
333     String entityMentionRole = null;
334     String entityMentionReference = null;
335     String entityMentionGeneric = null;
336 
337     FeatureMap fm = ann.getFeatures();
338     if (fm != null){
339       if( null != fm.get("ENTITY_MENTION_TYPE"))
340         entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE");
341 
342       entityMentionRole = (String) fm.get("ROLE");
343       entityMentionReference = (String) fm.get("REFERENCE");
344       entityMentionGeneric = (String) fm.get("GENERIC");
345     }// End if
346     String str1 = (entityMentionRole == null)? "" :
347                              ("ROLE=\"" + entityMentionRole + "\"");
348     String str2 = (entityMentionReference == null)? "" :
349                              ("REFERENCE=\"" + entityMentionReference + "\"");
350     String str3 = (entityMentionGeneric == null)? "" :
351                              ("GENERIC=\"" + entityMentionGeneric + "\"");
352 
353 /* modified by Di - the new scorer needs a unique ID for each mention as well */
354 
355     xmlDoc.append("      <entity_mention TYPE=\"" + entityMentionType+"\"" +
356      str1 + " " + str2 + " " + str3 + "ID=\""  + "M" + getNextMentionId() +"\">\n"
357     );
358 
359     // extent
360     xmlDoc.append("        <extent>\n");
361     xmlDoc.append("          <charseq>\n");
362     try{
363       xmlDoc.append("          <!-- string = \"" +
364             document.getContent().getContent(ann.getStartNode().getOffset(),
365                                       ann.getEndNode().getOffset())+"\" -->\n");
366     }catch (InvalidOffsetException ioe){
367       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
368       " offsets:" + ann.getStartNode().getOffset() + " and "+
369       ann.getEndNode().getOffset());
370     }// End try
371     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
372         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
373     xmlDoc.append("          </charseq>\n");
374     xmlDoc.append("        </extent>\n");
375     // head
376     xmlDoc.append("        <head>\n");
377     xmlDoc.append("          <charseq>\n");
378     try{
379       xmlDoc.append("          <!-- string = \"" +
380             document.getContent().getContent(ann.getStartNode().getOffset(),
381                                       ann.getEndNode().getOffset())+"\" -->\n");
382     }catch (InvalidOffsetException ioe){
383       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
384       " offsets:" + ann.getStartNode().getOffset() + " and "+
385       ann.getEndNode().getOffset());
386     }// End try
387     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
388         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
389     xmlDoc.append("          </charseq>\n");
390     xmlDoc.append("        </head>\n");
391     xmlDoc.append("      </entity_mention>\n");
392   }//serializeAnEntityMention();
393 
394   /** This method serializes an entity attribute from an Annotation*/
395   private void serializeAnEntityAttributes(Annotation ann){
396     if (ann == null) return;
397     boolean isAttribute = false;
398     if ("NAME".equals(ann.getFeatures().get("ENTITY_MENTION_TYPE"))
399         ||
400        null == ann.getFeatures().get("ENTITY_MENTION_TYPE"))
401       isAttribute = true;
402     if (! isAttribute)
403       return;
404 
405     // name
406     xmlDoc.append("        <name>\n");
407     xmlDoc.append("          <charseq>\n");
408     try{
409       xmlDoc.append("          <!-- string = \"" +
410             document.getContent().getContent(ann.getStartNode().getOffset(),
411                                       ann.getEndNode().getOffset())+"\" -->\n");
412     }catch (InvalidOffsetException ioe){
413       Err.prln("APFormatExporter:Warning: Couldn't access text between"+
414       " offsets:" + ann.getStartNode().getOffset() + " and "+
415       ann.getEndNode().getOffset());
416     }// End try
417     xmlDoc.append("          <start>"+ann.getStartNode().getOffset()+
418         "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
419     xmlDoc.append("          </charseq>\n");
420     xmlDoc.append("        </name>\n");
421   }//serializeAnEntityMention();
422 
423   /** Returns the next safe ID for an entity*/
424   private int getNextEntityId(){
425     return entityId ++;
426   }// getNextEntityId()
427 
428   /** added by  Di - returns the next safe ID for an entity mention */
429  private int getNextMentionId(){
430     return mentionId ++;
431   }
432 
433 
434   /** This list of strings represents the entities type that will be exported*/
435   private List exportedTypes = null;
436   /** This is the name of the dtd file. If it's not present no dtd would be
437     * written in the APF file.
438     */
439   private String dtdFileName = null;
440   /** This field represent the document id and it is used in generating the
441     * entities IDs. It is the file name of the document, without the extension
442     */
443   private String docId = null;
444 
445   /** This field represent an unique entity ID generator*/
446   private int entityId = 1;
447 
448     /** added by Di - this field represents a unique entity ID generator */
449     private int mentionId = 1;
450 
451   /** This is the xmlDoc that will be created*/
452   private StringBuffer xmlDoc = null;
453 
454   private URL exportFilePath = null;
455 
456   /** The source attribute for source*/
457   private String source = null;
458 
459   /** The source attribute for source*/
460   private boolean isSourceWritten = true;
461 
462 
463 }// APFormatExporter
464