1
15
16 package gate.creole;
17
18 import java.io.*;
19 import java.net.URL;
20 import java.util.*;
21
22 import gate.*;
23 import gate.util.*;
24
25
28 public class APFormatExporter extends AbstractLanguageAnalyser
29 implements ANNIEConstants{
30 public static final String
31 APF_EXP_DOCUMENT_PARAMETER_NAME = "document";
32
33 public static final String
34 APF_EXP_SOURCE_PARAMETER_NAME = "source";
35
36 public static final String
37 APF_EXP_DTD_PARAMETER_NAME = "dtdFileName";
38
39 public static final String
40 APF_EXP_PATH_PARAMETER_NAME = "exportFilePath";
41
42 public static final String
43 APF_EXP_TYPES_PARAMETER_NAME = "exportedTypes";
44
45 public static final String
46 APF_EXP_WRITE_SOURCE_PARAMETER_NAME = "isSourceWritten";
47
48
49 private static final boolean DEBUG = false;
50
51 public APFormatExporter() {}
52
53
54 public void execute() throws ExecutionException{
55 if(document == null)
57 throw new ExecutionException("No document found to export in APF format!");
58 if (exportedTypes == null)
59 throw new ExecutionException("No export types found.");
60 xmlDoc = new StringBuffer(10*(document.getContent().size().intValue()));
61 initDocId();
62 if (docId == null)
63 throw new ExecutionException("Couldn't detect the document's ID");
64 if (DEBUG)
65 Out.prln("Document id = "+ docId);
66
67 String exportFilePathStr = null;
68 if (exportFilePath == null)
69 exportFilePathStr = new String(document.getSourceUrl().getFile() +
70 ".apf.xml");
71 else
72 exportFilePathStr = exportFilePath.getPath()+ "/"
73 + gate.util.Files.getLastPathComponent(
74 document.getSourceUrl().getFile()) + ".apf.xml";
75
76 if (DEBUG)
77 Out.prln("Export file path = "+ exportFilePathStr);
78 OutputStreamWriter writer = null;
81 try{
82 writer = new OutputStreamWriter(
83 new FileOutputStream(new File(exportFilePathStr)));
84
85 serializeDocumentToAPF();
89 writer.write(xmlDoc.toString());
90 writer.flush();
91 writer.close();
92 }catch (Exception e){
93 throw new ExecutionException(e);
94 } }
98
99
100 public Resource init() throws ResourceInstantiationException {
101 return this;
102 }
104
105 public void setExportedTypes(List anExportedTypesList){
106 exportedTypes = anExportedTypesList;
107 }
109
110 public List getExportedTypes(){
111 return exportedTypes;
112 }
114
115 public void setDtdFileName(String aDtdFileName){
116 dtdFileName = aDtdFileName;
117 }
119
120 public String getDtdFileName(){
121 return dtdFileName;
122 }
124
125 public void setExportFilePath(URL anExportFilePath){
126 exportFilePath = anExportFilePath;
127 }
129
130 public URL getExportFilePath(){
131 return exportFilePath;
132 }
134
135 public void setSource(String aSource){
136 source = aSource;
137 }
139
140 public String getSource(){
141 return source;
142 }
144
145 public Boolean getIsSourceWritten() {
146 return new Boolean(isSourceWritten);
147 }
148
149
150 public void setIsSourceWritten(Boolean aIsSourceWritten){
151 isSourceWritten = aIsSourceWritten.booleanValue();
152 }
154
155
156
157 private void initDocId(){
158 String fileName = "";
159 fileName = gate.util.Files.getLastPathComponent(
160 document.getSourceUrl().getFile());
161 if (DEBUG)
163 Out.prln("From initDocId, fileName ="+ fileName);
164 StringTokenizer fileNameTokenizer = new StringTokenizer(fileName,".");
165 StringBuffer tmpDocId = new StringBuffer("");
166 while(fileNameTokenizer.hasMoreTokens()){
167 String token = (String)fileNameTokenizer.nextToken();
168 if (fileNameTokenizer.hasMoreTokens())
170 tmpDocId.append(token + ".");
171 } if (!"".equals(tmpDocId)){
174 tmpDocId.replace(tmpDocId.length()-1,tmpDocId.length(),"");
176 docId = tmpDocId.toString();
177 } }
180
181 protected void serializeDocumentToAPF(){
182 xmlDoc.append("<?xml version=\"1.0\" ?>\n");
183 xmlDoc.append("<!DOCTYPE source_file SYSTEM ");
184 if (dtdFileName == null)
185 xmlDoc.append("\"ace-rdc.v2.0.1.dtd\"");
186 else
187 xmlDoc.append("\""+dtdFileName+"\"");
188 xmlDoc.append(">\n");
189 xmlDoc.append("<source_file TYPE=\"text\"");
190 if (isSourceWritten) {
191 AnnotationSet docTypeAnns = document.getAnnotations(
192 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME).get("DOCTYPE");
193 if (docTypeAnns == null || docTypeAnns.isEmpty())
194 xmlDoc.append(" SOURCE=\""+ source+ "\" ");
195 else {
196 Annotation docTypeAnn = (Annotation) docTypeAnns.iterator().next();
197 if (docTypeAnn.getFeatures().get("SOURCE") == null)
198 xmlDoc.append(" SOURCE=\""+ source+ "\" ");
199 else
200 xmlDoc.append(" SOURCE=\""+ docTypeAnn.getFeatures().get("SOURCE")+ "\" ");
201 } }
203 xmlDoc.append("VERSION=\"2.0\" URI=\"");
204 xmlDoc.append(docId);
205 xmlDoc.append("-lf\">\n");
206 xmlDoc.append(" <document DOCID=\"");
207 xmlDoc.append(docId + "\">\n");
208 serializeEntities();
209 xmlDoc.append(" </document>\n");
210 xmlDoc.append("</source_file>");
211 }
213
216 protected void serializeEntities(){
217 if (exportedTypes == null || exportedTypes.isEmpty()) return;
219
220 Map entitiesMap = null;
221 if ( document.getFeatures() == null ||
222 document.getFeatures().get(DOCUMENT_COREF_FEATURE_NAME)== null)
223 entitiesMap = new HashMap();
224 else
225 entitiesMap = (Map)document.getFeatures().
226 get(DOCUMENT_COREF_FEATURE_NAME);
227 Map namedAnnotSetMap = null;
228 if (document.getNamedAnnotationSets() == null)
229 namedAnnotSetMap = new HashMap();
230 else
231 namedAnnotSetMap = new HashMap(document.getNamedAnnotationSets());
232 namedAnnotSetMap.put(null,document.getAnnotations());
234 Iterator exportedTypesIter = exportedTypes.iterator();
238 while(exportedTypesIter.hasNext()){
239 String entityType = (String)exportedTypesIter.next();
240 Set annotationSetNames = namedAnnotSetMap.keySet();
244 Iterator annotationSetNamesIter = annotationSetNames.iterator();
245 while (annotationSetNamesIter.hasNext()){
246 Object annotSetName = annotationSetNamesIter.next();
247 List entitiesList = (List) entitiesMap.get(annotSetName);
249 if (entitiesList == null) entitiesList = new ArrayList();
250 AnnotationSet annotSet = null;
252 Set serializationAnnotSet = null;
253 annotSet = (AnnotationSet)namedAnnotSetMap.get(annotSetName);
254 if (annotSet == null || annotSet.get(entityType) == null) continue;
255 serializationAnnotSet = new HashSet(annotSet.get(entityType));
256 Iterator entitiesListIter = entitiesList.iterator();
264 while (entitiesListIter.hasNext()){
265 List entity = (List)entitiesListIter.next();
266 String theEntityType = new String("");
269 if (entity != null && !entity.isEmpty()){
270 Integer annotId = (Integer)entity.get(0);
271 Annotation a = (Annotation)annotSet.get(annotId);
272 if (a != null) theEntityType = a.getType();
273 } if (theEntityType.equals(entityType)){
276 List ent = new ArrayList();
277 Iterator entityIter = entity.iterator();
278 while(entityIter.hasNext()){
279 Integer id = (Integer)entityIter.next();
280 ent.add(annotSet.get(id));
281 } serializeAnEntity(ent);
283 serializationAnnotSet.removeAll(ent);
285 } } Iterator serializationAnnotSetIter = serializationAnnotSet.iterator();
289 while(serializationAnnotSetIter.hasNext()){
290 Annotation annotEntity = (Annotation) serializationAnnotSetIter.next();
291 List ent = new ArrayList();
292 ent.add(annotEntity);
293 serializeAnEntity(ent);
294 } } } }
299
304 private void serializeAnEntity(List anEntity){
305 if (anEntity == null || anEntity.isEmpty()) return;
306 xmlDoc.append(" <entity ID=\"" + docId + "-" + getNextEntityId() + "\">\n");
308 Annotation a = (Annotation) anEntity.get(0);
310 xmlDoc.append(" <entity_type GENERIC=\"FALSE\">" + a.getType().toUpperCase() +
311 "</entity_type>\n");
312 Iterator anEntityIter = anEntity.iterator();
314 while(anEntityIter.hasNext()){
315 Annotation ann = (Annotation)anEntityIter.next();
316 serializeAnEntityMention(ann);
317 } xmlDoc.append(" <entity_attributes>\n");
320 anEntityIter = anEntity.iterator();
321 while(anEntityIter.hasNext()){
322 Annotation ann = (Annotation)anEntityIter.next();
323 serializeAnEntityAttributes(ann);
324 } xmlDoc.append(" </entity_attributes>\n");
326 xmlDoc.append(" </entity>\n");
327 }
329
330 private void serializeAnEntityMention(Annotation ann){
331 if (ann == null) return;
332 String entityMentionType = "NAME";
333 String entityMentionRole = null;
334 String entityMentionReference = null;
335 String entityMentionGeneric = null;
336
337 FeatureMap fm = ann.getFeatures();
338 if (fm != null){
339 if( null != fm.get("ENTITY_MENTION_TYPE"))
340 entityMentionType = (String) fm.get("ENTITY_MENTION_TYPE");
341
342 entityMentionRole = (String) fm.get("ROLE");
343 entityMentionReference = (String) fm.get("REFERENCE");
344 entityMentionGeneric = (String) fm.get("GENERIC");
345 } String str1 = (entityMentionRole == null)? "" :
347 ("ROLE=\"" + entityMentionRole + "\"");
348 String str2 = (entityMentionReference == null)? "" :
349 ("REFERENCE=\"" + entityMentionReference + "\"");
350 String str3 = (entityMentionGeneric == null)? "" :
351 ("GENERIC=\"" + entityMentionGeneric + "\"");
352
353
354
355 xmlDoc.append(" <entity_mention TYPE=\"" + entityMentionType+"\"" +
356 str1 + " " + str2 + " " + str3 + "ID=\"" + "M" + getNextMentionId() +"\">\n"
357 );
358
359 xmlDoc.append(" <extent>\n");
361 xmlDoc.append(" <charseq>\n");
362 try{
363 xmlDoc.append(" <!-- string = \"" +
364 document.getContent().getContent(ann.getStartNode().getOffset(),
365 ann.getEndNode().getOffset())+"\" -->\n");
366 }catch (InvalidOffsetException ioe){
367 Err.prln("APFormatExporter:Warning: Couldn't access text between"+
368 " offsets:" + ann.getStartNode().getOffset() + " and "+
369 ann.getEndNode().getOffset());
370 } xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+
372 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
373 xmlDoc.append(" </charseq>\n");
374 xmlDoc.append(" </extent>\n");
375 xmlDoc.append(" <head>\n");
377 xmlDoc.append(" <charseq>\n");
378 try{
379 xmlDoc.append(" <!-- string = \"" +
380 document.getContent().getContent(ann.getStartNode().getOffset(),
381 ann.getEndNode().getOffset())+"\" -->\n");
382 }catch (InvalidOffsetException ioe){
383 Err.prln("APFormatExporter:Warning: Couldn't access text between"+
384 " offsets:" + ann.getStartNode().getOffset() + " and "+
385 ann.getEndNode().getOffset());
386 } xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+
388 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
389 xmlDoc.append(" </charseq>\n");
390 xmlDoc.append(" </head>\n");
391 xmlDoc.append(" </entity_mention>\n");
392 }
394
395 private void serializeAnEntityAttributes(Annotation ann){
396 if (ann == null) return;
397 boolean isAttribute = false;
398 if ("NAME".equals(ann.getFeatures().get("ENTITY_MENTION_TYPE"))
399 ||
400 null == ann.getFeatures().get("ENTITY_MENTION_TYPE"))
401 isAttribute = true;
402 if (! isAttribute)
403 return;
404
405 xmlDoc.append(" <name>\n");
407 xmlDoc.append(" <charseq>\n");
408 try{
409 xmlDoc.append(" <!-- string = \"" +
410 document.getContent().getContent(ann.getStartNode().getOffset(),
411 ann.getEndNode().getOffset())+"\" -->\n");
412 }catch (InvalidOffsetException ioe){
413 Err.prln("APFormatExporter:Warning: Couldn't access text between"+
414 " offsets:" + ann.getStartNode().getOffset() + " and "+
415 ann.getEndNode().getOffset());
416 } xmlDoc.append(" <start>"+ann.getStartNode().getOffset()+
418 "</start><end>"+(ann.getEndNode().getOffset().longValue() - 1)+"</end>\n");
419 xmlDoc.append(" </charseq>\n");
420 xmlDoc.append(" </name>\n");
421 }
423
424 private int getNextEntityId(){
425 return entityId ++;
426 }
428
429 private int getNextMentionId(){
430 return mentionId ++;
431 }
432
433
434
435 private List exportedTypes = null;
436
439 private String dtdFileName = null;
440
443 private String docId = null;
444
445
446 private int entityId = 1;
447
448
449 private int mentionId = 1;
450
451
452 private StringBuffer xmlDoc = null;
453
454 private URL exportFilePath = null;
455
456
457 private String source = null;
458
459
460 private boolean isSourceWritten = true;
461
462
463 }