1   /*
2    *  CorpusBenchmarkTool.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 24/Oct/2001
12   *
13   *  $Id: CorpusBenchmarkTool.java,v 1.59 2006/04/05 14:57:47 ian_roberts Exp $
14   */
15  
16  package gate.util;
17  
18  import java.io.*;
19  import java.util.*;
20  
21  import gate.*;
22  import gate.util.AnnotationDiffer;
23  import gate.creole.*;
24  import gate.persist.PersistenceException;
25  import gate.persist.SerialDataStore;
26  
27  public class CorpusBenchmarkTool {
28    private static final String MARKED_DIR_NAME = "marked";
29    private static final String CLEAN_DIR_NAME = "clean";
30    private static final String CVS_DIR_NAME = "Cvs";
31    private static final String PROCESSED_DIR_NAME = "processed";
32    private static final String ERROR_DIR_NAME = "err";
33  
34    private static final boolean DEBUG = true;
35  
36    public CorpusBenchmarkTool() {}
37  
38    public void initPRs() {
39      try {
40        if (applicationFile == null)
41          Out.prln("Application not set!");
42        Out.prln("App file is: " + applicationFile.getAbsolutePath());
43        application = (Controller) gate.util.persistence.PersistenceManager
44                      .loadObjectFromFile(applicationFile);
45      }
46      catch (Exception ex) {
47        throw (GateRuntimeException)
48          new GateRuntimeException("Corpus Benchmark Tool:" + ex.getMessage())
49          .initCause(ex);
50      }
51    } //initPRs
52  
53    public void unloadPRs() {
54      //we have nothing to unload if no PRs are loaded
55      if (isMarkedStored)
56        return;
57  
58    }
59  
60    public void execute() {
61      execute(startDir);
62      if (application != null) {
63        javax.swing.SwingUtilities.invokeLater(new Runnable() {
64          public void run() {
65  
66            Iterator iter = new ArrayList(application.getPRs()).iterator();
67            while (iter.hasNext())
68              Factory.deleteResource( (Resource) iter.next());
69  
70            Factory.deleteResource(application);
71          }
72        });
73      }
74    }
75  
76    public void init() {
77      //first read the corpus_tool.properties file
78      File propFile = new File("corpus_tool.properties");
79      Out.prln(propFile.getAbsolutePath());
80      if (propFile.exists()) {
81        try {
82          InputStream inputStream = new FileInputStream(propFile);
83          this.configs.load(inputStream);
84          String thresholdString = this.configs.getProperty("threshold");
85          if (thresholdString != null && !thresholdString.equals("")) {
86            thresholdString=thresholdString.trim();
87            this.threshold = (new Double(thresholdString)).doubleValue();
88            Out.prln("New threshold is: " + this.threshold + "<P>\n");
89          }
90          String setName = this.configs.getProperty("annotSetName");
91          if (setName != null && !setName.equals("")) {
92            setName=setName.trim();
93            Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
94            this.annotSetName = setName;
95          }
96          setName = this.configs.getProperty("outputSetName");
97          if (setName != null && !setName.equals("")) {
98            setName=setName.trim();
99            Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
100           this.outputSetName = setName;
101         }
102         String encodingString = this.configs.getProperty("encoding");
103         if (encodingString != null && !encodingString.equals("")) {
104           encodingString=encodingString.trim();
105           this.documentEncoding = encodingString;
106           Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
107         }
108         String types = this.configs.getProperty("annotTypes");
109         if (types != null && !types.equals("")) {
110           types=types.trim();
111           Out.prln("Using annotation types from the properties file. <P>\n");
112           StringTokenizer strTok = new StringTokenizer(types, ";");
113           annotTypes = new ArrayList();
114           while (strTok.hasMoreTokens())
115             annotTypes.add(strTok.nextToken());
116         }
117         else {
118           annotTypes = new ArrayList();
119           annotTypes.add("Organization");
120           annotTypes.add("Person");
121           annotTypes.add("Date");
122           annotTypes.add("Location");
123           annotTypes.add("Address");
124           annotTypes.add("Money");
125           annotTypes.add("Percent");
126           annotTypes.add("GPE");
127           annotTypes.add("Facility");
128         }
129         String features = this.configs.getProperty("annotFeatures");
130         HashSet result = new HashSet();
131         if (features != null && !features.equals("")) {
132           features=features.trim();
133           Out.pr("Using annotation features from the properties file. \n");
134           java.util.StringTokenizer tok =
135               new java.util.StringTokenizer(features, ";");
136           String current;
137           while (tok.hasMoreTokens()) {
138             current = tok.nextToken();
139             result.add(current);
140           } // while
141         }
142         diffFeaturesSet = result;
143         Out.prln("Features: " + diffFeaturesSet + " <P>\n");
144 
145       }
146       catch (IOException ex) {
147         //just ignore the file and go on with the defaults
148         this.configs = new Properties();
149       }
150     }
151     else
152       this.configs = new Properties();
153 
154     //we only initialise the PRs if they are going to be used
155     //for processing unprocessed documents
156     if (!this.isMarkedStored)
157       initPRs();
158 
159   }
160 
161   public void execute(File dir) {
162     if (dir == null)
163       return;
164     //first set the current directory to be the given one
165     currDir = dir;
166 
167     File processedDir = null;
168     File cleanDir = null;
169     File markedDir = null;
170     File errorDir = null;
171 
172     ArrayList subDirs = new ArrayList();
173     File[] dirArray = currDir.listFiles();
174     if (dirArray == null)return;
175     for (int i = 0; i < dirArray.length; i++) {
176       if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
177         continue;
178       if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
179         cleanDir = dirArray[i];
180       else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
181         markedDir = dirArray[i];
182       else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
183         processedDir = dirArray[i];
184       else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
185         errorDir = dirArray[i];
186       else
187         subDirs.add(dirArray[i]);
188     }
189 
190     if (cleanDir == null)return;
191     Out.prln("Processing directory: " + currDir + "<P>");
192 
193     if (this.isGenerateMode)
194       generateCorpus(cleanDir, processedDir);
195     else
196       evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
197 
198       //if no more subdirs left, return
199     if (subDirs.isEmpty())
200       return;
201 
202     //there are more subdirectories to traverse, so iterate through
203     for (int j = 0; j < subDirs.size(); j++)
204       execute( (File) subDirs.get(j));
205 
206   } //execute(dir)
207 
208   public static void main(String[] args) throws GateException {
209     Out.prln("<HTML>");
210     Out.prln("<HEAD>");
211     Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
212     for (int argC = 0; argC < args.length; ++argC)
213       Out.pr(args[argC] + " ");
214     Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
215     Out.prln("<BODY>");
216     Out.prln("Please wait while GATE tools are initialised. <P>");
217     // initialise GATE
218     Gate.init();
219 
220     CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
221 
222     List inputFiles = null;
223     if (args.length < 1)throw new GateException(usage);
224     int i = 0;
225     while (i < args.length && args[i].startsWith("-")) {
226       if (args[i].equals("-generate")) {
227         Out.prln("Generating the corpus... <P>");
228         corpusTool.setGenerateMode(true);
229       }
230       else if (args[i].equals("-marked_clean")) {
231         Out.prln("Evaluating current grammars against human-annotated...<P>");
232         corpusTool.setMarkedClean(true);
233       }
234       else if (args[i].equals("-marked_stored")) {
235         Out.prln("Evaluating stored documents against human-annotated...<P>");
236         corpusTool.setMarkedStored(true);
237       }
238       else if (args[i].equals("-marked_ds")) {
239         Out.prln("Looking for marked docs in a datastore...<P>");
240         corpusTool.setMarkedDS(true);
241       }
242       else if (args[i].equals("-verbose")) {
243         Out.prln("Running in verbose mode. Will generate annotation " +
244                  "information when precision/recall are lower than " +
245                  corpusTool.getThreshold() + "<P>");
246         corpusTool.setVerboseMode(true);
247       }
248       else if (args[i].equals("-moreinfo")) {
249         Out.prln("Show more details in document table...<P>");
250         corpusTool.setMoreInfo(true);
251       }
252       i++; //just ignore the option, which we do not recognise
253     } //while
254 
255     String dirName = args[i];
256     File dir = new File(dirName);
257     if (!dir.isDirectory())
258       throw new GateException(usage);
259 
260     //get the last argument which is the application
261     i++;
262     String appName = args[i];
263     File appFile = new File(appName);
264     if (!appFile.isFile())
265       throw new GateException(usage);
266     else
267       corpusTool.setApplicationFile(appFile);
268 
269     corpusTool.init();
270     corpusWordCount = 0;
271 
272     Out.prln("Measuring annotaitions of types: " +
273              CorpusBenchmarkTool.annotTypes + "<P>");
274 
275     corpusTool.setStartDirectory(dir);
276     corpusTool.execute();
277     //if we're not generating the corpus, then print the precision and recall
278     //statistics for the processed corpus
279     if (!corpusTool.getGenerateMode())
280       corpusTool.printStatistics();
281 
282     Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
283     Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
284     Out.prln("<BR>Overall average fMeasure: " + corpusTool.getFMeasureAverage());
285     if (corpusWordCount == 0)
286       Out.prln("<BR>No Token annotations to count words in the corpus.");
287     else
288       Out.prln("<BR>Overall word count: " + corpusWordCount);
289 
290     if (hasProcessed) {
291       Out.prln("<P>Old Processed: ");
292       Out.prln("<BR>Overall average precision: "
293                + corpusTool.getPrecisionAverageProc());
294       Out.prln("<BR>Overall average recall: "
295                + corpusTool.getRecallAverageProc());
296       Out.prln("<BR>Overall average fMeasure: "
297                + corpusTool.getFMeasureAverageProc());
298     }
299     Out.prln("<BR>Finished! <P>");
300     Out.prln("</BODY>");
301     Out.prln("</HTML>");
302 
303     System.exit(0);
304 
305   } //main
306 
307   public void setGenerateMode(boolean mode) {
308     isGenerateMode = mode;
309   } //setGenerateMode
310 
311   public boolean getGenerateMode() {
312     return isGenerateMode;
313   } //getGenerateMode
314 
315   public boolean getVerboseMode() {
316     return isVerboseMode;
317   } //getVerboseMode
318 
319   public void setVerboseMode(boolean mode) {
320     isVerboseMode = mode;
321   } //setVerboseMode
322 
323   public void setMoreInfo(boolean mode) {
324     isMoreInfoMode = mode;
325   } // setMoreInfo
326 
327   public boolean getMoreInfo() {
328     return isMoreInfoMode;
329   } // getMoreInfo
330 
331   public void setDiffFeaturesList(Set features) {
332     diffFeaturesSet = features;
333   } // setDiffFeaturesList
334 
335   public Set getDiffFeaturesList() {
336     return diffFeaturesSet;
337   } // getDiffFeaturesList
338 
339   public void setMarkedStored(boolean mode) {
340     isMarkedStored = mode;
341   } // setMarkedStored
342 
343   public boolean getMarkedStored() {
344     return isMarkedStored;
345   } // getMarkedStored
346 
347   public void setMarkedClean(boolean mode) {
348     isMarkedClean = mode;
349   } //
350 
351   public boolean getMarkedClean() {
352     return isMarkedClean;
353   } //
354 
355   public void setMarkedDS(boolean mode) {
356     isMarkedDS = mode;
357   } //
358 
359   public boolean getMarkedDS() {
360     return isMarkedDS;
361   } //
362 
363   public void setApplicationFile(File newAppFile) {
364     applicationFile = newAppFile;
365   }
366 
367   /**
368    * Returns the average precision over the entire set of processed documents.
369    * <P>
370    * If the tool has been evaluating the original documents against the
371    * previously-stored automatically annotated ones, then the precision
372    * will be the average precision on those two sets. <P>
373    * If the tool was run in -marked mode, i.e., was evaluating the stored
374    * automatically processed ones against the human-annotated ones, then
375    * the precision will be the average precision on those two sets of documents.
376    */
377   public double getPrecisionAverage() {
378     return (double) precisionSum / docNumber;
379   }
380 
381   /**
382    * Returns the average recall over the entire set of processed documents.
383    * <P>
384    * If the tool has been evaluating the original documents against the
385    * previously-stored automatically annotated ones, then the recall
386    * will be the average recall on those two sets. <P>
387    * If the tool was run in -marked mode, i.e., was evaluating the stored
388    * automatically processed ones against the human-annotated ones, then
389    * the recall will be the average recall on those two sets of documents.
390    */
391   public double getRecallAverage() {
392     return (double) recallSum / docNumber;
393   }
394 
395   public double getFMeasureAverage() {
396     return (double) fMeasureSum / docNumber;
397   }
398 
399   /** For processed documents */
400   public double getPrecisionAverageProc() {
401     return (double) proc_precisionSum / docNumber;
402   }
403 
404   public double getRecallAverageProc() {
405     return (double) proc_recallSum / docNumber;
406   }
407 
408   public double getFMeasureAverageProc() {
409     return (double) proc_fMeasureSum / docNumber;
410   }
411 
412   public boolean isGenerateMode() {
413     return isGenerateMode == true;
414   } //isGenerateMode
415 
416   public double getThreshold() {
417     return threshold;
418   }
419 
420   public void setThreshold(double newValue) {
421     threshold = newValue;
422   }
423 
424   public File getStartDirectory() {
425     return startDir;
426   } //getStartDirectory
427 
428   public void setStartDirectory(File dir) {
429     startDir = dir;
430   } //setStartDirectory
431 
432   protected void generateCorpus(File fileDir, File outputDir) {
433     //1. check if we have input files
434     if (fileDir == null)
435       return;
436     //2. create the output directory or clean it up if needed
437     File outDir = outputDir;
438     if (outputDir == null) {
439       outDir = new File(currDir, PROCESSED_DIR_NAME);
440     }
441     else {
442       // get rid of the directory, coz datastore wants it clean
443       if (!Files.rmdir(outDir))
444         Out.prln("cannot delete old output directory: " + outDir);
445     }
446     outDir.mkdir();
447 
448     //create the datastore and process each document
449     try {
450       SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
451       sds.create();
452       sds.open();
453 
454       File[] files = fileDir.listFiles();
455       for (int i = 0; i < files.length; i++) {
456         if (!files[i].isFile())
457           continue;
458         // create a document
459         Out.prln("Processing and storing document: " + files[i].toURL() + "<P>");
460 
461         FeatureMap params = Factory.newFeatureMap();
462         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
463         params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
464 
465         FeatureMap features = Factory.newFeatureMap();
466 //        Gate.setHiddenAttribute(features, true);
467 
468         // create the document
469         final Document doc = (Document) Factory.createResource(
470             "gate.corpora.DocumentImpl", params, features
471             );
472 
473         doc.setName(files[i].getName());
474         if (doc == null)
475           continue;
476         processDocument(doc);
477         final LanguageResource lr = sds.adopt(doc, null);
478         sds.sync(lr);
479         javax.swing.SwingUtilities.invokeLater(new Runnable() {
480           public void run() {
481             Factory.deleteResource(doc);
482             Factory.deleteResource(lr);
483           }
484         });
485       } //for
486       sds.close();
487     }
488     catch (java.net.MalformedURLException ex) {
489       throw (GateRuntimeException)
490         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
491         .initCause(ex);
492     }
493     catch (PersistenceException ex1) {
494       throw (GateRuntimeException)
495         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
496         .initCause(ex1);
497     }
498     catch (ResourceInstantiationException ex2) {
499       throw (GateRuntimeException)
500         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
501         .initCause(ex2);
502     }
503     catch (gate.security.SecurityException ex3) {
504       throw (GateRuntimeException)
505         new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage())
506         .initCause(ex3);
507     }
508   } //generateCorpus
509 
510   protected void evaluateCorpus(File fileDir,
511                                 File processedDir, File markedDir,
512                                 File errorDir) {
513     //1. check if we have input files and the processed Dir
514     if (fileDir == null || !fileDir.exists())
515       return;
516     if (processedDir == null || !processedDir.exists())
517 
518       //if the user wants evaluation of marked and stored that's not possible
519       if (isMarkedStored) {
520         Out.prln("Cannot evaluate because no processed documents exist.");
521         return;
522       }
523       else
524         isMarkedClean = true;
525 
526         // create the error directory or clean it up if needed
527     File errDir = null;
528     if (isMoreInfoMode) {
529       errDir = errorDir;
530       if (errDir == null) {
531         errDir = new File(currDir, ERROR_DIR_NAME);
532       }
533       else {
534         // get rid of the directory, coz we wants it clean
535         if (!Files.rmdir(errDir))
536           Out.prln("cannot delete old error directory: " + errDir);
537       }
538       Out.prln("Create error directory: " + errDir + "<BR><BR>");
539       errDir.mkdir();
540     }
541 
542     //looked for marked texts only if the directory exists
543     boolean processMarked = markedDir != null && markedDir.exists();
544     if (!processMarked && (isMarkedStored || isMarkedClean)) {
545       Out.prln("Cannot evaluate because no human-annotated documents exist.");
546       return;
547     }
548 
549     if (isMarkedStored) {
550       evaluateMarkedStored(markedDir, processedDir, errDir);
551       return;
552     }
553     else if (isMarkedClean) {
554       evaluateMarkedClean(markedDir, fileDir, errDir);
555       return;
556     }
557 
558     Document persDoc = null;
559     Document cleanDoc = null;
560     Document markedDoc = null;
561 
562     //open the datastore and process each document
563     try {
564       //open the data store
565       DataStore sds = Factory.openDataStore
566                       ("gate.persist.SerialDataStore",
567                        processedDir.toURL().toExternalForm());
568 
569       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
570       for (int i = 0; i < lrIDs.size(); i++) {
571         String docID = (String) lrIDs.get(i);
572 
573         //read the stored document
574         FeatureMap features = Factory.newFeatureMap();
575         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
576         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
577         FeatureMap hparams = Factory.newFeatureMap();
578 //        Gate.setHiddenAttribute(hparams, true);
579 
580         persDoc = (Document) Factory.createResource(
581             "gate.corpora.DocumentImpl",
582             features, hparams);
583 
584         if (isMoreInfoMode) {
585           StringBuffer errName = new StringBuffer(persDoc.getName());
586           errName.replace(
587               persDoc.getName().lastIndexOf("."),
588               persDoc.getName().length(),
589               ".err");
590           Out.prln("<H2>" +
591                    "<a href=\"err/" + errName.toString() + "\">"
592                    + persDoc.getName() + "</a>" + "</H2>");
593         }
594         else
595           Out.prln("<H2>" + persDoc.getName() + "</H2>");
596 
597         File cleanDocFile = new File(fileDir, persDoc.getName());
598         //try reading the original document from clean
599         if (!cleanDocFile.exists()) {
600           Out.prln("Warning: Cannot find original document " +
601                    persDoc.getName() + " in " + fileDir);
602         }
603         else {
604           FeatureMap params = Factory.newFeatureMap();
605           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
606           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
607                      documentEncoding);
608 
609           // create the document
610           cleanDoc = (Document) Factory.createResource(
611               "gate.corpora.DocumentImpl", params, hparams);
612           cleanDoc.setName(persDoc.getName());
613         }
614 
615         //try finding the marked document
616         StringBuffer docName = new StringBuffer(persDoc.getName());
617         if (!isMarkedDS) {
618           docName.replace(
619               persDoc.getName().lastIndexOf("."),
620               docName.length(),
621               ".xml");
622           File markedDocFile = new File(markedDir, docName.toString());
623           if (!processMarked || !markedDocFile.exists()) {
624             Out.prln("Warning: Cannot find human-annotated document " +
625                      markedDocFile + " in " + markedDir);
626           }
627           else {
628             FeatureMap params = Factory.newFeatureMap();
629             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
630                        markedDocFile.toURL());
631             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
632                        documentEncoding);
633 
634             // create the document
635             markedDoc = (Document) Factory.createResource(
636                 "gate.corpora.DocumentImpl", params, hparams);
637             markedDoc.setName(persDoc.getName());
638           }
639         }
640         else {
641           //open marked from a DS
642           //open the data store
643           DataStore sds1 = Factory.openDataStore
644                            ("gate.persist.SerialDataStore",
645                             markedDir.toURL().toExternalForm());
646 
647           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
648           boolean found = false;
649           int k = 0;
650           //search for the marked doc with the same name
651           while (k < lrIDs1.size() && !found) {
652             String docID1 = (String) lrIDs1.get(k);
653 
654             //read the stored document
655             FeatureMap features1 = Factory.newFeatureMap();
656             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
657             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
658             Document tempDoc = (Document) Factory.createResource(
659                 "gate.corpora.DocumentImpl",
660                 features1, hparams);
661             //check whether this is our doc
662             if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
663                 endsWith(persDoc.getName())) {
664               found = true;
665               markedDoc = tempDoc;
666             }
667             else k++;
668           }
669         }
670 
671         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
672 
673         if (persDoc != null) {
674           final gate.Document pd = persDoc;
675           javax.swing.SwingUtilities.invokeLater(new Runnable() {
676             public void run() {
677               Factory.deleteResource(pd);
678             }
679           });
680         }
681         if (cleanDoc != null) {
682           final gate.Document cd = cleanDoc;
683           javax.swing.SwingUtilities.invokeLater(new Runnable() {
684             public void run() {
685               Factory.deleteResource(cd);
686             }
687           });
688         }
689         if (markedDoc != null) {
690           final gate.Document md = markedDoc;
691           javax.swing.SwingUtilities.invokeLater(new Runnable() {
692             public void run() {
693               Factory.deleteResource(md);
694             }
695           });
696         }
697 
698       } //for loop through saved docs
699       sds.close();
700     }
701     catch (java.net.MalformedURLException ex) {
702       throw (GateRuntimeException)
703         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
704         .initCause(ex);
705     }
706     catch (PersistenceException ex1) {
707       throw (GateRuntimeException)
708         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
709         .initCause(ex1);
710     }
711     catch (ResourceInstantiationException ex2) {
712       throw (GateRuntimeException)
713         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
714         .initCause(ex2);
715     }
716 
717   } //evaluateCorpus
718 
719   protected void evaluateMarkedStored(File markedDir, File storedDir,
720                                       File errDir) {
721     Document persDoc = null;
722     Document cleanDoc = null;
723     Document markedDoc = null;
724 
725     //open the datastore and process each document
726     try {
727       //open the data store
728       DataStore sds = Factory.openDataStore
729                       ("gate.persist.SerialDataStore",
730                        storedDir.toURL().toExternalForm());
731 
732       List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
733       for (int i = 0; i < lrIDs.size(); i++) {
734         String docID = (String) lrIDs.get(i);
735 
736         //read the stored document
737         FeatureMap features = Factory.newFeatureMap();
738         features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
739         features.put(DataStore.LR_ID_FEATURE_NAME, docID);
740 
741         FeatureMap hparams = Factory.newFeatureMap();
742 //        Gate.setHiddenAttribute(hparams, true);
743 
744         persDoc = (Document) Factory.createResource(
745             "gate.corpora.DocumentImpl",
746             features, hparams);
747 
748         if (isMoreInfoMode) {
749           StringBuffer errName = new StringBuffer(persDoc.getName());
750           errName.replace(
751               persDoc.getName().lastIndexOf("."),
752               persDoc.getName().length(),
753               ".err");
754           Out.prln("<H2>" +
755                    "<a href=\"err/" + errName.toString() + "\">"
756                    + persDoc.getName() + "</a>" + "</H2>");
757         }
758         else
759           Out.prln("<H2>" + persDoc.getName() + "</H2>");
760 
761         if (!this.isMarkedDS) { //try finding the marked document as file
762           StringBuffer docName = new StringBuffer(persDoc.getName());
763           docName.replace(
764               persDoc.getName().lastIndexOf("."),
765               docName.length(),
766               ".xml");
767           File markedDocFile = new File(markedDir, docName.toString());
768           if (!markedDocFile.exists()) {
769             Out.prln("Warning: Cannot find human-annotated document " +
770                      markedDocFile + " in " + markedDir);
771           }
772           else {
773             FeatureMap params = Factory.newFeatureMap();
774             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
775                        markedDocFile.toURL());
776             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
777                        documentEncoding);
778 
779             // create the document
780             markedDoc = (Document) Factory.createResource(
781                 "gate.corpora.DocumentImpl", params, hparams);
782             markedDoc.setName(persDoc.getName());
783           } //find marked as file
784         }
785         else {
786           try {
787             //open marked from a DS
788             //open the data store
789             DataStore sds1 = Factory.openDataStore
790                              ("gate.persist.SerialDataStore",
791                               markedDir.toURL().toExternalForm());
792 
793             List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
794             boolean found = false;
795             int k = 0;
796             //search for the marked doc with the same name
797             while (k < lrIDs1.size() && !found) {
798               String docID1 = (String) lrIDs1.get(k);
799 
800               //read the stored document
801               FeatureMap features1 = Factory.newFeatureMap();
802               features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
803               features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
804               Document tempDoc = (Document) Factory.createResource(
805                   "gate.corpora.DocumentImpl",
806                   features1, hparams);
807               //check whether this is our doc
808               if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
809                   endsWith(persDoc.getName())) {
810                 found = true;
811                 markedDoc = tempDoc;
812               }
813               else k++;
814             }
815           }
816           catch (java.net.MalformedURLException ex) {
817             Out.prln("Error finding marked directory " +
818                      markedDir.getAbsolutePath());
819           }
820           catch (gate.persist.PersistenceException ex1) {
821             Out.prln(
822                 "Error opening marked as a datastore (-marked_ds specified)");
823           }
824           catch (gate.creole.ResourceInstantiationException ex2) {
825             Out.prln(
826                 "Error opening marked as a datastore (-marked_ds specified)");
827           }
828         }
829 
830         evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
831         if (persDoc != null) {
832           final gate.Document pd = persDoc;
833           javax.swing.SwingUtilities.invokeLater(new Runnable() {
834             public void run() {
835               Factory.deleteResource(pd);
836             }
837           });
838         }
839         if (markedDoc != null) {
840           final gate.Document md = markedDoc;
841           javax.swing.SwingUtilities.invokeLater(new Runnable() {
842             public void run() {
843               Factory.deleteResource(md);
844             }
845           });
846         }
847 
848       } //for loop through saved docs
849       sds.close();
850 
851     }
852     catch (java.net.MalformedURLException ex) {
853       throw (GateRuntimeException)
854         new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
855         .initCause(ex);
856     }
857     catch (PersistenceException ex1) {
858       throw (GateRuntimeException)
859         new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
860         .initCause(ex1);
861     }
862     catch (ResourceInstantiationException ex2) {
863       throw (GateRuntimeException)
864         new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
865         .initCause(ex2);
866     }
867 
868   } //evaluateMarkedStored
869 
870   protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
871     Document persDoc = null;
872     Document cleanDoc = null;
873     Document markedDoc = null;
874 
875     File[] cleanDocs = cleanDir.listFiles();
876     for (int i = 0; i < cleanDocs.length; i++) {
877       if (!cleanDocs[i].isFile())
878         continue;
879 
880       //try reading the original document from clean
881       FeatureMap params = Factory.newFeatureMap();
882       try {
883         params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
884       }
885       catch (java.net.MalformedURLException ex) {
886         Out.prln("Cannot create document from file: " +
887                  cleanDocs[i].getAbsolutePath());
888         continue;
889       }
890       //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
891       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
892 
893       FeatureMap hparams = Factory.newFeatureMap();
894 //      Gate.setHiddenAttribute(hparams, true);
895 
896       // create the document
897       try {
898         cleanDoc = (Document) Factory.createResource(
899             "gate.corpora.DocumentImpl", params, hparams, cleanDocs[i].getName());
900       }
901       catch (gate.creole.ResourceInstantiationException ex) {
902         Out.prln("Cannot create document from file: " +
903                  cleanDocs[i].getAbsolutePath());
904         continue;
905       }
906 
907       if (isMoreInfoMode) {
908         StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
909         errName.replace(
910             cleanDocs[i].getName().lastIndexOf("."),
911             cleanDocs[i].getName().length(),
912             ".err");
913         Out.prln("<H2>" +
914                  "<a href=\"err/" + errName.toString() + "\">"
915                  + cleanDocs[i].getName() + "</a>" + "</H2>");
916       }
917       else
918         Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
919 
920         //try finding the marked document
921       if (!isMarkedDS) {
922         StringBuffer docName = new StringBuffer(cleanDoc.getName());
923         docName.replace(
924             cleanDoc.getName().lastIndexOf("."),
925             docName.length(),
926             ".xml");
927         File markedDocFile = new File(markedDir, docName.toString());
928         if (!markedDocFile.exists()) {
929           Out.prln("Warning: Cannot find human-annotated document " +
930                    markedDocFile + " in " + markedDir);
931           continue;
932         }
933         else {
934           params = Factory.newFeatureMap();
935           try {
936             params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
937                        markedDocFile.toURL());
938           }
939           catch (java.net.MalformedURLException ex) {
940             Out.prln("Cannot create document from file: " +
941                      markedDocFile.getAbsolutePath());
942             continue;
943           }
944           //params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
945           params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
946 
947           // create the document
948           try {
949             markedDoc = (Document) Factory.createResource(
950                 "gate.corpora.DocumentImpl", params,
951                 hparams, cleanDoc.getName());
952           }
953           catch (gate.creole.ResourceInstantiationException ex) {
954             Out.prln("Cannot create document from file: " +
955                      markedDocFile.getAbsolutePath());
956             continue;
957           }
958 
959         } //if markedDoc exists
960       }
961       else {
962         try {
963           //open marked from a DS
964           //open the data store
965           DataStore sds1 = Factory.openDataStore
966                            ("gate.persist.SerialDataStore",
967                             markedDir.toURL().toExternalForm());
968 
969           List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
970           boolean found = false;
971           int k = 0;
972           //search for the marked doc with the same name
973           while (k < lrIDs1.size() && !found) {
974             String docID1 = (String) lrIDs1.get(k);
975 
976             //read the stored document
977             FeatureMap features1 = Factory.newFeatureMap();
978             features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
979             features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
980             Document tempDoc = (Document) Factory.createResource(
981                 "gate.corpora.DocumentImpl",
982                 features1, hparams);
983             //check whether this is our doc
984             if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
985                 endsWith(cleanDoc.getName())) {
986               found = true;
987               markedDoc = tempDoc;
988             }
989             else k++;
990           }
991         }
992         catch (java.net.MalformedURLException ex) {
993           Out.prln("Error finding marked directory " +
994                    markedDir.getAbsolutePath());
995         }
996         catch (gate.persist.PersistenceException ex1) {
997           Out.prln("Error opening marked as a datastore (-marked_ds specified)");
998         }
999         catch (gate.creole.ResourceInstantiationException ex2) {
1000          Out.prln("Error opening marked as a datastore (-marked_ds specified)");
1001        }
1002      } //if using a DS for marked
1003
1004      try {
1005        evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
1006      }
1007      catch (gate.creole.ResourceInstantiationException ex) {
1008        ex.printStackTrace();
1009        Out.prln("Evaluate failed on document: " + cleanDoc.getName());
1010      }
1011      if (persDoc != null) {
1012        final gate.Document pd = persDoc;
1013        javax.swing.SwingUtilities.invokeLater(new Runnable() {
1014          public void run() {
1015            Factory.deleteResource(pd);
1016          }
1017        });
1018      }
1019      if (cleanDoc != null) {
1020        final gate.Document cd = cleanDoc;
1021        javax.swing.SwingUtilities.invokeLater(new Runnable() {
1022          public void run() {
1023            Factory.deleteResource(cd);
1024          }
1025        });
1026      }
1027      if (markedDoc != null) {
1028        final gate.Document md = markedDoc;
1029        javax.swing.SwingUtilities.invokeLater(new Runnable() {
1030          public void run() {
1031            Factory.deleteResource(md);
1032          }
1033        });
1034      }
1035
1036    } //for loop through clean docs
1037
1038  } //evaluateMarkedClean
1039
1040  protected void processDocument(Document doc) {
1041    try {
1042      if (application instanceof CorpusController) {
1043        Corpus tempCorpus = Factory.newCorpus("temp");
1044        tempCorpus.add(doc);
1045        ( (CorpusController) application).setCorpus(tempCorpus);
1046        application.execute();
1047        Factory.deleteResource(tempCorpus);
1048        tempCorpus = null;
1049      }
1050      else {
1051        Iterator iter = application.getPRs().iterator();
1052        while (iter.hasNext())
1053          ( (ProcessingResource) iter.next()).setParameterValue("document", doc);
1054        application.execute();
1055      }
1056    }
1057    catch (ResourceInstantiationException ex) {
1058      throw (RuntimeException)
1059        new RuntimeException("Error executing application: "
1060                                 + ex.getMessage())
1061        .initCause(ex);
1062    }
1063    catch (ExecutionException ex) {
1064      throw (RuntimeException)
1065        new RuntimeException("Error executing application: "
1066                                 + ex.getMessage())
1067        .initCause(ex);
1068    }
1069  }
1070
1071  protected void evaluateDocuments(Document persDoc,
1072                                   Document cleanDoc, Document markedDoc,
1073                                   File errDir) throws
1074      ResourceInstantiationException {
1075    if (cleanDoc == null && markedDoc == null)
1076      return;
1077
1078    //we've got no types to compare
1079    if (annotTypes == null || annotTypes.isEmpty())
1080      return;
1081
1082    if (cleanDoc != null && !isMarkedStored) {
1083
1084      processDocument(cleanDoc);
1085
1086      int wordCount = countWords(cleanDoc);
1087      if (wordCount == 0)
1088        Out.prln("<BR>No Token annotations to count words in the document.");
1089      else
1090        Out.prln("<BR>Word count: " + wordCount);
1091      corpusWordCount += wordCount;
1092
1093      if (!isMarkedClean)
1094        evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
1095      else
1096        evaluateTwoDocs(markedDoc, cleanDoc, errDir);
1097
1098    }
1099    else
1100      evaluateTwoDocs(markedDoc, persDoc, errDir);
1101
1102  }
1103
1104  /**
1105   * Count all Token.kind=word annotations in the document
1106   */
1107  protected int countWords(Document annotDoc) {
1108    int count = 0;
1109
1110    if (annotDoc == null)return 0;
1111    // check for Token in outputSetName
1112    AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
1113    if (tokens == null)return 0;
1114
1115    Iterator it = tokens.iterator();
1116    Annotation currAnnotation;
1117    while (it.hasNext()) {
1118      currAnnotation = (Annotation) it.next();
1119      Object feature = currAnnotation.getFeatures().get("kind");
1120      if (feature != null && "word".equalsIgnoreCase( (String) feature))++count;
1121    } // while
1122
1123    return count;
1124  }
1125
1126  protected void evaluateAllThree(Document persDoc,
1127                                  Document cleanDoc, Document markedDoc,
1128                                  File errDir) throws
1129      ResourceInstantiationException {
1130    //first start the table and its header
1131    printTableHeader();
1132
1133    // store annotation diff in .err file
1134    Writer errWriter = null;
1135    if (isMoreInfoMode && errDir != null) {
1136      StringBuffer docName = new StringBuffer(cleanDoc.getName());
1137      docName.replace(
1138          cleanDoc.getName().lastIndexOf("."),
1139          docName.length(),
1140          ".err");
1141      File errFile = new File(errDir, docName.toString());
1142      String encoding = ( (gate.corpora.DocumentImpl) cleanDoc).getEncoding();
1143      try {
1144        errWriter = new FileWriter(errFile, false);
1145        /*
1146                 if(encoding == null) {
1147          errWriter = new OutputStreamWriter(
1148              new FileOutputStream(errFile, false));
1149                 } else {
1150          errWriter = new OutputStreamWriter(
1151              new FileOutputStream(errFile, false), encoding);
1152                 }*/
1153      }
1154      catch (Exception ex) {
1155        Out.prln("Exception when creating the error file " + errFile + ": "
1156                 + ex.getMessage());
1157        errWriter = null;
1158      }
1159    }
1160
1161    for (int jj = 0; jj < annotTypes.size(); jj++) {
1162      String annotType = (String) annotTypes.get(jj);
1163
1164      AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1165      //we don't have this annotation type in this document
1166      if (annotDiffer == null)
1167        continue;
1168
1169      //increase the number of processed documents
1170      docNumber++;
1171      //add precison and recall to the sums
1172      updateStatistics(annotDiffer, annotType);
1173
1174      AnnotationDiffer annotDiffer1 =
1175          measureDocs(markedDoc, persDoc, annotType);
1176
1177      Out.prln("<TR>");
1178
1179      if (isMoreInfoMode && annotDiffer1 != null
1180          &&
1181          (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1182           || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1183          )
1184        Out.prln("<TD> " + annotType + "_new" + "</TD>");
1185      else
1186        Out.prln("<TD> " + annotType + "</TD>");
1187
1188      if (isMoreInfoMode) {
1189        if (annotDiffer1 != null) updateStatisticsProc(annotDiffer1, annotType);
1190
1191        Out.prln("<TD>" + annotDiffer.getCorrectMatches() + "</TD>");
1192        Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() + "</TD>");
1193        Out.prln("<TD>" + annotDiffer.getMissing() + "</TD>");
1194        Out.prln("<TD>" + annotDiffer.getSpurious() + "</TD>");
1195      }
1196
1197      Out.prln("<TD>");
1198
1199      //check the precision first
1200      if (annotDiffer1 != null) {
1201
1202        if (annotDiffer1.getPrecisionAverage()
1203            < annotDiffer.getPrecisionAverage()) {
1204          Out.prln("<P><Font color=blue> ");
1205          Out.prln(annotDiffer.getPrecisionAverage());
1206
1207          if (!isMoreInfoMode) {
1208            Out.pr("<BR>Precision increase on human-marked from ");
1209            Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1210            Out.prln(annotDiffer.getPrecisionAverage());
1211          }
1212          Out.prln(" </Font></P>");
1213        }
1214        else if (annotDiffer1.getPrecisionAverage()
1215                 > annotDiffer.getPrecisionAverage()) {
1216          Out.prln("<P><Font color=red> ");
1217          Out.prln(annotDiffer.getPrecisionAverage());
1218
1219          if (!isMoreInfoMode) {
1220            Out.pr("<BR>Precision decrease on human-marked from ");
1221            Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1222            Out.prln(annotDiffer.getPrecisionAverage());
1223          }
1224          Out.prln(" </Font></P>");
1225        }
1226        else
1227          Out.prln("<P> " + (double) annotDiffer.getPrecisionAverage() +
1228                   " </P>");
1229      }
1230      else
1231        Out.prln("<P> " + annotDiffer.getPrecisionAverage() + " </P>");
1232
1233      Out.prln("</TD>");
1234
1235      Out.prln("<TD>");
1236
1237      //check the recall now
1238      if (annotDiffer1 != null) {
1239
1240        if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1241          Out.prln("<P><Font color=blue> ");
1242          Out.prln(annotDiffer.getRecallAverage());
1243
1244          if (!isMoreInfoMode) {
1245            Out.pr("<BR>Recall increase on human-marked from ");
1246            Out.pr(annotDiffer1.getRecallAverage() + " to ");
1247            Out.prln(annotDiffer.getRecallAverage());
1248          }
1249          Out.prln(" </Font></P>");
1250        }
1251        else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1252          Out.prln("<P><Font color=red> ");
1253          Out.prln(annotDiffer.getRecallAverage());
1254
1255          if (!isMoreInfoMode) {
1256            Out.pr("<BR>Recall decrease on human-marked from ");
1257            Out.pr(annotDiffer1.getRecallAverage() + " to ");
1258            Out.prln(annotDiffer.getRecallAverage());
1259          }
1260          Out.prln(" </Font></P>");
1261        }
1262        else
1263          Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1264      }
1265      else
1266        Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1267
1268      Out.prln("</TD>");
1269
1270      //check the recall now
1271      if (isVerboseMode) {
1272        Out.prln("<TD>");
1273        if (annotDiffer.getRecallAverage() < threshold) {
1274          printAnnotations(annotDiffer, markedDoc, cleanDoc);
1275        }
1276        else {
1277          Out.prln("&nbsp;");
1278        }
1279        Out.prln("</TD>");
1280      }
1281
1282      Out.prln("</TR>");
1283
1284      // show one more table line for processed document
1285      if (isMoreInfoMode && annotDiffer1 != null
1286          &&
1287          (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1288           || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1289          ) {
1290
1291        Out.prln("<TR>");
1292        Out.prln("<TD> " + annotType + "_old" + "</TD>");
1293
1294        Out.prln("<TD>" + annotDiffer1.getCorrectMatches() + "</TD>");
1295        Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() + "</TD>");
1296        Out.prln("<TD>" + annotDiffer1.getMissing() + "</TD>");
1297        Out.prln("<TD>" + annotDiffer1.getSpurious() + "</TD>");
1298
1299        Out.prln("<TD>");
1300        if (annotDiffer1.getPrecisionAverage() <
1301            annotDiffer.getPrecisionAverage())
1302
1303          Out.prln("<P><Font color=blue> " + annotDiffer1.getPrecisionAverage()
1304                   + "</Font></P>");
1305        else if (annotDiffer1.getPrecisionAverage() >
1306                 annotDiffer.getPrecisionAverage())
1307          Out.prln(
1308              "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1309              + " </Font></P>");
1310        else
1311          Out.prln(annotDiffer1.getPrecisionAverage());
1312
1313        Out.prln("</TD>");
1314
1315        Out.prln("<TD>");
1316        if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1317          Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1318                   + " </Font></P>");
1319        else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1320          Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1321                   + " </Font></P>");
1322        else
1323          Out.prln(annotDiffer1.getRecallAverage());
1324
1325        Out.prln("</TD>");
1326
1327        //check the recall now
1328        if (isVerboseMode) {
1329          // create error file and start writing
1330
1331          Out.prln("<TD>");
1332          if (annotDiffer.getRecallAverage() < threshold) {
1333            printAnnotations(annotDiffer, markedDoc, cleanDoc);
1334          }
1335          else {
1336            Out.prln("&nbsp;");
1337          }
1338          Out.prln("</TD>");
1339        }
1340        Out.prln("</TR>");
1341      } // if(isMoreInfoMode && annotDiff1 != null)
1342
1343      if (isMoreInfoMode && errDir != null)
1344        storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1345    } //for loop through annotation types
1346    Out.prln("</TABLE>");
1347
1348    try {
1349      if (errWriter != null)
1350        errWriter.close();
1351    }
1352    catch (Exception ex) {
1353      Out.prln("Exception on close of error file " + errWriter + ": "
1354               + ex.getMessage());
1355    }
1356  } //evaluateAllThree
1357
1358  protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1359                                 File errDir) throws
1360      ResourceInstantiationException {
1361
1362    //first start the table and its header
1363    printTableHeader();
1364
1365    // store annotation diff in .err file
1366    Writer errWriter = null;
1367    if (isMoreInfoMode && errDir != null) {
1368      StringBuffer docName = new StringBuffer(keyDoc.getName());
1369      docName.replace(
1370          keyDoc.getName().lastIndexOf("."),
1371          docName.length(),
1372          ".err");
1373      File errFile = new File(errDir, docName.toString());
1374      String encoding = ( (gate.corpora.DocumentImpl) keyDoc).getEncoding();
1375      try {
1376        errWriter = new FileWriter(errFile, false);
1377        /*
1378                 if(encoding == null) {
1379          errWriter = new OutputStreamWriter(
1380              new FileOutputStream(errFile, false));
1381                 } else {
1382          errWriter = new OutputStreamWriter(
1383              new FileOutputStream(errFile, false), encoding);
1384                 }*/
1385      }
1386      catch (Exception ex) {
1387        Out.prln("Exception when creating the error file " + errFile + ": "
1388                 + ex.getMessage());
1389        errWriter = null;
1390      }
1391    }
1392
1393    for (int jj = 0; jj < annotTypes.size(); jj++) {
1394      String annotType = (String) annotTypes.get(jj);
1395
1396      AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1397      //we don't have this annotation type in this document
1398      if (annotDiff == null)
1399        continue;
1400
1401      //increase the number of processed documents
1402      docNumber++;
1403      //add precison and recall to the sums
1404      updateStatistics(annotDiff, annotType);
1405
1406      Out.prln("<TR>");
1407      Out.prln("<TD>" + annotType + "</TD>");
1408
1409      if (isMoreInfoMode) {
1410        Out.prln("<TD>" + annotDiff.getCorrectMatches() + "</TD>");
1411        Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() + "</TD>");
1412        Out.prln("<TD>" + annotDiff.getMissing() + "</TD>");
1413        Out.prln("<TD>" + annotDiff.getSpurious() + "</TD>");
1414      }
1415
1416      Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1417      Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1418      //check the recall now
1419      if (isVerboseMode) {
1420        Out.prln("<TD>");
1421        if (annotDiff.getRecallAverage() < threshold) {
1422          printAnnotations(annotDiff, keyDoc, respDoc);
1423        }
1424        else {
1425          Out.prln("&nbsp;");
1426        }
1427        Out.prln("</TD>");
1428      }
1429      Out.prln("</TR>");
1430
1431      if (isMoreInfoMode && errDir != null)
1432        storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1433    } //for loop through annotation types
1434    Out.prln("</TABLE>");
1435
1436    try {
1437      if (errWriter != null)
1438        errWriter.close();
1439    }
1440    catch (Exception ex) {
1441      Out.prln("Exception on close of error file " + errWriter + ": "
1442               + ex.getMessage());
1443    }
1444  } //evaluateTwoDocs
1445
1446  protected void printTableHeader() {
1447    Out.prln("<TABLE BORDER=1");
1448    Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1449
1450    if (isMoreInfoMode)
1451      Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1452             + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1453
1454    Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1455
1456    if (isVerboseMode)
1457      Out.pr("<TD><B>Annotations</B></TD>");
1458
1459    Out.prln("</TR>");
1460  }
1461
1462  protected void updateStatistics(AnnotationDiffer annotDiffer,
1463                                  String annotType) {
1464    double precisionAverage = ( (double) ( (double) annotDiffer.
1465                                          getPrecisionLenient() +
1466                                          annotDiffer.getPrecisionStrict()) /
1467                               (double) (2.0));
1468    if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1469    precisionSum += precisionAverage;
1470
1471    double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1472                                       annotDiffer.getRecallStrict()) /
1473                            (double) (2.0));
1474    if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1475    recallSum += recallAverage;
1476
1477    double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0) +
1478                                         annotDiffer.getFMeasureStrict(1.0)) /
1479                              (double) (2.0));
1480    if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1481    fMeasureSum += fMeasureAverage;
1482
1483    Double oldPrecision = (Double) precisionByType.get(annotType);
1484    if (oldPrecision == null)
1485      precisionByType.put(annotType, new Double(precisionAverage));
1486    else
1487      precisionByType.put(annotType,
1488                          new Double(oldPrecision.doubleValue() + precisionAverage));
1489
1490    Integer precCount = (Integer) prCountByType.get(annotType);
1491    if (precCount == null)
1492      prCountByType.put(annotType, new Integer(1));
1493    else
1494      prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1495
1496    Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1497    if (oldFMeasure == null)
1498      fMeasureByType.put(annotType, new Double(fMeasureAverage));
1499    else
1500      fMeasureByType.put(annotType,
1501                         new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1502
1503    Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1504    if (fCount == null)
1505      fMeasureCountByType.put(annotType, new Integer(1));
1506    else
1507      fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1508
1509    Double oldRecall = (Double) recallByType.get(annotType);
1510    if (oldRecall == null)
1511      recallByType.put(annotType, new Double(recallAverage));
1512    else
1513      recallByType.put(annotType,
1514                       new Double(oldRecall.doubleValue() + recallAverage));
1515
1516    Integer recCount = (Integer) recCountByType.get(annotType);
1517    if (recCount == null)
1518      recCountByType.put(annotType, new Integer(1));
1519    else
1520      recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1521
1522      //Update the missing, spurious, correct, and partial counts
1523    Long oldMissingNo = (Long) missingByType.get(annotType);
1524    if (oldMissingNo == null)
1525      missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1526    else
1527      missingByType.put(annotType,
1528                        new Long(oldMissingNo.longValue() +
1529                                 annotDiffer.getMissing()));
1530
1531    Long oldCorrectNo = (Long) correctByType.get(annotType);
1532    if (oldCorrectNo == null)
1533      correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1534    else
1535      correctByType.put(annotType,
1536                        new Long(oldCorrectNo.longValue() +
1537                                 annotDiffer.getCorrectMatches()));
1538
1539    Long oldPartialNo = (Long) partialByType.get(annotType);
1540    if (oldPartialNo == null)
1541      partialByType.put(annotType,
1542                        new Long(annotDiffer.getPartiallyCorrectMatches()));
1543    else
1544      partialByType.put(annotType,
1545                        new Long(oldPartialNo.longValue() +
1546                                 annotDiffer.getPartiallyCorrectMatches()));
1547
1548    Long oldSpuriousNo = (Long) spurByType.get(annotType);
1549    if (oldSpuriousNo == null)
1550      spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1551    else
1552      spurByType.put(annotType,
1553                     new Long(oldSpuriousNo.longValue() +
1554                              annotDiffer.getSpurious()));
1555  }
1556
1557  /**
1558   * Update statistics for processed documents
1559   * The same procedure as updateStatistics with different hashTables
1560   */
1561  protected void updateStatisticsProc(AnnotationDiffer annotDiffer,
1562                                      String annotType) {
1563    hasProcessed = true;
1564    double precisionAverage = ( (double) (annotDiffer.getPrecisionLenient() +
1565                                          annotDiffer.getPrecisionStrict()) /
1566                               (double) (2.0));
1567    if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1568    proc_precisionSum += precisionAverage;
1569
1570    double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1571                                       annotDiffer.getRecallStrict()) /
1572                            (double) (2.0));
1573    if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1574    proc_recallSum += recallAverage;
1575
1576    double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0) +
1577                                         annotDiffer.getFMeasureStrict(1.0)) /
1578                              (double) (2.0));
1579    if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1580    proc_fMeasureSum += fMeasureAverage;
1581
1582    Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1583    if (oldPrecision == null)
1584      proc_precisionByType.put(annotType, new Double(precisionAverage));
1585    else
1586      proc_precisionByType.put(annotType,
1587                               new Double(oldPrecision.doubleValue() +
1588                                          precisionAverage));
1589    Integer precCount = (Integer) proc_prCountByType.get(annotType);
1590    if (precCount == null)
1591      proc_prCountByType.put(annotType, new Integer(1));
1592    else
1593      proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1594
1595    Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1596    if (oldFMeasure == null)
1597      proc_fMeasureByType.put(annotType,
1598                              new Double(fMeasureAverage));
1599    else
1600      proc_fMeasureByType.put(annotType,
1601                              new Double(oldFMeasure.doubleValue() +
1602                                         fMeasureAverage));
1603    Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1604    if (fCount == null)
1605      proc_fMeasureCountByType.put(annotType, new Integer(1));
1606    else
1607      proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1608
1609    Double oldRecall = (Double) proc_recallByType.get(annotType);
1610    if (oldRecall == null)
1611      proc_recallByType.put(annotType,
1612                            new Double(recallAverage));
1613    else
1614      proc_recallByType.put(annotType,
1615                            new Double(oldRecall.doubleValue() +
1616                                       recallAverage));
1617    Integer recCount = (Integer) proc_recCountByType.get(annotType);
1618    if (recCount == null)
1619      proc_recCountByType.put(annotType, new Integer(1));
1620    else
1621      proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1622
1623      //Update the missing, spurious, correct, and partial counts
1624    Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1625    if (oldMissingNo == null)
1626      proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1627    else
1628      proc_missingByType.put(annotType,
1629                             new Long(oldMissingNo.longValue() +
1630                                      annotDiffer.getMissing()));
1631
1632    Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1633    if (oldCorrectNo == null)
1634      proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1635    else
1636      proc_correctByType.put(annotType,
1637                             new Long(oldCorrectNo.longValue() +
1638                                      annotDiffer.getCorrectMatches()));
1639
1640    Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1641    if (oldPartialNo == null)
1642      proc_partialByType.put(annotType,
1643                             new Long(annotDiffer.getPartiallyCorrectMatches()));
1644    else
1645      proc_partialByType.put(annotType,
1646                             new Long(oldPartialNo.longValue() +
1647                                      annotDiffer.getPartiallyCorrectMatches()));
1648
1649    Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1650    if (oldSpuriousNo == null)
1651      proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1652    else
1653      proc_spurByType.put(annotType,
1654                          new Long(oldSpuriousNo.longValue() +
1655                                   annotDiffer.getSpurious()));
1656  }
1657
1658  public void printStatistics() {
1659
1660    Out.prln("<H2> Statistics </H2>");
1661
1662    /*
1663        Out.prln("<H3> Precision </H3>");
1664        if (precisionByType != null && !precisionByType.isEmpty()) {
1665          Iterator iter = precisionByType.keySet().iterator();
1666          while (iter.hasNext()) {
1667            String annotType = (String) iter.next();
1668            Out.prln(annotType + ": "
1669              + ((Double)precisionByType.get(annotType)).doubleValue()
1670                  /
1671                  ((Integer)prCountByType.get(annotType)).intValue()
1672              + "<P>");
1673          }//while
1674        }
1675        Out.prln("Overall precision: " + getPrecisionAverage() + "<P>");
1676
1677        Out.prln("<H3> Recall </H3>");
1678        if (recallByType != null && !recallByType.isEmpty()) {
1679          Iterator iter = recallByType.keySet().iterator();
1680          while (iter.hasNext()) {
1681            String annotType = (String) iter.next();
1682            Out.prln(annotType + ": "
1683              + ((Double)recallByType.get(annotType)).doubleValue()
1684                  /
1685                  ((Integer)recCountByType.get(annotType)).intValue()
1686              + "<P>");
1687          }//while
1688        }
1689
1690        Out.prln("Overall recall: " + getRecallAverage()
1691                 + "<P>");
1692     */
1693    if (annotTypes == null) {
1694      Out.prln("No types given for evaluation, cannot obtain precision/recall");
1695      return;
1696    }
1697    Out.prln("<table border=1>");
1698    Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1699             "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1700             "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1701             "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1702    String annotType;
1703    for (int i = 0; i < annotTypes.size(); i++) {
1704      annotType = (String) annotTypes.get(i);
1705      printStatsForType(annotType);
1706    } //for
1707    Out.prln("</table>");
1708  } // updateStatisticsProc
1709
1710  protected void printStatsForType(String annotType) {
1711    long correct = (correctByType.get(annotType) == null) ? 0 :
1712                   ( (Long) correctByType.get(annotType)).longValue();
1713    long partial = (partialByType.get(annotType) == null) ? 0 :
1714                   ( (Long) partialByType.get(annotType)).longValue();
1715    long spurious = (spurByType.get(annotType) == null) ? 0 :
1716                    ( (Long) spurByType.get(annotType)).longValue();
1717    long missing = (missingByType.get(annotType) == null) ? 0 :
1718                   ( (Long) missingByType.get(annotType)).longValue();
1719    long actual = correct + partial + spurious;
1720    long possible = correct + partial + missing;
1721    //precision strict is correct/actual
1722    //precision is (correct + 0.5 * partially correct)/actual
1723    double precision = 0d;
1724    if (actual!=0)
1725      precision = (correct + 0.5 * partial) / actual;
1726    
1727    //recall strict is correct/possible
1728    double recall = 0d;
1729    if (possible!=0)
1730      recall = (correct + 0.5 * partial) / possible;
1731    
1732    //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1733    double fmeasure = 0d;
1734    if ((beta * beta * precision) + recall !=0){
1735      fmeasure =
1736        ( (beta * beta + 1) * precision * recall)
1737        /
1738        ( (beta * beta * precision) + recall);
1739    }
1740
1741    long proc_correct = 0;
1742    long proc_partial = 0;
1743    long proc_spurious = 0;
1744    long proc_missing = 0;
1745    long proc_actual = 0;
1746    long proc_possible = 0;
1747    double proc_precision = 0;
1748    double proc_recall = 0;
1749    double proc_fmeasure = 0;
1750
1751    if (hasProcessed) {
1752      // calculate values for processed
1753      proc_correct = (proc_correctByType.get(annotType) == null) ? 0 :
1754                     ( (Long) proc_correctByType.get(annotType)).longValue();
1755      proc_partial = (proc_partialByType.get(annotType) == null) ? 0 :
1756                     ( (Long) proc_partialByType.get(annotType)).longValue();
1757      proc_spurious = (proc_spurByType.get(annotType) == null) ? 0 :
1758                      ( (Long) proc_spurByType.get(annotType)).longValue();
1759      proc_missing = (proc_missingByType.get(annotType) == null) ? 0 :
1760                     ( (Long) proc_missingByType.get(annotType)).longValue();
1761      proc_actual = proc_correct + proc_partial + proc_spurious;
1762      proc_possible = proc_correct + proc_partial + proc_missing;
1763      //precision strict is correct/actual
1764      //precision is (correct + 0.5 * partially correct)/actual
1765      proc_precision = (proc_correct + 0.5 * proc_partial) / proc_actual;
1766      //recall strict is correct/possible
1767      proc_recall = (proc_correct + 0.5 * proc_partial) / proc_possible;
1768      //F-measure = ( (beta*beta + 1)*P*R ) / ((beta*beta*P) + R)
1769      proc_fmeasure =
1770          ( (beta * beta + 1) * proc_precision * proc_recall)
1771          /
1772          ( (beta * beta * proc_precision) + proc_recall);
1773
1774    }
1775
1776    // output data
1777    Out.prln("<TR>");
1778    if (hasProcessed)
1779      Out.prln("<TD>" + annotType + "_new" + "</TD>");
1780    else
1781      Out.prln("<TD>" + annotType + "</TD>");
1782
1783    Out.prln("<TD>" + correct + "</TD>");
1784    Out.prln("<TD>" + partial + "</TD>");
1785    Out.prln("<TD>" + missing + "</TD>");
1786    Out.prln("<TD>" + spurious + "</TD>");
1787
1788    String strPrec = (isMoreInfoMode) ?
1789                     avgPrint(precision, 4)
1790                     : Double.toString(precision);
1791    String strRec = (isMoreInfoMode) ?
1792                    avgPrint(recall, 4)
1793                    : Double.toString(recall);
1794    String strFmes = (isMoreInfoMode) ?
1795                     avgPrint(fmeasure, 4)
1796                     : Double.toString(fmeasure);
1797
1798    if (hasProcessed && (precision < proc_precision))
1799      Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1800    else if (hasProcessed && (precision > proc_precision))
1801      Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1802    else
1803      Out.prln("<TD>" + strPrec + "</TD>");
1804    if (hasProcessed && (recall < proc_recall))
1805      Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1806    else if (hasProcessed && (recall > proc_recall))
1807      Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1808    else
1809      Out.prln("<TD>" + strRec + "</TD>");
1810    Out.prln("<TD>" + strFmes + "</TD>");
1811    Out.prln("</TR>");
1812
1813    if (hasProcessed) {
1814      // output data
1815      Out.prln("<TR>");
1816      Out.prln("<TD>" + annotType + "_old" + "</TD>");
1817
1818      Out.prln("<TD>" + proc_correct + "</TD>");
1819      Out.prln("<TD>" + proc_partial + "</TD>");
1820      Out.prln("<TD>" + proc_missing + "</TD>");
1821      Out.prln("<TD>" + proc_spurious + "</TD>");
1822
1823      String strProcPrec = (isMoreInfoMode) ?
1824                           avgPrint(proc_precision, 4)
1825                           : Double.toString(proc_precision);
1826      String strProcRec = (isMoreInfoMode) ?
1827                          avgPrint(proc_recall, 4)
1828                          : Double.toString(proc_recall);
1829      String strProcFmes = (isMoreInfoMode) ?
1830                           avgPrint(proc_fmeasure, 4)
1831                           : Double.toString(proc_fmeasure);
1832
1833      if (precision < proc_precision)
1834        Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1835      else if (precision > proc_precision)
1836        Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1837      else
1838        Out.prln("<TD>" + strProcPrec + "</TD>");
1839      if (recall < proc_recall)
1840        Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1841      else if (recall > proc_recall)
1842        Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1843      else
1844        Out.prln("<TD>" + strProcRec + "</TD>");
1845      Out.prln("<TD>" + strProcFmes + "</TD>");
1846      Out.prln("</TR>");
1847    }
1848  } //printStatsForType
1849
1850  //** Print @param value with @param count digits after decimal point */
1851  protected String avgPrint(double value, int count) {
1852    double newvalue;
1853    double power = Math.pow(10, count);
1854    newvalue = Math.round(value * power) / power;
1855    return Double.toString(newvalue);
1856  }
1857
1858  private double precisionSumCalc = 0;
1859  private double recallSumCalc = 0;
1860  private double fMeasureSumCalc = 0;
1861
1862  public double getPrecisionAverageCalc() {
1863    return precisionSumCalc;
1864  }
1865
1866  public double getRecallAverageCalc() {
1867    return recallSumCalc;
1868  }
1869
1870  public double getFmeasureAverageCalc() {
1871    return fMeasureSumCalc;
1872  }
1873
1874  protected void calculateAvgTotal() {
1875    long correct, partial, spurious, missing;
1876    long correctSum, partialSum, spuriousSum, missingSum;
1877
1878    if (annotTypes == null) {
1879      return;
1880    }
1881    correctSum = partialSum = spuriousSum = missingSum = 0;
1882
1883    String annotType;
1884    for (int i = 0; i < annotTypes.size(); i++) {
1885      annotType = (String) annotTypes.get(i);
1886      correct = (correctByType.get(annotType) == null) ? 0 :
1887                ( (Long) correctByType.get(annotType)).longValue();
1888      partial = (partialByType.get(annotType) == null) ? 0 :
1889                ( (Long) partialByType.get(annotType)).longValue();
1890      spurious = (spurByType.get(annotType) == null) ? 0 :
1891                 ( (Long) spurByType.get(annotType)).longValue();
1892      missing = (missingByType.get(annotType) == null) ? 0 :
1893                ( (Long) missingByType.get(annotType)).longValue();
1894      correctSum += correct;
1895      partialSum += partial;
1896      spuriousSum += spurious;
1897      missingSum += missing;
1898    } //for
1899
1900    long actual = correctSum + partialSum + spuriousSum;
1901    long possible = correctSum + partialSum + missingSum;
1902
1903    if (actual == 0) {
1904      precisionSumCalc = 0;
1905    }
1906    else {
1907      precisionSumCalc = (correctSum + 0.5 * partialSum) / actual;
1908    }
1909
1910    if (possible == 0) {
1911      recallSumCalc = 0;
1912    }
1913    else {
1914      recallSumCalc = (correctSum + 0.5 * partialSum) / actual;
1915    }
1916
1917    if (precisionSumCalc == 0 && recallSumCalc == 0) {
1918      fMeasureSumCalc = 0;
1919    }
1920    else {
1921      fMeasureSumCalc =
1922          ( (beta * beta + 1) * precisionSumCalc * recallSumCalc)
1923          /
1924          ( (beta * beta * precisionSumCalc) + recallSumCalc);
1925
1926    }
1927  } // calculateAvgTotal
1928
1929  protected AnnotationDiffer measureDocs(
1930      Document keyDoc, Document respDoc, String annotType) throws
1931      ResourceInstantiationException {
1932
1933    if (keyDoc == null || respDoc == null)
1934      return null;
1935
1936    if (annotSetName != null
1937        && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1938      return null;
1939    else if ( (annotSetName == null || annotSetName.equals(""))
1940             && keyDoc.getAnnotations().get(annotType) == null)
1941      return null;
1942
1943    // create an annotation diff
1944    AnnotationDiffer annotDiffer = new AnnotationDiffer();
1945    // set the feature names set for annotation differ
1946    annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1947    // we need to find the sets
1948    AnnotationSet keys, responses;
1949    if (annotSetName == null || annotSetName.equals("")) {
1950      keys = keyDoc.getAnnotations().get(annotType);
1951      responses = respDoc.getAnnotations().get(annotType);
1952    }
1953    else {
1954      keys = keyDoc.getAnnotations(annotSetName).get(annotType);
1955      responses = respDoc.getAnnotations(outputSetName).get(annotType);
1956    }
1957
1958    // we have annotation sets so call the annotationDiffer
1959    List pairings = annotDiffer.calculateDiff(keys, responses);
1960    return annotDiffer;
1961  } // measureDocs
1962
1963  protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1964                                  Document keyDoc, Document respDoc,
1965                                  Writer errFileWriter) {
1966    if (errFileWriter == null)return; // exit on "no file"
1967
1968    try {
1969      // extract and store annotations
1970      Comparator comp = new OffsetComparator();
1971      TreeSet sortedSet = new TreeSet(comp);
1972      Set missingSet =
1973          annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1974      sortedSet.clear();
1975      sortedSet.addAll(missingSet);
1976      storeAnnotations(type + ".miss", sortedSet, keyDoc, errFileWriter);
1977      Set spuriousSet =
1978          annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1979      sortedSet.clear();
1980      sortedSet.addAll(spuriousSet);
1981      storeAnnotations(type + ".spur", sortedSet, respDoc, errFileWriter);
1982      Set partialSet =
1983          annotDiffer.getAnnotationsOfType(AnnotationDiffer.
1984                                           PARTIALLY_CORRECT_TYPE);
1985      sortedSet.clear();
1986      sortedSet.addAll(partialSet);
1987      storeAnnotations(type + ".part", sortedSet, respDoc, errFileWriter);
1988    }
1989    catch (Exception ex) {
1990      Out.prln("Exception on close of error file " + errFileWriter + ": "
1991               + ex.getMessage());
1992    }
1993  } // storeAnnotations
1994
1995  protected void storeAnnotations(String type, Set set, Document doc,
1996                                  Writer file) throws IOException {
1997
1998    if (set == null || set.isEmpty())
1999      return;
2000
2001    Iterator iter = set.iterator();
2002    Annotation ann;
2003    while (iter.hasNext()) {
2004      ann = (Annotation) iter.next();
2005      file.write(type);
2006      file.write(".");
2007      file.write(doc.getContent().toString().substring(
2008          ann.getStartNode().getOffset().intValue(),
2009          ann.getEndNode().getOffset().intValue()));
2010      file.write(".");
2011      file.write(ann.getStartNode().getOffset().toString());
2012      file.write(".");
2013      file.write(ann.getEndNode().getOffset().toString());
2014      file.write("\n");
2015    } //while
2016  } // storeAnnotations
2017
2018  protected void printAnnotations(AnnotationDiffer annotDiff,
2019                                  Document keyDoc, Document respDoc) {
2020    Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
2021    Set missingSet =
2022        annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
2023    printAnnotations(missingSet, keyDoc);
2024    Out.prln("<BR>");
2025
2026    Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
2027    Set spuriousSet =
2028        annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
2029    printAnnotations(spuriousSet, respDoc);
2030    Out.prln("</BR>");
2031
2032    Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
2033    Set partialSet =
2034        annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
2035    printAnnotations(partialSet, respDoc);
2036  }
2037
2038  protected void printAnnotations(Set set, Document doc) {
2039    if (set == null || set.isEmpty())
2040      return;
2041
2042    Iterator iter = set.iterator();
2043    while (iter.hasNext()) {
2044      Annotation ann = (Annotation) iter.next();
2045      Out.prln(
2046          "<B>" +
2047          doc.getContent().toString().substring(
2048          ann.getStartNode().getOffset().intValue(),
2049          ann.getEndNode().getOffset().intValue()) +
2050          "</B>: <I>[" + ann.getStartNode().getOffset() +
2051          "," + ann.getEndNode().getOffset() + "]</I>"
2052//        + "; features" + ann.getFeatures()
2053          );
2054    } //while
2055  } //printAnnotations
2056
2057  /**
2058   * The directory from which we should generate/evaluate the corpus
2059   */
2060  private File startDir;
2061  private File currDir;
2062  private static List annotTypes;
2063
2064  private Controller application = null;
2065  private File applicationFile = null;
2066
2067  //collect the sum of all precisions and recalls of all docs
2068  //and the number of docs, so I can calculate the average for
2069  //the corpus at the end
2070  private double precisionSum = 0.0;
2071  private double recallSum = 0.0;
2072  private double fMeasureSum = 0.0;
2073  private HashMap precisionByType = new HashMap();
2074  private HashMap prCountByType = new HashMap();
2075  private HashMap recallByType = new HashMap();
2076  private HashMap recCountByType = new HashMap();
2077  private HashMap fMeasureByType = new HashMap();
2078  private HashMap fMeasureCountByType = new HashMap();
2079
2080  private HashMap missingByType = new HashMap();
2081  private HashMap spurByType = new HashMap();
2082  private HashMap correctByType = new HashMap();
2083  private HashMap partialByType = new HashMap();
2084
2085  // statistic for processed
2086  static boolean hasProcessed = false;
2087  private double proc_precisionSum = 0;
2088  private double proc_recallSum = 0;
2089  private double proc_fMeasureSum = 0;
2090  private HashMap proc_precisionByType = new HashMap();
2091  private HashMap proc_prCountByType = new HashMap();
2092  private HashMap proc_recallByType = new HashMap();
2093  private HashMap proc_recCountByType = new HashMap();
2094  private HashMap proc_fMeasureByType = new HashMap();
2095  private HashMap proc_fMeasureCountByType = new HashMap();
2096
2097  private HashMap proc_missingByType = new HashMap();
2098  private HashMap proc_spurByType = new HashMap();
2099  private HashMap proc_correctByType = new HashMap();
2100  private HashMap proc_partialByType = new HashMap();
2101
2102  double beta = 1;
2103
2104  private int docNumber = 0;
2105
2106  /**
2107   * If true, the corpus tool will generate the corpus, otherwise it'll
2108   * run in evaluate mode
2109   */
2110  private boolean isGenerateMode = false;
2111
2112  /**
2113   * If true - show annotations for docs below threshold
2114   */
2115  private boolean isVerboseMode = false;
2116
2117  /**
2118   * If true - show more info in document table
2119   */
2120  private boolean isMoreInfoMode = false;
2121
2122  /**
2123   * The list of features used in the AnnotationDiff separated by comma
2124   * Example: "class;inst"
2125   */
2126  private Set diffFeaturesSet;
2127
2128  /**
2129   * If true, the corpus tool will evaluate stored against the human-marked
2130   * documents
2131   */
2132  private boolean isMarkedStored = false;
2133  private boolean isMarkedClean = false;
2134
2135  //whether marked are in a DS, not xml
2136  private boolean isMarkedDS = false;
2137
2138  private String annotSetName = "Key";
2139  private String outputSetName = null;
2140
2141  private double threshold = 0.5;
2142  private Properties configs = new Properties();
2143  private static int corpusWordCount = 0;
2144
2145  private String documentEncoding = "";
2146
2147  /** String to print when wrong command-line args */
2148  private static String usage =
2149      "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
2150      + "[-verbose] [-moreinfo] directory-name application";
2151
2152}
2153