1
15
16 package gate.util;
17
18 import java.io.*;
19 import java.util.*;
20
21 import gate.*;
22 import gate.util.AnnotationDiffer;
23 import gate.creole.*;
24 import gate.persist.PersistenceException;
25 import gate.persist.SerialDataStore;
26
27 public class CorpusBenchmarkTool {
28 private static final String MARKED_DIR_NAME = "marked";
29 private static final String CLEAN_DIR_NAME = "clean";
30 private static final String CVS_DIR_NAME = "Cvs";
31 private static final String PROCESSED_DIR_NAME = "processed";
32 private static final String ERROR_DIR_NAME = "err";
33
34 private static final boolean DEBUG = true;
35
36 public CorpusBenchmarkTool() {}
37
38 public void initPRs() {
39 try {
40 if (applicationFile == null)
41 Out.prln("Application not set!");
42 Out.prln("App file is: " + applicationFile.getAbsolutePath());
43 application = (Controller) gate.util.persistence.PersistenceManager
44 .loadObjectFromFile(applicationFile);
45 }
46 catch (Exception ex) {
47 throw (GateRuntimeException)
48 new GateRuntimeException("Corpus Benchmark Tool:" + ex.getMessage())
49 .initCause(ex);
50 }
51 }
53 public void unloadPRs() {
54 if (isMarkedStored)
56 return;
57
58 }
59
60 public void execute() {
61 execute(startDir);
62 if (application != null) {
63 javax.swing.SwingUtilities.invokeLater(new Runnable() {
64 public void run() {
65
66 Iterator iter = new ArrayList(application.getPRs()).iterator();
67 while (iter.hasNext())
68 Factory.deleteResource( (Resource) iter.next());
69
70 Factory.deleteResource(application);
71 }
72 });
73 }
74 }
75
76 public void init() {
77 File propFile = new File("corpus_tool.properties");
79 Out.prln(propFile.getAbsolutePath());
80 if (propFile.exists()) {
81 try {
82 InputStream inputStream = new FileInputStream(propFile);
83 this.configs.load(inputStream);
84 String thresholdString = this.configs.getProperty("threshold");
85 if (thresholdString != null && !thresholdString.equals("")) {
86 thresholdString=thresholdString.trim();
87 this.threshold = (new Double(thresholdString)).doubleValue();
88 Out.prln("New threshold is: " + this.threshold + "<P>\n");
89 }
90 String setName = this.configs.getProperty("annotSetName");
91 if (setName != null && !setName.equals("")) {
92 setName=setName.trim();
93 Out.prln("Annotation set in marked docs is: " + setName + " <P>\n");
94 this.annotSetName = setName;
95 }
96 setName = this.configs.getProperty("outputSetName");
97 if (setName != null && !setName.equals("")) {
98 setName=setName.trim();
99 Out.prln("Annotation set in processed docs is: " + setName + " <P>\n");
100 this.outputSetName = setName;
101 }
102 String encodingString = this.configs.getProperty("encoding");
103 if (encodingString != null && !encodingString.equals("")) {
104 encodingString=encodingString.trim();
105 this.documentEncoding = encodingString;
106 Out.prln("New encoding is: " + this.documentEncoding + "<P>\n");
107 }
108 String types = this.configs.getProperty("annotTypes");
109 if (types != null && !types.equals("")) {
110 types=types.trim();
111 Out.prln("Using annotation types from the properties file. <P>\n");
112 StringTokenizer strTok = new StringTokenizer(types, ";");
113 annotTypes = new ArrayList();
114 while (strTok.hasMoreTokens())
115 annotTypes.add(strTok.nextToken());
116 }
117 else {
118 annotTypes = new ArrayList();
119 annotTypes.add("Organization");
120 annotTypes.add("Person");
121 annotTypes.add("Date");
122 annotTypes.add("Location");
123 annotTypes.add("Address");
124 annotTypes.add("Money");
125 annotTypes.add("Percent");
126 annotTypes.add("GPE");
127 annotTypes.add("Facility");
128 }
129 String features = this.configs.getProperty("annotFeatures");
130 HashSet result = new HashSet();
131 if (features != null && !features.equals("")) {
132 features=features.trim();
133 Out.pr("Using annotation features from the properties file. \n");
134 java.util.StringTokenizer tok =
135 new java.util.StringTokenizer(features, ";");
136 String current;
137 while (tok.hasMoreTokens()) {
138 current = tok.nextToken();
139 result.add(current);
140 } }
142 diffFeaturesSet = result;
143 Out.prln("Features: " + diffFeaturesSet + " <P>\n");
144
145 }
146 catch (IOException ex) {
147 this.configs = new Properties();
149 }
150 }
151 else
152 this.configs = new Properties();
153
154 if (!this.isMarkedStored)
157 initPRs();
158
159 }
160
161 public void execute(File dir) {
162 if (dir == null)
163 return;
164 currDir = dir;
166
167 File processedDir = null;
168 File cleanDir = null;
169 File markedDir = null;
170 File errorDir = null;
171
172 ArrayList subDirs = new ArrayList();
173 File[] dirArray = currDir.listFiles();
174 if (dirArray == null)return;
175 for (int i = 0; i < dirArray.length; i++) {
176 if (dirArray[i].isFile() || dirArray[i].getName().equals(CVS_DIR_NAME))
177 continue;
178 if (dirArray[i].getName().equals(CLEAN_DIR_NAME))
179 cleanDir = dirArray[i];
180 else if (dirArray[i].getName().equals(MARKED_DIR_NAME))
181 markedDir = dirArray[i];
182 else if (dirArray[i].getName().equals(PROCESSED_DIR_NAME))
183 processedDir = dirArray[i];
184 else if (dirArray[i].getName().equals(ERROR_DIR_NAME))
185 errorDir = dirArray[i];
186 else
187 subDirs.add(dirArray[i]);
188 }
189
190 if (cleanDir == null)return;
191 Out.prln("Processing directory: " + currDir + "<P>");
192
193 if (this.isGenerateMode)
194 generateCorpus(cleanDir, processedDir);
195 else
196 evaluateCorpus(cleanDir, processedDir, markedDir, errorDir);
197
198 if (subDirs.isEmpty())
200 return;
201
202 for (int j = 0; j < subDirs.size(); j++)
204 execute( (File) subDirs.get(j));
205
206 }
208 public static void main(String[] args) throws GateException {
209 Out.prln("<HTML>");
210 Out.prln("<HEAD>");
211 Out.prln("<TITLE> Corpus benchmark tool: ran with args ");
212 for (int argC = 0; argC < args.length; ++argC)
213 Out.pr(args[argC] + " ");
214 Out.pr(" on " + new Date() + "</TITLE> </HEAD>");
215 Out.prln("<BODY>");
216 Out.prln("Please wait while GATE tools are initialised. <P>");
217 Gate.init();
219
220 CorpusBenchmarkTool corpusTool = new CorpusBenchmarkTool();
221
222 List inputFiles = null;
223 if (args.length < 1)throw new GateException(usage);
224 int i = 0;
225 while (i < args.length && args[i].startsWith("-")) {
226 if (args[i].equals("-generate")) {
227 Out.prln("Generating the corpus... <P>");
228 corpusTool.setGenerateMode(true);
229 }
230 else if (args[i].equals("-marked_clean")) {
231 Out.prln("Evaluating current grammars against human-annotated...<P>");
232 corpusTool.setMarkedClean(true);
233 }
234 else if (args[i].equals("-marked_stored")) {
235 Out.prln("Evaluating stored documents against human-annotated...<P>");
236 corpusTool.setMarkedStored(true);
237 }
238 else if (args[i].equals("-marked_ds")) {
239 Out.prln("Looking for marked docs in a datastore...<P>");
240 corpusTool.setMarkedDS(true);
241 }
242 else if (args[i].equals("-verbose")) {
243 Out.prln("Running in verbose mode. Will generate annotation " +
244 "information when precision/recall are lower than " +
245 corpusTool.getThreshold() + "<P>");
246 corpusTool.setVerboseMode(true);
247 }
248 else if (args[i].equals("-moreinfo")) {
249 Out.prln("Show more details in document table...<P>");
250 corpusTool.setMoreInfo(true);
251 }
252 i++; }
255 String dirName = args[i];
256 File dir = new File(dirName);
257 if (!dir.isDirectory())
258 throw new GateException(usage);
259
260 i++;
262 String appName = args[i];
263 File appFile = new File(appName);
264 if (!appFile.isFile())
265 throw new GateException(usage);
266 else
267 corpusTool.setApplicationFile(appFile);
268
269 corpusTool.init();
270 corpusWordCount = 0;
271
272 Out.prln("Measuring annotaitions of types: " +
273 CorpusBenchmarkTool.annotTypes + "<P>");
274
275 corpusTool.setStartDirectory(dir);
276 corpusTool.execute();
277 if (!corpusTool.getGenerateMode())
280 corpusTool.printStatistics();
281
282 Out.prln("<BR>Overall average precision: " + corpusTool.getPrecisionAverage());
283 Out.prln("<BR>Overall average recall: " + corpusTool.getRecallAverage());
284 Out.prln("<BR>Overall average fMeasure: " + corpusTool.getFMeasureAverage());
285 if (corpusWordCount == 0)
286 Out.prln("<BR>No Token annotations to count words in the corpus.");
287 else
288 Out.prln("<BR>Overall word count: " + corpusWordCount);
289
290 if (hasProcessed) {
291 Out.prln("<P>Old Processed: ");
292 Out.prln("<BR>Overall average precision: "
293 + corpusTool.getPrecisionAverageProc());
294 Out.prln("<BR>Overall average recall: "
295 + corpusTool.getRecallAverageProc());
296 Out.prln("<BR>Overall average fMeasure: "
297 + corpusTool.getFMeasureAverageProc());
298 }
299 Out.prln("<BR>Finished! <P>");
300 Out.prln("</BODY>");
301 Out.prln("</HTML>");
302
303 System.exit(0);
304
305 }
307 public void setGenerateMode(boolean mode) {
308 isGenerateMode = mode;
309 }
311 public boolean getGenerateMode() {
312 return isGenerateMode;
313 }
315 public boolean getVerboseMode() {
316 return isVerboseMode;
317 }
319 public void setVerboseMode(boolean mode) {
320 isVerboseMode = mode;
321 }
323 public void setMoreInfo(boolean mode) {
324 isMoreInfoMode = mode;
325 }
327 public boolean getMoreInfo() {
328 return isMoreInfoMode;
329 }
331 public void setDiffFeaturesList(Set features) {
332 diffFeaturesSet = features;
333 }
335 public Set getDiffFeaturesList() {
336 return diffFeaturesSet;
337 }
339 public void setMarkedStored(boolean mode) {
340 isMarkedStored = mode;
341 }
343 public boolean getMarkedStored() {
344 return isMarkedStored;
345 }
347 public void setMarkedClean(boolean mode) {
348 isMarkedClean = mode;
349 }
351 public boolean getMarkedClean() {
352 return isMarkedClean;
353 }
355 public void setMarkedDS(boolean mode) {
356 isMarkedDS = mode;
357 }
359 public boolean getMarkedDS() {
360 return isMarkedDS;
361 }
363 public void setApplicationFile(File newAppFile) {
364 applicationFile = newAppFile;
365 }
366
367
377 public double getPrecisionAverage() {
378 return (double) precisionSum / docNumber;
379 }
380
381
391 public double getRecallAverage() {
392 return (double) recallSum / docNumber;
393 }
394
395 public double getFMeasureAverage() {
396 return (double) fMeasureSum / docNumber;
397 }
398
399
400 public double getPrecisionAverageProc() {
401 return (double) proc_precisionSum / docNumber;
402 }
403
404 public double getRecallAverageProc() {
405 return (double) proc_recallSum / docNumber;
406 }
407
408 public double getFMeasureAverageProc() {
409 return (double) proc_fMeasureSum / docNumber;
410 }
411
412 public boolean isGenerateMode() {
413 return isGenerateMode == true;
414 }
416 public double getThreshold() {
417 return threshold;
418 }
419
420 public void setThreshold(double newValue) {
421 threshold = newValue;
422 }
423
424 public File getStartDirectory() {
425 return startDir;
426 }
428 public void setStartDirectory(File dir) {
429 startDir = dir;
430 }
432 protected void generateCorpus(File fileDir, File outputDir) {
433 if (fileDir == null)
435 return;
436 File outDir = outputDir;
438 if (outputDir == null) {
439 outDir = new File(currDir, PROCESSED_DIR_NAME);
440 }
441 else {
442 if (!Files.rmdir(outDir))
444 Out.prln("cannot delete old output directory: " + outDir);
445 }
446 outDir.mkdir();
447
448 try {
450 SerialDataStore sds = new SerialDataStore(outDir.toURL().toString());
451 sds.create();
452 sds.open();
453
454 File[] files = fileDir.listFiles();
455 for (int i = 0; i < files.length; i++) {
456 if (!files[i].isFile())
457 continue;
458 Out.prln("Processing and storing document: " + files[i].toURL() + "<P>");
460
461 FeatureMap params = Factory.newFeatureMap();
462 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, files[i].toURL());
463 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
464
465 FeatureMap features = Factory.newFeatureMap();
466
468 final Document doc = (Document) Factory.createResource(
470 "gate.corpora.DocumentImpl", params, features
471 );
472
473 doc.setName(files[i].getName());
474 if (doc == null)
475 continue;
476 processDocument(doc);
477 final LanguageResource lr = sds.adopt(doc, null);
478 sds.sync(lr);
479 javax.swing.SwingUtilities.invokeLater(new Runnable() {
480 public void run() {
481 Factory.deleteResource(doc);
482 Factory.deleteResource(lr);
483 }
484 });
485 } sds.close();
487 }
488 catch (java.net.MalformedURLException ex) {
489 throw (GateRuntimeException)
490 new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
491 .initCause(ex);
492 }
493 catch (PersistenceException ex1) {
494 throw (GateRuntimeException)
495 new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
496 .initCause(ex1);
497 }
498 catch (ResourceInstantiationException ex2) {
499 throw (GateRuntimeException)
500 new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
501 .initCause(ex2);
502 }
503 catch (gate.security.SecurityException ex3) {
504 throw (GateRuntimeException)
505 new GateRuntimeException("CorpusBenchmark: " + ex3.getMessage())
506 .initCause(ex3);
507 }
508 }
510 protected void evaluateCorpus(File fileDir,
511 File processedDir, File markedDir,
512 File errorDir) {
513 if (fileDir == null || !fileDir.exists())
515 return;
516 if (processedDir == null || !processedDir.exists())
517
518 if (isMarkedStored) {
520 Out.prln("Cannot evaluate because no processed documents exist.");
521 return;
522 }
523 else
524 isMarkedClean = true;
525
526 File errDir = null;
528 if (isMoreInfoMode) {
529 errDir = errorDir;
530 if (errDir == null) {
531 errDir = new File(currDir, ERROR_DIR_NAME);
532 }
533 else {
534 if (!Files.rmdir(errDir))
536 Out.prln("cannot delete old error directory: " + errDir);
537 }
538 Out.prln("Create error directory: " + errDir + "<BR><BR>");
539 errDir.mkdir();
540 }
541
542 boolean processMarked = markedDir != null && markedDir.exists();
544 if (!processMarked && (isMarkedStored || isMarkedClean)) {
545 Out.prln("Cannot evaluate because no human-annotated documents exist.");
546 return;
547 }
548
549 if (isMarkedStored) {
550 evaluateMarkedStored(markedDir, processedDir, errDir);
551 return;
552 }
553 else if (isMarkedClean) {
554 evaluateMarkedClean(markedDir, fileDir, errDir);
555 return;
556 }
557
558 Document persDoc = null;
559 Document cleanDoc = null;
560 Document markedDoc = null;
561
562 try {
564 DataStore sds = Factory.openDataStore
566 ("gate.persist.SerialDataStore",
567 processedDir.toURL().toExternalForm());
568
569 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
570 for (int i = 0; i < lrIDs.size(); i++) {
571 String docID = (String) lrIDs.get(i);
572
573 FeatureMap features = Factory.newFeatureMap();
575 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
576 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
577 FeatureMap hparams = Factory.newFeatureMap();
578
580 persDoc = (Document) Factory.createResource(
581 "gate.corpora.DocumentImpl",
582 features, hparams);
583
584 if (isMoreInfoMode) {
585 StringBuffer errName = new StringBuffer(persDoc.getName());
586 errName.replace(
587 persDoc.getName().lastIndexOf("."),
588 persDoc.getName().length(),
589 ".err");
590 Out.prln("<H2>" +
591 "<a href=\"err/" + errName.toString() + "\">"
592 + persDoc.getName() + "</a>" + "</H2>");
593 }
594 else
595 Out.prln("<H2>" + persDoc.getName() + "</H2>");
596
597 File cleanDocFile = new File(fileDir, persDoc.getName());
598 if (!cleanDocFile.exists()) {
600 Out.prln("Warning: Cannot find original document " +
601 persDoc.getName() + " in " + fileDir);
602 }
603 else {
604 FeatureMap params = Factory.newFeatureMap();
605 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocFile.toURL());
606 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
607 documentEncoding);
608
609 cleanDoc = (Document) Factory.createResource(
611 "gate.corpora.DocumentImpl", params, hparams);
612 cleanDoc.setName(persDoc.getName());
613 }
614
615 StringBuffer docName = new StringBuffer(persDoc.getName());
617 if (!isMarkedDS) {
618 docName.replace(
619 persDoc.getName().lastIndexOf("."),
620 docName.length(),
621 ".xml");
622 File markedDocFile = new File(markedDir, docName.toString());
623 if (!processMarked || !markedDocFile.exists()) {
624 Out.prln("Warning: Cannot find human-annotated document " +
625 markedDocFile + " in " + markedDir);
626 }
627 else {
628 FeatureMap params = Factory.newFeatureMap();
629 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
630 markedDocFile.toURL());
631 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
632 documentEncoding);
633
634 markedDoc = (Document) Factory.createResource(
636 "gate.corpora.DocumentImpl", params, hparams);
637 markedDoc.setName(persDoc.getName());
638 }
639 }
640 else {
641 DataStore sds1 = Factory.openDataStore
644 ("gate.persist.SerialDataStore",
645 markedDir.toURL().toExternalForm());
646
647 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
648 boolean found = false;
649 int k = 0;
650 while (k < lrIDs1.size() && !found) {
652 String docID1 = (String) lrIDs1.get(k);
653
654 FeatureMap features1 = Factory.newFeatureMap();
656 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
657 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
658 Document tempDoc = (Document) Factory.createResource(
659 "gate.corpora.DocumentImpl",
660 features1, hparams);
661 if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
663 endsWith(persDoc.getName())) {
664 found = true;
665 markedDoc = tempDoc;
666 }
667 else k++;
668 }
669 }
670
671 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
672
673 if (persDoc != null) {
674 final gate.Document pd = persDoc;
675 javax.swing.SwingUtilities.invokeLater(new Runnable() {
676 public void run() {
677 Factory.deleteResource(pd);
678 }
679 });
680 }
681 if (cleanDoc != null) {
682 final gate.Document cd = cleanDoc;
683 javax.swing.SwingUtilities.invokeLater(new Runnable() {
684 public void run() {
685 Factory.deleteResource(cd);
686 }
687 });
688 }
689 if (markedDoc != null) {
690 final gate.Document md = markedDoc;
691 javax.swing.SwingUtilities.invokeLater(new Runnable() {
692 public void run() {
693 Factory.deleteResource(md);
694 }
695 });
696 }
697
698 } sds.close();
700 }
701 catch (java.net.MalformedURLException ex) {
702 throw (GateRuntimeException)
703 new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
704 .initCause(ex);
705 }
706 catch (PersistenceException ex1) {
707 throw (GateRuntimeException)
708 new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
709 .initCause(ex1);
710 }
711 catch (ResourceInstantiationException ex2) {
712 throw (GateRuntimeException)
713 new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
714 .initCause(ex2);
715 }
716
717 }
719 protected void evaluateMarkedStored(File markedDir, File storedDir,
720 File errDir) {
721 Document persDoc = null;
722 Document cleanDoc = null;
723 Document markedDoc = null;
724
725 try {
727 DataStore sds = Factory.openDataStore
729 ("gate.persist.SerialDataStore",
730 storedDir.toURL().toExternalForm());
731
732 List lrIDs = sds.getLrIds("gate.corpora.DocumentImpl");
733 for (int i = 0; i < lrIDs.size(); i++) {
734 String docID = (String) lrIDs.get(i);
735
736 FeatureMap features = Factory.newFeatureMap();
738 features.put(DataStore.DATASTORE_FEATURE_NAME, sds);
739 features.put(DataStore.LR_ID_FEATURE_NAME, docID);
740
741 FeatureMap hparams = Factory.newFeatureMap();
742
744 persDoc = (Document) Factory.createResource(
745 "gate.corpora.DocumentImpl",
746 features, hparams);
747
748 if (isMoreInfoMode) {
749 StringBuffer errName = new StringBuffer(persDoc.getName());
750 errName.replace(
751 persDoc.getName().lastIndexOf("."),
752 persDoc.getName().length(),
753 ".err");
754 Out.prln("<H2>" +
755 "<a href=\"err/" + errName.toString() + "\">"
756 + persDoc.getName() + "</a>" + "</H2>");
757 }
758 else
759 Out.prln("<H2>" + persDoc.getName() + "</H2>");
760
761 if (!this.isMarkedDS) { StringBuffer docName = new StringBuffer(persDoc.getName());
763 docName.replace(
764 persDoc.getName().lastIndexOf("."),
765 docName.length(),
766 ".xml");
767 File markedDocFile = new File(markedDir, docName.toString());
768 if (!markedDocFile.exists()) {
769 Out.prln("Warning: Cannot find human-annotated document " +
770 markedDocFile + " in " + markedDir);
771 }
772 else {
773 FeatureMap params = Factory.newFeatureMap();
774 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
775 markedDocFile.toURL());
776 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME,
777 documentEncoding);
778
779 markedDoc = (Document) Factory.createResource(
781 "gate.corpora.DocumentImpl", params, hparams);
782 markedDoc.setName(persDoc.getName());
783 } }
785 else {
786 try {
787 DataStore sds1 = Factory.openDataStore
790 ("gate.persist.SerialDataStore",
791 markedDir.toURL().toExternalForm());
792
793 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
794 boolean found = false;
795 int k = 0;
796 while (k < lrIDs1.size() && !found) {
798 String docID1 = (String) lrIDs1.get(k);
799
800 FeatureMap features1 = Factory.newFeatureMap();
802 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
803 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
804 Document tempDoc = (Document) Factory.createResource(
805 "gate.corpora.DocumentImpl",
806 features1, hparams);
807 if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
809 endsWith(persDoc.getName())) {
810 found = true;
811 markedDoc = tempDoc;
812 }
813 else k++;
814 }
815 }
816 catch (java.net.MalformedURLException ex) {
817 Out.prln("Error finding marked directory " +
818 markedDir.getAbsolutePath());
819 }
820 catch (gate.persist.PersistenceException ex1) {
821 Out.prln(
822 "Error opening marked as a datastore (-marked_ds specified)");
823 }
824 catch (gate.creole.ResourceInstantiationException ex2) {
825 Out.prln(
826 "Error opening marked as a datastore (-marked_ds specified)");
827 }
828 }
829
830 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
831 if (persDoc != null) {
832 final gate.Document pd = persDoc;
833 javax.swing.SwingUtilities.invokeLater(new Runnable() {
834 public void run() {
835 Factory.deleteResource(pd);
836 }
837 });
838 }
839 if (markedDoc != null) {
840 final gate.Document md = markedDoc;
841 javax.swing.SwingUtilities.invokeLater(new Runnable() {
842 public void run() {
843 Factory.deleteResource(md);
844 }
845 });
846 }
847
848 } sds.close();
850
851 }
852 catch (java.net.MalformedURLException ex) {
853 throw (GateRuntimeException)
854 new GateRuntimeException("CorpusBenchmark: " + ex.getMessage())
855 .initCause(ex);
856 }
857 catch (PersistenceException ex1) {
858 throw (GateRuntimeException)
859 new GateRuntimeException("CorpusBenchmark: " + ex1.getMessage())
860 .initCause(ex1);
861 }
862 catch (ResourceInstantiationException ex2) {
863 throw (GateRuntimeException)
864 new GateRuntimeException("CorpusBenchmark: " + ex2.getMessage())
865 .initCause(ex2);
866 }
867
868 }
870 protected void evaluateMarkedClean(File markedDir, File cleanDir, File errDir) {
871 Document persDoc = null;
872 Document cleanDoc = null;
873 Document markedDoc = null;
874
875 File[] cleanDocs = cleanDir.listFiles();
876 for (int i = 0; i < cleanDocs.length; i++) {
877 if (!cleanDocs[i].isFile())
878 continue;
879
880 FeatureMap params = Factory.newFeatureMap();
882 try {
883 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, cleanDocs[i].toURL());
884 }
885 catch (java.net.MalformedURLException ex) {
886 Out.prln("Cannot create document from file: " +
887 cleanDocs[i].getAbsolutePath());
888 continue;
889 }
890 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
892
893 FeatureMap hparams = Factory.newFeatureMap();
894
896 try {
898 cleanDoc = (Document) Factory.createResource(
899 "gate.corpora.DocumentImpl", params, hparams, cleanDocs[i].getName());
900 }
901 catch (gate.creole.ResourceInstantiationException ex) {
902 Out.prln("Cannot create document from file: " +
903 cleanDocs[i].getAbsolutePath());
904 continue;
905 }
906
907 if (isMoreInfoMode) {
908 StringBuffer errName = new StringBuffer(cleanDocs[i].getName());
909 errName.replace(
910 cleanDocs[i].getName().lastIndexOf("."),
911 cleanDocs[i].getName().length(),
912 ".err");
913 Out.prln("<H2>" +
914 "<a href=\"err/" + errName.toString() + "\">"
915 + cleanDocs[i].getName() + "</a>" + "</H2>");
916 }
917 else
918 Out.prln("<H2>" + cleanDocs[i].getName() + "</H2>");
919
920 if (!isMarkedDS) {
922 StringBuffer docName = new StringBuffer(cleanDoc.getName());
923 docName.replace(
924 cleanDoc.getName().lastIndexOf("."),
925 docName.length(),
926 ".xml");
927 File markedDocFile = new File(markedDir, docName.toString());
928 if (!markedDocFile.exists()) {
929 Out.prln("Warning: Cannot find human-annotated document " +
930 markedDocFile + " in " + markedDir);
931 continue;
932 }
933 else {
934 params = Factory.newFeatureMap();
935 try {
936 params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
937 markedDocFile.toURL());
938 }
939 catch (java.net.MalformedURLException ex) {
940 Out.prln("Cannot create document from file: " +
941 markedDocFile.getAbsolutePath());
942 continue;
943 }
944 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, documentEncoding);
946
947 try {
949 markedDoc = (Document) Factory.createResource(
950 "gate.corpora.DocumentImpl", params,
951 hparams, cleanDoc.getName());
952 }
953 catch (gate.creole.ResourceInstantiationException ex) {
954 Out.prln("Cannot create document from file: " +
955 markedDocFile.getAbsolutePath());
956 continue;
957 }
958
959 } }
961 else {
962 try {
963 DataStore sds1 = Factory.openDataStore
966 ("gate.persist.SerialDataStore",
967 markedDir.toURL().toExternalForm());
968
969 List lrIDs1 = sds1.getLrIds("gate.corpora.DocumentImpl");
970 boolean found = false;
971 int k = 0;
972 while (k < lrIDs1.size() && !found) {
974 String docID1 = (String) lrIDs1.get(k);
975
976 FeatureMap features1 = Factory.newFeatureMap();
978 features1.put(DataStore.DATASTORE_FEATURE_NAME, sds1);
979 features1.put(DataStore.LR_ID_FEATURE_NAME, docID1);
980 Document tempDoc = (Document) Factory.createResource(
981 "gate.corpora.DocumentImpl",
982 features1, hparams);
983 if ( ( (String) tempDoc.getFeatures().get("gate.SourceURL")).
985 endsWith(cleanDoc.getName())) {
986 found = true;
987 markedDoc = tempDoc;
988 }
989 else k++;
990 }
991 }
992 catch (java.net.MalformedURLException ex) {
993 Out.prln("Error finding marked directory " +
994 markedDir.getAbsolutePath());
995 }
996 catch (gate.persist.PersistenceException ex1) {
997 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
998 }
999 catch (gate.creole.ResourceInstantiationException ex2) {
1000 Out.prln("Error opening marked as a datastore (-marked_ds specified)");
1001 }
1002 }
1004 try {
1005 evaluateDocuments(persDoc, cleanDoc, markedDoc, errDir);
1006 }
1007 catch (gate.creole.ResourceInstantiationException ex) {
1008 ex.printStackTrace();
1009 Out.prln("Evaluate failed on document: " + cleanDoc.getName());
1010 }
1011 if (persDoc != null) {
1012 final gate.Document pd = persDoc;
1013 javax.swing.SwingUtilities.invokeLater(new Runnable() {
1014 public void run() {
1015 Factory.deleteResource(pd);
1016 }
1017 });
1018 }
1019 if (cleanDoc != null) {
1020 final gate.Document cd = cleanDoc;
1021 javax.swing.SwingUtilities.invokeLater(new Runnable() {
1022 public void run() {
1023 Factory.deleteResource(cd);
1024 }
1025 });
1026 }
1027 if (markedDoc != null) {
1028 final gate.Document md = markedDoc;
1029 javax.swing.SwingUtilities.invokeLater(new Runnable() {
1030 public void run() {
1031 Factory.deleteResource(md);
1032 }
1033 });
1034 }
1035
1036 }
1038 }
1040 protected void processDocument(Document doc) {
1041 try {
1042 if (application instanceof CorpusController) {
1043 Corpus tempCorpus = Factory.newCorpus("temp");
1044 tempCorpus.add(doc);
1045 ( (CorpusController) application).setCorpus(tempCorpus);
1046 application.execute();
1047 Factory.deleteResource(tempCorpus);
1048 tempCorpus = null;
1049 }
1050 else {
1051 Iterator iter = application.getPRs().iterator();
1052 while (iter.hasNext())
1053 ( (ProcessingResource) iter.next()).setParameterValue("document", doc);
1054 application.execute();
1055 }
1056 }
1057 catch (ResourceInstantiationException ex) {
1058 throw (RuntimeException)
1059 new RuntimeException("Error executing application: "
1060 + ex.getMessage())
1061 .initCause(ex);
1062 }
1063 catch (ExecutionException ex) {
1064 throw (RuntimeException)
1065 new RuntimeException("Error executing application: "
1066 + ex.getMessage())
1067 .initCause(ex);
1068 }
1069 }
1070
1071 protected void evaluateDocuments(Document persDoc,
1072 Document cleanDoc, Document markedDoc,
1073 File errDir) throws
1074 ResourceInstantiationException {
1075 if (cleanDoc == null && markedDoc == null)
1076 return;
1077
1078 if (annotTypes == null || annotTypes.isEmpty())
1080 return;
1081
1082 if (cleanDoc != null && !isMarkedStored) {
1083
1084 processDocument(cleanDoc);
1085
1086 int wordCount = countWords(cleanDoc);
1087 if (wordCount == 0)
1088 Out.prln("<BR>No Token annotations to count words in the document.");
1089 else
1090 Out.prln("<BR>Word count: " + wordCount);
1091 corpusWordCount += wordCount;
1092
1093 if (!isMarkedClean)
1094 evaluateAllThree(persDoc, cleanDoc, markedDoc, errDir);
1095 else
1096 evaluateTwoDocs(markedDoc, cleanDoc, errDir);
1097
1098 }
1099 else
1100 evaluateTwoDocs(markedDoc, persDoc, errDir);
1101
1102 }
1103
1104
1107 protected int countWords(Document annotDoc) {
1108 int count = 0;
1109
1110 if (annotDoc == null)return 0;
1111 AnnotationSet tokens = annotDoc.getAnnotations(outputSetName).get("Token");
1113 if (tokens == null)return 0;
1114
1115 Iterator it = tokens.iterator();
1116 Annotation currAnnotation;
1117 while (it.hasNext()) {
1118 currAnnotation = (Annotation) it.next();
1119 Object feature = currAnnotation.getFeatures().get("kind");
1120 if (feature != null && "word".equalsIgnoreCase( (String) feature))++count;
1121 }
1123 return count;
1124 }
1125
1126 protected void evaluateAllThree(Document persDoc,
1127 Document cleanDoc, Document markedDoc,
1128 File errDir) throws
1129 ResourceInstantiationException {
1130 printTableHeader();
1132
1133 Writer errWriter = null;
1135 if (isMoreInfoMode && errDir != null) {
1136 StringBuffer docName = new StringBuffer(cleanDoc.getName());
1137 docName.replace(
1138 cleanDoc.getName().lastIndexOf("."),
1139 docName.length(),
1140 ".err");
1141 File errFile = new File(errDir, docName.toString());
1142 String encoding = ( (gate.corpora.DocumentImpl) cleanDoc).getEncoding();
1143 try {
1144 errWriter = new FileWriter(errFile, false);
1145
1153 }
1154 catch (Exception ex) {
1155 Out.prln("Exception when creating the error file " + errFile + ": "
1156 + ex.getMessage());
1157 errWriter = null;
1158 }
1159 }
1160
1161 for (int jj = 0; jj < annotTypes.size(); jj++) {
1162 String annotType = (String) annotTypes.get(jj);
1163
1164 AnnotationDiffer annotDiffer = measureDocs(markedDoc, cleanDoc, annotType);
1165 if (annotDiffer == null)
1167 continue;
1168
1169 docNumber++;
1171 updateStatistics(annotDiffer, annotType);
1173
1174 AnnotationDiffer annotDiffer1 =
1175 measureDocs(markedDoc, persDoc, annotType);
1176
1177 Out.prln("<TR>");
1178
1179 if (isMoreInfoMode && annotDiffer1 != null
1180 &&
1181 (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1182 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1183 )
1184 Out.prln("<TD> " + annotType + "_new" + "</TD>");
1185 else
1186 Out.prln("<TD> " + annotType + "</TD>");
1187
1188 if (isMoreInfoMode) {
1189 if (annotDiffer1 != null) updateStatisticsProc(annotDiffer1, annotType);
1190
1191 Out.prln("<TD>" + annotDiffer.getCorrectMatches() + "</TD>");
1192 Out.prln("<TD>" + annotDiffer.getPartiallyCorrectMatches() + "</TD>");
1193 Out.prln("<TD>" + annotDiffer.getMissing() + "</TD>");
1194 Out.prln("<TD>" + annotDiffer.getSpurious() + "</TD>");
1195 }
1196
1197 Out.prln("<TD>");
1198
1199 if (annotDiffer1 != null) {
1201
1202 if (annotDiffer1.getPrecisionAverage()
1203 < annotDiffer.getPrecisionAverage()) {
1204 Out.prln("<P><Font color=blue> ");
1205 Out.prln(annotDiffer.getPrecisionAverage());
1206
1207 if (!isMoreInfoMode) {
1208 Out.pr("<BR>Precision increase on human-marked from ");
1209 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1210 Out.prln(annotDiffer.getPrecisionAverage());
1211 }
1212 Out.prln(" </Font></P>");
1213 }
1214 else if (annotDiffer1.getPrecisionAverage()
1215 > annotDiffer.getPrecisionAverage()) {
1216 Out.prln("<P><Font color=red> ");
1217 Out.prln(annotDiffer.getPrecisionAverage());
1218
1219 if (!isMoreInfoMode) {
1220 Out.pr("<BR>Precision decrease on human-marked from ");
1221 Out.pr(annotDiffer1.getPrecisionAverage() + " to ");
1222 Out.prln(annotDiffer.getPrecisionAverage());
1223 }
1224 Out.prln(" </Font></P>");
1225 }
1226 else
1227 Out.prln("<P> " + (double) annotDiffer.getPrecisionAverage() +
1228 " </P>");
1229 }
1230 else
1231 Out.prln("<P> " + annotDiffer.getPrecisionAverage() + " </P>");
1232
1233 Out.prln("</TD>");
1234
1235 Out.prln("<TD>");
1236
1237 if (annotDiffer1 != null) {
1239
1240 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage()) {
1241 Out.prln("<P><Font color=blue> ");
1242 Out.prln(annotDiffer.getRecallAverage());
1243
1244 if (!isMoreInfoMode) {
1245 Out.pr("<BR>Recall increase on human-marked from ");
1246 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1247 Out.prln(annotDiffer.getRecallAverage());
1248 }
1249 Out.prln(" </Font></P>");
1250 }
1251 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage()) {
1252 Out.prln("<P><Font color=red> ");
1253 Out.prln(annotDiffer.getRecallAverage());
1254
1255 if (!isMoreInfoMode) {
1256 Out.pr("<BR>Recall decrease on human-marked from ");
1257 Out.pr(annotDiffer1.getRecallAverage() + " to ");
1258 Out.prln(annotDiffer.getRecallAverage());
1259 }
1260 Out.prln(" </Font></P>");
1261 }
1262 else
1263 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1264 }
1265 else
1266 Out.prln("<P> " + annotDiffer.getRecallAverage() + " </P>");
1267
1268 Out.prln("</TD>");
1269
1270 if (isVerboseMode) {
1272 Out.prln("<TD>");
1273 if (annotDiffer.getRecallAverage() < threshold) {
1274 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1275 }
1276 else {
1277 Out.prln(" ");
1278 }
1279 Out.prln("</TD>");
1280 }
1281
1282 Out.prln("</TR>");
1283
1284 if (isMoreInfoMode && annotDiffer1 != null
1286 &&
1287 (annotDiffer1.getPrecisionAverage() != annotDiffer.getPrecisionAverage()
1288 || annotDiffer1.getRecallAverage() != annotDiffer.getRecallAverage())
1289 ) {
1290
1291 Out.prln("<TR>");
1292 Out.prln("<TD> " + annotType + "_old" + "</TD>");
1293
1294 Out.prln("<TD>" + annotDiffer1.getCorrectMatches() + "</TD>");
1295 Out.prln("<TD>" + annotDiffer1.getPartiallyCorrectMatches() + "</TD>");
1296 Out.prln("<TD>" + annotDiffer1.getMissing() + "</TD>");
1297 Out.prln("<TD>" + annotDiffer1.getSpurious() + "</TD>");
1298
1299 Out.prln("<TD>");
1300 if (annotDiffer1.getPrecisionAverage() <
1301 annotDiffer.getPrecisionAverage())
1302
1303 Out.prln("<P><Font color=blue> " + annotDiffer1.getPrecisionAverage()
1304 + "</Font></P>");
1305 else if (annotDiffer1.getPrecisionAverage() >
1306 annotDiffer.getPrecisionAverage())
1307 Out.prln(
1308 "<P><Font color=red> " + annotDiffer1.getPrecisionAverage()
1309 + " </Font></P>");
1310 else
1311 Out.prln(annotDiffer1.getPrecisionAverage());
1312
1313 Out.prln("</TD>");
1314
1315 Out.prln("<TD>");
1316 if (annotDiffer1.getRecallAverage() < annotDiffer.getRecallAverage())
1317 Out.prln("<P><Font color=blue> " + annotDiffer1.getRecallAverage()
1318 + " </Font></P>");
1319 else if (annotDiffer1.getRecallAverage() > annotDiffer.getRecallAverage())
1320 Out.prln("<P><Font color=red> " + annotDiffer1.getRecallAverage()
1321 + " </Font></P>");
1322 else
1323 Out.prln(annotDiffer1.getRecallAverage());
1324
1325 Out.prln("</TD>");
1326
1327 if (isVerboseMode) {
1329
1331 Out.prln("<TD>");
1332 if (annotDiffer.getRecallAverage() < threshold) {
1333 printAnnotations(annotDiffer, markedDoc, cleanDoc);
1334 }
1335 else {
1336 Out.prln(" ");
1337 }
1338 Out.prln("</TD>");
1339 }
1340 Out.prln("</TR>");
1341 }
1343 if (isMoreInfoMode && errDir != null)
1344 storeAnnotations(annotType, annotDiffer, markedDoc, cleanDoc, errWriter);
1345 } Out.prln("</TABLE>");
1347
1348 try {
1349 if (errWriter != null)
1350 errWriter.close();
1351 }
1352 catch (Exception ex) {
1353 Out.prln("Exception on close of error file " + errWriter + ": "
1354 + ex.getMessage());
1355 }
1356 }
1358 protected void evaluateTwoDocs(Document keyDoc, Document respDoc,
1359 File errDir) throws
1360 ResourceInstantiationException {
1361
1362 printTableHeader();
1364
1365 Writer errWriter = null;
1367 if (isMoreInfoMode && errDir != null) {
1368 StringBuffer docName = new StringBuffer(keyDoc.getName());
1369 docName.replace(
1370 keyDoc.getName().lastIndexOf("."),
1371 docName.length(),
1372 ".err");
1373 File errFile = new File(errDir, docName.toString());
1374 String encoding = ( (gate.corpora.DocumentImpl) keyDoc).getEncoding();
1375 try {
1376 errWriter = new FileWriter(errFile, false);
1377
1385 }
1386 catch (Exception ex) {
1387 Out.prln("Exception when creating the error file " + errFile + ": "
1388 + ex.getMessage());
1389 errWriter = null;
1390 }
1391 }
1392
1393 for (int jj = 0; jj < annotTypes.size(); jj++) {
1394 String annotType = (String) annotTypes.get(jj);
1395
1396 AnnotationDiffer annotDiff = measureDocs(keyDoc, respDoc, annotType);
1397 if (annotDiff == null)
1399 continue;
1400
1401 docNumber++;
1403 updateStatistics(annotDiff, annotType);
1405
1406 Out.prln("<TR>");
1407 Out.prln("<TD>" + annotType + "</TD>");
1408
1409 if (isMoreInfoMode) {
1410 Out.prln("<TD>" + annotDiff.getCorrectMatches() + "</TD>");
1411 Out.prln("<TD>" + annotDiff.getPartiallyCorrectMatches() + "</TD>");
1412 Out.prln("<TD>" + annotDiff.getMissing() + "</TD>");
1413 Out.prln("<TD>" + annotDiff.getSpurious() + "</TD>");
1414 }
1415
1416 Out.prln("<TD>" + annotDiff.getPrecisionAverage() + "</TD>");
1417 Out.prln("<TD>" + annotDiff.getRecallAverage() + "</TD>");
1418 if (isVerboseMode) {
1420 Out.prln("<TD>");
1421 if (annotDiff.getRecallAverage() < threshold) {
1422 printAnnotations(annotDiff, keyDoc, respDoc);
1423 }
1424 else {
1425 Out.prln(" ");
1426 }
1427 Out.prln("</TD>");
1428 }
1429 Out.prln("</TR>");
1430
1431 if (isMoreInfoMode && errDir != null)
1432 storeAnnotations(annotType, annotDiff, keyDoc, respDoc, errWriter);
1433 } Out.prln("</TABLE>");
1435
1436 try {
1437 if (errWriter != null)
1438 errWriter.close();
1439 }
1440 catch (Exception ex) {
1441 Out.prln("Exception on close of error file " + errWriter + ": "
1442 + ex.getMessage());
1443 }
1444 }
1446 protected void printTableHeader() {
1447 Out.prln("<TABLE BORDER=1");
1448 Out.pr("<TR> <TD><B>Annotation Type</B></TD> ");
1449
1450 if (isMoreInfoMode)
1451 Out.pr("<TD><B>Correct</B></TD> <TD><B>Partially Correct</B></TD> "
1452 + "<TD><B>Missing</B></TD> <TD><B>Spurious<B></TD>");
1453
1454 Out.pr("<TD><B>Precision</B></TD> <TD><B>Recall</B></TD>");
1455
1456 if (isVerboseMode)
1457 Out.pr("<TD><B>Annotations</B></TD>");
1458
1459 Out.prln("</TR>");
1460 }
1461
1462 protected void updateStatistics(AnnotationDiffer annotDiffer,
1463 String annotType) {
1464 double precisionAverage = ( (double) ( (double) annotDiffer.
1465 getPrecisionLenient() +
1466 annotDiffer.getPrecisionStrict()) /
1467 (double) (2.0));
1468 if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1469 precisionSum += precisionAverage;
1470
1471 double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1472 annotDiffer.getRecallStrict()) /
1473 (double) (2.0));
1474 if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1475 recallSum += recallAverage;
1476
1477 double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0) +
1478 annotDiffer.getFMeasureStrict(1.0)) /
1479 (double) (2.0));
1480 if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1481 fMeasureSum += fMeasureAverage;
1482
1483 Double oldPrecision = (Double) precisionByType.get(annotType);
1484 if (oldPrecision == null)
1485 precisionByType.put(annotType, new Double(precisionAverage));
1486 else
1487 precisionByType.put(annotType,
1488 new Double(oldPrecision.doubleValue() + precisionAverage));
1489
1490 Integer precCount = (Integer) prCountByType.get(annotType);
1491 if (precCount == null)
1492 prCountByType.put(annotType, new Integer(1));
1493 else
1494 prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1495
1496 Double oldFMeasure = (Double) fMeasureByType.get(annotType);
1497 if (oldFMeasure == null)
1498 fMeasureByType.put(annotType, new Double(fMeasureAverage));
1499 else
1500 fMeasureByType.put(annotType,
1501 new Double(oldFMeasure.doubleValue() + fMeasureAverage));
1502
1503 Integer fCount = (Integer) fMeasureCountByType.get(annotType);
1504 if (fCount == null)
1505 fMeasureCountByType.put(annotType, new Integer(1));
1506 else
1507 fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1508
1509 Double oldRecall = (Double) recallByType.get(annotType);
1510 if (oldRecall == null)
1511 recallByType.put(annotType, new Double(recallAverage));
1512 else
1513 recallByType.put(annotType,
1514 new Double(oldRecall.doubleValue() + recallAverage));
1515
1516 Integer recCount = (Integer) recCountByType.get(annotType);
1517 if (recCount == null)
1518 recCountByType.put(annotType, new Integer(1));
1519 else
1520 recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1521
1522 Long oldMissingNo = (Long) missingByType.get(annotType);
1524 if (oldMissingNo == null)
1525 missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1526 else
1527 missingByType.put(annotType,
1528 new Long(oldMissingNo.longValue() +
1529 annotDiffer.getMissing()));
1530
1531 Long oldCorrectNo = (Long) correctByType.get(annotType);
1532 if (oldCorrectNo == null)
1533 correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1534 else
1535 correctByType.put(annotType,
1536 new Long(oldCorrectNo.longValue() +
1537 annotDiffer.getCorrectMatches()));
1538
1539 Long oldPartialNo = (Long) partialByType.get(annotType);
1540 if (oldPartialNo == null)
1541 partialByType.put(annotType,
1542 new Long(annotDiffer.getPartiallyCorrectMatches()));
1543 else
1544 partialByType.put(annotType,
1545 new Long(oldPartialNo.longValue() +
1546 annotDiffer.getPartiallyCorrectMatches()));
1547
1548 Long oldSpuriousNo = (Long) spurByType.get(annotType);
1549 if (oldSpuriousNo == null)
1550 spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1551 else
1552 spurByType.put(annotType,
1553 new Long(oldSpuriousNo.longValue() +
1554 annotDiffer.getSpurious()));
1555 }
1556
1557
1561 protected void updateStatisticsProc(AnnotationDiffer annotDiffer,
1562 String annotType) {
1563 hasProcessed = true;
1564 double precisionAverage = ( (double) (annotDiffer.getPrecisionLenient() +
1565 annotDiffer.getPrecisionStrict()) /
1566 (double) (2.0));
1567 if (Double.isNaN(precisionAverage)) precisionAverage = 0.0;
1568 proc_precisionSum += precisionAverage;
1569
1570 double recallAverage = ( (double) (annotDiffer.getRecallLenient() +
1571 annotDiffer.getRecallStrict()) /
1572 (double) (2.0));
1573 if (Double.isNaN(recallAverage)) recallAverage = 0.0;
1574 proc_recallSum += recallAverage;
1575
1576 double fMeasureAverage = ( (double) (annotDiffer.getFMeasureLenient(1.0) +
1577 annotDiffer.getFMeasureStrict(1.0)) /
1578 (double) (2.0));
1579 if (Double.isNaN(fMeasureAverage)) fMeasureAverage = 0.0;
1580 proc_fMeasureSum += fMeasureAverage;
1581
1582 Double oldPrecision = (Double) proc_precisionByType.get(annotType);
1583 if (oldPrecision == null)
1584 proc_precisionByType.put(annotType, new Double(precisionAverage));
1585 else
1586 proc_precisionByType.put(annotType,
1587 new Double(oldPrecision.doubleValue() +
1588 precisionAverage));
1589 Integer precCount = (Integer) proc_prCountByType.get(annotType);
1590 if (precCount == null)
1591 proc_prCountByType.put(annotType, new Integer(1));
1592 else
1593 proc_prCountByType.put(annotType, new Integer(precCount.intValue() + 1));
1594
1595 Double oldFMeasure = (Double) proc_fMeasureByType.get(annotType);
1596 if (oldFMeasure == null)
1597 proc_fMeasureByType.put(annotType,
1598 new Double(fMeasureAverage));
1599 else
1600 proc_fMeasureByType.put(annotType,
1601 new Double(oldFMeasure.doubleValue() +
1602 fMeasureAverage));
1603 Integer fCount = (Integer) proc_fMeasureCountByType.get(annotType);
1604 if (fCount == null)
1605 proc_fMeasureCountByType.put(annotType, new Integer(1));
1606 else
1607 proc_fMeasureCountByType.put(annotType, new Integer(fCount.intValue() + 1));
1608
1609 Double oldRecall = (Double) proc_recallByType.get(annotType);
1610 if (oldRecall == null)
1611 proc_recallByType.put(annotType,
1612 new Double(recallAverage));
1613 else
1614 proc_recallByType.put(annotType,
1615 new Double(oldRecall.doubleValue() +
1616 recallAverage));
1617 Integer recCount = (Integer) proc_recCountByType.get(annotType);
1618 if (recCount == null)
1619 proc_recCountByType.put(annotType, new Integer(1));
1620 else
1621 proc_recCountByType.put(annotType, new Integer(recCount.intValue() + 1));
1622
1623 Long oldMissingNo = (Long) proc_missingByType.get(annotType);
1625 if (oldMissingNo == null)
1626 proc_missingByType.put(annotType, new Long(annotDiffer.getMissing()));
1627 else
1628 proc_missingByType.put(annotType,
1629 new Long(oldMissingNo.longValue() +
1630 annotDiffer.getMissing()));
1631
1632 Long oldCorrectNo = (Long) proc_correctByType.get(annotType);
1633 if (oldCorrectNo == null)
1634 proc_correctByType.put(annotType, new Long(annotDiffer.getCorrectMatches()));
1635 else
1636 proc_correctByType.put(annotType,
1637 new Long(oldCorrectNo.longValue() +
1638 annotDiffer.getCorrectMatches()));
1639
1640 Long oldPartialNo = (Long) proc_partialByType.get(annotType);
1641 if (oldPartialNo == null)
1642 proc_partialByType.put(annotType,
1643 new Long(annotDiffer.getPartiallyCorrectMatches()));
1644 else
1645 proc_partialByType.put(annotType,
1646 new Long(oldPartialNo.longValue() +
1647 annotDiffer.getPartiallyCorrectMatches()));
1648
1649 Long oldSpuriousNo = (Long) proc_spurByType.get(annotType);
1650 if (oldSpuriousNo == null)
1651 proc_spurByType.put(annotType, new Long(annotDiffer.getSpurious()));
1652 else
1653 proc_spurByType.put(annotType,
1654 new Long(oldSpuriousNo.longValue() +
1655 annotDiffer.getSpurious()));
1656 }
1657
1658 public void printStatistics() {
1659
1660 Out.prln("<H2> Statistics </H2>");
1661
1662
1693 if (annotTypes == null) {
1694 Out.prln("No types given for evaluation, cannot obtain precision/recall");
1695 return;
1696 }
1697 Out.prln("<table border=1>");
1698 Out.prln("<TR> <TD><B>Annotation Type</B></TD> <TD><B>Correct</B></TD>" +
1699 "<TD><B>Partially Correct</B></TD> <TD><B>Missing</B></TD>" +
1700 "<TD><B>Spurious</B></TD> <TD><B>Precision</B></TD>" +
1701 "<TD><B>Recall</B></TD> <TD><B>F-Measure</B></TD> </TR>");
1702 String annotType;
1703 for (int i = 0; i < annotTypes.size(); i++) {
1704 annotType = (String) annotTypes.get(i);
1705 printStatsForType(annotType);
1706 } Out.prln("</table>");
1708 }
1710 protected void printStatsForType(String annotType) {
1711 long correct = (correctByType.get(annotType) == null) ? 0 :
1712 ( (Long) correctByType.get(annotType)).longValue();
1713 long partial = (partialByType.get(annotType) == null) ? 0 :
1714 ( (Long) partialByType.get(annotType)).longValue();
1715 long spurious = (spurByType.get(annotType) == null) ? 0 :
1716 ( (Long) spurByType.get(annotType)).longValue();
1717 long missing = (missingByType.get(annotType) == null) ? 0 :
1718 ( (Long) missingByType.get(annotType)).longValue();
1719 long actual = correct + partial + spurious;
1720 long possible = correct + partial + missing;
1721 double precision = 0d;
1724 if (actual!=0)
1725 precision = (correct + 0.5 * partial) / actual;
1726
1727 double recall = 0d;
1729 if (possible!=0)
1730 recall = (correct + 0.5 * partial) / possible;
1731
1732 double fmeasure = 0d;
1734 if ((beta * beta * precision) + recall !=0){
1735 fmeasure =
1736 ( (beta * beta + 1) * precision * recall)
1737 /
1738 ( (beta * beta * precision) + recall);
1739 }
1740
1741 long proc_correct = 0;
1742 long proc_partial = 0;
1743 long proc_spurious = 0;
1744 long proc_missing = 0;
1745 long proc_actual = 0;
1746 long proc_possible = 0;
1747 double proc_precision = 0;
1748 double proc_recall = 0;
1749 double proc_fmeasure = 0;
1750
1751 if (hasProcessed) {
1752 proc_correct = (proc_correctByType.get(annotType) == null) ? 0 :
1754 ( (Long) proc_correctByType.get(annotType)).longValue();
1755 proc_partial = (proc_partialByType.get(annotType) == null) ? 0 :
1756 ( (Long) proc_partialByType.get(annotType)).longValue();
1757 proc_spurious = (proc_spurByType.get(annotType) == null) ? 0 :
1758 ( (Long) proc_spurByType.get(annotType)).longValue();
1759 proc_missing = (proc_missingByType.get(annotType) == null) ? 0 :
1760 ( (Long) proc_missingByType.get(annotType)).longValue();
1761 proc_actual = proc_correct + proc_partial + proc_spurious;
1762 proc_possible = proc_correct + proc_partial + proc_missing;
1763 proc_precision = (proc_correct + 0.5 * proc_partial) / proc_actual;
1766 proc_recall = (proc_correct + 0.5 * proc_partial) / proc_possible;
1768 proc_fmeasure =
1770 ( (beta * beta + 1) * proc_precision * proc_recall)
1771 /
1772 ( (beta * beta * proc_precision) + proc_recall);
1773
1774 }
1775
1776 Out.prln("<TR>");
1778 if (hasProcessed)
1779 Out.prln("<TD>" + annotType + "_new" + "</TD>");
1780 else
1781 Out.prln("<TD>" + annotType + "</TD>");
1782
1783 Out.prln("<TD>" + correct + "</TD>");
1784 Out.prln("<TD>" + partial + "</TD>");
1785 Out.prln("<TD>" + missing + "</TD>");
1786 Out.prln("<TD>" + spurious + "</TD>");
1787
1788 String strPrec = (isMoreInfoMode) ?
1789 avgPrint(precision, 4)
1790 : Double.toString(precision);
1791 String strRec = (isMoreInfoMode) ?
1792 avgPrint(recall, 4)
1793 : Double.toString(recall);
1794 String strFmes = (isMoreInfoMode) ?
1795 avgPrint(fmeasure, 4)
1796 : Double.toString(fmeasure);
1797
1798 if (hasProcessed && (precision < proc_precision))
1799 Out.prln("<TD><Font color=red>" + strPrec + "</TD>");
1800 else if (hasProcessed && (precision > proc_precision))
1801 Out.prln("<TD><Font color=blue>" + strPrec + "</TD>");
1802 else
1803 Out.prln("<TD>" + strPrec + "</TD>");
1804 if (hasProcessed && (recall < proc_recall))
1805 Out.prln("<TD><Font color=red>" + strRec + "</TD>");
1806 else if (hasProcessed && (recall > proc_recall))
1807 Out.prln("<TD><Font color=blue>" + strRec + "</TD>");
1808 else
1809 Out.prln("<TD>" + strRec + "</TD>");
1810 Out.prln("<TD>" + strFmes + "</TD>");
1811 Out.prln("</TR>");
1812
1813 if (hasProcessed) {
1814 Out.prln("<TR>");
1816 Out.prln("<TD>" + annotType + "_old" + "</TD>");
1817
1818 Out.prln("<TD>" + proc_correct + "</TD>");
1819 Out.prln("<TD>" + proc_partial + "</TD>");
1820 Out.prln("<TD>" + proc_missing + "</TD>");
1821 Out.prln("<TD>" + proc_spurious + "</TD>");
1822
1823 String strProcPrec = (isMoreInfoMode) ?
1824 avgPrint(proc_precision, 4)
1825 : Double.toString(proc_precision);
1826 String strProcRec = (isMoreInfoMode) ?
1827 avgPrint(proc_recall, 4)
1828 : Double.toString(proc_recall);
1829 String strProcFmes = (isMoreInfoMode) ?
1830 avgPrint(proc_fmeasure, 4)
1831 : Double.toString(proc_fmeasure);
1832
1833 if (precision < proc_precision)
1834 Out.prln("<TD><Font color=red>" + strProcPrec + "</TD>");
1835 else if (precision > proc_precision)
1836 Out.prln("<TD><Font color=blue>" + strProcPrec + "</TD>");
1837 else
1838 Out.prln("<TD>" + strProcPrec + "</TD>");
1839 if (recall < proc_recall)
1840 Out.prln("<TD><Font color=red>" + strProcRec + "</TD>");
1841 else if (recall > proc_recall)
1842 Out.prln("<TD><Font color=blue>" + strProcRec + "</TD>");
1843 else
1844 Out.prln("<TD>" + strProcRec + "</TD>");
1845 Out.prln("<TD>" + strProcFmes + "</TD>");
1846 Out.prln("</TR>");
1847 }
1848 }
1850 protected String avgPrint(double value, int count) {
1852 double newvalue;
1853 double power = Math.pow(10, count);
1854 newvalue = Math.round(value * power) / power;
1855 return Double.toString(newvalue);
1856 }
1857
1858 private double precisionSumCalc = 0;
1859 private double recallSumCalc = 0;
1860 private double fMeasureSumCalc = 0;
1861
1862 public double getPrecisionAverageCalc() {
1863 return precisionSumCalc;
1864 }
1865
1866 public double getRecallAverageCalc() {
1867 return recallSumCalc;
1868 }
1869
1870 public double getFmeasureAverageCalc() {
1871 return fMeasureSumCalc;
1872 }
1873
1874 protected void calculateAvgTotal() {
1875 long correct, partial, spurious, missing;
1876 long correctSum, partialSum, spuriousSum, missingSum;
1877
1878 if (annotTypes == null) {
1879 return;
1880 }
1881 correctSum = partialSum = spuriousSum = missingSum = 0;
1882
1883 String annotType;
1884 for (int i = 0; i < annotTypes.size(); i++) {
1885 annotType = (String) annotTypes.get(i);
1886 correct = (correctByType.get(annotType) == null) ? 0 :
1887 ( (Long) correctByType.get(annotType)).longValue();
1888 partial = (partialByType.get(annotType) == null) ? 0 :
1889 ( (Long) partialByType.get(annotType)).longValue();
1890 spurious = (spurByType.get(annotType) == null) ? 0 :
1891 ( (Long) spurByType.get(annotType)).longValue();
1892 missing = (missingByType.get(annotType) == null) ? 0 :
1893 ( (Long) missingByType.get(annotType)).longValue();
1894 correctSum += correct;
1895 partialSum += partial;
1896 spuriousSum += spurious;
1897 missingSum += missing;
1898 }
1900 long actual = correctSum + partialSum + spuriousSum;
1901 long possible = correctSum + partialSum + missingSum;
1902
1903 if (actual == 0) {
1904 precisionSumCalc = 0;
1905 }
1906 else {
1907 precisionSumCalc = (correctSum + 0.5 * partialSum) / actual;
1908 }
1909
1910 if (possible == 0) {
1911 recallSumCalc = 0;
1912 }
1913 else {
1914 recallSumCalc = (correctSum + 0.5 * partialSum) / actual;
1915 }
1916
1917 if (precisionSumCalc == 0 && recallSumCalc == 0) {
1918 fMeasureSumCalc = 0;
1919 }
1920 else {
1921 fMeasureSumCalc =
1922 ( (beta * beta + 1) * precisionSumCalc * recallSumCalc)
1923 /
1924 ( (beta * beta * precisionSumCalc) + recallSumCalc);
1925
1926 }
1927 }
1929 protected AnnotationDiffer measureDocs(
1930 Document keyDoc, Document respDoc, String annotType) throws
1931 ResourceInstantiationException {
1932
1933 if (keyDoc == null || respDoc == null)
1934 return null;
1935
1936 if (annotSetName != null
1937 && keyDoc.getAnnotations(annotSetName).get(annotType) == null)
1938 return null;
1939 else if ( (annotSetName == null || annotSetName.equals(""))
1940 && keyDoc.getAnnotations().get(annotType) == null)
1941 return null;
1942
1943 AnnotationDiffer annotDiffer = new AnnotationDiffer();
1945 annotDiffer.setSignificantFeaturesSet(diffFeaturesSet);
1947 AnnotationSet keys, responses;
1949 if (annotSetName == null || annotSetName.equals("")) {
1950 keys = keyDoc.getAnnotations().get(annotType);
1951 responses = respDoc.getAnnotations().get(annotType);
1952 }
1953 else {
1954 keys = keyDoc.getAnnotations(annotSetName).get(annotType);
1955 responses = respDoc.getAnnotations(outputSetName).get(annotType);
1956 }
1957
1958 List pairings = annotDiffer.calculateDiff(keys, responses);
1960 return annotDiffer;
1961 }
1963 protected void storeAnnotations(String type, AnnotationDiffer annotDiffer,
1964 Document keyDoc, Document respDoc,
1965 Writer errFileWriter) {
1966 if (errFileWriter == null)return;
1968 try {
1969 Comparator comp = new OffsetComparator();
1971 TreeSet sortedSet = new TreeSet(comp);
1972 Set missingSet =
1973 annotDiffer.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
1974 sortedSet.clear();
1975 sortedSet.addAll(missingSet);
1976 storeAnnotations(type + ".miss", sortedSet, keyDoc, errFileWriter);
1977 Set spuriousSet =
1978 annotDiffer.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
1979 sortedSet.clear();
1980 sortedSet.addAll(spuriousSet);
1981 storeAnnotations(type + ".spur", sortedSet, respDoc, errFileWriter);
1982 Set partialSet =
1983 annotDiffer.getAnnotationsOfType(AnnotationDiffer.
1984 PARTIALLY_CORRECT_TYPE);
1985 sortedSet.clear();
1986 sortedSet.addAll(partialSet);
1987 storeAnnotations(type + ".part", sortedSet, respDoc, errFileWriter);
1988 }
1989 catch (Exception ex) {
1990 Out.prln("Exception on close of error file " + errFileWriter + ": "
1991 + ex.getMessage());
1992 }
1993 }
1995 protected void storeAnnotations(String type, Set set, Document doc,
1996 Writer file) throws IOException {
1997
1998 if (set == null || set.isEmpty())
1999 return;
2000
2001 Iterator iter = set.iterator();
2002 Annotation ann;
2003 while (iter.hasNext()) {
2004 ann = (Annotation) iter.next();
2005 file.write(type);
2006 file.write(".");
2007 file.write(doc.getContent().toString().substring(
2008 ann.getStartNode().getOffset().intValue(),
2009 ann.getEndNode().getOffset().intValue()));
2010 file.write(".");
2011 file.write(ann.getStartNode().getOffset().toString());
2012 file.write(".");
2013 file.write(ann.getEndNode().getOffset().toString());
2014 file.write("\n");
2015 } }
2018 protected void printAnnotations(AnnotationDiffer annotDiff,
2019 Document keyDoc, Document respDoc) {
2020 Out.pr("MISSING ANNOTATIONS in the automatic texts: ");
2021 Set missingSet =
2022 annotDiff.getAnnotationsOfType(AnnotationDiffer.MISSING_TYPE);
2023 printAnnotations(missingSet, keyDoc);
2024 Out.prln("<BR>");
2025
2026 Out.pr("SPURIOUS ANNOTATIONS in the automatic texts: ");
2027 Set spuriousSet =
2028 annotDiff.getAnnotationsOfType(AnnotationDiffer.SPURIOUS_TYPE);
2029 printAnnotations(spuriousSet, respDoc);
2030 Out.prln("</BR>");
2031
2032 Out.pr("PARTIALLY CORRECT ANNOTATIONS in the automatic texts: ");
2033 Set partialSet =
2034 annotDiff.getAnnotationsOfType(AnnotationDiffer.PARTIALLY_CORRECT_TYPE);
2035 printAnnotations(partialSet, respDoc);
2036 }
2037
2038 protected void printAnnotations(Set set, Document doc) {
2039 if (set == null || set.isEmpty())
2040 return;
2041
2042 Iterator iter = set.iterator();
2043 while (iter.hasNext()) {
2044 Annotation ann = (Annotation) iter.next();
2045 Out.prln(
2046 "<B>" +
2047 doc.getContent().toString().substring(
2048 ann.getStartNode().getOffset().intValue(),
2049 ann.getEndNode().getOffset().intValue()) +
2050 "</B>: <I>[" + ann.getStartNode().getOffset() +
2051 "," + ann.getEndNode().getOffset() + "]</I>"
2052 );
2054 } }
2057
2060 private File startDir;
2061 private File currDir;
2062 private static List annotTypes;
2063
2064 private Controller application = null;
2065 private File applicationFile = null;
2066
2067 private double precisionSum = 0.0;
2071 private double recallSum = 0.0;
2072 private double fMeasureSum = 0.0;
2073 private HashMap precisionByType = new HashMap();
2074 private HashMap prCountByType = new HashMap();
2075 private HashMap recallByType = new HashMap();
2076 private HashMap recCountByType = new HashMap();
2077 private HashMap fMeasureByType = new HashMap();
2078 private HashMap fMeasureCountByType = new HashMap();
2079
2080 private HashMap missingByType = new HashMap();
2081 private HashMap spurByType = new HashMap();
2082 private HashMap correctByType = new HashMap();
2083 private HashMap partialByType = new HashMap();
2084
2085 static boolean hasProcessed = false;
2087 private double proc_precisionSum = 0;
2088 private double proc_recallSum = 0;
2089 private double proc_fMeasureSum = 0;
2090 private HashMap proc_precisionByType = new HashMap();
2091 private HashMap proc_prCountByType = new HashMap();
2092 private HashMap proc_recallByType = new HashMap();
2093 private HashMap proc_recCountByType = new HashMap();
2094 private HashMap proc_fMeasureByType = new HashMap();
2095 private HashMap proc_fMeasureCountByType = new HashMap();
2096
2097 private HashMap proc_missingByType = new HashMap();
2098 private HashMap proc_spurByType = new HashMap();
2099 private HashMap proc_correctByType = new HashMap();
2100 private HashMap proc_partialByType = new HashMap();
2101
2102 double beta = 1;
2103
2104 private int docNumber = 0;
2105
2106
2110 private boolean isGenerateMode = false;
2111
2112
2115 private boolean isVerboseMode = false;
2116
2117
2120 private boolean isMoreInfoMode = false;
2121
2122
2126 private Set diffFeaturesSet;
2127
2128
2132 private boolean isMarkedStored = false;
2133 private boolean isMarkedClean = false;
2134
2135 private boolean isMarkedDS = false;
2137
2138 private String annotSetName = "Key";
2139 private String outputSetName = null;
2140
2141 private double threshold = 0.5;
2142 private Properties configs = new Properties();
2143 private static int corpusWordCount = 0;
2144
2145 private String documentEncoding = "";
2146
2147
2148 private static String usage =
2149 "usage: CorpusBenchmarkTool [-generate|-marked_stored|-marked_clean] "
2150 + "[-verbose] [-moreinfo] directory-name application";
2151
2152}
2153