1
15
16 package gate.creole;
17
18 import java.io.File;
19 import java.util.*;
20
21 import gate.*;
22 import gate.creole.gazetteer.DefaultGazetteer;
23 import gate.creole.orthomatcher.OrthoMatcher;
24 import gate.creole.splitter.SentenceSplitter;
25 import gate.creole.tokeniser.DefaultTokeniser;
26 import gate.util.GateException;
27 import gate.util.Out;
28 import gate.util.profile.Profiler;
29
31
45 public class ProfilePRs {
46
47
48 private static String usage =
49 "usage: ProfilePRs [-dir directory-name | file(s)]";
50
51 private static double totalDocLength = 0;
52 private static int docs = 0;
53 private static Profiler prof = new Profiler();
54 private static double maxDocLength = 0;
55
56
57 public static void main(String[] args) throws Exception {
58 Out.prln("processing command line arguments");
60
61 List inputFiles = null;
63 if(args.length < 1) throw new GateException(usage);
64 if(args[0].equals("-dir")) { if(args.length < 2) throw new GateException(usage);
66 File dir = new File(args[1]);
67 File[] filesArray = dir.listFiles();
68 if(filesArray == null)
69 throw new GateException(
70 dir.getPath() + " is not a directory; " + usage
71 );
72 inputFiles = Arrays.asList(filesArray);
73 } else { inputFiles = new ArrayList();
75 for(int i = 0; i < args.length; i++)
76 inputFiles.add(new File(args[i]));
77 }
78
79 prof.initRun("Measuring performance on directory " + args[1]);
80
83 prof.checkPoint("Before GATE.init()");
85 Gate.init();
86
89
90 prof.checkPoint("Before creating the processing resources");
92
93 FeatureMap params = Factory.newFeatureMap();
95 DefaultTokeniser tokeniser = (DefaultTokeniser) Factory.createResource(
96 "gate.creole.tokeniser.DefaultTokeniser", params);
97 prof.checkPoint("Tokeniser initialised");
98
99 params = Factory.newFeatureMap();
101 DefaultGazetteer gaz = (DefaultGazetteer) Factory.createResource(
102 "gate.creole.gazetteer.DefaultGazetteer", params);
103 prof.checkPoint("Gazetteer initialised");
104
105 params = Factory.newFeatureMap();
107 SentenceSplitter splitter = (SentenceSplitter) Factory.createResource(
108 "gate.creole.splitter.SentenceSplitter", params);
109 prof.checkPoint("Sentence splitter initialised");
110
111 params = Factory.newFeatureMap();
113 POSTagger tagger = (POSTagger) Factory.createResource(
114 "gate.creole.POSTagger", params);
115 prof.checkPoint("POSTagger initialised");
116
117 params = Factory.newFeatureMap();
119 ANNIETransducer transducer = (ANNIETransducer) Factory.createResource(
120 "gate.creole.ANNIETransducer", params);
121 prof.checkPoint("Grammars initialised");
122
123 params = Factory.newFeatureMap();
125 OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
126 "gate.creole.orthomatcher.OrthoMatcher", params);
127 prof.checkPoint("Orthomatcher initialised");
128
129
130 Out.prln("\nLooping on input files list");
137 Iterator filesIter = inputFiles.iterator();
138 docs = inputFiles.size();
139 int fileNo=0;
140 while(filesIter.hasNext()) {
141 File inFile = (File) filesIter.next(); fileNo++;
143
144 params.clear();
146 params.put(Document.DOCUMENT_URL_PARAMETER_NAME, inFile.toURL().toExternalForm());
147 params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
148
149 Document doc = (Document) Factory.createResource(
151 "gate.corpora.DocumentImpl", params
152 );
153 totalDocLength += doc.getContent().size().longValue();
154
155 if (maxDocLength < doc.getContent().size().longValue())
156 maxDocLength = doc.getContent().size().longValue();
157
158 tokeniser.setDocument(doc);
160 prof.checkPoint("Processing file " + inFile.getPath() +
161 ", #" + fileNo + "/" + docs, new String[0], true, false, false);
162 tokeniser.execute();
163 prof.checkPoint("", new String[] {"Tokenizer", "Processing"}, false, false, false);
164
165 gaz.setDocument(doc);
167 gaz.execute();
168 prof.checkPoint("", new String[] {"Gazettier", "Processing"}, false, false, false);
169
170 splitter.setDocument(doc);
172 splitter.execute();
173 prof.checkPoint("", new String[] {"Splitter", "Processing"}, false, false, false);
174
175 tagger.setDocument(doc);
177 tagger.execute();
178 prof.checkPoint("", new String[] {"Tagger", "Processing"}, false, false, false);
179
180 transducer.setDocument(doc);
182 transducer.execute();
183 prof.checkPoint("", new String[] {"JAPE grammars", "Processing"}, false, false, false);
184
185 orthomatcher.setDocument(doc);
187 orthomatcher.execute();
188 prof.checkPoint("", new String[] {"Orthomatcher", "Processing"}, false, false, false);
189
190 Factory.deleteResource(doc);
192
193 }
195 prof.checkPoint("Done!");
196
197 totalDocLength = (double) totalDocLength/1024;
198 Out.prln("\nTotal KBytes processed: " + (long)totalDocLength);
199 Out.prln("\nMax document size in bytes: " + (long)maxDocLength +
200 " (" + (long) maxDocLength/1024 + " Kb)");
201
202
203 prof.printCategAvg("Processing", docs, totalDocLength, "kb");
204 prof.printCategAvg("Tokenizer", docs, totalDocLength, "kb");
205 prof.printCategAvg("Gazettier", docs, totalDocLength, "kb");
206 prof.printCategAvg("Splitter", docs, totalDocLength, "kb");
207 prof.printCategAvg("Tagger", docs, totalDocLength, "kb");
208 prof.printCategAvg("JAPE grammars", docs, totalDocLength, "kb");
209 prof.printCategAvg("Orthomatcher", docs, totalDocLength, "kb");
210 }
212
213 }