1   /*
2    *  ProfilePRs.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 04/10/2001
12   *
13   *  $Id: ProfilePRs.java,v 1.7 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.creole;
17  
18  import java.io.File;
19  import java.util.*;
20  
21  import gate.*;
22  import gate.creole.gazetteer.DefaultGazetteer;
23  import gate.creole.orthomatcher.OrthoMatcher;
24  import gate.creole.splitter.SentenceSplitter;
25  import gate.creole.tokeniser.DefaultTokeniser;
26  import gate.util.GateException;
27  import gate.util.Out;
28  import gate.util.profile.Profiler;
29  //import java.text.NumberFormat;
30  
31  /**
32   * This class provides a main function that:
33   * <UL>
34   * <LI>
35   * initialises the GATE library, and creates all PRs
36   * <LI>
37   * takes a directory name as argument
38   * <LI>
39   * for each .html file in that directory:
40   * <BR>  create a GATE document from the file
41   * <BR>  run the PRs on the document
42   * <BR>  dump some statistics in the end
43   * </UL>
44   */
45  public class ProfilePRs {
46  
47    /** String to print when wrong command-line args */
48    private static String usage =
49      "usage: ProfilePRs [-dir directory-name | file(s)]";
50  
51    private static double totalDocLength = 0;
52    private static int docs = 0;
53    private static Profiler prof = new Profiler();
54    private static double maxDocLength = 0;
55  
56    /** Main function */
57    public static void main(String[] args) throws Exception {
58      // say "hi"
59      Out.prln("processing command line arguments");
60  
61      // check we have a directory name or list of files
62      List inputFiles = null;
63      if(args.length < 1) throw new GateException(usage);
64      if(args[0].equals("-dir")) { // list all the files in the dir
65        if(args.length < 2) throw new GateException(usage);
66        File dir = new File(args[1]);
67        File[] filesArray = dir.listFiles();
68        if(filesArray == null)
69          throw new GateException(
70            dir.getPath() + " is not a directory; " + usage
71          );
72        inputFiles = Arrays.asList(filesArray);
73      } else { // all args should be file names
74        inputFiles = new ArrayList();
75        for(int i = 0; i < args.length; i++)
76          inputFiles.add(new File(args[i]));
77      }
78  
79      prof.initRun("Measuring performance on directory " + args[1]);
80  //    prof.enable(false);
81  //    prof.enableGCCalling(false);
82  
83      // initialise GATE
84      prof.checkPoint("Before GATE.init()");
85      Gate.init();
86      //tell GATE we're in batch mode
87  //    gate.Main.batchMode = true;
88  
89  
90      // create some processing resources
91      prof.checkPoint("Before creating the processing resources");
92  
93      //create a default tokeniser
94      FeatureMap params = Factory.newFeatureMap();
95      DefaultTokeniser tokeniser = (DefaultTokeniser) Factory.createResource(
96                      "gate.creole.tokeniser.DefaultTokeniser", params);
97      prof.checkPoint("Tokeniser initialised");
98  
99      //create a default gazetteer
100     params = Factory.newFeatureMap();
101     DefaultGazetteer gaz = (DefaultGazetteer) Factory.createResource(
102                           "gate.creole.gazetteer.DefaultGazetteer", params);
103     prof.checkPoint("Gazetteer initialised");
104 
105     //create a splitter
106     params = Factory.newFeatureMap();
107     SentenceSplitter splitter = (SentenceSplitter) Factory.createResource(
108                           "gate.creole.splitter.SentenceSplitter", params);
109     prof.checkPoint("Sentence splitter initialised");
110 
111     //create a tagger
112     params = Factory.newFeatureMap();
113     POSTagger tagger = (POSTagger) Factory.createResource(
114                           "gate.creole.POSTagger", params);
115     prof.checkPoint("POSTagger initialised");
116 
117     //create a grammar
118     params = Factory.newFeatureMap();
119     ANNIETransducer transducer = (ANNIETransducer) Factory.createResource(
120                           "gate.creole.ANNIETransducer", params);
121     prof.checkPoint("Grammars initialised");
122 
123     //create an orthomatcher
124     params = Factory.newFeatureMap();
125     OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
126                           "gate.creole.orthomatcher.OrthoMatcher", params);
127     prof.checkPoint("Orthomatcher initialised");
128 
129 
130     // for each document
131     //   create a gate doc
132     //   set as the document for hte PRs
133     //   run the PRs
134     //   dump output from the doc
135     //   delete the doc
136     Out.prln("\nLooping on input files list");
137     Iterator filesIter = inputFiles.iterator();
138     docs = inputFiles.size();
139     int fileNo=0;
140     while(filesIter.hasNext()) {
141       File inFile = (File) filesIter.next(); // the current file
142       fileNo++;
143 
144       // set the source URL parameter to a "file:..." URL string
145       params.clear();
146       params.put(Document.DOCUMENT_URL_PARAMETER_NAME, inFile.toURL().toExternalForm());
147       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
148 
149       // create the document
150       Document doc = (Document) Factory.createResource(
151         "gate.corpora.DocumentImpl", params
152       );
153       totalDocLength += doc.getContent().size().longValue();
154 
155       if (maxDocLength < doc.getContent().size().longValue())
156         maxDocLength = doc.getContent().size().longValue();
157 
158       // set the document param on the PRs
159       tokeniser.setDocument(doc);
160       prof.checkPoint("Processing file " + inFile.getPath() +
161           ", #" + fileNo + "/" + docs, new String[0], true, false, false);
162       tokeniser.execute();
163       prof.checkPoint("", new String[] {"Tokenizer", "Processing"}, false, false, false);
164 
165       //run gazetteer
166       gaz.setDocument(doc);
167       gaz.execute();
168       prof.checkPoint("", new String[] {"Gazettier", "Processing"}, false, false, false);
169 
170       //run splitter
171       splitter.setDocument(doc);
172       splitter.execute();
173       prof.checkPoint("", new String[] {"Splitter", "Processing"}, false, false, false);
174 
175       //run the tagger
176       tagger.setDocument(doc);
177       tagger.execute();
178       prof.checkPoint("", new String[] {"Tagger", "Processing"}, false, false, false);
179 
180       //run the transducer
181       transducer.setDocument(doc);
182       transducer.execute();
183       prof.checkPoint("", new String[] {"JAPE grammars", "Processing"}, false, false, false);
184 
185       // run the orthomatcher
186       orthomatcher.setDocument(doc);
187       orthomatcher.execute();
188       prof.checkPoint("", new String[] {"Orthomatcher", "Processing"}, false, false, false);
189 
190       // make the doc a candidate for garbage collection
191       Factory.deleteResource(doc);
192 
193     } // input files loop
194 
195     prof.checkPoint("Done!");
196 
197     totalDocLength = (double) totalDocLength/1024;
198     Out.prln("\nTotal KBytes processed: " + (long)totalDocLength);
199     Out.prln("\nMax document size in bytes: " + (long)maxDocLength +
200       " (" + (long) maxDocLength/1024 + " Kb)");
201 
202 
203     prof.printCategAvg("Processing", docs, totalDocLength, "kb");
204     prof.printCategAvg("Tokenizer", docs, totalDocLength, "kb");
205     prof.printCategAvg("Gazettier", docs, totalDocLength, "kb");
206     prof.printCategAvg("Splitter", docs, totalDocLength, "kb");
207     prof.printCategAvg("Tagger", docs, totalDocLength, "kb");
208     prof.printCategAvg("JAPE grammars", docs, totalDocLength, "kb");
209     prof.printCategAvg("Orthomatcher", docs, totalDocLength, "kb");
210   } // main
211 
212 
213 } // class ProfilePRs
214