1   /*
2    *  CookBook.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 16/Feb/2000
12   *
13   *  $Id: CookBook.java,v 1.35 2005/01/11 13:51:30 ian Exp $
14   */
15  
16  package gate;
17  
18  import java.io.*;
19  import java.util.*;
20  
21  import junit.framework.*;
22  
23  import gate.creole.*;
24  import gate.creole.gazetteer.DefaultGazetteer;
25  import gate.creole.orthomatcher.OrthoMatcher;
26  import gate.creole.splitter.SentenceSplitter;
27  import gate.creole.tokeniser.DefaultTokeniser;
28  import gate.util.*;
29  
30  
31  /**
32    * <P><B>NOTE: this class has been REPLACED by the GateExamples package;
33    * see
34    * <A HREF=http://gate.ac.uk/GateExamples/doc/>http://gate.ac.uk/GateExamples/doc/</A>.</B>
35    *
36    * <P>
37    * This class provides examples of using the GATE APIs.
38    * Read this documentation along with a copy of the
39    * <A HREF=http://gate.ac.uk/gate/doc/java2html/gate/CookBook.java.html>source
40    * code</A>.
41    *
42    * <P>
43    * The CookBook is set up as
44    * part of the GATE test suite (using the
45    * <A HREF="http://www.junit.org/>JUnit testing framework</A>), so there's
46    * an easy way to run the examples (viz.,
47    * <A HREF=../gate/TestGate.html>gate.TestGate</A>'s <TT>main</TT> method,
48    * which will invoke the
49    * JUnit test runner). Also, we can use JUnit's assert methods: e.g.
50    * <TT>assertTrue(corpus.isEmpty());</TT>
51    * tests that a corpus object is empty, and creates a test failure report if
52    * this is not the case. (To add a new test class to the suite, see the
53    * <A HREF=../gate/util/TestTemplate.html>gate.util.TestTemplate</A> class.)
54    *
55    * <P>
56    * Programming to the GATE Java API involves manipulating the classes and
57    * interfaces in the <A HREF=package-summary.html>gate package</A>
58    * (and to a lesser extent other packages). These are
59    * often interfaces; classes there are often to do with getting
60    * access to objects that implement the interfaces (without exposing those
61    * implementations). In other words, there's a lot of interface-based design
62    * around.
63    *
64    * <P>
65    * For more details and for a conceptual view, see
66    * <A HREF=http://gate.ac.uk/sale/tao/>Developing Language Processing
67    * Components with GATE</A> (for which this class provides some of the
68    * examples).
69    *
70    * <P>
71    * The rest of this documentation refers to methods in the code that
72    * provide examples of using the GATE API.
73    *
74    * <P>
75    * The <A HREF=#testResourceCreation()>testResourceCreation</A> method gives
76    * an example of creating a resource via
77    * <A HREF=../gate/Factory.html>gate.Factory</A>.
78    *
79    * <P>
80    * The <A HREF=Corpus.html>Corpus interface</A> represents collections of
81    * <A HREF=Document.html>Documents</A> (and takes the place of the old TIPSTER
82    * <TT>Collection</TT> class).
83    *
84    * <P>
85    * The <A HREF=#testCorpusConstruction()>testCorpusConstruction</A> method
86    * gives an example of how to create a new transient Corpus object.
87    *
88    * <P>
89    * The <A HREF=#testAddingDocuments()>testAddingDocuments</A> method gives
90    * examples of adding documents to corpora.
91    *
92    * <P>
93    * The <A HREF=#testAddingAnnotations()>testAddingAnnotations</A> method gives
94    * examples of adding annotations to documents.
95    *
96    *
97    * <P>
98    * The <A HREF=#testUsingFeatures()>testUsingFeatures</A> method gives
99    * examples of using features. <A HREF=FeatureMap.html>The FeatureMap
100   * interface</A> is a mechanism for associating arbitrary data with GATE
101   * entities. Corpora, documents and annotations all share this
102   * mechanism. Simple feature maps use Java's Map interface.
103   *
104   *
105   * <H3>Other sources of examples</H3>
106   *
107   * <P>
108   * See also the other test classes, although note that they also use methods
109   * that are not part of the public API. Test classes include:
110   * <A HREF=corpora/TestCreole.html>TestCreole</A>;
111   * <A HREF=corpora/TestCorpus.html>TestCorpus</A>;
112   * <A HREF=corpora/TestDocument.html>TestDocument</A>;
113   * <A HREF=corpora/TestAnnotation.html>TestAnnotation</A>; anything
114   * else starting "Test" - about 30 of them at the last count.
115   */
116 public class CookBook extends TestCase
117 {
118   /** Debug flag */
119   private static final boolean DEBUG = false;
120 
121   /** A corpus */
122   Corpus corpus = null;
123 
124   /** A document */
125   Document doc1 = null;
126 
127   /** Another document */
128   Document doc2 = null;
129 
130   /** Constructing a resource */
131   public void testResourceCreation() throws GateException {
132 
133     // before creating a resource we need a feature map to store
134     // parameter values
135     FeatureMap params = Factory.newFeatureMap();
136 
137     // to create a document we need a sourceUrlName parameter giving
138     // the location of the source for the document content
139     params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
140       Gate.getUrl("tests/doc0.html"));
141     params.put(Document.DOCUMENT_MARKUP_AWARE_PARAMETER_NAME,
142       new Boolean(true));
143     Resource res = Factory.createResource("gate.corpora.DocumentImpl", params);
144 
145     // now we have a document
146     assertTrue(
147       "should be document but the class is: " + res.getClass().getName(),
148       res instanceof gate.Document
149     );
150     Document doc = (Document) res;
151     AnnotationSet markupAnnotations = doc.getAnnotations(
152                         GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
153     //this is useless as doc.getAnnotations() will never return null!
154     assertNotNull("no markup annotations on doc " + doc, markupAnnotations);
155     int numMarkupAnnotations = markupAnnotations.size();
156     if(DEBUG)
157       Out.prln("annotations on doc after unpack= " + numMarkupAnnotations);
158     assertTrue(
159       "wrong number annots on doc: " + doc + numMarkupAnnotations,
160       numMarkupAnnotations == 27
161     );
162 
163   } // testResourceCreation
164 
165   /** Constructing a corpus */
166   public void testCorpusConstruction() throws GateException {
167 
168     // corpus constructors require a name
169     corpus = Factory.newCorpus("My example corpus");
170 
171     // the corpus interface inherits all the sorted set methods
172     assertTrue(corpus.isEmpty());
173 
174   } // testCorpusConstruction
175 
176   /** Adding documents to a corpus */
177   public void testAddingDocuments() throws GateException {
178 
179     corpus = Factory.newCorpus("My example corpus");
180 
181     // add a document or two....
182     corpus.add(doc1);
183     corpus.add(doc2);
184 
185     // iterate the corpus members and do some random tests
186     Iterator iter = corpus.iterator();
187     while(iter.hasNext()) {
188       Document doc = (Document) iter.next();
189       assertTrue(
190         "document url not as expected",
191         doc.getSourceUrl().toExternalForm().endsWith("doc0.html") ||
192           doc.getSourceUrl().toExternalForm().endsWith("test1.htm")
193       );
194     } // while
195 
196   } // testAddingDocuments
197 
198   /** Adding annotations to documents */
199   public void testAddingAnnotations() {
200     AnnotationSet as = doc1.getAnnotations();
201     FeatureMap fm = doc1.getFeatures();
202     Integer id;
203 
204     // during creation of annotations offsets are checked and an invalid
205     // offset exception thrown if they are invalid
206     try {
207       id = as.add(new Long(10), new Long(20), "T1", fm);
208     } catch (InvalidOffsetException e) {
209       fail(e.toString());
210     }
211   } // testAddingAnnotations
212 
213   /** Using the FeatureMap interface */
214   public void testUsingFeatures() {
215     AnnotationSet as = doc1.getAnnotations();
216     Integer id; // the id of new annotations
217 
218     // putting features on documents
219     FeatureMap fm = Factory.newFeatureMap();
220     doc1.setFeatures(fm);
221     assertTrue(fm.size() == 0);
222     fm.put("author", "segovia");
223     assertTrue(fm.get("author").equals("segovia"));
224     fm.put("author", "brendl"); // map puts overwrite existing values
225     assertTrue(fm.get("author").equals("brendl"));
226     assertTrue(fm.size() == 1);
227 
228   } // testUsingFeatures
229 
230   /** String to print when wrong command-line args */
231   private static String usage =
232     "usage: CookBook [-dir directory-name | file(s)]";
233 
234   /**
235    * Main function: an example of embedding GATE-based
236    * batch processing. The method:
237    * <UL>
238    * <LI>
239    * initialises the GATE library, and creates PRs for
240    * tokenisation, sentence splitting and part of speech tagging
241    * <LI>
242    * takes a directory name as argument (-dir option) or just a list
243    * of files
244    * <LI>
245    * creates a directory called "out" and an index.html file there
246    * <LI>
247    * for each .html file in that directory:
248    * <BR> create a GATE document from the file
249    * <BR> run the PRs on the document
250    * <BR> dump some output for the file to "out/gate__[file name].txt",
251    * and add a line to the index
252    * </UL>
253    */
254   public static void main(String[] args) throws Exception {
255     // say "hi"
256     Out.prln("CookBook.main");
257     Out.prln("processing command line arguments");
258 
259     // check we have a directory name or list of files
260     List inputFiles = null;
261     if(args.length < 1) throw new GateException(usage);
262 
263     // set up a list of all the files to process
264     if(args[0].equals("-dir")) { // list all the files in the dir
265       if(args.length < 2) throw new GateException(usage);
266       File dir = new File(args[1]);
267       File[] filesArray = dir.listFiles();
268       if(filesArray == null)
269         throw new GateException(
270           dir.getPath() + " is not a directory; " + usage
271         );
272       inputFiles = Arrays.asList(filesArray);
273 
274     } else { // all args should be file names
275       inputFiles = new ArrayList();
276       for(int i = 0; i < args.length; i++)
277         inputFiles.add(new File(args[i]));
278     }
279 
280     // did we get some file names?
281     if(inputFiles.isEmpty()) {
282       throw new GateException("No files to process!");
283     }
284 
285     // initialise GATE
286     Out.prln("initialising GATE");
287     Gate.init();
288 
289     // create some processing resources
290     Out.prln("creating PRs");
291     //create a tokeniser
292     DefaultTokeniser tokeniser = (DefaultTokeniser)Factory.createResource(
293                                       "gate.creole.tokeniser.DefaultTokeniser");
294     //create a sentence splitter
295     SentenceSplitter splitter = (SentenceSplitter)Factory.createResource(
296                                       "gate.creole.splitter.SentenceSplitter");
297     //create a POS tagger
298     POSTagger tagger = (POSTagger)Factory.createResource(
299                                       "gate.creole.POSTagger");
300 
301     //create  a gazetteer
302     DefaultGazetteer gazetteer = (DefaultGazetteer)Factory.createResource(
303                                       "gate.creole.gazetteer.DefaultGazetteer");
304 
305     //create a grammar
306     ANNIETransducer transducer = (ANNIETransducer)Factory.createResource(
307                                       "gate.creole.ANNIETransducer");
308 
309     //create an orthomatcher
310     OrthoMatcher orthomatcher = (OrthoMatcher) Factory.createResource(
311                                 "gate.creole.orthomatcher.OrthoMatcher");
312 
313     // make the "out" directory that will contain the results.
314     String outDirName =
315       ((File) inputFiles.get(0)).getParent() + Strings.getFileSep() + "out";
316     if(! new File(outDirName).mkdir()){
317       throw new GateException("Could not create the output directory");
318     }
319 
320     // construct a name for the output index file; open; dump header
321     String nl = Strings.getNl(); // shorthand for platform's newline
322     String fsep =
323       Strings.getFileSep(); // shorthand for platform's file separator
324     String indexName =
325       ( (File) inputFiles.get(0) ).getParent() + fsep + "index.html";
326     FileWriter indexWriter = new FileWriter(new File(indexName));
327     indexWriter.write("<HTML><HEAD><TITLE>Documents list</TITLE></HEAD>");
328     indexWriter.write(nl + "<BODY>" + nl + "<UL>" + nl);
329 
330     // main loop:
331     // for each document
332     //   create a gate doc
333     //   set as the document for the PRs
334     //   run the PRs
335     //   dump output from the doc to out/gate__.....txt
336     //   delete the doc
337 
338     // loop on files list
339     Iterator filesIter = inputFiles.iterator();
340     Out.prln("looping on input files list");
341     while(filesIter.hasNext()) {
342       File inFile = (File) filesIter.next(); // the current file
343       Out.prln("processing file " + inFile.getPath());
344       FeatureMap params = Factory.newFeatureMap(); // params list for new doc
345 
346       // set the source URL parameter to a "file:..." URL string
347       params.put(Document.DOCUMENT_URL_PARAMETER_NAME,
348         inFile.toURL().toExternalForm());
349 
350       // use the platform's default encoding rather than GATE's
351       params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, "");
352 
353       // create the document
354       Document doc = (Document) Factory.createResource(
355         "gate.corpora.DocumentImpl", params
356       );
357 
358       // set the document param on the PRs
359        tokeniser.setDocument(doc);
360        splitter.setDocument(doc);
361        tagger.setDocument(doc);
362        gazetteer.setDocument(doc);
363        transducer.setDocument(doc);
364        orthomatcher.setDocument(doc);
365 
366       // run each PR
367       tokeniser.execute();
368       splitter.execute();
369       tagger.execute();
370       gazetteer.execute();
371       transducer.execute();
372       orthomatcher.execute();
373 
374       // dump out results
375 
376       // construct a name for the output file and open a stream
377       StringBuffer outFileName = new StringBuffer(inFile.getParent());
378       outFileName.append(fsep);
379       outFileName.append("out");
380       outFileName.append(fsep);
381       outFileName.append("gate__");
382       outFileName.append(inFile.getName());
383       outFileName.append(".txt");
384       File outFile = new File(outFileName.toString());
385       FileWriter outFileWriter = new FileWriter(outFile);
386       Out.prln("dumping " + outFile.getPath());
387 
388       // iterate round the token annotations writing to the out file
389       // NOTE: to dump all to XML: outFileWriter.write(doc.toXml(tokens));
390       AnnotationSet tokens = doc.getAnnotations("nercAS").
391         get(ANNIEConstants.TOKEN_ANNOTATION_TYPE);
392       Iterator iter = tokens.iterator();
393       while(iter.hasNext()) {
394         Annotation token = (Annotation) iter.next();
395         FeatureMap tokFeats = token.getFeatures();
396         String tokStr = (String) tokFeats.
397           get(ANNIEConstants.TOKEN_STRING_FEATURE_NAME);
398         String tokPos = (String) tokFeats.
399           get(ANNIEConstants.TOKEN_CATEGORY_FEATURE_NAME);
400         outFileWriter.write(tokStr + "\t" + tokPos + nl);
401       }
402       outFileWriter.write(doc.getFeatures().get("entitySet").toString());
403 
404       // close the out file stream; add an index line
405       outFileWriter.close();
406       indexWriter.write(
407         "<LI><A href=\"" + inFile.getName() + "\">" + inFile.getName() +
408         "</a>" + " -> " + "<a href=\"" + "out" + fsep + outFile.getName() +
409         "\">" + "out" + fsep + outFile.getName() + "</a></LI>\n"
410       );
411 
412       // make the doc a candidate for garbage collection
413       Out.prln("deleting gate doc");
414 
415       Factory.deleteResource(doc);
416     } // input files loop
417 
418     // finish the index file
419     indexWriter.write(nl + "</UL>" + nl + "</BODY></HTML>" + nl);
420     indexWriter.close();
421 
422     Out.prln("The End (roll credits)");
423   } // main
424 
425   /** Fixture set up: initialise members before each test method */
426   public void setUp() throws GateException, IOException {
427     corpus = Factory.newCorpus("My example corpus");
428 
429     doc1 = Factory.newDocument(Gate.getUrl("tests/doc0.html"));
430     doc2 = Factory.newDocument(Gate.getUrl("tests/html/test1.htm"));
431   } // setUp
432 
433   /** Construction */
434   public CookBook(String name) { super(name); }
435 
436   /** Test suite routine for the test runner */
437   public static Test suite() {
438     return new TestSuite(CookBook.class);
439   } // suite
440 
441 } // class CookBook
442