| Batch.java |
1 /*
2 * Batch.java - transducer class
3 *
4 * Copyright (c) 1998-2005, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Hamish Cunningham, 10/08/98
12 *
13 * $Id: Batch.java,v 1.40 2005/10/10 10:39:51 valyt Exp $
14 *
15 * DEVELOPER NOTES:
16 *
17 * This is one that got away; the relation between constructors,
18 * initTransducer and parseTransducer are totally screwy and get worse
19 * every time I add something (e.g. support for resource loading).
20 * We should probably junk this whole thing and start again....
21 */
22
23 package gate.jape;
24
25 import java.io.IOException;
26 import java.net.URL;
27 import java.util.Iterator;
28 import java.util.Vector;
29
30 import gate.*;
31 import gate.creole.ExecutionException;
32 import gate.event.ProgressListener;
33 import gate.event.StatusListener;
34 import gate.util.Err;
35 import gate.util.Out;
36
37 /** Batch processing of JAPE transducers against documents or collections.
38 * Construction will parse or deserialise a transducer as required.
39 */
40 public class Batch implements JapeConstants {
41 /** Debug flag */
42 private static final boolean DEBUG = false;
43
44 /** The name of the transducer file, a .jape or .ser. */
45 // private String japeFileName;
46
47 /** The URL that points to a .jape file */
48 private URL japeURL;
49
50 /**The encoding used for reading the grammar file(s)*/
51 private String encoding;
52
53 /** The JAPE transducer. */
54 private Transducer transducer;
55
56 /** A stream connected to the JAPE file (often null). */
57 // private InputStream japeStream = null;
58
59 /** Create non-initialised instance (private, used in main). */
60 private Batch() { }
61
62 /** Create a fully initialised instance.
63 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
64 * file. This may be an absolute path, or may a .jar
65 * that lives somewhere on the classpath.
66 */
67 public Batch(URL url, String encoding) throws JapeException {
68 this.japeURL = url;
69 this.encoding = encoding;
70 parseJape();
71 linkListeners();
72 } // full init constructor
73
74 public Batch(URL url, String encoding, StatusListener sListener)
75 throws JapeException {
76
77 this.addStatusListener(sListener);
78 this.japeURL = url;
79 this.encoding = encoding;
80 parseJape();
81 linkListeners();
82 } // full init constructor
83
84 private void readObject(java.io.ObjectInputStream in)
85 throws IOException, ClassNotFoundException{
86 in.defaultReadObject();
87 //now recreate the listeners
88 linkListeners();
89 }
90
91 /**
92 * Creates inner listeners that forward events from the transducer object
93 * to our own listeners.
94 */
95 protected void linkListeners(){
96 if(transducer != null){
97 transducer.addStatusListener(new StatusListener(){
98 public void statusChanged(String text){
99 fireStatusChanged(text);
100 }
101 });
102
103 transducer.addProgressListener(new ProgressListener(){
104 public void progressChanged(int value){
105 fireProgressChanged(value);
106 }
107
108 public void processFinished(){
109 fireProcessFinished();
110 }
111 });
112 }
113 }
114
115 /**
116 * Notifies this PR that it should stop its execution as soon as possible.
117 */
118 public synchronized void interrupt(){
119 transducer.interrupt();
120 }
121 /** Create a fully initialised instance.
122 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer
123 * file. This may be an absolute path, or may a .jar
124 * that lives somewhere on the classpath.
125 */
126 /*
127 public Batch(String japeFileName) throws JapeException {
128 this.japeFileName = japeFileName;
129 initTransducer();
130 } // full init constructor
131 */
132 /*
133 public Batch(String japeFileName, StatusListener sListener)
134 throws JapeException {
135 this.japeFileName = japeFileName;
136 this.addStatusListener(sListener);
137 initTransducer();
138 } // full init constructor
139 */
140
141 /** Create a fully initialised instance from an InputStream connected
142 * to the JAPE file.
143 */
144 /*
145 public Batch(InputStream japeStream) throws JapeException {
146 if(japeStream == null)
147 throw new JapeException(
148 "attempt to create a batch parser with null input stream"
149 );
150 this.japeFileName = "stream";
151 this.japeStream = japeStream;
152 initTransducer();
153 } // full init constructor
154 */
155 /** Create a fully initialised instance from a resource path and resource
156 * name.
157 */
158 /*
159 public Batch(String resPath, String resName) throws JapeException {
160 fromResource = true;
161 this.japeFileName = resName;
162 this.resPath = resPath;
163 initTransducer();
164 } // full init constructor
165 */
166
167 /** Get the transducer. */
168 public Transducer getTransducer() { return transducer; }
169
170 /** Instantiate transducer member as necessary. */
171 /*
172 private void initTransducer()
173 throws JapeException {
174 if(fromResource) {
175 parseJape(resPath, japeFileName);
176 } else if(japeFileName.endsWith(".ser") || japeFileName.endsWith(".SER"))
177 deserialiseJape(new File(japeFileName));
178 else if(japeFileName.endsWith(".jape") || japeFileName.endsWith(".JAPE"))
179 parseJape();
180 else if(japeFileName.endsWith(".jar") || japeFileName.endsWith(".JAR"))
181 deserialiseJape();
182 else if(japeFileName.equals("stream"))
183 parseJape(japeStream);
184 else
185 throw new JapeException(
186 "unknown file type (not .jape, .ser or .jar):" + japeFileName
187 );
188 if(transducer != null) transducer.addStatusListener(new StatusListener() {
189 public void statusChanged(String text){
190 fireStatusChangedEvent(text);
191 }
192 });
193 }
194 */
195 /** Parse a jape file from {@link #japeURL} and store the transducer. */
196 private void parseJape() throws JapeException {
197 try {
198 gate.jape.parser.ParseCpsl parser = Factory.newJapeParser(japeURL, encoding);
199
200 StatusListener listener = null;
201 listener = new StatusListener(){
202 public void statusChanged(String text){
203 fireStatusChanged(text);
204 }
205 };
206 parser.addStatusListener(listener);
207 transducer = parser.MultiPhaseTransducer();
208 parser.removeStatusListener(listener);
209 //the call to finish needs to be handled from here now as it
210 //was removed from the .jj file
211 transducer.addStatusListener(listener);
212 transducer.finish();
213 transducer.removeStatusListener(listener);
214
215 } catch (gate.jape.parser.ParseException e) {
216 throw new
217 JapeException("Batch: error parsing transducer: " + e.getMessage());
218 } catch (java.io.IOException e) {
219 throw new
220 JapeException("Batch: couldn't open JAPE file: " + e.getMessage());
221 }
222 } // parseJape
223
224 /** Parse a jape file from an InputStream and store the transducer. */
225 /*
226 private void parseJape(InputStream japeStream) throws JapeException {
227 try {
228 gate.jape.parser.ParseCpsl parser =
229 new gate.jape.parser.ParseCpsl(japeFileName, japeStream);
230 transducer = parser.MultiPhaseTransducer();
231 } catch (gate.jape.parser.ParseException e) {
232 throw new
233 JapeException("Batch: error parsing transducer: " + e.getMessage());
234 } catch (java.io.IOException e) {
235 throw new
236 JapeException("Batch: couldn't read JAPE stream: " + e.getMessage());
237 }
238 } // parseJape(InputStream)
239 */
240 /** Parse a jape file from a resource and store the transducer. */
241 /*
242 private void parseJape(String resPath, String resName) throws JapeException {
243 try {
244 gate.jape.parser.ParseCpsl parser =
245 new gate.jape.parser.ParseCpsl(resPath, resName);
246 transducer = parser.MultiPhaseTransducer();
247 } catch (gate.jape.parser.ParseException e) {
248 throw new
249 JapeException("Batch: error parsing transducer: " + e.getMessage());
250 } catch (java.io.IOException e) {
251 throw new
252 JapeException("Batch: couldn't read JAPE resource: " + e.getMessage());
253 }
254 } // parseJape(resPath, resName)
255 */
256
257 /** Deserialise from a .ser file. */
258 /*
259 private void deserialiseJape(File japeFile) throws JapeException {
260
261 // set up a file input stream
262 FileInputStream japeInputStream = null;
263 try {
264 japeInputStream = new FileInputStream(japeFile.getPath());
265 } catch (IOException e) {
266 throw new JapeException(
267 "Can't read from " + japeFile.getPath() + ": " + e.getMessage()
268 );
269 }
270
271 // call the input stream deserialise method
272 deserialiseJape(japeInputStream);
273 } // deserialiseJape(File)
274 */
275 /** Deserialise from a JAR file. */
276 /*
277 private void deserialiseJape() throws JapeException {
278 // find the jar from CLASSPATH
279 //SearchPath classPath =
280 // new SearchPath(System.getProperty("java.class.path"), ".");
281 File jarFile = new File(japeFileName); //classPath.getFile(japeFileName);
282 if(jarFile == null)
283 throw new JapeException("Batch: can't find " + japeFileName);
284
285 // get a byte array input stream with the .ser in out of the jar file
286 JarFile jar = null;
287 BufferedInputStream japeInputStream = null;
288 try {
289 jar = new JarFile(jarFile.getPath());
290 japeInputStream = new BufferedInputStream(
291 jar.getInputStream(jar.getJarEntry(jarNameToSerName(japeFileName)))
292 );
293 } catch(IOException e) {
294 throw new JapeException("couldn't read jar file " + japeFileName);
295 }
296
297
298 // call the input stream deserialise method
299 deserialiseJape(japeInputStream);
300 } // deserialiseJape()
301 */
302 /** Create a transducer from an object input stream (deserialisation). */
303 /*
304 private void deserialiseJape(InputStream japeInputStream)
305 throws JapeException {
306 try {
307 ObjectInputStream ois = new ObjectInputStream(japeInputStream);
308 transducer = (Transducer) ois.readObject();
309 ois.close();
310 japeInputStream.close(); // redundant?
311 } catch (IOException e) {
312 throw new JapeException(
313 "Batch: can't deserialise InputStream (1): " + e.getMessage()
314 );
315 } catch (ClassNotFoundException e) {
316 throw new JapeException(
317 "Batch: can't deserialise InputStream (2): " + e.getMessage()
318 );
319 }
320 } // deserialise(OIS)
321 */
322 /** Create a .ser name from a .jar name. */
323 /*
324 private String jarNameToSerName(String jarName) {
325 return jarName.substring(0, jarName.length() - 4) + ".ser";
326 } // jarNameToSerName
327 */
328
329 /** Process the given collection. */
330 public void transduce(Corpus coll) throws JapeException, ExecutionException {
331 // for each doc run the transducer
332 Iterator iter = coll.iterator();
333 while(iter.hasNext()) {
334 Document doc = (Document) iter.next();
335 // transducer.transduce(doc);
336 transduce(doc, doc.getAnnotations(), doc.getAnnotations());
337 }
338 } // transduce(coll)
339
340 /** Process a single document. */
341 public void transduce(Document doc) throws JapeException, ExecutionException {
342 transducer.transduce(doc, doc.getAnnotations(), doc.getAnnotations());
343 } // transduce(doc)
344
345 /** Process a single document. */
346 public void transduce(Document doc, AnnotationSet inputAS,
347 AnnotationSet outputAS) throws JapeException,
348 ExecutionException {
349 //no need to transduce empty document
350 if (inputAS == null || inputAS.isEmpty())
351 return;
352 transducer.transduce(doc, inputAS, outputAS);
353
354 } // transduce(doc)
355
356 /** Process a single text. */
357 /*
358 public Document transduce(String text) throws JapeException {
359 Document doc = null;
360 try {
361 doc = Factory.newDocument(text);
362 } catch (ResourceInstantiationException e) {
363 throw new JapeException(e.toString());
364 }
365 transducer.transduce(doc, doc.getAnnotations());
366 return doc;
367 } // transduce(text)
368 */
369 /** Process a single file. */
370 /*
371 public Document transduce(File textFile) throws JapeException {
372 String text = null;
373 try {
374 text = gate.util.Files.getString(textFile);
375 } catch(IOException e) { throw new JapeException(e.toString()); }
376 return transduce(text);
377 } // transduce(textFile)
378 */
379 /** Process a set of files. */
380 /*
381 public Corpus transduce(String[] textFileNames) throws JapeException {
382 Corpus coll = null;
383 try {
384 coll = Factory.newCorpus("JAPE batch corpus");
385 Document doc = null;
386 for(int i = 0; i < textFileNames.length; i++) {
387 doc = Factory.newDocument(textFileNames[i]);
388 doc.setFeatures(Factory.newFeatureMap());
389 /*coll.createDocument(
390 textFileNames[i],
391 null, // the text - should get read from disk
392 new AnnotationSetImpl(doc),
393 Factory.newFeatureMap(),
394 Document.COPIED
395 );*/
396 /*
397 transducer.transduce(doc, doc.getAnnotations());
398 }
399 } catch(ResourceInstantiationException e) {
400 throw new JapeException(e.toString());
401 }
402 return coll;
403 } // transduce(textFileNames)
404 */
405 /** This is where it all happens. This is <I>the</I> place to be. Take
406 * your summer holidays here. Visit on Saturday nights. Buy a season
407 * ticket from <CODE>www.programmer.gone.insane.com</CODE>.
408 * <P>
409 * Takes a .jape/.jar/.ser
410 * file name (-j option) which is assumed to hold a pattern
411 * grammar for a multi-phase transducer, and a collection
412 * name (-c option) or a list of files. As needed it then parses and
413 * compiles the transducer, then transduces all the documents in the
414 * collection and saves it to disk.
415 */
416 public static void main(String args[]) {
417 /*
418 // oh great bug in the sky give us this day our daily fuckup
419 //gate.util.Debug.setDebug(true);
420 //gate.util.Debug.setDebug(Rule.class, true);
421 //gate.util.Debug.setDebug(LeftHandSide.class, true);
422 //gate.util.Debug.setDebug(BasicPatternElement.class, true);
423 //gate.util.Debug.setDebug(AnnotationSet.class, true);
424
425 // The persistent name of the collection.
426 String persCollName = null;;
427
428 // The collection to process.
429 Corpus collection = null;
430
431 // create one of us
432 Batch batch = new Batch();
433
434 // process the options
435 int i = 0;
436 for( ; i<args.length; i++) {
437 if(args[i].equals("-c") && ++i < args.length) // -c = coll name
438 persCollName = args[i];
439 else if(args[i].equals("-j") && ++i < args.length)// -j = transducer name
440 batch.japeFileName = args[i];
441 else if(args[i].equals("-v")) // -v = verbose
442 batch.setVerbose(true);
443 else if(args[i].startsWith("-"))
444 batch.usage("unknown option " + args[i]);
445 else
446 break;
447 } // for each arg
448
449 // file name list
450 String[] fileNames = null;
451 if(args.length > i) {
452 fileNames = new String[args.length - i];
453 for(int j = 0; i<args.length; j++, i++)
454 fileNames[j] = args[i];
455 }
456
457 // did they give valid options?
458 if(batch.japeFileName == null)
459 batch.usage("you must supply a transducer name");
460 if(fileNames != null && persCollName != null)
461 batch.usage("can't read a collection AND process a file list");
462
463 // parse the transducer or bomb
464 batch.message("parsing the transducer");
465 try { batch.initTransducer(); }
466 catch(JapeException e) {
467 batch.usage("oops: " + e.toString());
468 }
469
470 Corpus coll = null;
471 if(persCollName != null) { // we got a collection name, not a list of files
472
473 // open the collection or bomb
474 coll = null;
475 batch.message("opening the collection");
476 try {
477 coll = Factory.newCorpus(persCollName);
478 } catch(ResourceInstantiationException e) {
479 batch.usage("oops (x): " + e);
480 }
481
482 // transduce
483 batch.message("calling transducer");
484 try { batch.transduce(coll); }
485 catch(JapeException e) {
486 batch.usage("oops (1): " + e.toString());
487 }
488
489 // save to disk
490 batch.message("saving the collection");
491 batch.usage("couldn't sync coll ");
492
493 // we got a list of files, not a collection
494 } else {
495 batch.message("transducing transient collection");
496 try {
497 coll = batch.transduce(fileNames);
498 } catch(JapeException e) {
499 batch.usage("oops (2): " + e.toString());
500 }
501 }
502
503 // we won! we won! we can smash up all the computers now!
504 batch.message("done");
505 //System.exit(0);
506 */
507 } // main
508
509
510 /** Whether to print progress messages or not. */
511 private boolean verbose = false;
512
513 /** Set verbosity. */
514 public void setVerbose(boolean turtleSoup) { verbose = turtleSoup; }
515
516 /** You got something wrong, dumbo. */
517 public void usage(String errorMessage) {
518 String usageMessage =
519 "usage: java gate.jape.Batch.main [-v] " +
520 "-j japefile(.ser|.jape|.jar) " +
521 "(-c CollectionName | filenames)";
522
523 Err.println(errorMessage);
524 Err.println(usageMessage);
525 // System.exit(1);
526
527 } // usage
528
529 /** Hello? Anybody there?? */
530 public void message(String mess) {
531 if(verbose) Out.println("Batch: " + mess);
532 } // message
533
534 public void setFeatures(gate.FeatureMap newFeatures) {
535 features = newFeatures;
536 }
537 public gate.FeatureMap getFeatures() {
538 return features;
539 }
540 public synchronized void removeProgressListener(ProgressListener l) {
541 if (progressListeners != null && progressListeners.contains(l)) {
542 Vector v = (Vector) progressListeners.clone();
543 v.removeElement(l);
544 progressListeners = v;
545 }
546 }
547 public synchronized void addProgressListener(ProgressListener l) {
548 Vector v = progressListeners == null ? new Vector(2) : (Vector) progressListeners.clone();
549 if (!v.contains(l)) {
550 v.addElement(l);
551 progressListeners = v;
552 }
553 }
554
555 //ProcessProgressReporter implementation ends here
556
557 /** Are we initialising from a resource? */
558 // private boolean fromResource = false;
559
560 /** Path to the resources tree */
561 // private String resPath = null;
562
563
564 private gate.FeatureMap features;
565 private transient Vector progressListeners;
566 private transient Vector statusListeners;
567 private boolean enableDebugging;
568
569 protected void fireProgressChanged(int e) {
570 if (progressListeners != null) {
571 Vector listeners = progressListeners;
572 int count = listeners.size();
573 for (int i = 0; i < count; i++) {
574 ((ProgressListener) listeners.elementAt(i)).progressChanged(e);
575 }
576 }
577 }
578 protected void fireProcessFinished() {
579 if (progressListeners != null) {
580 Vector listeners = progressListeners;
581 int count = listeners.size();
582 for (int i = 0; i < count; i++) {
583 ((ProgressListener) listeners.elementAt(i)).processFinished();
584 }
585 }
586 }
587 public synchronized void removeStatusListener(StatusListener l) {
588 if (statusListeners != null && statusListeners.contains(l)) {
589 Vector v = (Vector) statusListeners.clone();
590 v.removeElement(l);
591 statusListeners = v;
592 }
593 }
594 public synchronized void addStatusListener(StatusListener l) {
595 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone();
596 if (!v.contains(l)) {
597 v.addElement(l);
598 statusListeners = v;
599 }
600 }
601 protected void fireStatusChanged(String e) {
602 if (statusListeners != null) {
603 Vector listeners = statusListeners;
604 int count = listeners.size();
605 for (int i = 0; i < count; i++) {
606 ((StatusListener) listeners.elementAt(i)).statusChanged(e);
607 }
608 }
609 }
610
611 /**
612 * Sets the ontology to be used by the transducers
613 * @param ontology
614 */
615 public void setOntology(gate.creole.ontology.Ontology ontology) {
616 transducer.setOntology(ontology);
617 }
618 public boolean isEnableDebugging() {
619 return enableDebugging;
620 }
621 public void setEnableDebugging(boolean enableDebugging) {
622 this.enableDebugging = enableDebugging;
623 //propagate
624 if(transducer != null) transducer.setEnableDebugging(enableDebugging);
625 }
626
627
628 /*
629 private void writeObject(ObjectOutputStream oos) throws IOException {
630 Out.prln("writing batch");
631 oos.defaultWriteObject();
632 Out.prln("finished writing batch");
633 } // writeObject
634 */
635
636 } // class Batch
637
638