Batch.java |
1 /* 2 * Batch.java - transducer class 3 * 4 * Copyright (c) 1998-2005, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Hamish Cunningham, 10/08/98 12 * 13 * $Id: Batch.java,v 1.40 2005/10/10 10:39:51 valyt Exp $ 14 * 15 * DEVELOPER NOTES: 16 * 17 * This is one that got away; the relation between constructors, 18 * initTransducer and parseTransducer are totally screwy and get worse 19 * every time I add something (e.g. support for resource loading). 20 * We should probably junk this whole thing and start again.... 21 */ 22 23 package gate.jape; 24 25 import java.io.IOException; 26 import java.net.URL; 27 import java.util.Iterator; 28 import java.util.Vector; 29 30 import gate.*; 31 import gate.creole.ExecutionException; 32 import gate.event.ProgressListener; 33 import gate.event.StatusListener; 34 import gate.util.Err; 35 import gate.util.Out; 36 37 /** Batch processing of JAPE transducers against documents or collections. 38 * Construction will parse or deserialise a transducer as required. 39 */ 40 public class Batch implements JapeConstants { 41 /** Debug flag */ 42 private static final boolean DEBUG = false; 43 44 /** The name of the transducer file, a .jape or .ser. */ 45 // private String japeFileName; 46 47 /** The URL that points to a .jape file */ 48 private URL japeURL; 49 50 /**The encoding used for reading the grammar file(s)*/ 51 private String encoding; 52 53 /** The JAPE transducer. */ 54 private Transducer transducer; 55 56 /** A stream connected to the JAPE file (often null). */ 57 // private InputStream japeStream = null; 58 59 /** Create non-initialised instance (private, used in main). */ 60 private Batch() { } 61 62 /** Create a fully initialised instance. 63 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer 64 * file. This may be an absolute path, or may a .jar 65 * that lives somewhere on the classpath. 66 */ 67 public Batch(URL url, String encoding) throws JapeException { 68 this.japeURL = url; 69 this.encoding = encoding; 70 parseJape(); 71 linkListeners(); 72 } // full init constructor 73 74 public Batch(URL url, String encoding, StatusListener sListener) 75 throws JapeException { 76 77 this.addStatusListener(sListener); 78 this.japeURL = url; 79 this.encoding = encoding; 80 parseJape(); 81 linkListeners(); 82 } // full init constructor 83 84 private void readObject(java.io.ObjectInputStream in) 85 throws IOException, ClassNotFoundException{ 86 in.defaultReadObject(); 87 //now recreate the listeners 88 linkListeners(); 89 } 90 91 /** 92 * Creates inner listeners that forward events from the transducer object 93 * to our own listeners. 94 */ 95 protected void linkListeners(){ 96 if(transducer != null){ 97 transducer.addStatusListener(new StatusListener(){ 98 public void statusChanged(String text){ 99 fireStatusChanged(text); 100 } 101 }); 102 103 transducer.addProgressListener(new ProgressListener(){ 104 public void progressChanged(int value){ 105 fireProgressChanged(value); 106 } 107 108 public void processFinished(){ 109 fireProcessFinished(); 110 } 111 }); 112 } 113 } 114 115 /** 116 * Notifies this PR that it should stop its execution as soon as possible. 117 */ 118 public synchronized void interrupt(){ 119 transducer.interrupt(); 120 } 121 /** Create a fully initialised instance. 122 * <P><CODE>japeFileName</CODE>: the name of a .jape or .ser transducer 123 * file. This may be an absolute path, or may a .jar 124 * that lives somewhere on the classpath. 125 */ 126 /* 127 public Batch(String japeFileName) throws JapeException { 128 this.japeFileName = japeFileName; 129 initTransducer(); 130 } // full init constructor 131 */ 132 /* 133 public Batch(String japeFileName, StatusListener sListener) 134 throws JapeException { 135 this.japeFileName = japeFileName; 136 this.addStatusListener(sListener); 137 initTransducer(); 138 } // full init constructor 139 */ 140 141 /** Create a fully initialised instance from an InputStream connected 142 * to the JAPE file. 143 */ 144 /* 145 public Batch(InputStream japeStream) throws JapeException { 146 if(japeStream == null) 147 throw new JapeException( 148 "attempt to create a batch parser with null input stream" 149 ); 150 this.japeFileName = "stream"; 151 this.japeStream = japeStream; 152 initTransducer(); 153 } // full init constructor 154 */ 155 /** Create a fully initialised instance from a resource path and resource 156 * name. 157 */ 158 /* 159 public Batch(String resPath, String resName) throws JapeException { 160 fromResource = true; 161 this.japeFileName = resName; 162 this.resPath = resPath; 163 initTransducer(); 164 } // full init constructor 165 */ 166 167 /** Get the transducer. */ 168 public Transducer getTransducer() { return transducer; } 169 170 /** Instantiate transducer member as necessary. */ 171 /* 172 private void initTransducer() 173 throws JapeException { 174 if(fromResource) { 175 parseJape(resPath, japeFileName); 176 } else if(japeFileName.endsWith(".ser") || japeFileName.endsWith(".SER")) 177 deserialiseJape(new File(japeFileName)); 178 else if(japeFileName.endsWith(".jape") || japeFileName.endsWith(".JAPE")) 179 parseJape(); 180 else if(japeFileName.endsWith(".jar") || japeFileName.endsWith(".JAR")) 181 deserialiseJape(); 182 else if(japeFileName.equals("stream")) 183 parseJape(japeStream); 184 else 185 throw new JapeException( 186 "unknown file type (not .jape, .ser or .jar):" + japeFileName 187 ); 188 if(transducer != null) transducer.addStatusListener(new StatusListener() { 189 public void statusChanged(String text){ 190 fireStatusChangedEvent(text); 191 } 192 }); 193 } 194 */ 195 /** Parse a jape file from {@link #japeURL} and store the transducer. */ 196 private void parseJape() throws JapeException { 197 try { 198 gate.jape.parser.ParseCpsl parser = Factory.newJapeParser(japeURL, encoding); 199 200 StatusListener listener = null; 201 listener = new StatusListener(){ 202 public void statusChanged(String text){ 203 fireStatusChanged(text); 204 } 205 }; 206 parser.addStatusListener(listener); 207 transducer = parser.MultiPhaseTransducer(); 208 parser.removeStatusListener(listener); 209 //the call to finish needs to be handled from here now as it 210 //was removed from the .jj file 211 transducer.addStatusListener(listener); 212 transducer.finish(); 213 transducer.removeStatusListener(listener); 214 215 } catch (gate.jape.parser.ParseException e) { 216 throw new 217 JapeException("Batch: error parsing transducer: " + e.getMessage()); 218 } catch (java.io.IOException e) { 219 throw new 220 JapeException("Batch: couldn't open JAPE file: " + e.getMessage()); 221 } 222 } // parseJape 223 224 /** Parse a jape file from an InputStream and store the transducer. */ 225 /* 226 private void parseJape(InputStream japeStream) throws JapeException { 227 try { 228 gate.jape.parser.ParseCpsl parser = 229 new gate.jape.parser.ParseCpsl(japeFileName, japeStream); 230 transducer = parser.MultiPhaseTransducer(); 231 } catch (gate.jape.parser.ParseException e) { 232 throw new 233 JapeException("Batch: error parsing transducer: " + e.getMessage()); 234 } catch (java.io.IOException e) { 235 throw new 236 JapeException("Batch: couldn't read JAPE stream: " + e.getMessage()); 237 } 238 } // parseJape(InputStream) 239 */ 240 /** Parse a jape file from a resource and store the transducer. */ 241 /* 242 private void parseJape(String resPath, String resName) throws JapeException { 243 try { 244 gate.jape.parser.ParseCpsl parser = 245 new gate.jape.parser.ParseCpsl(resPath, resName); 246 transducer = parser.MultiPhaseTransducer(); 247 } catch (gate.jape.parser.ParseException e) { 248 throw new 249 JapeException("Batch: error parsing transducer: " + e.getMessage()); 250 } catch (java.io.IOException e) { 251 throw new 252 JapeException("Batch: couldn't read JAPE resource: " + e.getMessage()); 253 } 254 } // parseJape(resPath, resName) 255 */ 256 257 /** Deserialise from a .ser file. */ 258 /* 259 private void deserialiseJape(File japeFile) throws JapeException { 260 261 // set up a file input stream 262 FileInputStream japeInputStream = null; 263 try { 264 japeInputStream = new FileInputStream(japeFile.getPath()); 265 } catch (IOException e) { 266 throw new JapeException( 267 "Can't read from " + japeFile.getPath() + ": " + e.getMessage() 268 ); 269 } 270 271 // call the input stream deserialise method 272 deserialiseJape(japeInputStream); 273 } // deserialiseJape(File) 274 */ 275 /** Deserialise from a JAR file. */ 276 /* 277 private void deserialiseJape() throws JapeException { 278 // find the jar from CLASSPATH 279 //SearchPath classPath = 280 // new SearchPath(System.getProperty("java.class.path"), "."); 281 File jarFile = new File(japeFileName); //classPath.getFile(japeFileName); 282 if(jarFile == null) 283 throw new JapeException("Batch: can't find " + japeFileName); 284 285 // get a byte array input stream with the .ser in out of the jar file 286 JarFile jar = null; 287 BufferedInputStream japeInputStream = null; 288 try { 289 jar = new JarFile(jarFile.getPath()); 290 japeInputStream = new BufferedInputStream( 291 jar.getInputStream(jar.getJarEntry(jarNameToSerName(japeFileName))) 292 ); 293 } catch(IOException e) { 294 throw new JapeException("couldn't read jar file " + japeFileName); 295 } 296 297 298 // call the input stream deserialise method 299 deserialiseJape(japeInputStream); 300 } // deserialiseJape() 301 */ 302 /** Create a transducer from an object input stream (deserialisation). */ 303 /* 304 private void deserialiseJape(InputStream japeInputStream) 305 throws JapeException { 306 try { 307 ObjectInputStream ois = new ObjectInputStream(japeInputStream); 308 transducer = (Transducer) ois.readObject(); 309 ois.close(); 310 japeInputStream.close(); // redundant? 311 } catch (IOException e) { 312 throw new JapeException( 313 "Batch: can't deserialise InputStream (1): " + e.getMessage() 314 ); 315 } catch (ClassNotFoundException e) { 316 throw new JapeException( 317 "Batch: can't deserialise InputStream (2): " + e.getMessage() 318 ); 319 } 320 } // deserialise(OIS) 321 */ 322 /** Create a .ser name from a .jar name. */ 323 /* 324 private String jarNameToSerName(String jarName) { 325 return jarName.substring(0, jarName.length() - 4) + ".ser"; 326 } // jarNameToSerName 327 */ 328 329 /** Process the given collection. */ 330 public void transduce(Corpus coll) throws JapeException, ExecutionException { 331 // for each doc run the transducer 332 Iterator iter = coll.iterator(); 333 while(iter.hasNext()) { 334 Document doc = (Document) iter.next(); 335 // transducer.transduce(doc); 336 transduce(doc, doc.getAnnotations(), doc.getAnnotations()); 337 } 338 } // transduce(coll) 339 340 /** Process a single document. */ 341 public void transduce(Document doc) throws JapeException, ExecutionException { 342 transducer.transduce(doc, doc.getAnnotations(), doc.getAnnotations()); 343 } // transduce(doc) 344 345 /** Process a single document. */ 346 public void transduce(Document doc, AnnotationSet inputAS, 347 AnnotationSet outputAS) throws JapeException, 348 ExecutionException { 349 //no need to transduce empty document 350 if (inputAS == null || inputAS.isEmpty()) 351 return; 352 transducer.transduce(doc, inputAS, outputAS); 353 354 } // transduce(doc) 355 356 /** Process a single text. */ 357 /* 358 public Document transduce(String text) throws JapeException { 359 Document doc = null; 360 try { 361 doc = Factory.newDocument(text); 362 } catch (ResourceInstantiationException e) { 363 throw new JapeException(e.toString()); 364 } 365 transducer.transduce(doc, doc.getAnnotations()); 366 return doc; 367 } // transduce(text) 368 */ 369 /** Process a single file. */ 370 /* 371 public Document transduce(File textFile) throws JapeException { 372 String text = null; 373 try { 374 text = gate.util.Files.getString(textFile); 375 } catch(IOException e) { throw new JapeException(e.toString()); } 376 return transduce(text); 377 } // transduce(textFile) 378 */ 379 /** Process a set of files. */ 380 /* 381 public Corpus transduce(String[] textFileNames) throws JapeException { 382 Corpus coll = null; 383 try { 384 coll = Factory.newCorpus("JAPE batch corpus"); 385 Document doc = null; 386 for(int i = 0; i < textFileNames.length; i++) { 387 doc = Factory.newDocument(textFileNames[i]); 388 doc.setFeatures(Factory.newFeatureMap()); 389 /*coll.createDocument( 390 textFileNames[i], 391 null, // the text - should get read from disk 392 new AnnotationSetImpl(doc), 393 Factory.newFeatureMap(), 394 Document.COPIED 395 );*/ 396 /* 397 transducer.transduce(doc, doc.getAnnotations()); 398 } 399 } catch(ResourceInstantiationException e) { 400 throw new JapeException(e.toString()); 401 } 402 return coll; 403 } // transduce(textFileNames) 404 */ 405 /** This is where it all happens. This is <I>the</I> place to be. Take 406 * your summer holidays here. Visit on Saturday nights. Buy a season 407 * ticket from <CODE>www.programmer.gone.insane.com</CODE>. 408 * <P> 409 * Takes a .jape/.jar/.ser 410 * file name (-j option) which is assumed to hold a pattern 411 * grammar for a multi-phase transducer, and a collection 412 * name (-c option) or a list of files. As needed it then parses and 413 * compiles the transducer, then transduces all the documents in the 414 * collection and saves it to disk. 415 */ 416 public static void main(String args[]) { 417 /* 418 // oh great bug in the sky give us this day our daily fuckup 419 //gate.util.Debug.setDebug(true); 420 //gate.util.Debug.setDebug(Rule.class, true); 421 //gate.util.Debug.setDebug(LeftHandSide.class, true); 422 //gate.util.Debug.setDebug(BasicPatternElement.class, true); 423 //gate.util.Debug.setDebug(AnnotationSet.class, true); 424 425 // The persistent name of the collection. 426 String persCollName = null;; 427 428 // The collection to process. 429 Corpus collection = null; 430 431 // create one of us 432 Batch batch = new Batch(); 433 434 // process the options 435 int i = 0; 436 for( ; i<args.length; i++) { 437 if(args[i].equals("-c") && ++i < args.length) // -c = coll name 438 persCollName = args[i]; 439 else if(args[i].equals("-j") && ++i < args.length)// -j = transducer name 440 batch.japeFileName = args[i]; 441 else if(args[i].equals("-v")) // -v = verbose 442 batch.setVerbose(true); 443 else if(args[i].startsWith("-")) 444 batch.usage("unknown option " + args[i]); 445 else 446 break; 447 } // for each arg 448 449 // file name list 450 String[] fileNames = null; 451 if(args.length > i) { 452 fileNames = new String[args.length - i]; 453 for(int j = 0; i<args.length; j++, i++) 454 fileNames[j] = args[i]; 455 } 456 457 // did they give valid options? 458 if(batch.japeFileName == null) 459 batch.usage("you must supply a transducer name"); 460 if(fileNames != null && persCollName != null) 461 batch.usage("can't read a collection AND process a file list"); 462 463 // parse the transducer or bomb 464 batch.message("parsing the transducer"); 465 try { batch.initTransducer(); } 466 catch(JapeException e) { 467 batch.usage("oops: " + e.toString()); 468 } 469 470 Corpus coll = null; 471 if(persCollName != null) { // we got a collection name, not a list of files 472 473 // open the collection or bomb 474 coll = null; 475 batch.message("opening the collection"); 476 try { 477 coll = Factory.newCorpus(persCollName); 478 } catch(ResourceInstantiationException e) { 479 batch.usage("oops (x): " + e); 480 } 481 482 // transduce 483 batch.message("calling transducer"); 484 try { batch.transduce(coll); } 485 catch(JapeException e) { 486 batch.usage("oops (1): " + e.toString()); 487 } 488 489 // save to disk 490 batch.message("saving the collection"); 491 batch.usage("couldn't sync coll "); 492 493 // we got a list of files, not a collection 494 } else { 495 batch.message("transducing transient collection"); 496 try { 497 coll = batch.transduce(fileNames); 498 } catch(JapeException e) { 499 batch.usage("oops (2): " + e.toString()); 500 } 501 } 502 503 // we won! we won! we can smash up all the computers now! 504 batch.message("done"); 505 //System.exit(0); 506 */ 507 } // main 508 509 510 /** Whether to print progress messages or not. */ 511 private boolean verbose = false; 512 513 /** Set verbosity. */ 514 public void setVerbose(boolean turtleSoup) { verbose = turtleSoup; } 515 516 /** You got something wrong, dumbo. */ 517 public void usage(String errorMessage) { 518 String usageMessage = 519 "usage: java gate.jape.Batch.main [-v] " + 520 "-j japefile(.ser|.jape|.jar) " + 521 "(-c CollectionName | filenames)"; 522 523 Err.println(errorMessage); 524 Err.println(usageMessage); 525 // System.exit(1); 526 527 } // usage 528 529 /** Hello? Anybody there?? */ 530 public void message(String mess) { 531 if(verbose) Out.println("Batch: " + mess); 532 } // message 533 534 public void setFeatures(gate.FeatureMap newFeatures) { 535 features = newFeatures; 536 } 537 public gate.FeatureMap getFeatures() { 538 return features; 539 } 540 public synchronized void removeProgressListener(ProgressListener l) { 541 if (progressListeners != null && progressListeners.contains(l)) { 542 Vector v = (Vector) progressListeners.clone(); 543 v.removeElement(l); 544 progressListeners = v; 545 } 546 } 547 public synchronized void addProgressListener(ProgressListener l) { 548 Vector v = progressListeners == null ? new Vector(2) : (Vector) progressListeners.clone(); 549 if (!v.contains(l)) { 550 v.addElement(l); 551 progressListeners = v; 552 } 553 } 554 555 //ProcessProgressReporter implementation ends here 556 557 /** Are we initialising from a resource? */ 558 // private boolean fromResource = false; 559 560 /** Path to the resources tree */ 561 // private String resPath = null; 562 563 564 private gate.FeatureMap features; 565 private transient Vector progressListeners; 566 private transient Vector statusListeners; 567 private boolean enableDebugging; 568 569 protected void fireProgressChanged(int e) { 570 if (progressListeners != null) { 571 Vector listeners = progressListeners; 572 int count = listeners.size(); 573 for (int i = 0; i < count; i++) { 574 ((ProgressListener) listeners.elementAt(i)).progressChanged(e); 575 } 576 } 577 } 578 protected void fireProcessFinished() { 579 if (progressListeners != null) { 580 Vector listeners = progressListeners; 581 int count = listeners.size(); 582 for (int i = 0; i < count; i++) { 583 ((ProgressListener) listeners.elementAt(i)).processFinished(); 584 } 585 } 586 } 587 public synchronized void removeStatusListener(StatusListener l) { 588 if (statusListeners != null && statusListeners.contains(l)) { 589 Vector v = (Vector) statusListeners.clone(); 590 v.removeElement(l); 591 statusListeners = v; 592 } 593 } 594 public synchronized void addStatusListener(StatusListener l) { 595 Vector v = statusListeners == null ? new Vector(2) : (Vector) statusListeners.clone(); 596 if (!v.contains(l)) { 597 v.addElement(l); 598 statusListeners = v; 599 } 600 } 601 protected void fireStatusChanged(String e) { 602 if (statusListeners != null) { 603 Vector listeners = statusListeners; 604 int count = listeners.size(); 605 for (int i = 0; i < count; i++) { 606 ((StatusListener) listeners.elementAt(i)).statusChanged(e); 607 } 608 } 609 } 610 611 /** 612 * Sets the ontology to be used by the transducers 613 * @param ontology 614 */ 615 public void setOntology(gate.creole.ontology.Ontology ontology) { 616 transducer.setOntology(ontology); 617 } 618 public boolean isEnableDebugging() { 619 return enableDebugging; 620 } 621 public void setEnableDebugging(boolean enableDebugging) { 622 this.enableDebugging = enableDebugging; 623 //propagate 624 if(transducer != null) transducer.setEnableDebugging(enableDebugging); 625 } 626 627 628 /* 629 private void writeObject(ObjectOutputStream oos) throws IOException { 630 Out.prln("writing batch"); 631 oos.defaultWriteObject(); 632 Out.prln("finished writing batch"); 633 } // writeObject 634 */ 635 636 } // class Batch 637 638