1   /*
2    *  Copyright (c) 1998-2005, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan 19/11/2002
10   *
11   *  $Id: MachineLearningPR.java,v 1.13 2006/03/09 13:33:19 ian_roberts Exp $
12   *
13   */
14  package gate.creole.ml;
15  
16  import java.util.*;
17  
18  import org.jdom.Element;
19  import org.jdom.JDOMException;
20  import org.jdom.input.SAXBuilder;
21  
22  import gate.*;
23  import gate.creole.*;
24  import gate.gui.ActionsPublisher;
25  import gate.util.*;
26  
27  /**
28   * This processing resource is used to train a machine learning algorithm with
29   * data extracted from a corpus.
30   */
31  
32  public class MachineLearningPR extends AbstractLanguageAnalyser
33                         implements gate.gui.ActionsPublisher{
34  
35    public MachineLearningPR(){
36      actionList = new ArrayList();
37      actionList.add(null);
38    }
39  
40    /**
41     * This will make sure that any resources allocated by an ml wrapper get
42     * released. This is needed in the case of those wrappers that call
43     * native code, as in such cases there is a need to realese dynamically
44     * allocated memory.
45     */
46    public void cleanup() {
47      // First call cleanup in the parent, in case any clean up needs to be done
48      // there.
49      super.cleanup();
50  
51      // So long as an ML Engine (wrapper) is associated with the processing
52      // resource, call its cleanup method.
53      if (engine!=null) {
54        engine.cleanUp();
55      }
56    }
57  
58    /** Initialise this resource, and return it. */
59    public Resource init() throws ResourceInstantiationException {
60      if(configFileURL == null){
61        throw new ResourceInstantiationException(
62          "No configuration file provided!");
63      }
64  
65      org.jdom.Document jdomDoc;
66      SAXBuilder saxBuilder = new SAXBuilder(false);
67      try {
68      try{
69        jdomDoc = saxBuilder.build(configFileURL);
70      }catch(JDOMException jde){
71        throw new ResourceInstantiationException(jde);
72      }
73      } catch (java.io.IOException ex) {
74        throw new ResourceInstantiationException(ex);
75      }
76  
77      //go through the jdom document to extract the data we need
78      Element rootElement = jdomDoc.getRootElement();
79      if(!rootElement.getName().equals("ML-CONFIG"))
80        throw new ResourceInstantiationException(
81          "Root element of dataset defintion file is \"" + rootElement.getName() +
82          "\" instead of \"ML-CONFIG\"!");
83  
84      //create the dataset defintion
85      Element datasetElement = rootElement.getChild("DATASET");
86      if(datasetElement == null) throw new ResourceInstantiationException(
87        "No dataset definition provided in the configuration file!");
88      try{
89        datasetDefinition = new DatasetDefintion(datasetElement);
90      }catch(GateException ge){
91        throw new ResourceInstantiationException(ge);
92      }
93  
94      //create the engine
95      Element engineElement = rootElement.getChild("ENGINE");
96      if(engineElement == null) throw new ResourceInstantiationException(
97        "No engine option provided in the configuration file!");
98      Element engineClassElement = engineElement.getChild("WRAPPER");
99      if(engineClassElement == null) throw new ResourceInstantiationException(
100       "No ML engine class provided!");
101     String engineClassName = engineClassElement.getTextTrim();
102     try{
103       // load MLEngine class from GATE Classloader
104       Class engineClass =
105         Class.forName(engineClassName, true, Gate.getClassLoader());
106       engine = (MLEngine)engineClass.newInstance();
107     }catch(ClassNotFoundException cnfe){
108       throw new ResourceInstantiationException(
109         "ML engine class:" + engineClassName + "not found!");
110     }catch(IllegalAccessException iae){
111       throw new ResourceInstantiationException(iae);
112     }catch(InstantiationException ie){
113       throw new ResourceInstantiationException(ie);
114     }
115 
116     // See if batch classification mode had been set.
117     if (engineElement.getChild("BATCH-MODE-CLASSIFICATION") == null) {
118       batchModeClassification = false;
119     } else {
120       // checks wether the engine supports batch mode
121       // engines must implement AdvancedMLEngine (extending MLengine)
122       // to be asked about this functionality
123       if (engine instanceof AdvancedMLEngine){
124         batchModeClassification = ((AdvancedMLEngine)engine).supportsBatchMode();
125       }
126       else batchModeClassification = false;
127     }
128 
129     engine.setDatasetDefinition(datasetDefinition);
130     engine.setOptions(engineElement.getChild("OPTIONS"));
131     engine.setOwnerPR(this);
132     try{
133       engine.init();
134     }catch(GateException ge){
135       throw new ResourceInstantiationException(ge);
136     }
137 
138     return this;
139   } // init()
140 
141 
142   /**
143    * Run the resource.
144    */
145   public void execute() throws ExecutionException {
146     interrupted = false;
147     //check the input
148     if (document == null) {
149       throw new ExecutionException(
150           "No document provided!"
151           );
152     }
153 
154     if (inputASName == null ||
155         inputASName.equals(""))
156       annotationSet = document.getAnnotations();
157     else
158       annotationSet = document.getAnnotations(inputASName);
159 
160     if (training.booleanValue()) {
161       fireStatusChanged(
162           "Collecting training data from " + document.getName() + "...");
163     }
164     else {
165       fireStatusChanged(
166           "Applying ML model to " + document.getName() + "...");
167     }
168     fireProgressChanged(0);
169     AnnotationSet anns = annotationSet.
170                          get(datasetDefinition.getInstanceType());
171     annotations = (anns == null || anns.isEmpty()) ?
172                   new ArrayList() : new ArrayList(anns);
173     Collections.sort(annotations, new OffsetComparator());
174     Iterator annotationIter = annotations.iterator();
175     int index = 0;
176     int size = annotations.size();
177 
178     //create the cache structure
179     cache = new Cache();
180 
181     if (!batchModeClassification || training.booleanValue()) {
182       // This code covers the case when instances are going to be passed to
183       // the wrapper one at a time, which is always the case with training,
184       // and the case with classification when we are not using batch mode.
185       while (annotationIter.hasNext()) {
186         Annotation instanceAnn = (Annotation) annotationIter.next();
187         List attributeValues = new ArrayList(datasetDefinition.
188                                              getAttributes().size());
189         //find the values for all attributes
190         Iterator attrIter = datasetDefinition.getAttributes().iterator();
191         while (attrIter.hasNext()) {
192           Attribute attr = (Attribute) attrIter.next();
193           if (attr.isClass && !training.booleanValue()) {
194             //we're not training so the class will be undefined
195             attributeValues.add(null);
196           }
197           else {
198             attributeValues.add(cache.getAttributeValue(index, attr));
199           }
200         }
201 
202         if (training.booleanValue()) {
203           engine.addTrainingInstance(attributeValues);
204         }
205         else {
206           Object result = engine.classifyInstance(attributeValues);
207           if (result instanceof Collection) {
208             Iterator resIter = ( (Collection) result).iterator();
209             while (resIter.hasNext())
210               updateDocument(resIter.next(), index);
211           }
212           else {
213             updateDocument(result, index);
214           }
215         }
216 
217         cache.shift();
218         //every 10 instances fire an event
219         if (index % 10 == 0) {
220           fireProgressChanged(index * 100 / size);
221           if (isInterrupted())
222             throw new ExecutionInterruptedException();
223         }
224         index++;
225       }
226 
227     }
228     else {
229       // This code covers the case when all the instances in a document will be 
230       // passed to the
231       // wrapper as a batch. This is necessary to achieve efficient performance
232       // with some wrappers.
233 
234       // This list is needed to collect all the test instances.
235       List instancesToBeClassified = new ArrayList();
236 
237       while (annotationIter.hasNext()) {
238         Annotation instanceAnn = (Annotation) annotationIter.next();
239         List attributeValues = new ArrayList(datasetDefinition.
240                                              getAttributes().size());
241         //find the values for all attributes
242         Iterator attrIter = datasetDefinition.getAttributes().iterator();
243         while (attrIter.hasNext()) {
244           Attribute attr = (Attribute) attrIter.next();
245           if (attr.isClass) {
246             //we're not training so the class will be undefined
247             attributeValues.add(null);
248           }
249           else {
250             attributeValues.add(cache.getAttributeValue(index, attr));
251           }
252         }
253 
254         // Instead of classifying the instance, just add it to the list of
255         // instances that need classifying.
256         instancesToBeClassified.add(attributeValues);
257 
258         cache.shift();
259 
260         index++;
261       }
262 
263       // Now all the data is collected in instances to be classified, we can
264       // actually get the wrapper to classify them.
265       List classificationResults = engine.batchClassifyInstances(
266           instancesToBeClassified);
267 
268       // Now go through the document and add all the annotations appropriately,
269       // given the output of the wrapper.
270 
271       // Start with the first instance again.
272       index = 0;
273       Iterator resultsIterator = classificationResults.iterator();
274       while (resultsIterator.hasNext()) {
275 
276         Object result = resultsIterator.next();
277         if (result instanceof Collection) {
278           Iterator resIter = ( (Collection) result).iterator();
279           while (resIter.hasNext())
280             updateDocument(resIter.next(), index);
281         }
282         else {
283           updateDocument(result, index);
284         }
285 
286         // Move index on so that it points at the next instance.
287         index++;
288       }
289     }
290     annotations = null;
291   } // execute()
292 
293 
294   protected void updateDocument(Object classificationResult, int instanceIndex){
295     //interpret the result according to the attribute semantics
296     Attribute classAttr = datasetDefinition.getClassAttribute();
297     String type = classAttr.getType();
298     String feature = classAttr.getFeature();
299     List classValues = classAttr.getValues();
300     FeatureMap features = Factory.newFeatureMap();
301     boolean shouldCreateAnnotation = true;
302     if(classValues != null && !classValues.isEmpty()){
303       //nominal attribute -> AnnotationType.feature
304       //the result is the value for the feature
305       String featureValue = (String)classificationResult;
306       features.put(feature, featureValue);
307     }else{
308       if(feature == null){
309         //boolean attribute
310         shouldCreateAnnotation = classificationResult.equals("true");
311       }else{
312         //numeric attribute
313         String featureValue = classificationResult.toString();
314         features.put(feature, featureValue);
315       }
316     }
317 
318     if(shouldCreateAnnotation){
319       //generate the new annotation
320       int coveredInstanceIndex = instanceIndex + classAttr.getPosition();
321       if(coveredInstanceIndex >= 0 &&
322          coveredInstanceIndex < annotations.size()){
323         Annotation coveredInstance = (Annotation)annotations.
324                                      get(coveredInstanceIndex);
325         annotationSet.add(coveredInstance.getStartNode(),
326                           coveredInstance.getEndNode(),
327                           type, features);
328       }
329     }
330   }
331 
332 
333   /**
334    * Gets the list of actions that can be performed on this resource.
335    * @return a List of Action objects (or null values)
336    */
337   public List getActions(){
338     List result = new ArrayList();
339     result.addAll(actionList);
340     if(engine instanceof ActionsPublisher){
341       result.addAll(((ActionsPublisher)engine).getActions());
342     }
343     return result;
344   }
345 
346   protected class Cache{
347     public Cache(){
348       //find the sizes for the two caches
349       int forwardCacheSize = 0;
350       int backwardCacheSize = 0;
351       Iterator attrIter = datasetDefinition.getAttributes().iterator();
352       while(attrIter.hasNext()){
353         Attribute anAttribute = (Attribute)attrIter.next();
354         if(anAttribute.getPosition() > 0){
355           //forward looking
356           if(anAttribute.getPosition() > forwardCacheSize){
357             forwardCacheSize = anAttribute.getPosition();
358           }
359         }else if(anAttribute.getPosition() < 0){
360           //backward looking
361           if(-anAttribute.getPosition() > backwardCacheSize){
362             backwardCacheSize = -anAttribute.getPosition();
363           }
364         }
365       }
366       //create the caches filled with null values
367       forwardCache = new ArrayList(forwardCacheSize);
368       for(int i =0; i < forwardCacheSize; i++) forwardCache.add(null);
369       backwardCache = new ArrayList(backwardCacheSize);
370       for(int i =0; i < backwardCacheSize; i++) backwardCache.add(null);
371     }
372 
373     /**
374      * Finds the value of a specified attribute for a particular instance.
375      * @param instanceIndex the index of the current instance in the annotations
376      * List.
377      * @param attribute the attribute whose value needs to be found
378      * @return a String representing the value for the attribute.
379      */
380     public String getAttributeValue(int instanceIndex, Attribute attribute){
381       //sanity check
382       int actualPosition = instanceIndex + attribute.getPosition();
383       if(actualPosition < 0 || actualPosition >= annotations.size()) return null;
384 
385       //check caches first
386       if(attribute.getPosition() == 0){
387         //current instance
388         if(currentAttributes == null) currentAttributes = new HashMap();
389         return getValue(attribute, instanceIndex, currentAttributes);
390       }else if(attribute.getPosition() > 0){
391         //check forward cache
392         Map attributesMap = (Map)forwardCache.get(attribute.getPosition() - 1);
393         if(attributesMap == null){
394           attributesMap = new HashMap();
395           forwardCache.set(attribute.getPosition() - 1, attributesMap);
396         }
397         return getValue(attribute, actualPosition, attributesMap);
398       }else if(attribute.getPosition() < 0){
399         //check bacward cache
400         Map attributesMap = (Map)backwardCache.get(-attribute.getPosition() - 1);
401         if(attributesMap == null){
402           attributesMap = new HashMap();
403           backwardCache.set(-attribute.getPosition() - 1, attributesMap);
404         }
405         return getValue(attribute, actualPosition, attributesMap);
406       }
407       //we should never get here
408       throw new LuckyException(
409         "Attribute position is neither 0, nor negative nor positive!");
410     }
411 
412     /**
413      * Notifies the cache that it should advance its internal structures one
414      * step forward.
415      */
416     public void shift(){
417       if(backwardCache.isEmpty()){
418         //no backward caching, all attributes have position "0" or more
419         //nothing to do
420       }else{
421         backwardCache.remove(backwardCache.size() - 1);
422         backwardCache.add(0, currentAttributes);
423       }
424       if(forwardCache.isEmpty()){
425         //no forward caching, all attributes have position "0" or less
426         if(currentAttributes != null) currentAttributes.clear();
427       }else{
428         currentAttributes = (Map) forwardCache.remove(0);
429         forwardCache.add(null);
430       }
431     }
432 
433     /**
434      * Finds the value for a particular attribute and returns it.
435      * If the value is not present in the cache it will be retrieved from the
436      * document and the cache will be updated.
437      * @param attribute the attribute whose value is requested.
438      * @param cache the Map containing the cache for the appropriate position
439      * for the attribute
440      * @param instanceIndex the index of the instance annotation which is
441      * covered by the sought attribute
442      * @return a String value.
443      */
444     protected String getValue(Attribute attribute,
445                               int instanceIndex,
446                               Map cache){
447       String value = null;
448       String annType = attribute.getType();
449       String featureName = attribute.getFeature();
450       Map typeData = (Map)cache.get(annType);
451       if(typeData != null){
452         if(featureName == null){
453           //we're only interested in the presence of the annotation
454           value = (String)typeData.get(null);
455         }else{
456           value = (String)typeData.get(featureName);
457         }
458       }else{
459         //type data was null -> nothing known about this type of annotations
460         //get the insformation; update the cache and return the right value
461         Annotation instanceAnnot = (Annotation)annotations.get(instanceIndex);
462        
463         typeData = new HashMap();
464         cache.put(annType, typeData);
465         // The annotation retrieved by its index is in a default type
466       // (default : Token). We need to search for overlapping types
467       // only if the Type needed is not the one we already have
468       // (which seems quite reasonable given that most Attributes are
469       // likely to be based on Token informations)
470       
471       if (instanceAnnot.getType().equals(annType)){
472         typeData.putAll(instanceAnnot.getFeatures());
473         typeData.put(null, "true");
474         
475         String stringvalue = (String)typeData.get(featureName);
476         if(featureName == null) return "true";
477         return stringvalue;
478       }
479       
480       // here we search for annotations of another type
481       // first restrict to the needed type
482       // then limit to those covering the current token
483       AnnotationSet typeSubset = annotationSet.get(annType);
484       AnnotationSet coverSubset = null;
485       if (typeSubset!=null) coverSubset = typeSubset.get(
486                 annType,
487                 instanceAnnot.getStartNode().getOffset(),
488                 instanceAnnot.getEndNode().getOffset());
489       
490         if(coverSubset == null || coverSubset.isEmpty()){
491           //no such annotations at given location
492           typeData.put(null, "false");
493           if(featureName == null) value = "false";
494           else value = null;
495         }else{
496           typeData.putAll(((Annotation)coverSubset.iterator().next()).
497                           getFeatures());
498           typeData.put(null, "true");
499           if(featureName == null) value = "true";
500           else value = (String)typeData.get(featureName);
501         }
502       }
503       return value;
504     }
505 
506     /**
507      * Stores cached data with attribute values for instances placed
508      * <b>after</b> the current instance.
509      * For each instance (i.e. for each position in the list) the data is a Map
510      * with annotationTypes as keys. For each annotation type the data stored is
511      * another Map with feature names as keys and feature values as values.
512      * The <tt>null</tt> key is used for a boolean value (stored as one of the
513      * &quot;true&quot; or &quot;false&quot; strings) signifying the presence
514      * (or lack of presence) of the required type of annotation at the location.
515      * forwardCache[2].get("Lookup").get(null) == "false" means that no lookup
516      * annotation covers the second instance to the right from the current
517      * instance.
518      */
519     protected List forwardCache;
520 
521     /**
522      * Stores cached data with attribute values for instances placed
523      * <b>before</b> the current instance.
524      * For each instance (i.e. for each position in the list) the data is a Map
525      * with annotationTypes as keys. For each annotation type the data stored is
526      * another Map with feature names as keys and feature values as values.
527      * The <tt>null</tt> key is used for a boolean value (stored as one of the
528      * &quot;true&quot; or &quot;false&quot; strings) signifying the presence
529      * (or lack of presence) of the required type of annotation at the location.
530      * backwardCache[2].get("Lookup").get(null) == "false" means that no lookup
531      * annotation covers the second instance to the left from the current
532      * instance.
533      */
534     protected List backwardCache;
535 
536     /**
537      * A Map
538      * with annotationTypes as keys. For each annotation type the data stored is
539      * another Map with feature names as keys and feature values as values.
540      * The <tt>null</tt> key is used for a boolean value (stored as one of the
541      * &quot;true&quot; or &quot;false&quot; strings) signifying the presence
542      * (or lack of presence) of the required type of annotation at the location.
543      * currentAttributes.get(Lookup).get(null) == "false" means that the current
544      * instance is not covered by a Lookup annotation.
545      * currentAttributes.get(Lookup) == null menas nothing is known about Lookup
546      * annotations caovering the current instance.
547      */
548     protected Map currentAttributes;
549 
550   }
551 
552 
553   public void setInputASName(String inputASName) {
554     this.inputASName = inputASName;
555   }
556   public String getInputASName() {
557     return inputASName;
558   }
559   public java.net.URL getConfigFileURL() {
560     return configFileURL;
561   }
562   public void setConfigFileURL(java.net.URL configFileURL) {
563     this.configFileURL = configFileURL;
564   }
565   public void setTraining(Boolean training) {
566     this.training = training;
567   }
568   public Boolean getTraining() {
569     return training;
570   }
571   public MLEngine getEngine() {
572     return engine;
573   }
574   public void setEngine(MLEngine engine) {
575     this.engine = engine;
576   }
577 
578   private java.net.URL configFileURL;
579   protected DatasetDefintion datasetDefinition;
580 
581   protected MLEngine engine;
582 
583   protected String inputASName;
584 
585   protected AnnotationSet annotationSet;
586 
587   protected List annotations;
588 
589   protected List actionList;
590 
591   protected Cache cache;
592   private Boolean training;
593 
594   /**
595    * This member will be set to true if instances are to be passed to the
596    * wrapper in batches, rather than one instance at a time and if the engine
597    * supports this functionality.
598    */
599   protected boolean batchModeClassification;
600 }
601