1   /*
2    *  Copyright (c) 2004, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Mike Dowman 30-03-2004
10   *
11   *  $Id: MaxentWrapper.java,v 1.6 2005/10/25 15:03:18 julien_nioche Exp $
12   *
13   */
14  
15  package gate.creole.ml.maxent;
16  
17  import gate.creole.ml.*;
18  import gate.util.GateException;
19  import gate.creole.ExecutionException;
20  import java.util.List;
21  import java.util.Iterator;
22  
23  /**
24   * Wrapper class for the Maxent machine learning algorithm.
25   * @see <a href="http://maxent.sourceforge.net/index.html">Maxent homepage</a>
26   */
27  public class MaxentWrapper
28      implements AdvancedMLEngine, gate.gui.ActionsPublisher {
29  
30    boolean DEBUG=false;
31  
32    /**
33     * This constructor sets up action list so that these actions (loading and
34     * saving models and data) will be available from a context menu in the
35     * gui).
36     *
37     * There is no option to load or save data sets, as maxent does not support
38     * this. If there is a need to save data sets, then this can be done using
39     * weka.wrapper instead.
40     */
41    public MaxentWrapper() {
42      actionsList = new java.util.ArrayList();
43      actionsList.add(new LoadModelAction());
44      actionsList.add(new SaveModelAction());
45      actionsList.add(null);
46    }
47  
48    /**
49     * No clean up is needed for this wrapper, so this is just added because its
50     * in the interface.
51     */
52    public void cleanUp() {
53    }
54  
55    /**
56     * Some wrappers allow batch classification, but this one doesn't, so if
57     * it's ever called just inform the user about this by throwing an exception.
58     *
59     * @param instances This parameter is not used.
60     * @return Nothing is ever returned - an exception is always thrown.
61     * @throws ExecutionException
62     */
63    public List batchClassifyInstances(java.util.List instances)
64        throws ExecutionException {
65      throw new ExecutionException("The Maxent wrapper does not support "+
66                                   "batch classification. Remove the "+
67                                   "<BATCH-MODE-CLASSIFICATION/> entry "+
68                                   "from the XML configuration file and "+
69                                   "try again.");
70    }
71  
72    /**
73     * Take a representation of the part of the XML configuration file
74     * which corresponds to <OPTIONS>, and store it.
75     *
76     * @throws GateException
77     */
78    public void setOptions(org.jdom.Element optionsElem) {
79      this.optionsElement = optionsElem;
80    }
81  
82    /**
83     * Extract the options from the stored Element, and verifiy that they are
84     * all valid. Store them in the class's fields.
85     *
86     * @throws ResourceInstansitaionException
87     */
88    private void extractAndCheckOptions() throws gate.creole.
89        ResourceInstantiationException {
90      setCutoff(optionsElement);
91      setConfidenceThreshold(optionsElement);
92      setVerbose(optionsElement);
93      setIterations(optionsElement);
94      setSmoothing(optionsElement);
95      setSmoothingObservation(optionsElement);
96    }
97  
98    /**
99     * Set the verbose field appropriately, depending on whether <VERBOSE> is
100    * specified in the configuration file.
101    */
102   private void setVerbose(org.jdom.Element optionsElem) {
103     if (optionsElem.getChild("VERBOSE") == null) {
104       verbose = false;
105     }
106     else {
107       verbose = true;
108     }
109   }
110 
111   /**
112    * Set the smoothing field appropriately, depending on whether <SMOOTHING> is
113    * specified in the configuration file.
114    */
115   private void setSmoothing(org.jdom.Element optionsElem) {
116     if (optionsElem.getChild("SMOOTHING") == null) {
117       smoothing = false;
118     }
119     else {
120       smoothing = true;
121     }
122   }
123 
124   /**
125    * Set the smoothing observation field appropriately, depending on what value
126    * is specified for <SMOOTHING-OBSERVATION> in the configuration file.
127    */
128   private void setSmoothingObservation(org.jdom.Element optionsElem) throws
129       gate.creole.ResourceInstantiationException {
130     String smoothingObservationString
131         = optionsElem.getChildTextTrim("SMOOTHING-OBSERVATION");
132     if (smoothingObservationString != null) {
133       try {
134         smoothingObservation = Double.parseDouble(smoothingObservationString);
135       }
136       catch (NumberFormatException e) {
137         throw new gate.creole.ResourceInstantiationException("Unable to parse " +
138             "<SMOOTHING-OBSERVATION> value in maxent configuration file.");
139       }
140     }
141     else {
142       smoothingObservation = 0.0;
143     }
144   }
145 
146   /**
147    * See if a cutoff is specified in the congif file. If it is set the cutoff
148    * field, otherwise set cutoff to its default value.
149    */
150   private void setConfidenceThreshold(org.jdom.Element optionsElem) throws gate.
151       creole.ResourceInstantiationException {
152     String confidenceThresholdString
153         = optionsElem.getChildTextTrim("CONFIDENCE-THRESHOLD");
154     if (confidenceThresholdString != null) {
155       try {
156         confidenceThreshold = Double.parseDouble(confidenceThresholdString);
157       }
158       catch (NumberFormatException e) {
159         throw new gate.creole.ResourceInstantiationException("Unable to parse " +
160             "<CONFIDENCE-THRESHOLD> value in maxent configuration file.");
161       }
162       if (confidenceThreshold < 0.0 || confidenceThreshold > 1) {
163         throw new gate.creole.ResourceInstantiationException(
164             "<CONFIDENCE-THRESHOLD> in maxent configuration"
165             + " file must be set to a value between 0 and 1."
166             + " (It is a probability.)");
167       }
168     }
169     else {
170       confidenceThreshold = 0.0;
171     }
172   }
173 
174   /**
175    * See if a cutoff is specified in the congif file. If it is set the cutoff
176    * field, otherwise set cutoff to its default value.
177    */
178   private void setCutoff(org.jdom.Element optionsElem) throws gate.creole.
179       ResourceInstantiationException {
180     String cutoffString = optionsElem.getChildTextTrim("CUT-OFF");
181     if (cutoffString != null) {
182       try {
183         cutoff = Integer.parseInt(cutoffString);
184       }
185       catch (NumberFormatException e) {
186         throw new gate.creole.ResourceInstantiationException(
187             "Unable to parse <CUT-OFF> value in maxent " +
188             "configuration file. It must be an integer.");
189       }
190     }
191     else {
192       cutoff = 0;
193     }
194   }
195 
196   /**
197    * See if a value for how many iterations should be performed during training
198    * is specified in the congif file. If it is set the iterations field,
199    * otherwise set it to its default value, 10.
200    */
201   private void setIterations(org.jdom.Element optionsElem) throws gate.creole.
202       ResourceInstantiationException {
203     String iterationsString = optionsElem.getChildTextTrim("ITERATIONS");
204     if (iterationsString != null) {
205       try {
206         iterations = Integer.parseInt(iterationsString);
207       }
208       catch (NumberFormatException e) {
209         throw new gate.creole.ResourceInstantiationException(
210             "Unable to parse <ITERATIONS> value in maxent " +
211             "configuration file. It must be an integer.");
212       }
213     }
214     else {
215       iterations = 0;
216     }
217   }
218 
219   /**
220    * This is called to add a new training instance to the data set collected
221    * in this wrapper object.
222    *
223    * @param attributeValues A list of String objects, each of which corresponds
224    * to an attribute value. For boolean attributes the values will be true or
225    * false.
226    */
227   public void addTrainingInstance(List attributeValues) {
228     markIndicesOnFeatures(attributeValues);
229     trainingData.add(attributeValues);
230     datasetChanged = true;
231   }
232 
233   /**
234    * Annotate the features (but not the outcome), by prepending the index of
235    * their location in the list of attributes, followed by a colon. This is
236    * because all features are true or false, but it is important that maxent
237    * does not confuse a true in one position with a true in another when, for
238    * example, calculating the cutoff.
239    *
240    * @param attributeValues a list of String objects listing all the
241    * feature values and the outcome value for an instance.
242    */
243   void markIndicesOnFeatures(List attributeValues) {
244     for (int i=0; i<attributeValues.size(); ++i) {
245       // Skip the outcome (a.k.a. the class).
246       if (i != datasetDefinition.getClassIndex())
247         attributeValues.set(i, i+":"+(String)attributeValues.get(i));
248     }
249   }
250 
251   /**
252    * Set the data set defition for this classifier.
253    *
254    * @param definition A specification of the types and allowable values of
255        * all the attributes, as specified in the <DATASET> part of the configuration
256    * file.
257    */
258   public void setDatasetDefinition(DatasetDefintion definition) {
259     this.datasetDefinition = definition;
260   }
261 
262   /**
263    * Tests that the attributes specified in the DatasetDefinition are valid for
264    * maxent. That is that all the attributes except for the class attribute are
265    * boolean, and that class is boolean or nominal, as that is a requirement of
266    * the maxent implementation used.
267    */
268   private void checkDatasetDefinition() throws gate.creole.
269       ResourceInstantiationException {
270     // Now go through the dataset definition, and check that each attribute is
271     // of the right kind.
272     List attributes = datasetDefinition.getAttributes();
273     Iterator attributeIterator = attributes.iterator();
274     while (attributeIterator.hasNext()) {
275       gate.creole.ml.Attribute currentAttribute
276           = (gate.creole.ml.Attribute) attributeIterator.next();
277       if (currentAttribute.semanticType() != gate.creole.ml.Attribute.BOOLEAN) {
278         if (currentAttribute.semanticType() != gate.creole.ml.Attribute.NOMINAL
279             || !currentAttribute.isClass()) {
280           throw new gate.creole.ResourceInstantiationException(
281               "Error in maxent configuration file. All " +
282               "attributes except the <CLASS/> attribute " +
283               "must be boolean, and the <CLASS/> attribute" +
284               " must be boolean or nominal");
285         }
286       }
287     }
288   }
289 
290   /**
291    * This method first sets the static parameters of GIS to reflect those
292    * specified in the configuration file, then it trains the model using the
293    * data collected up to this point, and stores the model in maxentClassifier.
294    */
295   private void initialiseAndTrainClassifier() {
296     opennlp.maxent.GIS.PRINT_MESSAGES = verbose;
297     opennlp.maxent.GIS.SMOOTHING_OBSERVATION = smoothingObservation;
298 
299     // Actually create and train the model, and store it for later use.
300     if (DEBUG) {
301       System.out.println("Number of training instances: "+trainingData.size());
302       System.out.println("Class index: "+datasetDefinition.getClassIndex());
303       System.out.println("Iterations: "+iterations);
304       System.out.println("Cutoff: "+cutoff);
305       System.out.println("Confidence threshold: "+confidenceThreshold);
306       System.out.println("Verbose: "+verbose);
307       System.out.println("Smoothing: "+smoothing);
308       System.out.println("Smoothing observation: "+smoothingObservation);
309 
310       System.out.println("");
311       System.out.println("TRAINING DATA\n");
312       System.out.println(trainingData);
313     }
314     maxentClassifier = opennlp.maxent.GIS.trainModel(
315         new GateEventStream(trainingData, datasetDefinition.getClassIndex()),
316         iterations, cutoff,smoothing,verbose);
317   }
318 
319   /**
320    * Decide on the outcome for the instance, based on the values of all the
321    * maxent features.
322    *
323    * N.B. Unless this function was previously called, and there has been no new
324    * data added since, the model will be trained when it is called. This could
325    * result in calls to this function taking a long time to execute.
326    *
327    * @param attributeValues A list of all the attributes, including the one that
328    * corresponds to the maxent outcome (the <CLASS/> attribute). The value of
329    * outcome is arbitrary.
330    *
331    * @return A string value giving the nominal value of the outcome or, if the
332    * outcome is boolean, a java String with value "true" or "false"
333    *
334    * @throws ExecutionException
335    */
336   public Object classifyInstance(List attributeValues) throws
337       ExecutionException {
338     // First we need to check whether we need to create a new model.
339     // If either we've never made a model, or some new data has been added, then
340     // we need to train a new model.
341     if (maxentClassifier == null || datasetChanged)
342       initialiseAndTrainClassifier();
343       // The data now reflects the model, so keep a note of this so we don't
344       // have to retrain the model if using the same data.
345     datasetChanged=false;
346 
347     // We need to mark indices on the features, so that they will be
348     // consistent with those on which the model was trained.
349     markIndicesOnFeatures(attributeValues);
350 
351       // When classifying, we need to remove the outcome from the List of
352       // attributes. (N.B. we must do this after marking indices, so that
353       // we don't end up with different indices for features after the class.
354     attributeValues.remove(datasetDefinition.getClassIndex());
355 
356     // Then try to classify stuff.
357     if (confidenceThreshold == 0) { // If no confidence threshold has been set
358       // then just use simple classification.
359       return maxentClassifier.
360           getBestOutcome(maxentClassifier.eval(
361           (String[])attributeValues.toArray(new String[0])));
362     }
363     else { // Otherwise, add all outcomes that are over the threshold.
364       double[] outcomeProbabilities = maxentClassifier.eval(
365           (String[]) attributeValues.toArray(new String[0]));
366 
367       List allOutcomesOverThreshold = new java.util.ArrayList();
368       for (int i = 0; i < outcomeProbabilities.length; i++) {
369         if (outcomeProbabilities[i] >= confidenceThreshold) {
370           allOutcomesOverThreshold.add(maxentClassifier.getOutcome(i));
371         }
372       }
373       return allOutcomesOverThreshold;
374     }
375   } // classifyInstance
376 
377   /**
378    * Initialises the classifier and prepares for running. Before calling this
379    * method, the datasetDefinition and optionsElement fields should have been
380    * set using calls to the appropriate methods.
381    * @throws GateException If it is not possible to initialise the classifier
382    * for any reason.
383    */
384   public void init() throws GateException {
385     //see if we can shout about what we're doing
386     sListener = null;
387     java.util.Map listeners = gate.gui.MainFrame.getListeners();
388     if (listeners != null) {
389       sListener = (gate.event.StatusListener)
390                   listeners.get("gate.event.StatusListener");
391     }
392 
393     if (sListener != null) {
394       sListener.statusChanged("Setting classifier options...");
395     }
396     extractAndCheckOptions();
397 
398     if (sListener != null) {
399       sListener.statusChanged("Checking dataset definition...");
400     }
401     checkDatasetDefinition();
402 
403     // N.B. We don't initialise the classifier here, because maxent classifiers,
404     // are both initialised and trained at the same time. Hence initialisation
405     // takes place in the method classifyInstance.
406 
407     //initialise the dataset
408     if (sListener != null) {
409       sListener.statusChanged("Initialising dataset...");
410 
411     }
412     trainingData = new java.util.ArrayList();
413 
414     if (sListener != null) {
415       sListener.statusChanged("");
416     }
417   } // init
418 
419   /**
420    * Loads the state of this engine from previously saved data.
421    * @param is An open InputStream from which the model will be loaded.
422    */
423   public void load(java.io.InputStream is) throws java.io.IOException {
424     if (sListener != null) {
425       sListener.statusChanged("Loading model...");
426 
427     }
428     java.io.ObjectInputStream ois = new java.io.ObjectInputStream(is);
429 
430     try {
431       maxentClassifier = (opennlp.maxent.MaxentModel) ois.readObject();
432       trainingData = (java.util.List) ois.readObject();
433       datasetDefinition = (DatasetDefintion) ois.readObject();
434       datasetChanged = ois.readBoolean();
435 
436       cutoff = ois.readInt();
437       confidenceThreshold = ois.readDouble();
438       iterations = ois.readInt();
439       verbose = ois.readBoolean();
440       smoothing = ois.readBoolean();
441       smoothingObservation = ois.readDouble();
442     }
443     catch (ClassNotFoundException cnfe) {
444       throw new gate.util.GateRuntimeException(cnfe.toString());
445     }
446     ois.close();
447 
448     if (sListener != null) {
449       sListener.statusChanged("");
450     }
451   }
452 
453   /**
454    * Saves the state of the engine for reuse at a later time.
455    * @param os An open output stream to which the model will be saved.
456    */
457   public void save(java.io.OutputStream os) throws java.io.IOException {
458     if (sListener != null) {
459       sListener.statusChanged("Saving model...");
460 
461     }
462     java.io.ObjectOutputStream oos = new java.io.ObjectOutputStream(os);
463 
464     oos.writeObject(maxentClassifier);
465     oos.writeObject(trainingData);
466     oos.writeObject(datasetDefinition);
467     oos.writeBoolean(datasetChanged);
468 
469     oos.writeInt(cutoff);
470     oos.writeDouble(confidenceThreshold);
471     oos.writeInt(iterations);
472     oos.writeBoolean(verbose);
473     oos.writeBoolean(smoothing);
474     oos.writeDouble(smoothingObservation);
475 
476     oos.flush();
477     oos.close();
478 
479     if (sListener != null) {
480       sListener.statusChanged("");
481     }
482   }
483 
484   /**
485    * Gets the list of actions that can be performed on this resource.
486    * @return a List of Action objects (or null values)
487    */
488   public java.util.List getActions() {
489     return actionsList;
490   }
491 
492   /**
493    * Registers the PR using the engine with the engine itself.
494    * @param pr the processing resource that owns this engine.
495    */
496   public void setOwnerPR(gate.ProcessingResource pr) {
497     this.owner = pr;
498   }
499 
500   public DatasetDefintion getDatasetDefinition() {
501     return datasetDefinition;
502   }
503 
504   public boolean supportsBatchMode(){
505     return false;
506   }
507   
508   /**
509    * This allows the model, including its parameters to be saved to a file.
510    */
511   protected class SaveModelAction
512       extends javax.swing.AbstractAction {
513     public SaveModelAction() {
514       super("Save model");
515       putValue(SHORT_DESCRIPTION, "Saves the ML model to a file");
516     }
517 
518     /**
519      * This function will open a file chooser, and then call the save function
520      * to actually save the model. (It is not normally called directly by the
521      * user, but will be called as the result of the save model menu option
522      * being selected.)
523      */
524     public void actionPerformed(java.awt.event.ActionEvent evt) {
525       Runnable runnable = new Runnable() {
526         public void run() {
527           javax.swing.JFileChooser fileChooser
528               = gate.gui.MainFrame.getFileChooser();
529           fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter());
530           fileChooser.setFileSelectionMode(javax.swing.JFileChooser.FILES_ONLY);
531           fileChooser.setMultiSelectionEnabled(false);
532           if (fileChooser.showSaveDialog(null)
533               == javax.swing.JFileChooser.APPROVE_OPTION) {
534             java.io.File file = fileChooser.getSelectedFile();
535             try {
536               gate.gui.MainFrame.lockGUI("Saving ML model...");
537               save(new java.util.zip.GZIPOutputStream(
538                   new java.io.FileOutputStream(
539                   file.getCanonicalPath(), false)));
540             }
541             catch (java.io.IOException ioe) {
542               javax.swing.JOptionPane.showMessageDialog(null,
543                   "Error!\n" +
544                   ioe.toString(),
545                   "GATE", javax.swing.JOptionPane.ERROR_MESSAGE);
546               ioe.printStackTrace(gate.util.Err.getPrintWriter());
547             }
548             finally {
549               gate.gui.MainFrame.unlockGUI();
550             }
551           }
552         }
553       };
554       Thread thread = new Thread(runnable, "ModelSaver(serialisation)");
555       thread.setPriority(Thread.MIN_PRIORITY);
556       thread.start();
557     }
558   }
559 
560   /**
561    * This reloads a file that was previously saved using the SaveModelAction
562    * class. A maxent ml processing resource must already exist before this
563    * action can be performed.
564    */
565   protected class LoadModelAction
566       extends javax.swing.AbstractAction {
567     public LoadModelAction() {
568       super("Load model");
569       putValue(SHORT_DESCRIPTION, "Loads a ML model from a file");
570     }
571 
572     /**
573      * This function will open a file chooser, and then call the load function
574      * to actually load the model. (It is not normally called directly by the
575      * user, but will be called as the result of the load model menu option
576      * being selected.)
577      */
578     public void actionPerformed(java.awt.event.ActionEvent evt) {
579       Runnable runnable = new Runnable() {
580         public void run() {
581           javax.swing.JFileChooser fileChooser
582               = gate.gui.MainFrame.getFileChooser();
583           fileChooser.setFileFilter(fileChooser.getAcceptAllFileFilter());
584           fileChooser.setFileSelectionMode(javax.swing.JFileChooser.FILES_ONLY);
585           fileChooser.setMultiSelectionEnabled(false);
586           if (fileChooser.showOpenDialog(null)
587               == javax.swing.JFileChooser.APPROVE_OPTION) {
588             java.io.File file = fileChooser.getSelectedFile();
589             try {
590               gate.gui.MainFrame.lockGUI("Loading model...");
591               load(new java.util.zip.GZIPInputStream(
592                   new java.io.FileInputStream(file)));
593             }
594             catch (java.io.IOException ioe) {
595               javax.swing.JOptionPane.showMessageDialog(null,
596                   "Error!\n" +
597                   ioe.toString(),
598                   "GATE", javax.swing.JOptionPane.ERROR_MESSAGE);
599               ioe.printStackTrace(gate.util.Err.getPrintWriter());
600             }
601             finally {
602               gate.gui.MainFrame.unlockGUI();
603             }
604           }
605         }
606       };
607       Thread thread = new Thread(runnable, "ModelLoader(serialisation)");
608       thread.setPriority(Thread.MIN_PRIORITY);
609       thread.start();
610     }
611   }
612 
613   protected gate.creole.ml.DatasetDefintion datasetDefinition;
614 
615   /**
616    * The Maxent classifier used by this wrapper
617    */
618   protected opennlp.maxent.MaxentModel maxentClassifier;
619 
620   /**
621    * This List stores all the data that has been collected. Each item is a
622    * List of Strings, each of which is an attribute. In maxent terms, these
623    * are the features and the outcome - the position of the outcome can be found
624    * by referring to the the datasetDefition object.
625    */
626   protected List trainingData;
627 
628   /**
629    * The JDom element contaning the options fro this wrapper.
630    */
631   protected org.jdom.Element optionsElement;
632 
633   /**
634    * Marks whether the dataset was changed since the last time the classifier
635    * was built.
636    */
637   protected boolean datasetChanged = false;
638 
639   /*
640    *  This list stores the actions that will be available on the context menu
641    *  in the GUI.
642    */
643   protected List actionsList;
644 
645   protected gate.ProcessingResource owner;
646 
647   protected gate.event.StatusListener sListener;
648 
649   /**
650    * The following members are set by the <OPTIONS> part of the config file,
651    * and control the parameters used for training the model, and for
652    * classifying instances. They are initialised with their default values,
653    * but may be changed when setOptions is called.
654    */
655   protected int cutoff = 0;
656   protected double confidenceThreshold = 0;
657   protected int iterations = 10;
658   protected boolean verbose = false;
659   protected boolean smoothing = false;
660   protected double smoothingObservation = 0.1;
661 
662 } // MaxentWrapper