1   package gate.creole.morph;
2   
3   
4   /*
5    *  Morph.java
6    *
7    * Copyright (c) 1998-2005, The University of Sheffield.
8    *
9    * This file is part of GATE (see http://gate.ac.uk/), and is free
10   * software, licenced under the GNU Library General Public License,
11   * Version 2, June1991.
12   *
13   * A copy of this licence is included in the distribution in the file
14   * licence.html, and is also available at http://gate.ac.uk/gate/licence.html.
15   *
16   *  Niraj Aswani, 13/10/2003
17   *
18   *  $Id: Morph.java,v 1.12 2005/01/11 13:51:33 ian Exp $
19   */
20  
21  
22  import java.net.URL;
23  import java.util.Iterator;
24  
25  import gate.*;
26  import gate.creole.*;
27  import gate.util.GateRuntimeException;
28  
29  /**
30   * Description: This class is a wrapper for {@link gate.creole.morph.Interpret},
31   * the Morphological Analyzer.
32   */
33  public class Morph
34      extends AbstractLanguageAnalyser
35      implements ProcessingResource {
36  
37  
38    /** Document to be processed by the morpher, must be provided at Runtime. */
39    private gate.Document document;
40  
41    /** File which cotains rules to be processed */
42    private URL rulesFile;
43  
44    /** Instance of BaseWord class - English Morpher */
45    private Interpret interpret;
46  
47    /** Feature Name that should be displayed for the root word */
48    private String rootFeatureName;
49  
50    /** Feature Name that should be displayed for the affix */
51    private String affixFeatureName;
52  
53    /** The name of the annotation set used for input */
54    private String annotationSetName;
55  
56    /** Boolean value that tells if parser should behave in caseSensitive mode */
57    private Boolean caseSensitive;
58  
59    private Boolean considerPOSTag;
60  
61    /** Default Constructor */
62    public Morph() {
63    }
64  
65    /**
66     * This method creates the instance of the BaseWord - English Morpher and
67     * returns the instance of current class with different attributes and
68     * the instance of BaseWord class wrapped into it.
69     * @return Resource
70     * @throws ResourceInstantiationException
71     */
72    public Resource init() throws ResourceInstantiationException {
73      interpret = new Interpret();
74      if (rulesFile == null) {
75        // no rule file is there, simply run the interpret to interpret it and
76        throw new ResourceInstantiationException("\n\n No Rule File Provided");
77      }
78  
79      // compile the rules
80      interpret.init(rulesFile);
81  
82      return this;
83    }
84  
85    /**
86     * Method is executed after the init() method has finished its execution.
87     * <BR>Method does the following operations:
88     * <OL type="1">
89     * <LI> creates the annotationSet
90     * <LI> fetches word tokens from the document, one at a time
91     * <LI> runs the morpher on each individual word token
92     * <LI> finds the root and the affix for that word
93     * <LI> adds them as features to the current token
94     * @throws ExecutionException
95     */
96    public void execute() throws ExecutionException {
97      // lets start the progress and initialize the progress counter
98      fireProgressChanged(0);
99  
100     // If no document provided to process throw an exception
101     if (document == null) {
102       fireProcessFinished();
103       throw new GateRuntimeException("No document to process!");
104     }
105 
106     // get the annotationSet name provided by the user, or otherwise use the
107     // default method
108     AnnotationSet inputAs = (annotationSetName == null ||
109         annotationSetName.length() == 0) ?
110         document.getAnnotations() :
111         document.getAnnotations(annotationSetName);
112 
113     // Morpher requires English tokenizer to be run before running the Morpher
114     // Fetch tokens from the document
115     AnnotationSet tokens = inputAs.get(TOKEN_ANNOTATION_TYPE);
116     if (tokens == null || tokens.isEmpty()) {
117       fireProcessFinished();
118       throw new ExecutionException("Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher");
119       //javax.swing.JOptionPane.showMessageDialog(null, "Either "+document.getName()+" does not have any contents or \n run the POS Tagger first and then Morpher"); ;
120       //return;
121     }
122 
123     // create iterator to get access to each and every individual token
124     Iterator tokensIter = tokens.iterator();
125 
126     // variables used to keep track on progress
127     int tokenSize = tokens.size();
128     int tokensProcessed = 0;
129     int lastReport = 0;
130 
131     //lets process each token one at a time
132     while (tokensIter != null && tokensIter.hasNext()) {
133       Annotation currentToken = (Annotation) tokensIter.next();
134       String tokenValue = (String) (currentToken.getFeatures().
135                                     get(TOKEN_STRING_FEATURE_NAME));
136       if(considerPOSTag != null && considerPOSTag.booleanValue() && !currentToken.getFeatures().containsKey(TOKEN_CATEGORY_FEATURE_NAME)) {
137         fireProcessFinished();
138         throw new ExecutionException("please run the POS Tagger first and then Morpher");
139         //javax.swing.JOptionPane.showMessageDialog(null, "please run the POS Tagger first and then Morpher"); ;
140         //return;
141       }
142 
143       String posCategory = (String) (currentToken.getFeatures().get(TOKEN_CATEGORY_FEATURE_NAME));
144       if(posCategory == null) {
145         posCategory = "*";
146       }
147 
148       if(considerPOSTag == null || !considerPOSTag.booleanValue()) {
149         posCategory = "*";
150       }
151 
152       // run the Morpher
153       if(!caseSensitive.booleanValue()) {
154         tokenValue = tokenValue.toLowerCase();
155       }
156 
157       String baseWord = interpret.runMorpher(tokenValue, posCategory);
158       String affixWord = interpret.getAffix();
159 
160       // no need to add affix feature if it is null
161       if (affixWord != null) {
162         currentToken.getFeatures().put(affixFeatureName, affixWord);
163       }
164       // add the root word as a feature
165       currentToken.getFeatures().put(rootFeatureName, baseWord);
166 
167       // measure the progress and update every after 100 tokens
168       tokensProcessed++;
169       if(tokensProcessed - lastReport > 100){
170         lastReport = tokensProcessed;
171         fireProgressChanged(tokensProcessed * 100 /tokenSize);
172       }
173     }
174     // process finished, acknowledge user about this.
175     fireProcessFinished();
176   }
177 
178   // getter and setter method
179   /**
180    * Sets the document to be processed
181    * @param document - document to be processed
182    */
183   public void setDocument(gate.Document document) {
184     this.document = document;
185   }
186 
187 
188   /**
189    * This method should only be called after init()
190    * @param word
191    * @return the rootWord
192    */
193   public String findBaseWord(String word, String cat) {
194     return interpret.runMorpher(word, cat);
195   }
196 
197   /**
198    * This method should only be called after init()
199    * @param word
200    * @return the afix of the rootWord
201    */
202   public String findAffix(String word, String cat) {
203     interpret.runMorpher(word, cat);
204     return interpret.getAffix();
205   }
206 
207 
208   /**
209    * Returns the document under process
210    */
211   public gate.Document getDocument() {
212     return this.document;
213   }
214 
215   /**
216    * Sets the rule file to be processed
217    * @param rulesFileURL - rule File name to be processed
218    */
219   public void setRulesFile(URL rulesFile) {
220     this.rulesFile = rulesFile;
221   }
222 
223   /**
224    * Returns the document under process
225    */
226   public URL getRulesFile() {
227     return this.rulesFile;
228   }
229 
230   /**
231    * Returns the feature name that has been currently set to display the root
232    * word
233    */
234   public String getRootFeatureName() {
235     return rootFeatureName;
236   }
237 
238   /**
239    * Sets the feature name that should be displayed for the root word
240    * @param rootFeatureName
241    */
242   public void setRootFeatureName(String rootFeatureName) {
243     this.rootFeatureName = rootFeatureName;
244   }
245 
246   /**
247    * Returns the feature name that has been currently set to display the affix
248    * word
249    */
250   public String getAffixFeatureName() {
251     return affixFeatureName;
252   }
253 
254   /**
255    * Sets the feature name that should be displayed for the affix
256    * @param affixFeatureName
257    */
258   public void setAffixFeatureName(String affixFeatureName) {
259     this.affixFeatureName = affixFeatureName;
260   }
261 
262   /**
263    * Returns the name of the AnnotationSet that has been provided to create
264    * the AnnotationSet
265    */
266   public String getAnnotationSetName() {
267     return annotationSetName;
268   }
269 
270   /**
271    * Sets the AnnonationSet name, that is used to create the AnnotationSet
272    * @param annotationSetName
273    */
274   public void setAnnotationSetName(String annotationSetName) {
275     this.annotationSetName = annotationSetName;
276   }
277 
278   /**
279    * A method which returns if the parser is in caseSenstive mode
280    * @return a {@link Boolean} value.
281    */
282   public Boolean getCaseSensitive() {
283     return this.caseSensitive;
284   }
285 
286   /**
287    * Sets the caseSensitive value, that is used to tell parser if it should
288    * convert document to lowercase before parsing
289    */
290   public void setCaseSensitive(java.lang.Boolean value) {
291     this.caseSensitive = value;
292   }
293 
294   public Boolean getConsiderPOSTag() {
295     return this.considerPOSTag;
296   }
297 
298   public void setConsiderPOSTag(Boolean value) {
299     this.considerPOSTag = value;
300   }
301 }
302