hepple.postag.POSTagger (Java2HTML)

1   /*
2    *  POSTagger.java
3    *
4    *  Copyright (c) 2001-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  HepTag was originally written by Mark Hepple, this version contains
12   *  modifications by Valentin Tablan and Niraj Aswani.
13   *
14   *  $Id: POSTagger.java,v 1.2 2005/10/18 10:01:26 ian_roberts Exp $
15   */
16  
17  /*
18   * INSTRUCTIONS for STAND-ALONE USE
19   * 
20   * SYNOPSIS
21   *     java hepple.postag.POSTagger [options] file1 [file2 ...]
22   * OPTIONS:
23   *     -h, --help : displays this message
24   *     -l, --lexicon <lexicon file> : uses specified lexicon
25   *     -r, --rules <rules file> : uses specified rules
26   * 
27   * NOTE: requires gnu.getopt package
28   */
29  
30  /**
31   * Title:        HepTag
32   * Description:  Mark Hepple's POS tagger
33   * Copyright:    Copyright (c) 2001
34   * Company:      University of Sheffield
35   * @author Mark Hepple
36   * @version 1.0
37   */
38  package hepple.postag;
39  
40  
41  import java.io.*;
42  import java.net.URL;
43  import java.util.*;
44  
45  import gnu.getopt.*;
46  
47  import hepple.postag.rules.*;
48  
49  /**
50   * A Java POS Tagger
51   *
52   * Author: Mark Hepple (hepple@dcs.shef.ac.uk)
53   *
54   * Input:  An ascii text file in "Brill input format", i.e. one
55   *        sentence per line, tokens separated by spaces.
56   *
57   * Output: Same text with each token tagged, i.e. "token" -> "token/tag".
58   *        Output is just streamed to std-output, so commonly will direct
59   *        into some target file.
60   *
61   * Revision: 13/9/00. Version 1.0.
62   *
63   * Comments:
64   *
65   * Implements a version of the decision list based tagging method
66   * described in:
67   *
68   * M. Hepple. 2000. Independence and Commitment: Assumptions for Rapid
69   * Training and Execution of Rule-based Part-of-Speech Taggers.
70   * Proceedings of the 38th Annual Meeting of the Association for
71   * Computational Linguistics (ACL-2000). Hong Kong, October 2000.
72   *
73   * Modified by Niraj Aswani/Ian Roberts to allow explicit specification of the
74   * character encoding to use when reading rules and lexicon files.
75   *
76   * $Id: POSTagger.java,v 1.2 2005/10/18 10:01:26 ian_roberts Exp $
77   *
78   */
79  
80  public class POSTagger {
81  
82  //    static final int MAXTAGS = 200;
83  
84      protected Map rules;
85  //    public Rule[] rules = new Rule[MAXTAGS];
86  //    public Rule[] lastRules = new Rule[MAXTAGS];
87  
88  
89      Lexicon lexicon;
90  
91      /** Niraj */
92      private String encoding;
93      /* End */
94  
95      static final String staart = "STAART";
96  
97      private String[] staartLex = { staart };
98      private String[] deflex_NNP = { "NNP"};
99      private String[] deflex_JJ  = { "JJ"};
100     private String[] deflex_CD  = { "CD"};
101     private String[] deflex_NNS = { "NNS"};
102     private String[] deflex_RB  = { "RB"};
103     private String[] deflex_VBG = { "VBG"};
104     private String[] deflex_NN  = { "NN"};
105 
106     public String[] wordBuff  = { staart,staart,staart,staart,
107         staart,staart,staart };
108 
109     public String[] tagBuff   = { staart,staart,staart,staart,
110         staart,staart,staart };
111     public String[][] lexBuff = { staartLex,staartLex,staartLex,
112          staartLex,staartLex,staartLex,
113          staartLex };
114 
115     /**
116      * Construct a POS tagger using the platform's native encoding to read the
117      * lexicon and rules files.
118      */
119     public POSTagger(URL lexiconURL, URL rulesURL) throws InvalidRuleException,
120                                                           IOException {
121       this(lexiconURL, rulesURL, null);
122     }
123     
124     /**
125      * Construct a POS tagger using the specified encoding to read the lexicon
126      * and rules files.
127      */
128     public POSTagger(URL lexiconURL, URL rulesURL, String encoding) throws InvalidRuleException,
129                                                           IOException{
130       this.encoding = encoding;
131       this.lexicon = new Lexicon(lexiconURL, encoding);
132       rules = new HashMap();
133       readRules(rulesURL);
134     }
135 
136   /**
137    * Creates a new rule of the required type according to the provided ID.
138    * @param ruleId the ID for the rule to be created
139    */
140   public Rule createNewRule(String ruleId) throws InvalidRuleException{
141     try{
142       String className = "hepple.postag.rules.Rule_" + ruleId;
143       Class ruleClass = Class.forName(className);
144       return (Rule)ruleClass.newInstance();
145     }catch(Exception e){
146       throw new InvalidRuleException("Could not create rule " + ruleId + "!\n" +
147                                      e.toString());
148     }
149   }
150 
151   /**
152    * Runs the tagger over a set of sentences.
153    * @param sentences a {@link java.util.List} of {@link java.util.List}s
154    * of words to be tagged. Each list is a sentence represented as a list of
155    * words.
156    * @return a {@link java.util.List} of {@link java.util.List}s of
157    * {@link java.lang.String}[]. A list of tagged sentences, each sentence
158    * being itself a list having pairs of strings as elements with
159    * the word on the first position and the tag on the second.
160    */
161   public List runTagger(List sentences){
162     List output = new ArrayList();
163     List taggedSentence = new ArrayList();
164     Iterator sentencesIter = sentences.iterator();
165     while(sentencesIter.hasNext()){
166       List sentence = (List)sentencesIter.next();
167       Iterator wordsIter = sentence.iterator();
168       while(wordsIter.hasNext()){
169         String newWord = (String)wordsIter.next();
170         oneStep(newWord, taggedSentence);
171       }//while(wordsIter.hasNext())
172       //finished adding all the words from a sentence, add six more 
173       //staarts to flush all words out of the tagging buffer
174       for(int i = 0; i < 6; i++){
175         oneStep(staart, taggedSentence);
176       }
177       //we have a new finished sentence
178       output.add(taggedSentence);
179       taggedSentence = new ArrayList();
180     }//while(sentencesIter.hasNext())
181     return output;
182   }
183 
184 
185   /**
186    * This method sets the encoding that POS tagger uses to read rules and the
187    * lexicons.
188    *
189    * @deprecated The rules and lexicon are read at construction time, so
190    * setting the encoding later will have no effect.
191    */
192   public void setEncoding(String encoding) {
193     throw new IllegalStateException("Cannot change encoding once POS tagger "
194                                   + "has been constructed.  Use the three "
195                                   + "argument constructor to specify "
196                                   + "encoding.");
197   }
198 
199   /**
200    * Adds a new word to the window of 7 words (on the last position) and tags
201    * the word currently in the middle (i.e. on position 3). This function
202    * also reads the word on the first position and adds its tag to the
203    * taggedSentence structure as this word would be lost at the next advance.
204    * If this word completes a sentence then it returns true otherwise it
205    * returns false.
206    * @param word the new word
207    * @param taggedSentence a List of pairs of strings representing the results
208    * of tagging the current sentence so far.
209    * @return returns true if a full sentence is now tagged, otherwise false.
210    */
211   protected boolean oneStep(String word, List taggedSentence){
212     //add the new word at the end of the text window
213     for (int i=1 ; i<7 ; i++) {
214       wordBuff[i-1] = wordBuff[i];
215       tagBuff[i-1] = tagBuff[i];
216       lexBuff[i-1] = lexBuff[i];
217     }
218     wordBuff[6] = word;
219     lexBuff[6] = classifyWord(word);
220     tagBuff[6] = lexBuff[6][0];
221 
222     //apply the rules to the word in the middle of the text window
223     //Try to fire a rule for the current lexical entry. It may be the case that
224     //no rule applies.
225     List rulesToApply = (List)rules.get(lexBuff[3][0]);
226     if(rulesToApply != null && rulesToApply.size() > 0){
227       Iterator rulesIter = rulesToApply.iterator();
228       //find the first rule that applies, fire it and stop.
229       while(rulesIter.hasNext() && !((Rule)rulesIter.next()).apply(this)){}
230     }
231 
232     //save the tagged word from the first position
233     String taggedWord = wordBuff[0];
234     if(taggedWord != staart){
235       taggedSentence.add(new String[]{taggedWord, tagBuff[0]});
236       if(wordBuff[1] == staart){
237         //wordTag[0] was the end of a sentence
238         return true;
239       }//if(wordBuff[1] == staart)
240     }//if(taggedWord != staart)
241     return false;
242 
243   }//protected List oneStep(String word, List taggedSentence)
244 
245   /**
246    * Reads the rules from the rules input file
247    */
248   public void readRules(URL rulesURL) throws IOException, InvalidRuleException{
249     BufferedReader rulesReader;
250     if(encoding == null) {
251       rulesReader = new BufferedReader(new InputStreamReader(rulesURL.
252           openStream()));
253     } else {
254       rulesReader = new BufferedReader(new InputStreamReader(rulesURL.
255           openStream(), this.encoding));
256     }
257 
258     String line;
259     Rule newRule;
260 
261     line = rulesReader.readLine();
262     while(line != null){
263       List ruleParts = new ArrayList();
264       StringTokenizer tokens = new StringTokenizer(line);
265       while (tokens.hasMoreTokens()) ruleParts.add(tokens.nextToken());
266       if (ruleParts.size() < 3) throw new InvalidRuleException(line);
267 
268       newRule = createNewRule((String)ruleParts.get(2));
269       newRule.initialise(ruleParts);
270       List existingRules = (List)rules.get(newRule.from);
271       if(existingRules == null){
272         existingRules = new ArrayList();
273         rules.put(newRule.from, existingRules);
274       }
275       existingRules.add(newRule);
276 
277       line = rulesReader.readLine();
278     }//while(line != null)
279   }//public void readRules()
280 
281   public void showRules(){
282     System.out.println(rules);
283   }
284 
285   /**
286    * Attempts to classify an unknown word.
287    * @param wd the word to be classified
288    */
289   private String[] classifyWord(String wd){
290     String[] result;
291 
292     if (wd == staart) return staartLex;
293 
294     List categories = (List)lexicon.get(wd);
295     if(categories != null){
296       result = new String[categories.size()];
297       for(int i = 0; i < result.length; i++){
298         result[i] = (String)categories.get(i);
299       }
300       return result;
301     }
302 
303     //no lexical entry for the word. Try to guess
304     if ('A' <= wd.charAt(0) && wd.charAt(0) <= 'Z') return deflex_NNP;
305 
306     for (int i=1 ; i < wd.length()-1 ; i++)
307       if (wd.charAt(i) == '-') return deflex_JJ;
308 
309     for (int i=0 ; i < wd.length() ; i++)
310       if ('0' <= wd.charAt(i) && wd.charAt(i) <= '9') return deflex_CD;
311 
312     if (wd.endsWith("ed") ||
313         wd.endsWith("us") ||
314         wd.endsWith("ic") ||
315         wd.endsWith("ble") ||
316         wd.endsWith("ive") ||
317         wd.endsWith("ary") ||
318         wd.endsWith("ful") ||
319         wd.endsWith("ical") ||
320         wd.endsWith("less")) return deflex_JJ;
321 
322     if (wd.endsWith("s")) return deflex_NNS;
323 
324     if (wd.endsWith("ly")) return deflex_RB;
325 
326     if (wd.endsWith("ing")) return deflex_VBG;
327 
328     return deflex_NN;
329   }//private String[] classifyWord(String wd)
330 
331 
332   /**
333    * Main method. Runs the tagger using the arguments to find the resources
334    * to be used for initialisation and the input file.
335    */
336   public static void main(String[] args){
337     if(args.length == 0) help();
338     try{
339       LongOpt[] options = new LongOpt[]{
340         new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'),
341         new LongOpt("lexicon", LongOpt.NO_ARGUMENT, null, 'l'),
342         new LongOpt("rules", LongOpt.NO_ARGUMENT, null, 'r')
343       };
344       Getopt getopt = new Getopt("HepTag", args, "hl:r:", options);
345       String lexiconUrlString = null;
346       String rulesUrlString = null;
347       int opt;
348       while( (opt = getopt.getopt()) != -1 ){
349         switch(opt) {
350           // -h
351           case 'h':{
352             help();
353             System.exit(0);
354             break;
355           }
356           // -l new lexicon
357           case 'l':{
358             lexiconUrlString = getopt.getOptarg();
359             break;
360           }
361           // -l new lexicon
362           case 'r':{
363             rulesUrlString = getopt.getOptarg();
364             break;
365           }
366           default:{
367             System.err.println("Invalid option " +
368                                args[getopt.getOptind() -1] + "!");
369             System.exit(1);
370           }
371         }//switch(opt)
372       }//while( (opt = g.getopt()) != -1 )
373       String[] fileNames = new String[args.length - getopt.getOptind()];
374       for(int i = getopt.getOptind(); i < args.length; i++){
375        fileNames[i - getopt.getOptind()] = args[i];
376       }
377 
378       URL lexiconURL = (lexiconUrlString == null) ?
379                        POSTagger.class.
380                        getResource("/hepple/resources/sample_lexicon") :
381                        new File(lexiconUrlString).toURL();
382 
383       URL rulesURL = (rulesUrlString == null) ?
384                        POSTagger.class.
385                        getResource("/hepple/resources/sample_ruleset.big") :
386                        new File(rulesUrlString).toURL();
387 
388       POSTagger tagger = new POSTagger(lexiconURL, rulesURL);
389 
390       for(int i = 0; i < fileNames.length; i++){
391         String file = fileNames[i];
392         BufferedReader reader = new BufferedReader(new FileReader(file));
393         String line = reader.readLine();
394 
395         while(line != null){
396           StringTokenizer tokens = new StringTokenizer(line);
397           List sentence = new ArrayList();
398           while(tokens.hasMoreTokens()) sentence.add(tokens.nextToken());
399           List sentences = new ArrayList();
400           sentences.add(sentence);
401           List result = tagger.runTagger(sentences);
402 
403           Iterator iter = result.iterator();
404           while(iter.hasNext()){
405             List sentenceFromTagger = (List)iter.next();
406             Iterator sentIter = sentenceFromTagger.iterator();
407             while(sentIter.hasNext()){
408               String[] tag = (String[])sentIter.next();
409               System.out.print(tag[0] + "/" + tag[1]);
410               if(sentIter.hasNext()) System.out.print(" ");
411               else System.out.println();
412             }//while(sentIter.hasNext())
413           }//while(iter.hasNext())
414           line = reader.readLine();
415         }//while(line != null)
416 //
417 //
418 //
419 //        List result = tagger.runTagger(readInput(file));
420 //        Iterator iter = result.iterator();
421 //        while(iter.hasNext()){
422 //          List sentence = (List)iter.next();
423 //          Iterator sentIter = sentence.iterator();
424 //          while(sentIter.hasNext()){
425 //            String[] tag = (String[])sentIter.next();
426 //            System.out.print(tag[0] + "/" + tag[1]);
427 //            if(sentIter.hasNext()) System.out.print(" ");
428 //            else System.out.println();
429 //          }//while(sentIter.hasNext())
430 //        }//while(iter.hasNext())
431       }//for(int i = 0; i < fileNames.length; i++)
432     }catch(Exception e){
433       e.printStackTrace();
434     }
435   }//public static void main(String[] args)
436 
437   /**
438    * Prints the help message
439    */
440   private static void help(){
441     System.out.println(
442       "NAME\n" +
443       "HepTag - a Part-of-Speech tagger\n" +
444       "see http://www.dcs.shef.ac.uk/~hepple/papers/acl00/abstract.html \n\n" +
445       "SYNOPSIS\n\tjava hepple.postag.POSTagger [options] file1 [file2 ...]\n\n" +
446       "OPTIONS:\n" +
447       "-h, --help \n\tdisplays this message\n" +
448       "-l, --lexicon <lexicon file>\n\tuses specified lexicon\n" +
449       "-r, --rules <rules file>\n\tuses specified rules");
450   }
451 
452   /**
453    * Reads one input file and creates the structure needed by the tagger
454    * for input.
455    */
456   private static List readInput(String file) throws IOException{
457     BufferedReader reader = new BufferedReader(new FileReader(file));
458     String line = reader.readLine();
459     List result = new ArrayList();
460     while(line != null){
461       StringTokenizer tokens = new StringTokenizer(line);
462       List sentence = new ArrayList();
463       while(tokens.hasMoreTokens()) sentence.add(tokens.nextToken());
464       result.add(sentence);
465       line = reader.readLine();
466     }//while(line != null)
467     return result;
468   }//private static List readInput(File file) throws IOException
469 
470 }//public class POSTagger
471