1
16
17
29
30
38 package hepple.postag;
39
40
41 import java.io.*;
42 import java.net.URL;
43 import java.util.*;
44
45 import gnu.getopt.*;
46
47 import hepple.postag.rules.*;
48
49
79
80 public class POSTagger {
81
82
84 protected Map rules;
85
88
89 Lexicon lexicon;
90
91
92 private String encoding;
93
94
95 static final String staart = "STAART";
96
97 private String[] staartLex = { staart };
98 private String[] deflex_NNP = { "NNP"};
99 private String[] deflex_JJ = { "JJ"};
100 private String[] deflex_CD = { "CD"};
101 private String[] deflex_NNS = { "NNS"};
102 private String[] deflex_RB = { "RB"};
103 private String[] deflex_VBG = { "VBG"};
104 private String[] deflex_NN = { "NN"};
105
106 public String[] wordBuff = { staart,staart,staart,staart,
107 staart,staart,staart };
108
109 public String[] tagBuff = { staart,staart,staart,staart,
110 staart,staart,staart };
111 public String[][] lexBuff = { staartLex,staartLex,staartLex,
112 staartLex,staartLex,staartLex,
113 staartLex };
114
115
119 public POSTagger(URL lexiconURL, URL rulesURL) throws InvalidRuleException,
120 IOException {
121 this(lexiconURL, rulesURL, null);
122 }
123
124
128 public POSTagger(URL lexiconURL, URL rulesURL, String encoding) throws InvalidRuleException,
129 IOException{
130 this.encoding = encoding;
131 this.lexicon = new Lexicon(lexiconURL, encoding);
132 rules = new HashMap();
133 readRules(rulesURL);
134 }
135
136
140 public Rule createNewRule(String ruleId) throws InvalidRuleException{
141 try{
142 String className = "hepple.postag.rules.Rule_" + ruleId;
143 Class ruleClass = Class.forName(className);
144 return (Rule)ruleClass.newInstance();
145 }catch(Exception e){
146 throw new InvalidRuleException("Could not create rule " + ruleId + "!\n" +
147 e.toString());
148 }
149 }
150
151
161 public List runTagger(List sentences){
162 List output = new ArrayList();
163 List taggedSentence = new ArrayList();
164 Iterator sentencesIter = sentences.iterator();
165 while(sentencesIter.hasNext()){
166 List sentence = (List)sentencesIter.next();
167 Iterator wordsIter = sentence.iterator();
168 while(wordsIter.hasNext()){
169 String newWord = (String)wordsIter.next();
170 oneStep(newWord, taggedSentence);
171 } for(int i = 0; i < 6; i++){
175 oneStep(staart, taggedSentence);
176 }
177 output.add(taggedSentence);
179 taggedSentence = new ArrayList();
180 } return output;
182 }
183
184
185
192 public void setEncoding(String encoding) {
193 throw new IllegalStateException("Cannot change encoding once POS tagger "
194 + "has been constructed. Use the three "
195 + "argument constructor to specify "
196 + "encoding.");
197 }
198
199
211 protected boolean oneStep(String word, List taggedSentence){
212 for (int i=1 ; i<7 ; i++) {
214 wordBuff[i-1] = wordBuff[i];
215 tagBuff[i-1] = tagBuff[i];
216 lexBuff[i-1] = lexBuff[i];
217 }
218 wordBuff[6] = word;
219 lexBuff[6] = classifyWord(word);
220 tagBuff[6] = lexBuff[6][0];
221
222 List rulesToApply = (List)rules.get(lexBuff[3][0]);
226 if(rulesToApply != null && rulesToApply.size() > 0){
227 Iterator rulesIter = rulesToApply.iterator();
228 while(rulesIter.hasNext() && !((Rule)rulesIter.next()).apply(this)){}
230 }
231
232 String taggedWord = wordBuff[0];
234 if(taggedWord != staart){
235 taggedSentence.add(new String[]{taggedWord, tagBuff[0]});
236 if(wordBuff[1] == staart){
237 return true;
239 } } return false;
242
243 }
245
248 public void readRules(URL rulesURL) throws IOException, InvalidRuleException{
249 BufferedReader rulesReader;
250 if(encoding == null) {
251 rulesReader = new BufferedReader(new InputStreamReader(rulesURL.
252 openStream()));
253 } else {
254 rulesReader = new BufferedReader(new InputStreamReader(rulesURL.
255 openStream(), this.encoding));
256 }
257
258 String line;
259 Rule newRule;
260
261 line = rulesReader.readLine();
262 while(line != null){
263 List ruleParts = new ArrayList();
264 StringTokenizer tokens = new StringTokenizer(line);
265 while (tokens.hasMoreTokens()) ruleParts.add(tokens.nextToken());
266 if (ruleParts.size() < 3) throw new InvalidRuleException(line);
267
268 newRule = createNewRule((String)ruleParts.get(2));
269 newRule.initialise(ruleParts);
270 List existingRules = (List)rules.get(newRule.from);
271 if(existingRules == null){
272 existingRules = new ArrayList();
273 rules.put(newRule.from, existingRules);
274 }
275 existingRules.add(newRule);
276
277 line = rulesReader.readLine();
278 } }
281 public void showRules(){
282 System.out.println(rules);
283 }
284
285
289 private String[] classifyWord(String wd){
290 String[] result;
291
292 if (wd == staart) return staartLex;
293
294 List categories = (List)lexicon.get(wd);
295 if(categories != null){
296 result = new String[categories.size()];
297 for(int i = 0; i < result.length; i++){
298 result[i] = (String)categories.get(i);
299 }
300 return result;
301 }
302
303 if ('A' <= wd.charAt(0) && wd.charAt(0) <= 'Z') return deflex_NNP;
305
306 for (int i=1 ; i < wd.length()-1 ; i++)
307 if (wd.charAt(i) == '-') return deflex_JJ;
308
309 for (int i=0 ; i < wd.length() ; i++)
310 if ('0' <= wd.charAt(i) && wd.charAt(i) <= '9') return deflex_CD;
311
312 if (wd.endsWith("ed") ||
313 wd.endsWith("us") ||
314 wd.endsWith("ic") ||
315 wd.endsWith("ble") ||
316 wd.endsWith("ive") ||
317 wd.endsWith("ary") ||
318 wd.endsWith("ful") ||
319 wd.endsWith("ical") ||
320 wd.endsWith("less")) return deflex_JJ;
321
322 if (wd.endsWith("s")) return deflex_NNS;
323
324 if (wd.endsWith("ly")) return deflex_RB;
325
326 if (wd.endsWith("ing")) return deflex_VBG;
327
328 return deflex_NN;
329 }
331
332
336 public static void main(String[] args){
337 if(args.length == 0) help();
338 try{
339 LongOpt[] options = new LongOpt[]{
340 new LongOpt("help", LongOpt.NO_ARGUMENT, null, 'h'),
341 new LongOpt("lexicon", LongOpt.NO_ARGUMENT, null, 'l'),
342 new LongOpt("rules", LongOpt.NO_ARGUMENT, null, 'r')
343 };
344 Getopt getopt = new Getopt("HepTag", args, "hl:r:", options);
345 String lexiconUrlString = null;
346 String rulesUrlString = null;
347 int opt;
348 while( (opt = getopt.getopt()) != -1 ){
349 switch(opt) {
350 case 'h':{
352 help();
353 System.exit(0);
354 break;
355 }
356 case 'l':{
358 lexiconUrlString = getopt.getOptarg();
359 break;
360 }
361 case 'r':{
363 rulesUrlString = getopt.getOptarg();
364 break;
365 }
366 default:{
367 System.err.println("Invalid option " +
368 args[getopt.getOptind() -1] + "!");
369 System.exit(1);
370 }
371 } } String[] fileNames = new String[args.length - getopt.getOptind()];
374 for(int i = getopt.getOptind(); i < args.length; i++){
375 fileNames[i - getopt.getOptind()] = args[i];
376 }
377
378 URL lexiconURL = (lexiconUrlString == null) ?
379 POSTagger.class.
380 getResource("/hepple/resources/sample_lexicon") :
381 new File(lexiconUrlString).toURL();
382
383 URL rulesURL = (rulesUrlString == null) ?
384 POSTagger.class.
385 getResource("/hepple/resources/sample_ruleset.big") :
386 new File(rulesUrlString).toURL();
387
388 POSTagger tagger = new POSTagger(lexiconURL, rulesURL);
389
390 for(int i = 0; i < fileNames.length; i++){
391 String file = fileNames[i];
392 BufferedReader reader = new BufferedReader(new FileReader(file));
393 String line = reader.readLine();
394
395 while(line != null){
396 StringTokenizer tokens = new StringTokenizer(line);
397 List sentence = new ArrayList();
398 while(tokens.hasMoreTokens()) sentence.add(tokens.nextToken());
399 List sentences = new ArrayList();
400 sentences.add(sentence);
401 List result = tagger.runTagger(sentences);
402
403 Iterator iter = result.iterator();
404 while(iter.hasNext()){
405 List sentenceFromTagger = (List)iter.next();
406 Iterator sentIter = sentenceFromTagger.iterator();
407 while(sentIter.hasNext()){
408 String[] tag = (String[])sentIter.next();
409 System.out.print(tag[0] + "/" + tag[1]);
410 if(sentIter.hasNext()) System.out.print(" ");
411 else System.out.println();
412 } } line = reader.readLine();
415 } } }catch(Exception e){
433 e.printStackTrace();
434 }
435 }
437
440 private static void help(){
441 System.out.println(
442 "NAME\n" +
443 "HepTag - a Part-of-Speech tagger\n" +
444 "see http://www.dcs.shef.ac.uk/~hepple/papers/acl00/abstract.html \n\n" +
445 "SYNOPSIS\n\tjava hepple.postag.POSTagger [options] file1 [file2 ...]\n\n" +
446 "OPTIONS:\n" +
447 "-h, --help \n\tdisplays this message\n" +
448 "-l, --lexicon <lexicon file>\n\tuses specified lexicon\n" +
449 "-r, --rules <rules file>\n\tuses specified rules");
450 }
451
452
456 private static List readInput(String file) throws IOException{
457 BufferedReader reader = new BufferedReader(new FileReader(file));
458 String line = reader.readLine();
459 List result = new ArrayList();
460 while(line != null){
461 StringTokenizer tokens = new StringTokenizer(line);
462 List sentence = new ArrayList();
463 while(tokens.hasMoreTokens()) sentence.add(tokens.nextToken());
464 result.add(sentence);
465 line = reader.readLine();
466 } return result;
468 }
470 }