1   /*
2    *  Lexicon.java
3    *
4    *  Copyright (c) 2001-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  HepTag was originally written by Mark Hepple, this version contains
12   *  modifications by Valentin Tablan and Niraj Aswani.
13   *
14   *  $Id: Lexicon.java,v 1.1 2005/09/30 14:48:12 ian_roberts Exp $
15   */
16  package hepple.postag;
17  
18  /**
19   * Title:        HepTag
20   * Description:  Mark Hepple's POS tagger
21   * Copyright:    Copyright (c) 2001
22   * Company:      University of Sheffield
23   * @author Mark Hepple
24   * @version 1.0
25   */
26  
27  import java.util.*;
28  import java.io.*;
29  import java.net.URL;
30  
31  /**
32   * A {@link java.util.HashMap} that maps from lexical entry
33   * ({@link java.lang.String}) to possible POS categories
34   * ({@link java.util.List}
35   */
36  class Lexicon extends HashMap {
37  
38    /** Niraj */
39    private String encoding;
40  
41  
42    /**
43     * @deprecated The lexicon file is read at construction time, so setting the
44     * encoding later will have no effect.  Use the two argument constructor to
45     * set the encoding.
46     */
47    public void setEncoding(String encoding) {
48      throw new IllegalStateException("Cannot change encoding once POS tagger "
49                                    + "has been constructed.  Use the three "
50                                    + "argument constructor to specify "
51                                    + "encoding.");
52    }
53    /* End */
54  
55    /**
56     * Constructor.
57     * @param lexiconURL an URL for the file contianing the lexicon.
58     */
59    public Lexicon(URL lexiconURL) throws IOException{
60      this(lexiconURL, null);
61    }
62  
63    /**
64     * Constructor.
65     * @param lexiconURL an URL for the file contianing the lexicon.
66     * @param encoding the character encoding to use for reading the lexicon.
67     */
68    public Lexicon(URL lexiconURL, String encoding) throws IOException{
69      this.encoding = encoding;
70      String line;
71      BufferedReader lexiconReader;
72      if(encoding == null) {
73        lexiconReader = new BufferedReader(new InputStreamReader(lexiconURL.openStream()));
74      } else {
75        lexiconReader = new BufferedReader(new InputStreamReader(lexiconURL.openStream(),encoding));
76      }
77  
78      line = lexiconReader.readLine();
79      String entry;
80      List categories;
81      while(line != null){
82        StringTokenizer tokens = new StringTokenizer(line);
83        entry = tokens.nextToken();
84        categories = new ArrayList();
85        while(tokens.hasMoreTokens()) categories.add(tokens.nextToken());
86        put(entry, categories);
87  
88        line = lexiconReader.readLine();
89      }//while(line != null)
90    }//public Lexicon(URL lexiconURL) throws IOException
91  
92  }//class Lexicon
93