1   /*
2    *  Copyright (c) 1998-2005, The University of Sheffield.
3    *
4    *  This file is part of GATE (see http://gate.ac.uk/), and is free
5    *  software, licenced under the GNU Library General Public License,
6    *  Version 2, June 1991 (in the distribution as file licence.html,
7    *  and also available at http://gate.ac.uk/gate/licence.html).
8    *
9    *  Valentin Tablan, 01 Feb 2000
10   *
11   *  $Id: SentenceSplitter.java,v 1.29 2005/06/30 17:03:08 valyt Exp $
12   */
13  
14  
15  package gate.creole.splitter;
16  
17  import gate.*;
18  import gate.creole.*;
19  import gate.creole.gazetteer.DefaultGazetteer;
20  import gate.event.ProgressListener;
21  import gate.event.StatusListener;
22  import gate.util.GateRuntimeException;
23  import gate.util.InvalidOffsetException;
24  import gate.util.LuckyException;
25  /**
26   * A sentence splitter. This is module contains a tokeniser, a
27   * gazetteer and a Jape grammar. This class is used so we can have a different
28   * entry in the creole.xml file describing the default resources and to add
29   * some minor processing after running the components in order to extract the
30   * results in a usable form.
31   */
32  public class SentenceSplitter extends AbstractLanguageAnalyser{
33  
34    public static final String
35      SPLIT_DOCUMENT_PARAMETER_NAME = "document";
36  
37    public static final String
38      SPLIT_INPUT_AS_PARAMETER_NAME = "inputASName";
39  
40    public static final String
41      SPLIT_OUTPUT_AS_PARAMETER_NAME = "outputASName";
42  
43    public static final String
44      SPLIT_ENCODING_PARAMETER_NAME = "encoding";
45  
46    public static final String
47      SPLIT_GAZ_URL_PARAMETER_NAME = "gazetteerListsURL";
48  
49    public static final String
50      SPLIT_TRANSD_URL_PARAMETER_NAME = "transducerURL";
51  
52    public Resource init()throws ResourceInstantiationException{
53      //create all the componets
54      FeatureMap params;
55      FeatureMap features;
56  
57      //gazetteer
58      fireStatusChanged("Creating the gazetteer");
59      params = Factory.newFeatureMap();
60      if(gazetteerListsURL != null)
61        params.put(DefaultGazetteer.DEF_GAZ_LISTS_URL_PARAMETER_NAME,
62                                               gazetteerListsURL);
63      params.put(DefaultGazetteer.DEF_GAZ_ENCODING_PARAMETER_NAME, encoding);
64      features = Factory.newFeatureMap();
65      Gate.setHiddenAttribute(features, true);
66  
67  
68      gazetteer = (DefaultGazetteer)Factory.createResource(
69                      "gate.creole.gazetteer.DefaultGazetteer",
70                      params, features);
71      gazetteer.setName("Gazetteer " + System.currentTimeMillis());
72      fireProgressChanged(10);
73  
74      //transducer
75      fireStatusChanged("Creating the JAPE transducer");
76  
77      params = Factory.newFeatureMap();
78      if(transducerURL != null)
79        params.put(Transducer.TRANSD_GRAMMAR_URL_PARAMETER_NAME, transducerURL);
80      params.put(Transducer.TRANSD_ENCODING_PARAMETER_NAME, encoding);
81      features = Factory.newFeatureMap();
82      Gate.setHiddenAttribute(features, true);
83  
84      transducer = (Transducer)Factory.createResource(
85                      "gate.creole.Transducer",
86                      params, features);
87      transducer.setName("Transducer " + System.currentTimeMillis());
88  
89      fireProgressChanged(100);
90      fireProcessFinished();
91  
92      return this;
93    }
94  
95    public void execute() throws ExecutionException{
96      interrupted = false;
97      //set the runtime parameters
98      FeatureMap params;
99      if(inputASName != null && inputASName.equals("")) inputASName = null;
100     if(outputASName != null && outputASName.equals("")) outputASName = null;
101     try{
102       fireProgressChanged(0);
103       params = Factory.newFeatureMap();
104       params.put(DefaultGazetteer.DEF_GAZ_DOCUMENT_PARAMETER_NAME, document);
105       params.put(DefaultGazetteer.DEF_GAZ_ANNOT_SET_PARAMETER_NAME, inputASName);
106       gazetteer.setParameterValues(params);
107 
108       params = Factory.newFeatureMap();
109       params.put(Transducer.TRANSD_DOCUMENT_PARAMETER_NAME, document);
110       params.put(Transducer.TRANSD_INPUT_AS_PARAMETER_NAME, inputASName);
111       params.put(Transducer.TRANSD_OUTPUT_AS_PARAMETER_NAME, inputASName);
112       transducer.setParameterValues(params);
113     }catch(Exception e){
114       throw new ExecutionException(e);
115     }
116     ProgressListener pListener = null;
117     StatusListener sListener = null;
118     fireProgressChanged(5);
119 
120     //run the gazetteer
121     if(isInterrupted()) throw new ExecutionInterruptedException(
122         "The execution of the \"" + getName() +
123         "\" sentence splitter has been abruptly interrupted!");
124     pListener = new IntervalProgressListener(5, 10);
125     sListener = new StatusListener(){
126       public void statusChanged(String text){
127         fireStatusChanged(text);
128       }
129     };
130     gazetteer.addProgressListener(pListener);
131     gazetteer.addStatusListener(sListener);
132     gazetteer.execute();
133     gazetteer.removeProgressListener(pListener);
134     gazetteer.removeStatusListener(sListener);
135 
136     //run the transducer
137     if(isInterrupted()) throw new ExecutionInterruptedException(
138         "The execution of the \"" + getName() +
139         "\" sentence splitter has been abruptly interrupted!");
140     pListener = new IntervalProgressListener(11, 90);
141     transducer.addProgressListener(pListener);
142     transducer.addStatusListener(sListener);
143     transducer.execute();
144     transducer.removeProgressListener(pListener);
145     transducer.removeStatusListener(sListener);
146 
147     //get pointers to the annotation sets
148     AnnotationSet inputAS = (inputASName == null) ?
149                             document.getAnnotations() :
150                             document.getAnnotations(inputASName);
151 
152     AnnotationSet outputAS = (outputASName == null) ?
153                              document.getAnnotations() :
154                              document.getAnnotations(outputASName);
155 
156     //copy the results to the output set if they are different
157     if(inputAS != outputAS){
158       outputAS.addAll(inputAS.get(SENTENCE_ANNOTATION_TYPE));
159     }
160 
161     //create one big sentence if none were found
162     AnnotationSet sentences = outputAS.get(SENTENCE_ANNOTATION_TYPE);
163     if(sentences == null || sentences.isEmpty()){
164       //create an annotation covering the entire content
165       try{
166         outputAS.add(new Long(0), document.getContent().size(), 
167                 SENTENCE_ANNOTATION_TYPE, Factory.newFeatureMap());
168       }catch(InvalidOffsetException ioe){
169         throw new GateRuntimeException(ioe);
170       }
171     }else{
172       //add a sentence covering all the tokens after the last sentence
173       Long endSentences = sentences.lastNode().getOffset();
174       AnnotationSet remainingTokens = inputAS.get(TOKEN_ANNOTATION_TYPE, endSentences,
175                                                   inputAS.lastNode().getOffset());
176       if(remainingTokens != null && !remainingTokens.isEmpty()){
177         try{
178           outputAS.add(remainingTokens.firstNode().getOffset(),
179                        remainingTokens.lastNode().getOffset(),
180                        SENTENCE_ANNOTATION_TYPE,
181                        Factory.newFeatureMap());
182         }catch(InvalidOffsetException ioe){
183           throw new ExecutionException(ioe);
184         }
185       }
186     }
187     fireProcessFinished();
188   }//execute()
189 
190   /**
191    * Notifies all the PRs in this controller that they should stop their
192    * execution as soon as possible.
193    */
194   public synchronized void interrupt(){
195     interrupted = true;
196     gazetteer.interrupt();
197     transducer.interrupt();
198   }
199 
200   public void setTransducerURL(java.net.URL newTransducerURL) {
201     transducerURL = newTransducerURL;
202   }
203   public java.net.URL getTransducerURL() {
204     return transducerURL;
205   }
206   DefaultGazetteer gazetteer;
207   Transducer transducer;
208   private java.net.URL transducerURL;
209   private String encoding;
210   private java.net.URL gazetteerListsURL;
211 
212 
213   public void setEncoding(String newEncoding) {
214     encoding = newEncoding;
215   }
216   public String getEncoding() {
217     return encoding;
218   }
219   public void setGazetteerListsURL(java.net.URL newGazetteerListsURL) {
220     gazetteerListsURL = newGazetteerListsURL;
221   }
222   public java.net.URL getGazetteerListsURL() {
223     return gazetteerListsURL;
224   }
225   public void setInputASName(String newInputASName) {
226     inputASName = newInputASName;
227   }
228 
229   public String getInputASName() {
230     return inputASName;
231   }
232   public void setOutputASName(String newOutputASName) {
233     outputASName = newOutputASName;
234   }
235   public String getOutputASName() {
236     return outputASName;
237   }
238 
239 
240 
241   private static final boolean DEBUG = false;
242   private String inputASName;
243   private String outputASName;
244 }//public class SentenceSplitter extends Nerc