1   /*
2    *  DefaultTokeniser.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Valentin Tablan, 2000
12   *
13   *  $Id: SimpleTokeniser.java,v 1.17 2005/01/11 13:51:33 ian Exp $
14   */
15  
16  package gate.creole.tokeniser;
17  
18  import java.io.*;
19  import java.lang.reflect.Field;
20  import java.lang.reflect.Modifier;
21  import java.util.*;
22  
23  import gate.*;
24  import gate.creole.*;
25  import gate.util.*;
26  
27  //import EDU.auburn.VGJ.graph.ParseError;
28  
29  /** Implementation of a Unicode rule based tokeniser.
30   * The tokeniser gets its rules from a file an {@link java.io.InputStream
31   * InputStream} or a {@link java.io.Reader Reader} which should be sent to one
32   * of the constructors.
33   * The implementations is based on a finite state machine that is built based
34   * on the set of rules.
35   * A rule has two sides, the left hand side (LHS)and the right hand side (RHS)
36   * that are separated by the ">" character. The LHS represents a
37   * regular expression that will be matched against the input while the RHS
38   * describes a Gate2 annotation in terms of annotation type and attribute-value
39   * pairs.
40   * The matching is done using Unicode enumarated types as defined by the {@link
41   * java.lang.Character Character} class. At the time of writing this class the
42   * suported Unicode categories were:
43   * <ul>
44   * <li>UNASSIGNED
45   * <li>UPPERCASE_LETTER
46   * <li>LOWERCASE_LETTER
47   * <li>TITLECASE_LETTER
48   * <li>MODIFIER_LETTER
49   * <li>OTHER_LETTER
50   * <li>NON_SPACING_MARK
51   * <li>ENCLOSING_MARK
52   * <li>COMBINING_SPACING_MARK
53   * <li>DECIMAL_DIGIT_NUMBER
54   * <li>LETTER_NUMBER
55   * <li>OTHER_NUMBER
56   * <li>SPACE_SEPARATOR
57   * <li>LINE_SEPARATOR
58   * <li>PARAGRAPH_SEPARATOR
59   * <li>CONTROL
60   * <li>FORMAT
61   * <li>PRIVATE_USE
62   * <li>SURROGATE
63   * <li>DASH_PUNCTUATION
64   * <li>START_PUNCTUATION
65   * <li>END_PUNCTUATION
66   * <li>CONNECTOR_PUNCTUATION
67   * <li>OTHER_PUNCTUATION
68   * <li>MATH_SYMBOL
69   * <li>CURRENCY_SYMBOL
70   * <li>MODIFIER_SYMBOL
71   * <li>OTHER_SYMBOL
72   * </ul>
73   * The accepted operators for the LHS are "+", "*" and "|" having the usual
74   * interpretations of "1 to n occurences", "0 to n occurences" and
75   * "boolean OR".
76   * For instance this is a valid LHS:
77   * <br>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+
78   * <br>meaning an uppercase letter followed by one or more lowercase letters.
79   *
80   * The RHS describes an annotation that is to be created and inserted in the
81   * annotation set provided in case of a match. The new annotation will span the
82   * text that has been recognised. The RHS consists in the annotation type
83   * followed by pairs of attributes and associated values.
84   * E.g. for the LHS above a possible RHS can be:<br>
85   * Token;kind=upperInitial;<br>
86   * representing an annotation of type &quot;Token&quot; having one attribute
87   * named &quot;kind&quot; with the value &quot;upperInitial&quot;<br>
88   * The entire rule willbe:<br>
89   * <pre>"UPPERCASE_LETTER" "LOWERCASE_LETTER"+ > Token;kind=upperInitial;</pre>
90   * <br>
91   * The tokeniser ignores all the empty lines or the ones that start with # or
92   * //.
93   *
94   */
95  public class SimpleTokeniser extends AbstractLanguageAnalyser{
96    public static final String
97      SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
98  
99    public static final String
100     SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
101 
102   public static final String
103     SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
104 
105   public static final String
106     SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
107 
108   /** Debug flag
109    */
110   private static final boolean DEBUG = false;
111 
112   /**
113    * Creates a tokeniser
114    */
115   public SimpleTokeniser(){
116   }
117 
118   /**
119    * Initialises this tokeniser by reading the rules from an external source (provided through an URL) and building
120    * the finite state machine at the core of the tokeniser.
121    *
122    * @exception ResourceInstantiationException
123    */
124   public Resource init() throws ResourceInstantiationException{
125     Reader rulesReader;
126     try{
127       if(rulesURL != null){
128         rulesReader = new InputStreamReader(rulesURL.openStream(), encoding);
129       }else{
130         //no init data, Scream!
131         throw new ResourceInstantiationException(
132           "No URL provided for the rules!");
133       }
134       initialState = new FSMState(this);
135       BufferedReader bRulesReader = new BufferedReader(rulesReader);
136       String line = bRulesReader.readLine();
137       ///String toParse = "";
138       StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);
139 
140       while (line != null){
141         if(line.endsWith("\\")){
142           ///toParse += line.substring(0,line.length()-1);
143           toParse.append(line.substring(0,line.length()-1));
144         }else{
145           /*toParse += line;
146           parseRule(toParse);
147           toParse = "";
148           */
149           toParse.append(line);
150           parseRule(toParse.toString());
151           toParse.delete(0,toParse.length());
152         }
153         line = bRulesReader.readLine();
154       }
155       eliminateVoidTransitions();
156     }catch(java.io.IOException ioe){
157       throw new ResourceInstantiationException(ioe);
158     }catch(TokeniserException te){
159       throw new ResourceInstantiationException(te);
160     }
161     return this;
162   }
163 
164   /**
165    * Prepares this Processing resource for a new run.
166    */
167   public void reset(){
168     document = null;
169   }
170 
171   /** Parses one input line containing a tokeniser rule.
172    * This will create the necessary FSMState objects and the links
173    * between them.
174    *
175    * @param line the string containing the rule
176    */
177   void parseRule(String line)throws TokeniserException{
178     //ignore comments
179     if(line.startsWith("#")) return;
180 
181     if(line.startsWith("//")) return;
182 
183     StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
184     FSMState newState = new FSMState(this);
185 
186     initialState.put(null, newState);
187     FSMState finalState = parseLHS(newState, st, LHStoRHS);
188     String rhs = "";
189 
190     if(st.hasMoreTokens()) rhs = st.nextToken("\f");
191 
192     if(rhs.length() > 0)finalState.setRhs(rhs);
193   } // parseRule
194 
195   /** Parses a part or the entire LHS.
196    *
197    * @param startState a FSMState object representing the initial state for
198    *     the small FSM that will recognise the (part of) the rule parsed by this
199    *     method.
200    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
201    *     provides the input
202    * @param until the string that marks the end of the section to be
203    *     recognised. This method will first be called by {@link
204    *     #parseRule(String)} with &quot; &gt;&quot; in order to parse the entire
205    *     LHS. when necessary it will make itself another call to {@link #parseLHS
206    *     parseLHS} to parse a region of the LHS (e.g. a
207    *     &quot;(&quot;,&quot;)&quot; enclosed part.
208    */
209   FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
210        throws TokeniserException{
211 
212     FSMState currentState = startState;
213     boolean orFound = false;
214     List orList = new LinkedList();
215     String token;
216     token = skipIgnoreTokens(st);
217 
218     if(null == token) return currentState;
219 
220     FSMState newState;
221     Integer typeId;
222     UnicodeType uType;
223 
224     bigwhile: while(!token.equals(until)){
225       if(token.equals("(")){//(..)
226         newState = parseLHS(currentState, st,")");
227       } else if(token.equals("\"")){//"unicode_type"
228         String sType = parseQuotedString(st, "\"");
229         newState = new FSMState(this);
230         typeId = (Integer)stringTypeIds.get(sType);
231 
232         if(null == typeId)
233           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
234         else uType = new UnicodeType(typeId.intValue());
235 
236         currentState.put(uType ,newState);
237       } else {// a type with no quotes
238         String sType = token;
239         newState = new FSMState(this);
240         typeId = (Integer)stringTypeIds.get(sType);
241 
242         if(null == typeId)
243           throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
244         else uType = new UnicodeType(typeId.intValue());
245 
246         currentState.put(uType ,newState);
247       }
248       //treat the operators
249       token = skipIgnoreTokens(st);
250       if(null == token) throw
251         new InvalidRuleException("Tokeniser rule ended too soon!");
252 
253       if(token.equals("|")) {
254 
255         orFound = true;
256         orList.add(newState);
257         token = skipIgnoreTokens(st);
258         if(null == token) throw
259           new InvalidRuleException("Tokeniser rule ended too soon!");
260 
261         continue bigwhile;
262       } else if(orFound) {//done parsing the "|"
263         orFound = false;
264         orList.add(newState);
265         newState = new FSMState(this);
266         Iterator orListIter = orList.iterator();
267 
268         while(orListIter.hasNext())
269           ((FSMState)orListIter.next()).put(null, newState);
270         orList.clear();
271       }
272 
273       if(token.equals("+")) {
274 
275         newState.put(null,currentState);
276         currentState = newState;
277         newState = new FSMState(this);
278         currentState.put(null,newState);
279         token = skipIgnoreTokens(st);
280 
281         if(null == token) throw
282           new InvalidRuleException("Tokeniser rule ended too soon!");
283       } else if(token.equals("*")) {
284 
285         currentState.put(null,newState);
286         newState.put(null,currentState);
287         currentState = newState;
288         newState = new FSMState(this);
289         currentState.put(null,newState);
290         token = skipIgnoreTokens(st);
291 
292         if(null == token) throw
293           new InvalidRuleException("Tokeniser rule ended too soon!");
294       }
295       currentState = newState;
296     }
297     return currentState;
298   } // parseLHS
299 
300   /** Parses from the given string tokeniser until it finds a specific
301    * delimiter.
302    * One use for this method is to read everything until the first quote.
303    *
304    * @param st a {@link java.util.StringTokenizer StringTokenizer} that
305    *     provides the input
306    * @param until a String representing the end delimiter.
307    */
308   String parseQuotedString(StringTokenizer st, String until)
309     throws TokeniserException {
310 
311     String token;
312 
313     if(st.hasMoreElements()) token = st.nextToken();
314     else return null;
315 
316     ///String type = "";
317     StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);
318 
319     while(!token.equals(until)){
320       //type += token;
321       type.append(token);
322       if(st.hasMoreElements())token = st.nextToken();
323       else throw new InvalidRuleException("Tokeniser rule ended too soon!");
324     }
325     return type.toString();
326   } // parseQuotedString
327 
328   /** Skips the ignorable tokens from the input returning the first significant
329    * token.
330    * The ignorable tokens are defined by {@link #ignoreTokens a set}
331    */
332   protected static String skipIgnoreTokens(StringTokenizer st){
333     Iterator ignorables;
334     boolean ignorableFound = false;
335     String currentToken;
336 
337     while(true){
338       if(st.hasMoreTokens()){
339         currentToken = st.nextToken();
340         ignorables = ignoreTokens.iterator();
341         ignorableFound = false;
342 
343         while(!ignorableFound && ignorables.hasNext()){
344           if(currentToken.equals((String)ignorables.next()))
345             ignorableFound = true;
346         }
347 
348         if(!ignorableFound) return currentToken;
349       } else return null;
350     }
351   }//skipIgnoreTokens
352 
353   /* Computes the lambda-closure (aka epsilon closure) of the given set of
354    * states, that is the set of states that are accessible from any of the
355    * states in the given set using only unrestricted transitions.
356    * @return a set containing all the states accessible from this state via
357    * transitions that bear no restrictions.
358    */
359   /**
360    * Converts the finite state machine to a deterministic one.
361    *
362    * @param s
363    */
364   private AbstractSet lambdaClosure(Set s){
365 
366     //the stack/queue used by the algorithm
367     LinkedList list = new LinkedList(s);
368 
369     //the set to be returned
370     AbstractSet lambdaClosure = new HashSet(s);
371 
372     FSMState top;
373     FSMState currentState;
374     Set nextStates;
375     Iterator statesIter;
376 
377     while(!list.isEmpty()) {
378       top = (FSMState)list.removeFirst();
379       nextStates = top.nextSet(null);
380 
381       if(null != nextStates){
382         statesIter = nextStates.iterator();
383 
384         while(statesIter.hasNext()) {
385           currentState = (FSMState)statesIter.next();
386           if(!lambdaClosure.contains(currentState)){
387             lambdaClosure.add(currentState);
388             list.addFirst(currentState);
389           }//if(!lambdaClosure.contains(currentState))
390         }//while(statesIter.hasNext())
391 
392       }//if(null != nextStates)
393     }
394     return lambdaClosure;
395   } // lambdaClosure
396 
397   /** Converts the FSM from a non-deterministic to a deterministic one by
398    * eliminating all the unrestricted transitions.
399    */
400   void eliminateVoidTransitions() throws TokeniserException {
401 
402     //kalina:clear() faster than init() which is called with init()
403     newStates.clear();
404     Set sdStates = new HashSet();
405     LinkedList unmarkedDStates = new LinkedList();
406     DFSMState dCurrentState = new DFSMState(this);
407     Set sdCurrentState = new HashSet();
408 
409     sdCurrentState.add(initialState);
410     sdCurrentState = lambdaClosure(sdCurrentState);
411     newStates.put(sdCurrentState, dCurrentState);
412     sdStates.add(sdCurrentState);
413 
414     //find out if the new state is a final one
415     Iterator innerStatesIter = sdCurrentState.iterator();
416     String rhs;
417     FSMState currentInnerState;
418     Set rhsClashSet = new HashSet();
419     boolean newRhs = false;
420 
421     while(innerStatesIter.hasNext()){
422       currentInnerState = (FSMState)innerStatesIter.next();
423       if(currentInnerState.isFinal()){
424         rhs = currentInnerState.getRhs();
425         rhsClashSet.add(rhs);
426         dCurrentState.rhs = rhs;
427         newRhs = true;
428       }
429     }
430 
431     if(rhsClashSet.size() > 1){
432       Err.println("Warning, rule clash: " +  rhsClashSet +
433                          "\nSelected last definition: " + dCurrentState.rhs);
434     }
435 
436     if(newRhs)dCurrentState.buildTokenDesc();
437     rhsClashSet.clear();
438     unmarkedDStates.addFirst(sdCurrentState);
439     dInitialState = dCurrentState;
440     Set nextSet;
441 
442     while(!unmarkedDStates.isEmpty()){
443       //Out.println("\n\n=====================" + unmarkedDStates.size());
444       sdCurrentState = (Set)unmarkedDStates.removeFirst();
445       for(int type = 0; type < maxTypeId; type++){
446       //Out.print(type);
447         nextSet = new HashSet();
448         innerStatesIter = sdCurrentState.iterator();
449 
450         while(innerStatesIter.hasNext()){
451           currentInnerState = (FSMState)innerStatesIter.next();
452           Set tempSet = currentInnerState.nextSet(type);
453           if(null != tempSet) nextSet.addAll(tempSet);
454         }//while(innerStatesIter.hasNext())
455 
456         if(!nextSet.isEmpty()){
457           nextSet = lambdaClosure(nextSet);
458           dCurrentState = (DFSMState)newStates.get(nextSet);
459 
460           if(dCurrentState == null){
461 
462             //we have a new DFSMState
463             dCurrentState = new DFSMState(this);
464             sdStates.add(nextSet);
465             unmarkedDStates.add(nextSet);
466 
467             //check to see whether the new state is a final one
468             innerStatesIter = nextSet.iterator();
469             newRhs =false;
470 
471             while(innerStatesIter.hasNext()){
472               currentInnerState = (FSMState)innerStatesIter.next();
473               if(currentInnerState.isFinal()){
474                 rhs = currentInnerState.getRhs();
475                 rhsClashSet.add(rhs);
476                 dCurrentState.rhs = rhs;
477                 newRhs = true;
478               }
479             }
480 
481             if(rhsClashSet.size() > 1){
482               Err.println("Warning, rule clash: " +  rhsClashSet +
483                             "\nSelected last definition: " + dCurrentState.rhs);
484             }
485 
486             if(newRhs)dCurrentState.buildTokenDesc();
487             rhsClashSet.clear();
488             newStates.put(nextSet, dCurrentState);
489           }
490           ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState);
491         } // if(!nextSet.isEmpty())
492 
493       } // for(byte type = 0; type < 256; type++)
494 
495     } // while(!unmarkedDStates.isEmpty())
496 
497   } // eliminateVoidTransitions
498 
499   /** Returns a string representation of the non-deterministic FSM graph using
500    * GML (Graph modelling language).
501    */
502   public String getFSMgml(){
503     String res = "graph[ \ndirected 1\n";
504     ///String nodes = "", edges = "";
505     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
506                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
507 
508     Iterator fsmStatesIter = fsmStates.iterator();
509     while (fsmStatesIter.hasNext()){
510       FSMState currentState = (FSMState)fsmStatesIter.next();
511       int stateIndex = currentState.getIndex();
512       /*nodes += "node[ id " + stateIndex +
513                " label \"" + stateIndex;
514         */
515         nodes.append("node[ id ");
516         nodes.append(stateIndex);
517         nodes.append(" label \"");
518         nodes.append(stateIndex);
519 
520              if(currentState.isFinal()){
521               ///nodes += ",F\\n" + currentState.getRhs();
522               nodes.append(",F\\n" + currentState.getRhs());
523              }
524              ///nodes +=  "\"  ]\n";
525              nodes.append("\"  ]\n");
526       ///edges += currentState.getEdgesGML();
527       edges.append(currentState.getEdgesGML());
528     }
529     res += nodes.toString() + edges.toString() + "]\n";
530     return res;
531   } // getFSMgml
532 
533   /** Returns a string representation of the deterministic FSM graph using
534    * GML.
535    */
536   public String getDFSMgml() {
537     String res = "graph[ \ndirected 1\n";
538     ///String nodes = "", edges = "";
539     StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
540                  edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
541 
542     Iterator dfsmStatesIter = dfsmStates.iterator();
543     while (dfsmStatesIter.hasNext()) {
544       DFSMState currentState = (DFSMState)dfsmStatesIter.next();
545       int stateIndex = currentState.getIndex();
546 /*      nodes += "node[ id " + stateIndex +
547                " label \"" + stateIndex;
548 */
549         nodes.append("node[ id ");
550         nodes.append(stateIndex);
551         nodes.append(" label \"");
552         nodes.append(stateIndex);
553 
554              if(currentState.isFinal()){
555 ///              nodes += ",F\\n" + currentState.getRhs();
556               nodes.append(",F\\n" + currentState.getRhs());
557              }
558 ///             nodes +=  "\"  ]\n";
559              nodes.append("\"  ]\n");
560 ///      edges += currentState.getEdgesGML();
561         edges.append(currentState.getEdgesGML());
562     }
563     res += nodes.toString() + edges.toString() + "]\n";
564     return res;
565   } // getDFSMgml
566 
567   //no doc required: javadoc will copy it from the interface
568   /**    */
569   public FeatureMap getFeatures(){
570     return features;
571   } // getFeatures
572 
573   /**    */
574   public void setFeatures(FeatureMap features){
575     this.features = features;
576   } // setFeatures
577 
578   /**
579    * The method that does the actual tokenisation.
580    */
581   public void execute() throws ExecutionException {
582     interrupted = false;
583     AnnotationSet annotationSet;
584     //check the input
585     if(document == null) {
586       throw new ExecutionException(
587         "No document to tokenise!"
588       );
589     }
590 
591     if(annotationSetName == null ||
592        annotationSetName.equals("")) annotationSet = document.getAnnotations();
593     else annotationSet = document.getAnnotations(annotationSetName);
594 
595     fireStatusChanged(
596         "Tokenising " + document.getName() + "...");
597 
598     String content = document.getContent().toString();
599     int length = content.length();
600     char currentChar;
601 
602     DFSMState graphPosition = dInitialState;
603 
604     //the index of the first character of the token trying to be recognised
605     int tokenStart = 0;
606 
607     //the index of the last character of the last token recognised
608     int lastMatch = -1;
609 
610     DFSMState lastMatchingState = null;
611     DFSMState nextState;
612     String tokenString;
613     int charIdx = 0;
614     int oldCharIdx = 0;
615     FeatureMap newTokenFm;
616 
617     while(charIdx < length){
618       currentChar = content.charAt(charIdx);
619 //      Out.println(
620 //      currentChar + typesMnemonics[Character.getType(currentChar)+128]);
621       nextState = graphPosition.next(((Integer)typeIds.get(
622                   new Integer(Character.getType(currentChar)))).intValue());
623 
624       if( null != nextState ) {
625         graphPosition = nextState;
626         if(graphPosition.isFinal()) {
627           lastMatch = charIdx;
628           lastMatchingState = graphPosition;
629         }
630         charIdx ++;
631       } else {//we have a match!
632         newTokenFm = Factory.newFeatureMap();
633 
634         if (null == lastMatchingState) {
635           tokenString = content.substring(tokenStart, tokenStart +1);
636           newTokenFm.put("type","UNKNOWN");
637           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
638           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
639                          Integer.toString(tokenString.length()));
640 
641           try {
642             annotationSet.add(new Long(tokenStart),
643                               new Long(tokenStart + 1),
644                               "DEFAULT_TOKEN", newTokenFm);
645           } catch (InvalidOffsetException ioe) {
646             //This REALLY shouldn't happen!
647             ioe.printStackTrace(Err.getPrintWriter());
648           }
649           // Out.println("Default token: " + tokenStart +
650           //             "->" + tokenStart + " :" + tokenString + ";");
651           charIdx  = tokenStart + 1;
652         } else {
653           tokenString = content.substring(tokenStart, lastMatch + 1);
654           newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
655           newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
656                          Integer.toString(tokenString.length()));
657 
658           for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
659             newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
660                            lastMatchingState.getTokenDesc()[i][1]);
661           //Out.println(lastMatchingState.getTokenDesc()[i][0] + "=" +
662           //                       lastMatchingState.getTokenDesc()[i][1]);
663           }
664 
665 
666           try {
667             annotationSet.add(new Long(tokenStart),
668                             new Long(lastMatch + 1),
669                             lastMatchingState.getTokenDesc()[0][0], newTokenFm);
670           } catch(InvalidOffsetException ioe) {
671             //This REALLY shouldn't happen!
672             throw new GateRuntimeException(ioe.toString());
673           }
674 
675           // Out.println(lastMatchingState.getTokenDesc()[0][0] +
676           //              ": " + tokenStart + "->" + lastMatch +
677           //              " :" + tokenString + ";");
678           charIdx = lastMatch + 1;
679         }
680 
681         lastMatchingState = null;
682         graphPosition = dInitialState;
683         tokenStart = charIdx;
684       }
685 
686       if((charIdx - oldCharIdx > 256)){
687         fireProgressChanged((100 * charIdx )/ length );
688         oldCharIdx = charIdx;
689         if(isInterrupted()) throw new ExecutionInterruptedException();
690       }
691 
692     } // while(charIdx < length)
693 
694     if (null != lastMatchingState) {
695       tokenString = content.substring(tokenStart, lastMatch + 1);
696       newTokenFm = Factory.newFeatureMap();
697       newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
698       newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
699                      Integer.toString(tokenString.length()));
700 
701       for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
702         newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
703                        lastMatchingState.getTokenDesc()[i][1]);
704       }
705 
706 
707       try {
708         annotationSet.add(new Long(tokenStart),
709                           new Long(lastMatch + 1),
710                           lastMatchingState.getTokenDesc()[0][0], newTokenFm);
711       } catch(InvalidOffsetException ioe) {
712         //This REALLY shouldn't happen!
713         throw new GateRuntimeException(ioe.toString());
714       }
715 
716     }
717 
718     reset();
719     fireProcessFinished();
720     fireStatusChanged("Tokenisation complete!");
721   } // run
722 
723   /**
724    * Sets the value of the <code>rulesURL</code> property which holds an URL
725    * to the file containing the rules for this tokeniser.
726    * @param newRulesURL
727    */
728   public void setRulesURL(java.net.URL newRulesURL) {
729     rulesURL = newRulesURL;
730   }
731   /**
732    * Gets the value of the <code>rulesURL</code> property hich holds an
733    * URL to the file containing the rules for this tokeniser.
734    */
735   public java.net.URL getRulesURL() {
736     return rulesURL;
737   }
738   /**    */
739   public void setAnnotationSetName(String newAnnotationSetName) {
740     annotationSetName = newAnnotationSetName;
741   }
742   /**    */
743   public String getAnnotationSetName() {
744     return annotationSetName;
745   }
746   public void setRulesResourceName(String newRulesResourceName) {
747     rulesResourceName = newRulesResourceName;
748   }
749   public String getRulesResourceName() {
750     return rulesResourceName;
751   }
752   public void setEncoding(String newEncoding) {
753     encoding = newEncoding;
754   }
755   public String getEncoding() {
756     return encoding;
757   }
758 
759   /**    */
760   protected FeatureMap features  = null;
761 
762   /** the annotations et where the new annotations will be adde
763    */
764   protected String annotationSetName;
765 
766   /** The initial state of the non deterministic machin
767    */
768   protected FSMState initialState;
769 
770   /** A set containng all the states of the non deterministic machin
771    */
772   protected Set fsmStates = new HashSet();
773 
774   /** The initial state of the deterministic machin
775    */
776   protected DFSMState dInitialState;
777 
778   /** A set containng all the states of the deterministic machin
779    */
780   protected Set dfsmStates = new HashSet();
781 
782   /** The separator from LHS to RH
783    */
784   static String LHStoRHS = ">";
785 
786   /** A set of string representing tokens to be ignored (e.g. blanks
787    */
788   static Set ignoreTokens;
789 
790   /** maps from int (the static value on {@link java.lang.Character} to int
791    * the internal value used by the tokeniser. The ins values used by the
792    * tokeniser are consecutive values, starting from 0 and going as high as
793    * necessary.
794    * They map all the public static int members on{@link java.lang.Character}
795    */
796   public static Map typeIds;
797 
798   /** The maximum int value used internally as a type i
799    */
800   public static int maxTypeId;
801 
802   /** Maps the internal type ids to the type name
803    */
804   public static String[] typeMnemonics;
805 
806   /** Maps from type names to type internal id
807    */
808   public static Map stringTypeIds;
809 
810   /**
811    * This property holds an URL to the file containing the rules for this tokeniser
812    *
813    */
814 
815   /**    */
816   static protected String defaultResourceName =
817                             "creole/tokeniser/DefaultTokeniser.rules";
818 
819   private String rulesResourceName;
820   private java.net.URL rulesURL;
821   private String encoding;
822   private transient Vector progressListeners;
823   //kalina: added this as method to minimise too many init() calls
824   protected transient Map newStates = new HashMap();
825 
826 
827   /** The static initialiser will inspect the class {@link java.lang.Character}
828     * using reflection to find all the public static members and will map them
829     * to ids starting from 0.
830     * After that it will build all the static data: {@link #typeIds}, {@link
831     * #maxTypeId}, {@link #typeMnemonics}, {@link #stringTypeIds}
832     */
833   static{
834     Field[] characterClassFields;
835 
836     try{
837       characterClassFields = Class.forName("java.lang.Character").getFields();
838     }catch(ClassNotFoundException cnfe){
839       throw new LuckyException("Could not find the java.lang.Character class!");
840     }
841 
842     Collection staticFields = new LinkedList();
843     // JDK 1.4 introduced directionality constants that have the same values as
844     //character types; we need to skip those as well
845     for(int i = 0; i< characterClassFields.length; i++)
846       if(Modifier.isStatic(characterClassFields[i].getModifiers()) &&
847          characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1)
848         staticFields.add(characterClassFields[i]);
849 
850     typeIds = new HashMap();
851     maxTypeId = staticFields.size() -1;
852     typeMnemonics = new String[maxTypeId + 1];
853     stringTypeIds = new HashMap();
854 
855     Iterator staticFieldsIter = staticFields.iterator();
856     Field currentField;
857     int currentId = 0;
858     String fieldName;
859 
860     try {
861       while(staticFieldsIter.hasNext()){
862         currentField = (Field)staticFieldsIter.next();
863         if(currentField.getType().toString().equals("byte")){
864           fieldName = currentField.getName();
865           typeIds.put(new Integer(currentField.getInt(null)),
866                                     new Integer(currentId));
867           typeMnemonics[currentId] = fieldName;
868           stringTypeIds.put(fieldName, new Integer(currentId));
869           currentId++;
870         }
871       }
872     } catch(Exception e) {
873       throw new LuckyException(e.toString());
874     }
875 
876     ignoreTokens = new HashSet();
877     ignoreTokens.add(" ");
878     ignoreTokens.add("\t");
879     ignoreTokens.add("\f");
880   }
881 
882 } // class DefaultTokeniser