1
15
16 package gate.creole.tokeniser;
17
18 import java.io.*;
19 import java.lang.reflect.Field;
20 import java.lang.reflect.Modifier;
21 import java.util.*;
22
23 import gate.*;
24 import gate.creole.*;
25 import gate.util.*;
26
27
29
95 public class SimpleTokeniser extends AbstractLanguageAnalyser{
96 public static final String
97 SIMP_TOK_DOCUMENT_PARAMETER_NAME = "document";
98
99 public static final String
100 SIMP_TOK_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
101
102 public static final String
103 SIMP_TOK_RULES_URL_PARAMETER_NAME = "rulesURL";
104
105 public static final String
106 SIMP_TOK_ENCODING_PARAMETER_NAME = "encoding";
107
108
110 private static final boolean DEBUG = false;
111
112
115 public SimpleTokeniser(){
116 }
117
118
124 public Resource init() throws ResourceInstantiationException{
125 Reader rulesReader;
126 try{
127 if(rulesURL != null){
128 rulesReader = new InputStreamReader(rulesURL.openStream(), encoding);
129 }else{
130 throw new ResourceInstantiationException(
132 "No URL provided for the rules!");
133 }
134 initialState = new FSMState(this);
135 BufferedReader bRulesReader = new BufferedReader(rulesReader);
136 String line = bRulesReader.readLine();
137 StringBuffer toParse = new StringBuffer(Gate.STRINGBUFFER_SIZE);
139
140 while (line != null){
141 if(line.endsWith("\\")){
142 toParse.append(line.substring(0,line.length()-1));
144 }else{
145
149 toParse.append(line);
150 parseRule(toParse.toString());
151 toParse.delete(0,toParse.length());
152 }
153 line = bRulesReader.readLine();
154 }
155 eliminateVoidTransitions();
156 }catch(java.io.IOException ioe){
157 throw new ResourceInstantiationException(ioe);
158 }catch(TokeniserException te){
159 throw new ResourceInstantiationException(te);
160 }
161 return this;
162 }
163
164
167 public void reset(){
168 document = null;
169 }
170
171
177 void parseRule(String line)throws TokeniserException{
178 if(line.startsWith("#")) return;
180
181 if(line.startsWith("//")) return;
182
183 StringTokenizer st = new StringTokenizer(line, "()+*|\" \t\f>", true);
184 FSMState newState = new FSMState(this);
185
186 initialState.put(null, newState);
187 FSMState finalState = parseLHS(newState, st, LHStoRHS);
188 String rhs = "";
189
190 if(st.hasMoreTokens()) rhs = st.nextToken("\f");
191
192 if(rhs.length() > 0)finalState.setRhs(rhs);
193 }
195
209 FSMState parseLHS(FSMState startState, StringTokenizer st, String until)
210 throws TokeniserException{
211
212 FSMState currentState = startState;
213 boolean orFound = false;
214 List orList = new LinkedList();
215 String token;
216 token = skipIgnoreTokens(st);
217
218 if(null == token) return currentState;
219
220 FSMState newState;
221 Integer typeId;
222 UnicodeType uType;
223
224 bigwhile: while(!token.equals(until)){
225 if(token.equals("(")){ newState = parseLHS(currentState, st,")");
227 } else if(token.equals("\"")){ String sType = parseQuotedString(st, "\"");
229 newState = new FSMState(this);
230 typeId = (Integer)stringTypeIds.get(sType);
231
232 if(null == typeId)
233 throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
234 else uType = new UnicodeType(typeId.intValue());
235
236 currentState.put(uType ,newState);
237 } else { String sType = token;
239 newState = new FSMState(this);
240 typeId = (Integer)stringTypeIds.get(sType);
241
242 if(null == typeId)
243 throw new InvalidRuleException("Invalid type: \"" + sType + "\"");
244 else uType = new UnicodeType(typeId.intValue());
245
246 currentState.put(uType ,newState);
247 }
248 token = skipIgnoreTokens(st);
250 if(null == token) throw
251 new InvalidRuleException("Tokeniser rule ended too soon!");
252
253 if(token.equals("|")) {
254
255 orFound = true;
256 orList.add(newState);
257 token = skipIgnoreTokens(st);
258 if(null == token) throw
259 new InvalidRuleException("Tokeniser rule ended too soon!");
260
261 continue bigwhile;
262 } else if(orFound) { orFound = false;
264 orList.add(newState);
265 newState = new FSMState(this);
266 Iterator orListIter = orList.iterator();
267
268 while(orListIter.hasNext())
269 ((FSMState)orListIter.next()).put(null, newState);
270 orList.clear();
271 }
272
273 if(token.equals("+")) {
274
275 newState.put(null,currentState);
276 currentState = newState;
277 newState = new FSMState(this);
278 currentState.put(null,newState);
279 token = skipIgnoreTokens(st);
280
281 if(null == token) throw
282 new InvalidRuleException("Tokeniser rule ended too soon!");
283 } else if(token.equals("*")) {
284
285 currentState.put(null,newState);
286 newState.put(null,currentState);
287 currentState = newState;
288 newState = new FSMState(this);
289 currentState.put(null,newState);
290 token = skipIgnoreTokens(st);
291
292 if(null == token) throw
293 new InvalidRuleException("Tokeniser rule ended too soon!");
294 }
295 currentState = newState;
296 }
297 return currentState;
298 }
300
308 String parseQuotedString(StringTokenizer st, String until)
309 throws TokeniserException {
310
311 String token;
312
313 if(st.hasMoreElements()) token = st.nextToken();
314 else return null;
315
316 StringBuffer type = new StringBuffer(Gate.STRINGBUFFER_SIZE);
318
319 while(!token.equals(until)){
320 type.append(token);
322 if(st.hasMoreElements())token = st.nextToken();
323 else throw new InvalidRuleException("Tokeniser rule ended too soon!");
324 }
325 return type.toString();
326 }
328
332 protected static String skipIgnoreTokens(StringTokenizer st){
333 Iterator ignorables;
334 boolean ignorableFound = false;
335 String currentToken;
336
337 while(true){
338 if(st.hasMoreTokens()){
339 currentToken = st.nextToken();
340 ignorables = ignoreTokens.iterator();
341 ignorableFound = false;
342
343 while(!ignorableFound && ignorables.hasNext()){
344 if(currentToken.equals((String)ignorables.next()))
345 ignorableFound = true;
346 }
347
348 if(!ignorableFound) return currentToken;
349 } else return null;
350 }
351 }
353
359
364 private AbstractSet lambdaClosure(Set s){
365
366 LinkedList list = new LinkedList(s);
368
369 AbstractSet lambdaClosure = new HashSet(s);
371
372 FSMState top;
373 FSMState currentState;
374 Set nextStates;
375 Iterator statesIter;
376
377 while(!list.isEmpty()) {
378 top = (FSMState)list.removeFirst();
379 nextStates = top.nextSet(null);
380
381 if(null != nextStates){
382 statesIter = nextStates.iterator();
383
384 while(statesIter.hasNext()) {
385 currentState = (FSMState)statesIter.next();
386 if(!lambdaClosure.contains(currentState)){
387 lambdaClosure.add(currentState);
388 list.addFirst(currentState);
389 } }
392 } }
394 return lambdaClosure;
395 }
397
400 void eliminateVoidTransitions() throws TokeniserException {
401
402 newStates.clear();
404 Set sdStates = new HashSet();
405 LinkedList unmarkedDStates = new LinkedList();
406 DFSMState dCurrentState = new DFSMState(this);
407 Set sdCurrentState = new HashSet();
408
409 sdCurrentState.add(initialState);
410 sdCurrentState = lambdaClosure(sdCurrentState);
411 newStates.put(sdCurrentState, dCurrentState);
412 sdStates.add(sdCurrentState);
413
414 Iterator innerStatesIter = sdCurrentState.iterator();
416 String rhs;
417 FSMState currentInnerState;
418 Set rhsClashSet = new HashSet();
419 boolean newRhs = false;
420
421 while(innerStatesIter.hasNext()){
422 currentInnerState = (FSMState)innerStatesIter.next();
423 if(currentInnerState.isFinal()){
424 rhs = currentInnerState.getRhs();
425 rhsClashSet.add(rhs);
426 dCurrentState.rhs = rhs;
427 newRhs = true;
428 }
429 }
430
431 if(rhsClashSet.size() > 1){
432 Err.println("Warning, rule clash: " + rhsClashSet +
433 "\nSelected last definition: " + dCurrentState.rhs);
434 }
435
436 if(newRhs)dCurrentState.buildTokenDesc();
437 rhsClashSet.clear();
438 unmarkedDStates.addFirst(sdCurrentState);
439 dInitialState = dCurrentState;
440 Set nextSet;
441
442 while(!unmarkedDStates.isEmpty()){
443 sdCurrentState = (Set)unmarkedDStates.removeFirst();
445 for(int type = 0; type < maxTypeId; type++){
446 nextSet = new HashSet();
448 innerStatesIter = sdCurrentState.iterator();
449
450 while(innerStatesIter.hasNext()){
451 currentInnerState = (FSMState)innerStatesIter.next();
452 Set tempSet = currentInnerState.nextSet(type);
453 if(null != tempSet) nextSet.addAll(tempSet);
454 }
456 if(!nextSet.isEmpty()){
457 nextSet = lambdaClosure(nextSet);
458 dCurrentState = (DFSMState)newStates.get(nextSet);
459
460 if(dCurrentState == null){
461
462 dCurrentState = new DFSMState(this);
464 sdStates.add(nextSet);
465 unmarkedDStates.add(nextSet);
466
467 innerStatesIter = nextSet.iterator();
469 newRhs =false;
470
471 while(innerStatesIter.hasNext()){
472 currentInnerState = (FSMState)innerStatesIter.next();
473 if(currentInnerState.isFinal()){
474 rhs = currentInnerState.getRhs();
475 rhsClashSet.add(rhs);
476 dCurrentState.rhs = rhs;
477 newRhs = true;
478 }
479 }
480
481 if(rhsClashSet.size() > 1){
482 Err.println("Warning, rule clash: " + rhsClashSet +
483 "\nSelected last definition: " + dCurrentState.rhs);
484 }
485
486 if(newRhs)dCurrentState.buildTokenDesc();
487 rhsClashSet.clear();
488 newStates.put(nextSet, dCurrentState);
489 }
490 ((DFSMState)newStates.get(sdCurrentState)).put(type,dCurrentState);
491 }
493 }
495 }
497 }
499
502 public String getFSMgml(){
503 String res = "graph[ \ndirected 1\n";
504 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
506 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
507
508 Iterator fsmStatesIter = fsmStates.iterator();
509 while (fsmStatesIter.hasNext()){
510 FSMState currentState = (FSMState)fsmStatesIter.next();
511 int stateIndex = currentState.getIndex();
512
515 nodes.append("node[ id ");
516 nodes.append(stateIndex);
517 nodes.append(" label \"");
518 nodes.append(stateIndex);
519
520 if(currentState.isFinal()){
521 nodes.append(",F\\n" + currentState.getRhs());
523 }
524 nodes.append("\" ]\n");
526 edges.append(currentState.getEdgesGML());
528 }
529 res += nodes.toString() + edges.toString() + "]\n";
530 return res;
531 }
533
536 public String getDFSMgml() {
537 String res = "graph[ \ndirected 1\n";
538 StringBuffer nodes = new StringBuffer(Gate.STRINGBUFFER_SIZE),
540 edges = new StringBuffer(Gate.STRINGBUFFER_SIZE);
541
542 Iterator dfsmStatesIter = dfsmStates.iterator();
543 while (dfsmStatesIter.hasNext()) {
544 DFSMState currentState = (DFSMState)dfsmStatesIter.next();
545 int stateIndex = currentState.getIndex();
546
549 nodes.append("node[ id ");
550 nodes.append(stateIndex);
551 nodes.append(" label \"");
552 nodes.append(stateIndex);
553
554 if(currentState.isFinal()){
555 nodes.append(",F\\n" + currentState.getRhs());
557 }
558 nodes.append("\" ]\n");
560 edges.append(currentState.getEdgesGML());
562 }
563 res += nodes.toString() + edges.toString() + "]\n";
564 return res;
565 }
567
569 public FeatureMap getFeatures(){
570 return features;
571 }
573
574 public void setFeatures(FeatureMap features){
575 this.features = features;
576 }
578
581 public void execute() throws ExecutionException {
582 interrupted = false;
583 AnnotationSet annotationSet;
584 if(document == null) {
586 throw new ExecutionException(
587 "No document to tokenise!"
588 );
589 }
590
591 if(annotationSetName == null ||
592 annotationSetName.equals("")) annotationSet = document.getAnnotations();
593 else annotationSet = document.getAnnotations(annotationSetName);
594
595 fireStatusChanged(
596 "Tokenising " + document.getName() + "...");
597
598 String content = document.getContent().toString();
599 int length = content.length();
600 char currentChar;
601
602 DFSMState graphPosition = dInitialState;
603
604 int tokenStart = 0;
606
607 int lastMatch = -1;
609
610 DFSMState lastMatchingState = null;
611 DFSMState nextState;
612 String tokenString;
613 int charIdx = 0;
614 int oldCharIdx = 0;
615 FeatureMap newTokenFm;
616
617 while(charIdx < length){
618 currentChar = content.charAt(charIdx);
619 nextState = graphPosition.next(((Integer)typeIds.get(
622 new Integer(Character.getType(currentChar)))).intValue());
623
624 if( null != nextState ) {
625 graphPosition = nextState;
626 if(graphPosition.isFinal()) {
627 lastMatch = charIdx;
628 lastMatchingState = graphPosition;
629 }
630 charIdx ++;
631 } else { newTokenFm = Factory.newFeatureMap();
633
634 if (null == lastMatchingState) {
635 tokenString = content.substring(tokenStart, tokenStart +1);
636 newTokenFm.put("type","UNKNOWN");
637 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
638 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
639 Integer.toString(tokenString.length()));
640
641 try {
642 annotationSet.add(new Long(tokenStart),
643 new Long(tokenStart + 1),
644 "DEFAULT_TOKEN", newTokenFm);
645 } catch (InvalidOffsetException ioe) {
646 ioe.printStackTrace(Err.getPrintWriter());
648 }
649 charIdx = tokenStart + 1;
652 } else {
653 tokenString = content.substring(tokenStart, lastMatch + 1);
654 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
655 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
656 Integer.toString(tokenString.length()));
657
658 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
659 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
660 lastMatchingState.getTokenDesc()[i][1]);
661 }
664
665
666 try {
667 annotationSet.add(new Long(tokenStart),
668 new Long(lastMatch + 1),
669 lastMatchingState.getTokenDesc()[0][0], newTokenFm);
670 } catch(InvalidOffsetException ioe) {
671 throw new GateRuntimeException(ioe.toString());
673 }
674
675 charIdx = lastMatch + 1;
679 }
680
681 lastMatchingState = null;
682 graphPosition = dInitialState;
683 tokenStart = charIdx;
684 }
685
686 if((charIdx - oldCharIdx > 256)){
687 fireProgressChanged((100 * charIdx )/ length );
688 oldCharIdx = charIdx;
689 if(isInterrupted()) throw new ExecutionInterruptedException();
690 }
691
692 }
694 if (null != lastMatchingState) {
695 tokenString = content.substring(tokenStart, lastMatch + 1);
696 newTokenFm = Factory.newFeatureMap();
697 newTokenFm.put(TOKEN_STRING_FEATURE_NAME, tokenString);
698 newTokenFm.put(TOKEN_LENGTH_FEATURE_NAME,
699 Integer.toString(tokenString.length()));
700
701 for(int i = 1; i < lastMatchingState.getTokenDesc().length; i++){
702 newTokenFm.put(lastMatchingState.getTokenDesc()[i][0],
703 lastMatchingState.getTokenDesc()[i][1]);
704 }
705
706
707 try {
708 annotationSet.add(new Long(tokenStart),
709 new Long(lastMatch + 1),
710 lastMatchingState.getTokenDesc()[0][0], newTokenFm);
711 } catch(InvalidOffsetException ioe) {
712 throw new GateRuntimeException(ioe.toString());
714 }
715
716 }
717
718 reset();
719 fireProcessFinished();
720 fireStatusChanged("Tokenisation complete!");
721 }
723
728 public void setRulesURL(java.net.URL newRulesURL) {
729 rulesURL = newRulesURL;
730 }
731
735 public java.net.URL getRulesURL() {
736 return rulesURL;
737 }
738
739 public void setAnnotationSetName(String newAnnotationSetName) {
740 annotationSetName = newAnnotationSetName;
741 }
742
743 public String getAnnotationSetName() {
744 return annotationSetName;
745 }
746 public void setRulesResourceName(String newRulesResourceName) {
747 rulesResourceName = newRulesResourceName;
748 }
749 public String getRulesResourceName() {
750 return rulesResourceName;
751 }
752 public void setEncoding(String newEncoding) {
753 encoding = newEncoding;
754 }
755 public String getEncoding() {
756 return encoding;
757 }
758
759
760 protected FeatureMap features = null;
761
762
764 protected String annotationSetName;
765
766
768 protected FSMState initialState;
769
770
772 protected Set fsmStates = new HashSet();
773
774
776 protected DFSMState dInitialState;
777
778
780 protected Set dfsmStates = new HashSet();
781
782
784 static String LHStoRHS = ">";
785
786
788 static Set ignoreTokens;
789
790
796 public static Map typeIds;
797
798
800 public static int maxTypeId;
801
802
804 public static String[] typeMnemonics;
805
806
808 public static Map stringTypeIds;
809
810
814
815
816 static protected String defaultResourceName =
817 "creole/tokeniser/DefaultTokeniser.rules";
818
819 private String rulesResourceName;
820 private java.net.URL rulesURL;
821 private String encoding;
822 private transient Vector progressListeners;
823 protected transient Map newStates = new HashMap();
825
826
827
833 static{
834 Field[] characterClassFields;
835
836 try{
837 characterClassFields = Class.forName("java.lang.Character").getFields();
838 }catch(ClassNotFoundException cnfe){
839 throw new LuckyException("Could not find the java.lang.Character class!");
840 }
841
842 Collection staticFields = new LinkedList();
843 for(int i = 0; i< characterClassFields.length; i++)
846 if(Modifier.isStatic(characterClassFields[i].getModifiers()) &&
847 characterClassFields[i].getName().indexOf("DIRECTIONALITY") == -1)
848 staticFields.add(characterClassFields[i]);
849
850 typeIds = new HashMap();
851 maxTypeId = staticFields.size() -1;
852 typeMnemonics = new String[maxTypeId + 1];
853 stringTypeIds = new HashMap();
854
855 Iterator staticFieldsIter = staticFields.iterator();
856 Field currentField;
857 int currentId = 0;
858 String fieldName;
859
860 try {
861 while(staticFieldsIter.hasNext()){
862 currentField = (Field)staticFieldsIter.next();
863 if(currentField.getType().toString().equals("byte")){
864 fieldName = currentField.getName();
865 typeIds.put(new Integer(currentField.getInt(null)),
866 new Integer(currentId));
867 typeMnemonics[currentId] = fieldName;
868 stringTypeIds.put(fieldName, new Integer(currentId));
869 currentId++;
870 }
871 }
872 } catch(Exception e) {
873 throw new LuckyException(e.toString());
874 }
875
876 ignoreTokens = new HashSet();
877 ignoreTokens.add(" ");
878 ignoreTokens.add("\t");
879 ignoreTokens.add("\f");
880 }
881
882 }