1
18 package gate.creole.gazetteer;
19
20 import java.util.*;
21
22 import gate.*;
23 import gate.creole.*;
24 import gate.util.*;
25
26
49 public class DefaultGazetteer extends AbstractGazetteer {
50
51
53 private static final boolean DEBUG = false;
54
55 public static final String
56 DEF_GAZ_DOCUMENT_PARAMETER_NAME = "document";
57
58 public static final String
59 DEF_GAZ_ANNOT_SET_PARAMETER_NAME = "annotationSetName";
60
61 public static final String
62 DEF_GAZ_LISTS_URL_PARAMETER_NAME = "listsURL";
63
64 public static final String
65 DEF_GAZ_ENCODING_PARAMETER_NAME = "encoding";
66
67 public static final String
68 DEF_GAZ_CASE_SENSITIVE_PARAMETER_NAME = "caseSensitive";
69
70
71
72 protected Map listsByNode;
73
74
77 public DefaultGazetteer(){
78 }
79
80
83 public Resource init()throws ResourceInstantiationException{
84 fsmStates = new HashSet();
85 initialState = new FSMState(this);
86 if(listsURL == null){
87 throw new ResourceInstantiationException (
88 "No URL provided for gazetteer creation!");
89 }
90 definition = new LinearDefinition();
91 definition.setURL(listsURL);
92 definition.load();
93 int linesCnt = definition.size();
94 listsByNode = definition.loadLists();
95 Iterator inodes = definition.iterator();
96
97 int nodeIdx = 0;
98 LinearNode node;
99 while (inodes.hasNext()) {
100 node = (LinearNode) inodes.next();
101 fireStatusChanged("Reading " + node.toString());
102 fireProgressChanged(++nodeIdx * 100 / linesCnt);
103 readList(node,true);
104 } fireProcessFinished();
106 return this;
107 }
108
109
110
118 protected void readList(LinearNode node, boolean add)
119 throws ResourceInstantiationException{
120 String listName, majorType, minorType, languages;
121 if ( null == node ) {
122 throw new ResourceInstantiationException(" LinearNode node is null ");
123 }
124
125 listName = node.getList();
126 majorType = node.getMajorType();
127 minorType = node.getMinorType();
128 languages = node.getLanguage();
129 GazetteerList gazList = (GazetteerList)listsByNode.get(node);
130 if (null == gazList) {
131 throw new ResourceInstantiationException("gazetteer list not found by node");
132 }
133
134 Iterator iline = gazList.iterator();
135
136 Lookup lookup = new Lookup(listName,majorType, minorType, languages);
137 lookup.list = node.getList();
138 if ( null != mappingDefinition){
139 MappingNode mnode = mappingDefinition.getNodeByList(lookup.list);
140 if (null!=mnode){
141 lookup.oClass = mnode.getClassID();
142 lookup.ontology = mnode.getOntologyID();
143 }
144 }
146 String line;
147 while(iline.hasNext()){
148 line = iline.next().toString();
149 if(add)addLookup(line, lookup);
150 else removeLookup(line, lookup);
151 }
152 }
154
160
189 public void addLookup(String text, Lookup lookup) {
191 char currentChar;
192 FSMState currentState = initialState;
193 FSMState nextState;
194 Lookup oldLookup;
195 boolean isSpace;
196
197 for(int i = 0; i< text.length(); i++) {
198 currentChar = text.charAt(i);
199 isSpace = Character.isWhitespace(currentChar);
200 if(isSpace) currentChar = ' ';
201 else currentChar = (caseSensitive.booleanValue()) ?
202 currentChar :
203 Character.toUpperCase(currentChar) ;
204 nextState = currentState.next(currentChar);
205 if(nextState == null){
206 nextState = new FSMState(this);
207 currentState.put(currentChar, nextState);
208 if(isSpace) nextState.put(' ',nextState);
209 }
210 currentState = nextState;
211 }
213 currentState.addLookup(lookup);
214
216 }
219
224
244 public void removeLookup(String text, Lookup lookup) {
246 char currentChar;
247 FSMState currentState = initialState;
248 FSMState nextState;
249 Lookup oldLookup;
250
251 for(int i = 0; i< text.length(); i++) {
252 currentChar = text.charAt(i);
253 if(Character.isWhitespace(currentChar)) currentChar = ' ';
254 nextState = currentState.next(currentChar);
255 if(nextState == null) return; currentState = nextState;
257 } currentState.removeLookup(lookup);
259 }
262
265 public String getFSMgml() {
266 String res = "graph[ \ndirected 1\n";
267 StringBuffer nodes = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE),
269 edges = new StringBuffer(gate.Gate.STRINGBUFFER_SIZE);
270 Iterator fsmStatesIter = fsmStates.iterator();
271 while (fsmStatesIter.hasNext()){
272 FSMState currentState = (FSMState)fsmStatesIter.next();
273 int stateIndex = currentState.getIndex();
274
277 nodes.append("node[ id ");
278 nodes.append(stateIndex);
279 nodes.append(" label \"");
280 nodes.append(stateIndex);
281
282 if(currentState.isFinal()){
283 nodes.append(",F\\n");
285 nodes.append(currentState.getLookupSet());
286 }
287 nodes.append("\" ]\n");
289 edges.append(currentState.getEdgesGML());
291 }
292 res += nodes.toString() + edges.toString() + "]\n";
293 return res;
294 }
296
297
303 public static boolean isWordInternal(char ch){
304 return Character.isLetter(ch) ||
305 Character.getType(ch) == Character.COMBINING_SPACING_MARK ||
306 Character.getType(ch) == Character.NON_SPACING_MARK;
307 }
308
309
313 public void execute() throws ExecutionException{
314 interrupted = false;
315 AnnotationSet annotationSet;
316 if(document == null) {
318 throw new ExecutionException(
319 "No document to process!"
320 );
321 }
322
323 if(annotationSetName == null ||
324 annotationSetName.equals("")) annotationSet = document.getAnnotations();
325 else annotationSet = document.getAnnotations(annotationSetName);
326
327 fireStatusChanged("Doing lookup in " +
328 document.getName() + "...");
329 String content = document.getContent().toString();
330 int length = content.length();
331
335 char currentChar;
337 FSMState currentState = initialState;
339 FSMState nextState;
340 FSMState lastMatchingState = null;
341 int matchedRegionEnd = 0;
342 int matchedRegionStart = 0;
343 int charIdx = 0;
344 int oldCharIdx = 0;
345 FeatureMap fm;
346 Lookup currentLookup;
347
348
358 while(charIdx < length) {
360 currentChar = content.charAt(charIdx);
361 if(Character.isWhitespace(currentChar)) currentChar = ' ';
362 else currentChar = caseSensitive.booleanValue() ?
363 currentChar :
364 Character.toUpperCase(currentChar);
365 nextState = currentState.next(currentChar);
367 if(nextState == null) {
368
370 if(lastMatchingState != null){
372 Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
374
375 while(lookupIter.hasNext()) {
376 currentLookup = (Lookup)lookupIter.next();
377 fm = Factory.newFeatureMap();
378 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
379 if (null!= currentLookup.oClass && null!=currentLookup.ontology){
380 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
381 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
382 }
383 if(null != currentLookup.minorType) {
384 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
385 if(null != currentLookup.languages)
386 fm.put("language", currentLookup.languages);
387 }
388 try {
389 annotationSet.add(new Long(matchedRegionStart),
390 new Long(matchedRegionEnd + 1),
391 LOOKUP_ANNOTATION_TYPE,
392 fm);
393 } catch(InvalidOffsetException ioe) {
394 throw new LuckyException(ioe.toString());
395 }
396 } lastMatchingState = null;
398 }
399
400 charIdx = matchedRegionStart + 1;
402 matchedRegionStart = charIdx;
403 currentState = initialState;
404
405 } else{ currentState = nextState;
407 if(currentState.isFinal() &&
409 (
410 (!wholeWordsOnly.booleanValue())
411 ||
412 ((matchedRegionStart == 0 ||
413 !isWordInternal(content.charAt(matchedRegionStart - 1)))
414 &&
415 (charIdx + 1 >= content.length() ||
416 !isWordInternal(content.charAt(charIdx + 1)))
417 )
418 )
419 ){
420 matchedRegionEnd = charIdx;
421 lastMatchingState = currentState;
422 }
423 charIdx ++;
424 if(charIdx == content.length()){
425 if(lastMatchingState != null){
428 Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
430
431 while(lookupIter.hasNext()) {
432 currentLookup = (Lookup)lookupIter.next();
433 fm = Factory.newFeatureMap();
434 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
435 if (null!= currentLookup.oClass && null!=currentLookup.ontology){
436 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
437 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
438 }
439 if(null != currentLookup.minorType) {
440 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
441 if(null != currentLookup.languages)
442 fm.put("language", currentLookup.languages);
443 }
444 try {
445 annotationSet.add(new Long(matchedRegionStart),
446 new Long(matchedRegionEnd + 1),
447 LOOKUP_ANNOTATION_TYPE,
448 fm);
449 } catch(InvalidOffsetException ioe) {
450 throw new LuckyException(ioe.toString());
451 }
452 } lastMatchingState = null;
454 }
455
456 charIdx = matchedRegionStart + 1;
458 matchedRegionStart = charIdx;
459 currentState = initialState;
460 }
461 }
462 if(charIdx - oldCharIdx > 256) {
463 fireProgressChanged((100 * charIdx )/ length );
464 oldCharIdx = charIdx;
465 if(isInterrupted()) throw new ExecutionInterruptedException(
466 "The execution of the " + getName() +
467 " gazetteer has been abruptly interrupted!");
468 }
469 }
471 if(lastMatchingState != null) {
472 Iterator lookupIter = lastMatchingState.getLookupSet().iterator();
473 while(lookupIter.hasNext()) {
474 currentLookup = (Lookup)lookupIter.next();
475 fm = Factory.newFeatureMap();
476 fm.put(LOOKUP_MAJOR_TYPE_FEATURE_NAME, currentLookup.majorType);
477 if (null!= currentLookup.oClass && null!=currentLookup.ontology){
478 fm.put(LOOKUP_CLASS_FEATURE_NAME,currentLookup.oClass);
479 fm.put(LOOKUP_ONTOLOGY_FEATURE_NAME,currentLookup.ontology);
480 }
481
482 if(null != currentLookup.minorType)
483 fm.put(LOOKUP_MINOR_TYPE_FEATURE_NAME, currentLookup.minorType);
484 try{
485 annotationSet.add(new Long(matchedRegionStart),
486 new Long(matchedRegionEnd + 1),
487 LOOKUP_ANNOTATION_TYPE,
488 fm);
489 } catch(InvalidOffsetException ioe) {
490 throw new GateRuntimeException(ioe.toString());
491 }
492 } }
494 fireProcessFinished();
495 fireStatusChanged("Lookup complete!");
496 }
498
499
501 protected FSMState initialState;
502
503
505 protected Set fsmStates;
506
507
510 public Set lookup(String singleItem) {
511 char currentChar;
512 Set set = new HashSet();
513 FSMState currentState = initialState;
514 FSMState nextState;
515
516 for(int i = 0; i< singleItem.length(); i++) {
517 currentChar = singleItem.charAt(i);
518 if(Character.isWhitespace(currentChar)) currentChar = ' ';
519 nextState = currentState.next(currentChar);
520 if(nextState == null) {
521 return set;
522 }
523 currentState = nextState;
524 } set = currentState.getLookupSet();
526 return set;
527 }
528
529 public boolean remove(String singleItem) {
530 char currentChar;
531 FSMState currentState = initialState;
532 FSMState nextState;
533 Lookup oldLookup;
534
535 for(int i = 0; i< singleItem.length(); i++) {
536 currentChar = singleItem.charAt(i);
537 if(Character.isWhitespace(currentChar)) currentChar = ' ';
538 nextState = currentState.next(currentChar);
539 if(nextState == null) {
540 return false;
541 } currentState = nextState;
543 } currentState.lookupSet = new HashSet();
545 return true;
546 }
547
548 public boolean add(String singleItem, Lookup lookup) {
549 addLookup(singleItem,lookup);
550 return true;
551 }
552
553 public static interface Iter
555 {
556 public boolean hasNext();
557 public char next();
558 }
560
564 public static class CharMap
565 {
566 char[] itemsKeys = null;
567 Object[] itemsObjs = null;
568
569
572 void resize(int index)
573 {
574 int newsz = itemsKeys.length + 1;
575 char[] tempKeys = new char[newsz];
576 Object[] tempObjs = new Object[newsz];
577 int i;
578 for (i= 0; i < index; i++)
579 {
580 tempKeys[i] = itemsKeys[i];
581 tempObjs[i] = itemsObjs[i];
582 }
583 for (i= index+1; i < newsz; i++)
584 {
585 tempKeys[i] = itemsKeys[i-1];
586 tempObjs[i] = itemsObjs[i-1];
587 }
588
589 itemsKeys = tempKeys;
590 itemsObjs = tempObjs;
591 }
593
596 Object get(char key)
597 {
598 if (itemsKeys == null) return null;
599 int index = Arrays.binarySearch(itemsKeys, key);
600 if (index<0)
601 return null;
602 return itemsObjs[index];
603 }
604
607 Object put(char key, Object value)
608 {
609 if (itemsKeys == null)
610 {
611 itemsKeys = new char[1];
612 itemsKeys[0] = key;
613 itemsObjs = new Object[1];
614 itemsObjs[0] = value;
615 return value;
616 } int index = Arrays.binarySearch(itemsKeys, key);
618 if (index<0)
619 {
620 index = ~index;
621 resize(index);
622 itemsKeys[index] = key;
623 itemsObjs[index] = value;
624 }
625 return itemsObjs[index];
626 }
640
641 }
643 }