1
15
16 package gate.corpora;
17
18 import java.io.*;
19 import java.net.URL;
20 import java.util.*;
21
22 import gate.*;
23 import gate.creole.AbstractLanguageResource;
24 import gate.creole.ResourceInstantiationException;
25 import gate.creole.ir.*;
26 import gate.event.*;
27 import gate.persist.PersistenceException;
28 import gate.security.SecurityException;
29 import gate.util.*;
30
31
39 public class SerialCorpusImpl extends
40 AbstractLanguageResource
41 implements Corpus, CreoleListener,
42 DatastoreListener, IndexedCorpus {
43
44
45 private static final boolean DEBUG = false;
46
47 static final long serialVersionUID = 3632609241787241616L;
48
49 protected transient Vector corpusListeners;
50 protected java.util.List docDataList = null;
51
52 protected transient List documents = null;
55
56 protected transient IndexManager indexManager= null;
57 protected transient List addedDocs = null;
58 protected transient List removedDocIDs = null;
59 protected transient List changedDocs = null;
60
61 public SerialCorpusImpl() {
62 }
63
64
71 protected SerialCorpusImpl(Corpus tCorpus){
72 this.setName(tCorpus.getName());
74 this.setFeatures(tCorpus.getFeatures());
75
76 docDataList = new ArrayList();
77 Iterator iter = tCorpus.getDocumentNames().iterator();
79 while (iter.hasNext())
80 docDataList.add(new DocumentData((String) iter.next(), null));
81
82 documents = new ArrayList();
84 documents.addAll(tCorpus);
85
86 Gate.getCreoleRegister().addCreoleListener(this);
88 }
89
90
95 public List getDocumentNames(){
96 List docsNames = new ArrayList();
97 if(docDataList == null)
98 return docsNames;
99 Iterator iter = docDataList.iterator();
100 while (iter.hasNext()) {
101 DocumentData data = (DocumentData) iter.next();
102 docsNames.add(data.getDocumentName());
103 }
104 return docsNames;
105 }
106
107
110 public void setDocumentPersistentID(int index, Object persID){
111 if (index >= docDataList.size()) return;
112 ((DocumentData)docDataList.get(index)).setPersistentID(persID);
113 if (DEBUG) Out.prln("IDs are now: " + docDataList);
114 }
115
116
122 public String getDocumentName(int index){
123 if (index >= docDataList.size()) return "No such document";
124
125 return ((DocumentData) docDataList.get(index)).getDocumentName();
126 }
127
128
132 public void unloadDocument(int index) {
133 if ( (! isDocumentLoaded(index)) && isPersistentDocument(index))
136 return;
137
138 Document doc = (Document) documents.get(index);
141 try {
142 if (doc.getLRPersistenceId() == null) {
144 doc = (Document) this.getDataStore().adopt(doc, null);
145 this.getDataStore().sync(doc);
146 this.setDocumentPersistentID(index, doc.getLRPersistenceId());
147 } else this.getDataStore().sync(doc);
149
150 documents.set(index, null);
153
154 } catch (PersistenceException ex) {
155 throw new GateRuntimeException("Error unloading document from corpus"
156 + "because document sync failed: " + ex.getMessage());
157 } catch (gate.security.SecurityException ex1) {
158 throw new GateRuntimeException("Error unloading document from corpus"
159 + "because of document access error: " + ex1.getMessage());
160 }
161
162 }
163
164
167 public void unloadDocument(Document doc) {
168 if (DEBUG) Out.prln("Document to be unloaded :" + doc.getName());
169 int index = findDocument(doc);
171 if (index == -1)
172 return;
173 if (DEBUG) Out.prln("Index of doc: " + index);
174 if (DEBUG) Out.prln("Size of corpus: " + documents.size());
175 unloadDocument(index);
176 }
178
179
182 public boolean isDocumentLoaded(int index) {
183 if (documents == null || documents.isEmpty()) return false;
184 return documents.get(index) != null;
185 }
186
187
191 public boolean isPersistentDocument(int index) {
192 if (documents == null || documents.isEmpty()) return false;
193 return (((DocumentData)docDataList.get(index)).getPersistentID() != null);
194 }
195
196
202 public void cleanup() {
203 if (DEBUG) Out.prln("serial corpus cleanup called");
204 if (corpusListeners != null)
205 corpusListeners = null;
206 if (documents != null)
207 documents.clear();
208 docDataList.clear();
209 Gate.getCreoleRegister().removeCreoleListener(this);
210 if (this.dataStore != null) {
211 this.dataStore.removeDatastoreListener(this);
212 }
213 }
214
215
230 public void populate(URL directory, FileFilter filter, String encoding,
231 boolean recurseDirectories)
232 throws IOException, ResourceInstantiationException{
233 CorpusImpl.populate(this, directory, filter, encoding, recurseDirectories);
234 }
235
236
237 public synchronized void removeCorpusListener(CorpusListener l) {
238 if (corpusListeners != null && corpusListeners.contains(l)) {
239 Vector v = (Vector) corpusListeners.clone();
240 v.removeElement(l);
241 corpusListeners = v;
242 }
243 }
244 public synchronized void addCorpusListener(CorpusListener l) {
245 Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone();
246 if (!v.contains(l)) {
247 v.addElement(l);
248 corpusListeners = v;
249 }
250 }
251 protected void fireDocumentAdded(CorpusEvent e) {
252 if (corpusListeners != null) {
253 Vector listeners = corpusListeners;
254 int count = listeners.size();
255 for (int i = 0; i < count; i++) {
256 ((CorpusListener) listeners.elementAt(i)).documentAdded(e);
257 }
258 }
259 }
260 protected void fireDocumentRemoved(CorpusEvent e) {
261 if (corpusListeners != null) {
262 Vector listeners = corpusListeners;
263 int count = listeners.size();
264 for (int i = 0; i < count; i++) {
265 ((CorpusListener) listeners.elementAt(i)).documentRemoved(e);
266 }
267 }
268 }
269 public void resourceLoaded(CreoleEvent e) {
270 }
271
272 public void resourceRenamed(Resource resource, String oldName,
273 String newName){}
274
275 public void resourceUnloaded(CreoleEvent e) {
276 Resource res = e.getResource();
277 if (res instanceof Document) {
278 Document doc = (Document) res;
279 if (DEBUG)
280 Out.prln("resource Unloaded called ");
281 if (doc.getDataStore() != this.getDataStore()) {
283 this.remove(doc);
284 } else {
285 int index = indexOf(res);
287 if (index < 0)
288 return;
289 documents.set(index, null);
290 if (DEBUG)
291 Out.prln("corpus: document "+ index + " unloaded and set to null");
292 } }
294 }
295 public void datastoreOpened(CreoleEvent e) {
296 }
297 public void datastoreCreated(CreoleEvent e) {
298 }
299 public void datastoreClosed(CreoleEvent e) {
300 if (! e.getDatastore().equals(this.getDataStore()))
301 return;
302 if (this.getDataStore() != null)
303 this.getDataStore().removeDatastoreListener(this);
304 Factory.deleteResource(this);
307 }
308
311 public void resourceAdopted(DatastoreEvent evt){
312 }
313
314
317 public void resourceDeleted(DatastoreEvent evt){
318 DataStore ds = (DataStore)evt.getSource();
319 if (!ds.equals(this.dataStore))
321 return;
322
323 Object docID = evt.getResourceID();
324 if (docID == null)
325 return;
326
327 if (DEBUG) Out.prln("Resource deleted called for: " + docID);
328 if (docID.equals(this.getLRPersistenceId())) {
331 Factory.deleteResource(this);
332 return;
333 }
335 boolean isDirty=false;
336 for (int i=0; i< docDataList.size(); i++) {
339 DocumentData docData = (DocumentData)docDataList.get(i);
340 if (docID.equals(docData.getPersistentID())) {
343 remove(i);
344 isDirty = true;
345 } }
348 if (isDirty)
349 try {
350 this.dataStore.sync(this);
351 } catch (PersistenceException ex) {
352 throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
353 } catch (SecurityException sex) {
354 throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
355 }
356 }
358
361 public void resourceWritten(DatastoreEvent evt){
362 if (evt.getResourceID().equals(this.getLRPersistenceId())) {
363 thisResourceWritten();
364 }
365 }
366
367
368
369
372 public int size() {
373 return docDataList.size();
374 }
375
376 public boolean isEmpty() {
377 return docDataList.isEmpty();
378 }
379
380 public boolean contains(Object o){
381
385 if(! (o instanceof Document))
386 return false;
387
388 int index = findDocument((Document) o);
389 if (index < 0)
390 return false;
391 else
392 return true;
393 }
394
395 public Iterator iterator(){
396 return new Iterator(){
397 Iterator docDataIter = docDataList.iterator();
398
399 public boolean hasNext() {
400 return docDataIter.hasNext();
401 }
402
403 public Object next(){
404
405 DocumentData docData = (DocumentData) docDataIter.next();
407 int index = docDataList.indexOf(docData);
408 return SerialCorpusImpl.this.get(index);
409 }
410
411 public void remove() {
412 throw new UnsupportedOperationException("SerialCorpusImpl does not " +
413 "support remove in the iterators");
414 }
415 };
417 }
419 public String toString() {
420 return "document data " + docDataList.toString() + " documents " + documents;
421 }
422
423 public Object[] toArray(){
424 throw new MethodNotImplementedException(
426 "toArray() is not implemented for SerialCorpusImpl");
427 }
428
429 public Object[] toArray(Object[] a){
430 throw new MethodNotImplementedException(
432 "toArray(Object[] a) is not implemented for SerialCorpusImpl");
433 }
434
435 public boolean add(Object o){
436 if (! (o instanceof Document) || o == null)
437 return false;
438 Document doc = (Document) o;
439
440 if (doc.getDataStore() != null
442 && !this.dataStore.equals(doc.getDataStore())) {
443 Err.prln("Error: Persistent corpus can only accept documents " +
444 "from its own datastore!");
445 return false;
446 }
448 DocumentData docData = new DocumentData(doc.getName(),
453 doc.getLRPersistenceId());
454 boolean result = docDataList.add(docData);
455 documents.add(doc);
456 documentAdded(doc);
457 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
458 doc,
459 docDataList.size()-1,
460 CorpusEvent.DOCUMENT_ADDED));
461
462 return result;
463 }
464
465 public boolean remove(Object o){
466 if (DEBUG) Out.prln("SerialCorpus:Remove object called");
467 if (! (o instanceof Document))
468 return false;
469 Document doc = (Document) o;
470
471 int index = findDocument(doc);
473 if (index == -1)
474 return false;
475
476 if(index < docDataList.size()) { docDataList.remove(index);
478 Document oldDoc = (Document) documents.remove(index);
479 if (DEBUG) Out.prln("documents after remove of " + oldDoc.getName()
480 + " are " + documents);
481 documentRemoved(oldDoc.getLRPersistenceId().toString());
482 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
483 oldDoc,
484 index,
485 CorpusEvent.DOCUMENT_REMOVED));
486 }
487
488 return true;
489 }
490
491 public int findDocument(Document doc) {
492 boolean found = false;
493 DocumentData docData = null;
494
495 int index = documents.indexOf(doc);
497 if (index > -1 && index < docDataList.size())
498 return index;
499
500 Iterator iter = docDataList.iterator();
502 for (index = 0; iter.hasNext(); index++) {
503 docData = (DocumentData) iter.next();
504 if (docData.getDocumentName().equals(doc.getName()) &&
505 docData.getPersistentID().equals(doc.getLRPersistenceId())) {
506 found = true;
507 break;
508 }
509 }
510 if (found && index < docDataList.size())
511 return index;
512 else
513 return -1;
514 }
516 public boolean containsAll(Collection c){
517 Iterator iter = c.iterator();
518 while (iter.hasNext()) {
519 if (! contains(iter.next()))
520 return false;
521 }
522 return true;
523 }
524
525 public boolean addAll(Collection c){
526 boolean allAdded = true;
527 Iterator iter = c.iterator();
528 while (iter.hasNext()) {
529 if (! add(iter.next()))
530 allAdded = false;
531 }
532 return allAdded;
533 }
534
535 public boolean addAll(int index, Collection c){
536 throw new UnsupportedOperationException();
537 }
538
539 public boolean removeAll(Collection c){
540 boolean allRemoved = true;
541 Iterator iter = c.iterator();
542 while (iter.hasNext()) {
543 if (! remove(iter.next()))
544 allRemoved = false;
545 }
546 return allRemoved;
547
548 }
549
550 public boolean retainAll(Collection c){
551 throw new UnsupportedOperationException();
552 }
553
554 public void clear(){
555 documents.clear();
556 docDataList.clear();
557 }
558
559 public boolean equals(Object o){
560 if (! (o instanceof SerialCorpusImpl))
561 return false;
562 SerialCorpusImpl oCorpus = (SerialCorpusImpl) o;
563 if ((this == null && oCorpus != null) || (oCorpus == null && this != null))
564 return false;
565 if (oCorpus == this)
566 return true;
567 if ((oCorpus.lrPersistentId == this.lrPersistentId ||
568 ( this.lrPersistentId != null &&
569 this.lrPersistentId.equals(oCorpus.lrPersistentId))
570 )
571 &&
572 oCorpus.name.equals(this.name)
573 &&
574 (oCorpus.dataStore == this.dataStore
575 || oCorpus.dataStore.equals(this.dataStore))
576 &&
577 oCorpus.docDataList.equals(docDataList))
578 return true;
579 return false;
580 }
581
582 public int hashCode(){
583 return docDataList.hashCode();
584 }
585
586 public Object get(int index){
587 if (index >= docDataList.size())
588 return null;
589
590 Object res = documents.get(index);
591
592 if (DEBUG)
593 Out.prln("SerialCorpusImpl: get(): index " + index + "result: " + res);
594
595 if (res == null) {
597 FeatureMap features = Factory.newFeatureMap();
598 features.put(DataStore.DATASTORE_FEATURE_NAME, this.dataStore);
599 try {
600 features.put(DataStore.LR_ID_FEATURE_NAME,
601 ((DocumentData)docDataList.get(index)).getPersistentID());
602 Resource lr = Factory.createResource( "gate.corpora.DocumentImpl",
603 features);
604 if (DEBUG)
605 Out.prln("Loaded document :" + lr.getName());
606 res = lr;
608
609 documents.set(index, lr);
611 } catch (ResourceInstantiationException ex) {
612 Err.prln("Error reading document inside a serialised corpus.");
613 throw new GateRuntimeException(ex.getMessage());
614 }
615 }
616
617 return res;
618 }
619
620 public Object set(int index, Object element){
621 throw new gate.util.MethodNotImplementedException();
622
632 }
633
634 public void add(int index, Object o){
635 if (! (o instanceof Document) || o == null)
636 return;
637 Document doc = (Document) o;
638
639 DocumentData docData = new DocumentData(doc.getName(),
640 doc.getLRPersistenceId());
641 docDataList.add(index, docData);
642
643 documents.add(index, doc);
644 documentAdded(doc);
645 fireDocumentAdded(new CorpusEvent(SerialCorpusImpl.this,
646 doc,
647 index,
648 CorpusEvent.DOCUMENT_ADDED));
649
650 }
651
652 public Object remove(int index){
653 if (DEBUG) Out.prln("Remove index called");
654
655 boolean isLoaded = isDocumentLoaded(index);
656 Document removed = (Document) get(index);
657 documentRemoved(removed.getLRPersistenceId().toString());
658 if (!isLoaded){
659 unloadDocument(removed);
660 }
661
662 docDataList.remove(index);
663 Document res = (Document) documents.remove(index);
664 fireDocumentRemoved(new CorpusEvent(SerialCorpusImpl.this,
665 res,
666 index,
667 CorpusEvent.DOCUMENT_REMOVED));
668 return res;
669
670 }
671
672 public int indexOf(Object o){
673 if (o instanceof Document)
674 return findDocument((Document) o);
675
676 return -1;
677 }
678
679 public int lastIndexOf(Object o){
680 throw new gate.util.MethodNotImplementedException();
681 }
682
683 public ListIterator listIterator(){
684 throw new gate.util.MethodNotImplementedException();
685 }
686
687 public ListIterator listIterator(int index){
688 throw new gate.util.MethodNotImplementedException();
689 }
690
691
695 public List subList(int fromIndex, int toIndex){
696 throw new gate.util.MethodNotImplementedException();
697 }
698
699 public void setDataStore(DataStore dataStore)
700 throws gate.persist.PersistenceException {
701 super.setDataStore( dataStore);
702 if (this.dataStore != null)
703 this.dataStore.addDatastoreListener(this);
704 }
705
706 public void setTransientSource(Object source) {
707 if (! (source instanceof Corpus))
708 return;
709
710 if (this.dataStore != null && this.lrPersistentId != null)
716 return;
717
718 Corpus tCorpus = (Corpus) source;
719
720 this.setName(tCorpus.getName());
722 this.setFeatures(tCorpus.getFeatures());
723
724 docDataList = new ArrayList();
725 Iterator iter = tCorpus.getDocumentNames().iterator();
727 while (iter.hasNext())
728 docDataList.add(new DocumentData((String) iter.next(), null));
729
730 documents = new ArrayList();
732 documents.addAll(tCorpus);
733
734 this.addedDocs = new Vector();
735 this.removedDocIDs = new Vector();
736 this.changedDocs = new Vector();
737
738 Gate.getCreoleRegister().addCreoleListener(this);
740
741 }
742
743 public Object getTransientSource() {
746 return null;
747 }
748
749
750 public Resource init() throws gate.creole.ResourceInstantiationException {
751 super.init();
752
753 return this;
754
755 }
756
757
758
764 private void readObject(ObjectInputStream s)
765 throws IOException, ClassNotFoundException {
766 s.defaultReadObject();
767 documents = new ArrayList(docDataList.size());
768 for (int i = 0; i < docDataList.size(); i++)
769 documents.add(null);
770 corpusListeners = new Vector();
771 Gate.getCreoleRegister().addCreoleListener(this);
773 if (this.dataStore != null)
774 this.dataStore.addDatastoreListener(this);
775
776 IndexDefinition definition = (IndexDefinition) this.getFeatures().get(
778 GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY);
779 if (definition != null){
780 String className = definition.getIrEngineClassName();
781 try{
782 Class aClass = Class.forName(className, true, Gate.getClassLoader());
784 IREngine engine = (IREngine)aClass.newInstance();
785 this.indexManager = engine.getIndexmanager();
786 this.indexManager.setIndexDefinition(definition);
787 this.indexManager.setCorpus(this);
788 }catch(Exception e){
789 e.printStackTrace(Err.getPrintWriter());
790 }
791 this.addedDocs = new Vector();
799 this.removedDocIDs = new Vector();
800 this.changedDocs = new Vector();
801 }
802 }
804 public void setIndexDefinition(IndexDefinition definition) {
805 if (definition != null){
806 this.getFeatures().put(GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY,
807 definition);
808
809 String className = definition.getIrEngineClassName();
810 try{
811 Class aClass = Class.forName(className, true, Gate.getClassLoader());
813 IREngine engine = (IREngine)aClass.newInstance();
814 this.indexManager = engine.getIndexmanager();
815 this.indexManager.setIndexDefinition(definition);
816 this.indexManager.setCorpus(this);
817 }catch(Exception e){
818 e.printStackTrace(Err.getPrintWriter());
819 }
820 this.addedDocs = new Vector();
828 this.removedDocIDs = new Vector();
829 this.changedDocs = new Vector();
830 }
831 }
832
833 public IndexDefinition getIndexDefinition() {
834 return (IndexDefinition) this.getFeatures().get(
835 GateConstants.CORPUS_INDEX_DEFINITION_FEATURE_KEY);
836 }
837
838 public IndexManager getIndexManager() {
839 return this.indexManager;
840 }
841
842 public IndexStatistics getIndexStatistics(){
843 return (IndexStatistics) this.getFeatures().get(
844 GateConstants.CORPUS_INDEX_STATISTICS_FEATURE_KEY);
845 }
846
847 private void documentAdded(Document doc) {
848 if (indexManager != null){
849 addedDocs.add(doc);
850 }
851 }
852
853 private void documentRemoved(String lrID) {
854 if (indexManager != null) {
855 removedDocIDs.add(lrID);
856 }
857 }
858
859 private void thisResourceWritten() {
860 if (indexManager != null) {
861 try {
862 for (int i = 0; i<documents.size(); i++) {
863 if (documents.get(i) != null) {
864 Document doc = (Document) documents.get(i);
865 if (!addedDocs.contains(doc) && doc.isModified()) {
866 changedDocs.add(doc);
867 }
868 }
869 }
870 indexManager.sync(addedDocs, removedDocIDs, changedDocs);
871 } catch (IndexException ie) {
872 ie.printStackTrace();
873 }
874 }
875 }
876
877 }
878