1   /*
2    *  CorpusImpl.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: CorpusImpl.java,v 1.56 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.*;
19  import java.net.URL;
20  import java.util.*;
21  
22  import gate.*;
23  import gate.creole.AbstractLanguageResource;
24  import gate.creole.ResourceInstantiationException;
25  import gate.event.*;
26  import gate.util.Err;
27  import gate.util.Strings;
28  
29  /** Corpora are sets of Document. They are ordered by lexicographic collation
30    * on Url.
31    */
32  public class CorpusImpl extends AbstractLanguageResource
33                          implements Corpus, CreoleListener {
34  
35    /** Debug flag */
36    private static final boolean DEBUG = false;
37  
38    public CorpusImpl(){
39      supportList = Collections.synchronizedList(new VerboseList());
40      Gate.getCreoleRegister().addCreoleListener(this);
41    }
42  
43  
44    /**
45     * Gets the names of the documents in this corpus.
46     * @return a {@link List} of Strings representing the names of the documents
47     * in this corpus.
48     */
49    public List getDocumentNames(){
50      ArrayList res = new ArrayList(supportList.size());
51      Iterator docIter = supportList.iterator();
52      while(docIter.hasNext()){
53        res.add(((Document)docIter.next()).getName());
54      }
55      return res;
56    }
57  
58    /**
59     * Gets the name of a document in this corpus.
60     * @param index the index of the document
61     * @return a String value representing the name of the document at
62     * <tt>index</tt> in this corpus.
63     */
64    public String getDocumentName(int index){
65      return ((Document)supportList.get(index)).getName();
66    }
67  
68    /**
69     * This method does not make sense for transient corpora, so it does
70     * nothing.
71     */
72    public void unloadDocument(Document doc) {
73      return;
74    }
75  
76  
77    /**
78     * The underlying list that holds the documents in this corpus.
79     */
80    protected List supportList = null;
81  
82    /**
83     * A proxy list that stores the actual data in an internal list and forwards
84     * all operations to that one but it also fires the appropiate corpus events
85     * when necessary.
86     * It also does some type checking so only Documents are accepted as corpus
87     * members.
88     */
89    protected class VerboseList extends AbstractList implements Serializable{
90  
91  
92  
93      VerboseList(){
94        data = new ArrayList();
95      }
96  
97      public Object get(int index){
98        return data.get(index);
99      }
100 
101     public int size(){
102       return data.size();
103     }
104 
105     public Object set(int index, Object element){
106       if(element instanceof Document){
107         Document oldDoc = (Document)data.set(index, element);
108         Document newDoc = (Document)element;
109 
110         //fire the 2 events
111         fireDocumentRemoved(new CorpusEvent(CorpusImpl.this,
112                                             oldDoc,
113                                             index,
114                                             CorpusEvent.DOCUMENT_REMOVED));
115         fireDocumentAdded(new CorpusEvent(CorpusImpl.this,
116                                           newDoc,
117                                           index,
118                                           CorpusEvent.DOCUMENT_ADDED));
119         return oldDoc;
120       }else{
121         throw new UnsupportedOperationException(
122           getClass().getName() +
123           " only accepts gate.Document values as members!\n" +
124           element.getClass().getName() + " is not a gate.Document");
125       }
126     }
127 
128     public void add(int index, Object element){
129       if(element instanceof Document){
130         data.add(index, element);
131 
132         //fire the event
133         fireDocumentAdded(new CorpusEvent(CorpusImpl.this,
134                                           (Document)element,
135                                           index,
136                                           CorpusEvent.DOCUMENT_ADDED));
137       }else{
138         throw new UnsupportedOperationException(
139           getClass().getName() +
140           " only accepts gate.Document values as members!\n" +
141           element.getClass().getName() + " is not a gate.Document");
142       }
143     }
144 
145     public Object remove(int index){
146       Document oldDoc = (Document)data.remove(index);
147 
148       fireDocumentRemoved(new CorpusEvent(CorpusImpl.this,
149                                           oldDoc,
150                                           index,
151                                           CorpusEvent.DOCUMENT_REMOVED));
152       return oldDoc;
153     }
154 
155     /**
156      * The List containing the actual data.
157      */
158     ArrayList data;
159   }
160 
161   /**
162    * This method returns true when the document is already loaded in memory
163    */
164   public boolean isDocumentLoaded(int index) {
165     return true;
166   }
167 
168 
169   protected void clearDocList() {
170     if (supportList == null)
171       return;
172     supportList.clear();
173   }
174 
175 
176   //List methods
177   //java docs will be automatically copied from the List interface.
178 
179   public int size() {
180     return supportList.size();
181   }
182 
183   public boolean isEmpty() {
184     return supportList.isEmpty();
185   }
186 
187   public boolean contains(Object o){
188     return supportList.contains(o);
189   }
190 
191   public Iterator iterator(){
192     return supportList.iterator();
193   }
194 
195   public Object[] toArray(){
196     return supportList.toArray();
197   }
198 
199   public Object[] toArray(Object[] a){
200     return supportList.toArray(a);
201   }
202 
203   public boolean add(Object o){
204     return supportList.add(o);
205   }
206 
207   public boolean remove(Object o){
208     return supportList.remove(o);
209   }
210 
211   public boolean containsAll(Collection c){
212     return supportList.containsAll(c);
213   }
214 
215   public boolean addAll(Collection c){
216     return supportList.addAll(c);
217   }
218 
219   public boolean addAll(int index, Collection c){
220     return supportList.addAll(index, c);
221   }
222 
223   public boolean removeAll(Collection c){
224     return supportList.removeAll(c);
225   }
226 
227   public boolean retainAll(Collection c){
228     return supportList.retainAll(c);
229   }
230 
231   public void clear(){
232     supportList.clear();
233   }
234 
235   public boolean equals(Object o){
236     if (! (o instanceof CorpusImpl))
237       return false;
238 
239     return supportList.equals(o);
240   }
241 
242   public int hashCode(){
243     return supportList.hashCode();
244   }
245 
246   public Object get(int index){
247     return supportList.get(index);
248   }
249 
250   public Object set(int index, Object element){
251     return supportList.set(index, element);
252   }
253 
254   public void add(int index, Object element){
255     supportList.add(index, element);
256   }
257 
258   public Object remove(int index){
259     return supportList.remove(index);
260   }
261 
262   public int indexOf(Object o){
263     return supportList.indexOf(o);
264   }
265 
266   public int lastIndexOf(Object o){
267     return lastIndexOf(o);
268   }
269 
270   public ListIterator listIterator(){
271     return supportList.listIterator();
272   }
273 
274   public ListIterator listIterator(int index){
275     return supportList.listIterator(index);
276   }
277 
278   public List subList(int fromIndex, int toIndex){
279     return supportList.subList(fromIndex, toIndex);
280   }
281 
282 
283   /** Construction */
284 
285   public void cleanup(){
286     Gate.getCreoleRegister().removeCreoleListener(this);
287   }
288 
289   /** Initialise this resource, and return it. */
290   public Resource init() {
291     if(documentsList != null && !documentsList.isEmpty()){
292       addAll(documentsList);
293     }
294     return this;
295   } // init()
296 
297 
298   /**
299    * Fills the provided corpus with documents created on the fly from selected
300    * files in a directory. Uses a {@link FileFilter} to select which files will
301    * be used and which will be ignored.
302    * A simple file filter based on extensions is provided in the Gate
303    * distribution ({@link gate.util.ExtensionFileFilter}).
304    * @param corpus the corpus to be populated
305    * @param directory the directory from which the files will be picked. This
306    * parameter is an URL for uniformity. It needs to be a URL of type file
307    * otherwise an InvalidArgumentException will be thrown.
308    * @param filter the file filter used to select files from the target
309    * directory. If the filter is <tt>null</tt> all the files will be accepted.
310    * @param encoding the encoding to be used for reading the documents
311    * @param recurseDirectories should the directory be parsed recursively?. If
312    * <tt>true</tt> all the files from the provided directory and all its
313    * children directories (on as many levels as necessary) will be picked if
314    * accepted by the filter otherwise the children directories will be ignored.
315    */
316   public static void populate(Corpus corpus, URL directory, FileFilter filter,
317                               String encoding, boolean recurseDirectories)
318                      throws IOException {
319     //check input
320     if(!directory.getProtocol().equalsIgnoreCase("file"))
321       throw new IllegalArgumentException(
322         "The URL provided is not of type \"file:\"!");
323 
324     File dir = new File(directory.getPath());
325     if(!dir.exists())
326       throw new FileNotFoundException(dir.toString());
327 
328     if(!dir.isDirectory())
329       throw new IllegalArgumentException(
330         dir.getAbsolutePath() + " is not a directory!");
331 
332     //populate the corpus
333     File[] files = dir.listFiles(filter);
334     if(files != null){
335       for(int i = 0; i < files.length; i++){
336         File aFile = files[i];
337         if(aFile.isDirectory()){
338           //recurse dir if required
339           if(recurseDirectories){
340             populate(corpus, aFile.toURL(), filter,
341                      encoding, recurseDirectories);
342           }
343         }else{
344           //create the doc
345           StatusListener sListener = (StatusListener)
346                                      gate.gui.MainFrame.getListeners().
347                                      get("gate.event.StatusListener");
348           if(sListener != null) sListener.statusChanged(
349             "Reading: " + aFile.getName());
350           String docName = aFile.getName() + "_" + Gate.genSym();
351           FeatureMap params = Factory.newFeatureMap();
352           params.put(Document.DOCUMENT_URL_PARAMETER_NAME, aFile.toURL());
353           if(encoding != null)
354             params.put(Document.DOCUMENT_ENCODING_PARAMETER_NAME, encoding);
355 
356           try {
357             Document doc = (Document)Factory.createResource(
358                 DocumentImpl.class.getName(), params, null, docName
359               );
360             corpus.add(doc);
361             if(corpus.getLRPersistenceId() != null){
362               //persistent corpus -> unload the document
363               corpus.unloadDocument(doc);
364               Factory.deleteResource(doc);
365             }
366           } catch(ResourceInstantiationException e) {
367             String nl = Strings.getNl();
368             Err.prln(
369               "WARNING: Corpus.populate could not intantiate document" + nl +
370               "  Document name was: " + docName + nl +
371               "  Exception was: " + e + nl + nl
372             );
373           }
374           if(sListener != null) sListener.statusChanged(
375             aFile.getName() + " read");
376         }
377       }
378     }
379   }//public static void populate
380 
381   /**
382    * Fills this corpus with documents created from files in a directory.
383    * @param filter the file filter used to select files from the target
384    * directory. If the filter is <tt>null</tt> all the files will be accepted.
385    * @param directory the directory from which the files will be picked. This
386    * parameter is an URL for uniformity. It needs to be a URL of type file
387    * otherwise an InvalidArgumentException will be thrown.
388    * An implementation for this method is provided as a static method at
389    * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}.
390    * @param encoding the encoding to be used for reading the documents
391    * @param recurseDirectories should the directory be parsed recursively?. If
392    * <tt>true</tt> all the files from the provided directory and all its
393    * children directories (on as many levels as necessary) will be picked if
394    * accepted by the filter otherwise the children directories will be ignored.
395    */
396   public void populate(URL directory, FileFilter filter, String encoding,
397                        boolean recurseDirectories)
398               throws IOException, ResourceInstantiationException{
399     populate(this, directory, filter, encoding, recurseDirectories);
400   }
401 
402   public synchronized void removeCorpusListener(CorpusListener l) {
403     if (corpusListeners != null && corpusListeners.contains(l)) {
404       Vector v = (Vector) corpusListeners.clone();
405       v.removeElement(l);
406       corpusListeners = v;
407     }
408   }
409   public synchronized void addCorpusListener(CorpusListener l) {
410     Vector v = corpusListeners == null ? new Vector(2) : (Vector) corpusListeners.clone();
411     if (!v.contains(l)) {
412       v.addElement(l);
413       corpusListeners = v;
414     }
415   }
416 
417   /** Freeze the serialization UID. */
418   static final long serialVersionUID = -1113142759053898456L;
419   private transient Vector corpusListeners;
420   protected transient java.util.List documentsList;
421 
422 
423   protected void fireDocumentAdded(CorpusEvent e) {
424     if (corpusListeners != null) {
425       Vector listeners = corpusListeners;
426       int count = listeners.size();
427       for (int i = 0; i < count; i++) {
428         ((CorpusListener) listeners.elementAt(i)).documentAdded(e);
429       }
430     }
431   }
432   protected void fireDocumentRemoved(CorpusEvent e) {
433     if (corpusListeners != null) {
434       Vector listeners = corpusListeners;
435       int count = listeners.size();
436       for (int i = 0; i < count; i++) {
437         ((CorpusListener) listeners.elementAt(i)).documentRemoved(e);
438       }
439     }
440   }
441   public void setDocumentsList(java.util.List documentsList) {
442     this.documentsList = documentsList;
443   }
444   public java.util.List getDocumentsList() {
445     return documentsList;
446   }
447   public void resourceLoaded(CreoleEvent e) {
448   }
449   public void resourceUnloaded(CreoleEvent e) {
450     Resource res = e.getResource();
451     //remove all occurences
452     if(res instanceof Document) while(contains(res)) remove(res);
453   }
454 
455   public void resourceRenamed(Resource resource, String oldName,
456                               String newName){
457   }
458 
459   public void datastoreOpened(CreoleEvent e) {
460   }
461   public void datastoreCreated(CreoleEvent e) {
462   }
463   public void datastoreClosed(CreoleEvent e) {
464   }
465 } // class CorpusImpl
466