1   /*
2    *  SimpleCorpus.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 23/Jul/2004
12   *
13   *  $Id: SimpleCorpus.java,v 1.2 2005/01/11 13:51:30 ian Exp $
14   */
15  
16  package gate;
17  import java.io.IOException;
18  import java.net.URL;
19  import java.util.List;
20  
21  import gate.creole.ResourceInstantiationException;
22  import java.io.FileFilter;
23  
24  import gate.util.NameBearer;
25  
26  /** Corpora are lists of Document. TIPSTER equivalent: Collection.
27    */
28  public interface SimpleCorpus extends LanguageResource, List, NameBearer {
29  
30    public static final String CORPUS_NAME_PARAMETER_NAME = "name";
31    public static final String CORPUS_DOCLIST_PARAMETER_NAME = "documentsList";
32  
33    /**
34     * Gets the names of the documents in this corpus.
35     * @return a {@link List} of Strings representing the names of the documents
36     * in this corpus.
37     */
38    public List getDocumentNames();
39  
40    /**
41     * Gets the name of a document in this corpus.
42     * @param index the index of the document
43     * @return a String value representing the name of the document at
44     * <tt>index</tt> in this corpus.
45     */
46    public String getDocumentName(int index);
47  
48    /**
49     * Fills this corpus with documents created on the fly from selected files in
50     * a directory. Uses a {@link FileFilter} to select which files will be used
51     * and which will be ignored.
52     * A simple file filter based on extensions is provided in the Gate
53     * distribution ({@link gate.util.ExtensionFileFilter}).
54     * @param directory the directory from which the files will be picked. This
55     * parameter is an URL for uniformity. It needs to be a URL of type file
56     * otherwise an InvalidArgumentException will be thrown.
57     * An implementation for this method is provided as a static method at
58     * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}.
59     * @param filter the file filter used to select files from the target
60     * directory. If the filter is <tt>null</tt> all the files will be accepted.
61     * @param encoding the encoding to be used for reading the documents
62     * @param recurseDirectories should the directory be parsed recursively?. If
63     * <tt>true</tt> all the files from the provided directory and all its
64     * children directories (on as many levels as necessary) will be picked if
65     * accepted by the filter otherwise the children directories will be ignored.
66     */
67    public void populate(URL directory, FileFilter filter,
68                         String encoding, boolean recurseDirectories)
69                         throws IOException, ResourceInstantiationException;
70  
71  
72  } // interface SimpleCorpus
73