| SimpleCorpus.java |
1 /*
2 * SimpleCorpus.java
3 *
4 * Copyright (c) 1998-2005, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Kalina Bontcheva, 23/Jul/2004
12 *
13 * $Id: SimpleCorpus.java,v 1.2 2005/01/11 13:51:30 ian Exp $
14 */
15
16 package gate;
17 import java.io.IOException;
18 import java.net.URL;
19 import java.util.List;
20
21 import gate.creole.ResourceInstantiationException;
22 import java.io.FileFilter;
23
24 import gate.util.NameBearer;
25
26 /** Corpora are lists of Document. TIPSTER equivalent: Collection.
27 */
28 public interface SimpleCorpus extends LanguageResource, List, NameBearer {
29
30 public static final String CORPUS_NAME_PARAMETER_NAME = "name";
31 public static final String CORPUS_DOCLIST_PARAMETER_NAME = "documentsList";
32
33 /**
34 * Gets the names of the documents in this corpus.
35 * @return a {@link List} of Strings representing the names of the documents
36 * in this corpus.
37 */
38 public List getDocumentNames();
39
40 /**
41 * Gets the name of a document in this corpus.
42 * @param index the index of the document
43 * @return a String value representing the name of the document at
44 * <tt>index</tt> in this corpus.
45 */
46 public String getDocumentName(int index);
47
48 /**
49 * Fills this corpus with documents created on the fly from selected files in
50 * a directory. Uses a {@link FileFilter} to select which files will be used
51 * and which will be ignored.
52 * A simple file filter based on extensions is provided in the Gate
53 * distribution ({@link gate.util.ExtensionFileFilter}).
54 * @param directory the directory from which the files will be picked. This
55 * parameter is an URL for uniformity. It needs to be a URL of type file
56 * otherwise an InvalidArgumentException will be thrown.
57 * An implementation for this method is provided as a static method at
58 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}.
59 * @param filter the file filter used to select files from the target
60 * directory. If the filter is <tt>null</tt> all the files will be accepted.
61 * @param encoding the encoding to be used for reading the documents
62 * @param recurseDirectories should the directory be parsed recursively?. If
63 * <tt>true</tt> all the files from the provided directory and all its
64 * children directories (on as many levels as necessary) will be picked if
65 * accepted by the filter otherwise the children directories will be ignored.
66 */
67 public void populate(URL directory, FileFilter filter,
68 String encoding, boolean recurseDirectories)
69 throws IOException, ResourceInstantiationException;
70
71
72 } // interface SimpleCorpus
73