SimpleCorpus.java |
1 /* 2 * SimpleCorpus.java 3 * 4 * Copyright (c) 1998-2005, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Kalina Bontcheva, 23/Jul/2004 12 * 13 * $Id: SimpleCorpus.java,v 1.2 2005/01/11 13:51:30 ian Exp $ 14 */ 15 16 package gate; 17 import java.io.IOException; 18 import java.net.URL; 19 import java.util.List; 20 21 import gate.creole.ResourceInstantiationException; 22 import java.io.FileFilter; 23 24 import gate.util.NameBearer; 25 26 /** Corpora are lists of Document. TIPSTER equivalent: Collection. 27 */ 28 public interface SimpleCorpus extends LanguageResource, List, NameBearer { 29 30 public static final String CORPUS_NAME_PARAMETER_NAME = "name"; 31 public static final String CORPUS_DOCLIST_PARAMETER_NAME = "documentsList"; 32 33 /** 34 * Gets the names of the documents in this corpus. 35 * @return a {@link List} of Strings representing the names of the documents 36 * in this corpus. 37 */ 38 public List getDocumentNames(); 39 40 /** 41 * Gets the name of a document in this corpus. 42 * @param index the index of the document 43 * @return a String value representing the name of the document at 44 * <tt>index</tt> in this corpus. 45 */ 46 public String getDocumentName(int index); 47 48 /** 49 * Fills this corpus with documents created on the fly from selected files in 50 * a directory. Uses a {@link FileFilter} to select which files will be used 51 * and which will be ignored. 52 * A simple file filter based on extensions is provided in the Gate 53 * distribution ({@link gate.util.ExtensionFileFilter}). 54 * @param directory the directory from which the files will be picked. This 55 * parameter is an URL for uniformity. It needs to be a URL of type file 56 * otherwise an InvalidArgumentException will be thrown. 57 * An implementation for this method is provided as a static method at 58 * {@link gate.corpora.CorpusImpl#populate(Corpus, URL, FileFilter, String, boolean)}. 59 * @param filter the file filter used to select files from the target 60 * directory. If the filter is <tt>null</tt> all the files will be accepted. 61 * @param encoding the encoding to be used for reading the documents 62 * @param recurseDirectories should the directory be parsed recursively?. If 63 * <tt>true</tt> all the files from the provided directory and all its 64 * children directories (on as many levels as necessary) will be picked if 65 * accepted by the filter otherwise the children directories will be ignored. 66 */ 67 public void populate(URL directory, FileFilter filter, 68 String encoding, boolean recurseDirectories) 69 throws IOException, ResourceInstantiationException; 70 71 72 } // interface SimpleCorpus 73