1   /*
2    *  CorpusSaver.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Kalina Bontcheva, 22/Nov/2001
12   *
13   *  $Id: CorpusSaver.java,v 1.11 2005/01/11 13:51:37 ian Exp $
14   */
15  
16  package gate.util;
17  
18  import java.io.File;
19  import java.text.NumberFormat;
20  import java.util.*;
21  
22  import gate.*;
23  import gate.creole.ExecutionException;
24  import gate.creole.ResourceInstantiationException;
25  
26  public class CorpusSaver {
27  
28    private static final boolean DEBUG = true;
29  
30    public CorpusSaver() {
31    }
32  
33    public void init() {
34      if (saveMode) {
35        File path = new File(dsPath);
36        try {
37         ds = Factory.openDataStore("gate.persist.SerialDataStore",
38                                    path.toURL().toString());
39        } catch (Exception ex) {
40          throw new gate.util.GateRuntimeException(ex.getMessage());
41        }
42  
43        try {
44          Corpus corpus = Factory.newCorpus("bnc");
45          LanguageResource lr = ds.adopt(corpus, null);
46          ds.sync(lr);
47          theCorpus = (Corpus) lr;
48        } catch (Exception ex) {
49          throw new GateRuntimeException(ex.getMessage());
50        }
51      }
52  
53      if (processMode)
54        initPRs();
55  
56    }
57  
58    public void initPRs() {
59      try {
60        if (applicationFile == null)
61          Out.prln("Application not set!");
62        Out.prln("App file is: " + applicationFile.getAbsolutePath());
63        application = (Controller) gate.util.persistence.PersistenceManager
64                                     .loadObjectFromFile(applicationFile);
65      } catch (Exception ex) {
66        throw new GateRuntimeException("Corpus Saver: "+ex.getMessage());
67      }
68    }//initPRs
69  
70    public void execute() {
71      execute(startDir);
72      try {
73        if (saveMode) {
74          ds.sync(theCorpus);
75          Factory.deleteResource(theCorpus);
76          if (ds != null)
77            ds.close();
78        }
79        if (application != null) {
80          Iterator iter = new ArrayList(application.getPRs()).iterator();
81          while (iter.hasNext())
82            Factory.deleteResource((Resource) iter.next());
83        }
84      } catch (Exception ex) {
85        throw new GateRuntimeException(ex.getMessage());
86      }
87    }
88  
89    public void execute(File dir) {
90      if (dir == null || (saveMode && ds == null))
91        return;
92      //first set the current directory to be the given one
93      currDir = dir;
94      Out.prln("Processing directory: " + currDir);
95  
96      ArrayList files = new ArrayList();
97      ArrayList dirs = new ArrayList();
98      File[] dirArray = currDir.listFiles();
99      for (int i = 0; i < dirArray.length; i++) {
100       if (dirArray[i].isDirectory())
101         dirs.add(dirArray[i]);
102       else if (dirArray[i].isFile())
103         files.add(dirArray[i]);
104     }
105 
106     saveFiles(files);
107 
108     //if no more subdirs left, return
109     if (dirs.isEmpty())
110       return;
111 
112     //there are more subdirectories to traverse, so iterate through
113     for (int j = 0; j < dirs.size(); j++)
114       execute((File) dirs.get(j));
115 
116   }//execute(dir)
117 
118 
119   public static void main(String[] args) throws GateException {
120     Gate.init();
121 
122     CorpusSaver corpusSaver1 = new CorpusSaver();
123 
124     if(args.length < 2)
125       throw new GateException("usage: [-process|-process-only] source_directory datastore_path application");
126     int i = 0;
127     while (i < args.length && args[i].startsWith("-")) {
128       if(args[i].equals("-process")) {
129         Out.prln("Processing and saving the corpus enabled. <P>");
130         corpusSaver1.setProcessMode(true);
131       } else if (args[i].equals("-process_only")) {
132         Out.prln("Processing only enabled. <P>");
133         corpusSaver1.setSaveMode(false);
134         corpusSaver1.setProcessMode(true);
135       }
136       i++; //just ignore the option, which we do not recognise
137     }//while
138 
139     String dirName = args[i];
140     File dir = new File(dirName);
141     if (!dir.isDirectory())
142       throw new GateRuntimeException("Corpus directory should be "
143                                      + "provided as a parameter");
144 
145     if(i+1 >= args.length)
146       throw new GateRuntimeException("Datastore path not provided");
147 
148     if (corpusSaver1.getSaveMode()) {
149       String storagePath = args[i + 1];
150       File storage = new File(storagePath);
151       if (!storage.isDirectory())
152         throw new GateRuntimeException("Please provide path to an existing "
153                                        + "GATE serial datastore");
154       corpusSaver1.setDSPath(storagePath);
155     }
156 
157     //get the last argument which is the application
158     if (corpusSaver1.getProcessMode()) {
159       i++;
160       String appName = args[i];
161       File appFile = new File(appName);
162       if (!appFile.isFile())
163         throw new GateException("Please provide an existing GATE application");
164       else
165         corpusSaver1.setApplicationFile(appFile);
166     }
167 
168     Out.prln("Initialising GATE please wait...");
169     corpusSaver1.init();
170     corpusSaver1.setStartDir(dir);
171     Out.prln("Processing...");
172     double timeBefore = System.currentTimeMillis();
173     corpusSaver1.execute();
174     double timeAfter = System.currentTimeMillis();
175     Out.prln("Done in " +
176       NumberFormat.getInstance().format((timeAfter-timeBefore)/1000)
177       + " seconds");
178 
179   }
180 
181   public void setStartDir(File newDir) {
182     startDir = newDir;
183   }
184 
185   public void setProcessMode(boolean mode) {
186     processMode = mode;
187   }
188 
189   public boolean getProcessMode() {
190     return processMode;
191   }
192 
193   public void setSaveMode(boolean mode) {
194     saveMode = mode;
195   }
196 
197   public boolean getSaveMode() {
198     return saveMode;
199   }
200 
201   public void setDSPath(String path){
202     dsPath = path;
203   }
204 
205   public void setApplicationFile(File newAppFile) {
206     applicationFile = newAppFile;
207   }
208 
209 
210   protected void saveFiles(List files) {
211     if (files==null || files.isEmpty() ||
212         (saveMode && (theCorpus == null || ds == null)))
213       return;
214 
215     for(int i=0; i<files.size(); i++) {
216       try {
217         Document doc = Factory.newDocument(((File)files.get(i)).toURL());
218         doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString()));
219         Out.prln("Storing document: " + doc.getName());
220         //first process it with ANNIE if in process mode
221         if (processMode)
222           processDocument(doc);
223 
224         //then store it in the DS and add to corpus
225         if (saveMode) {
226           LanguageResource lr = ds.adopt(doc, null);
227           theCorpus.add(lr);
228           theCorpus.unloadDocument( (Document) lr);
229 
230           if (lr != doc)
231             Factory.deleteResource(lr);
232         }
233         Factory.deleteResource(doc);
234       } catch (Exception ex) {
235         throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage());
236       }
237     }//for
238   }//saveFiles
239 
240   protected void processDocument(Document doc) {
241     try {
242       if (application instanceof CorpusController) {
243         Corpus tempCorpus = Factory.newCorpus("temp");
244         tempCorpus.add(doc);
245         ((CorpusController)application).setCorpus(tempCorpus);
246         application.execute();
247         Factory.deleteResource(tempCorpus);
248         tempCorpus = null;
249       } else {
250         Iterator iter = application.getPRs().iterator();
251         while (iter.hasNext())
252           ((ProcessingResource) iter.next()).setParameterValue("document", doc);
253         application.execute();
254       }
255     } catch (ResourceInstantiationException ex) {
256       throw new RuntimeException("Error executing application: "
257                                     + ex.getMessage());
258     } catch (ExecutionException ex) {
259       throw new RuntimeException("Error executing application: "
260                                     + ex.getMessage());
261     }
262   }
263 
264 
265   /**
266    * The directory from which we should generate/evaluate the corpus
267    */
268   private File startDir;
269   private File currDir;
270 
271   private DataStore ds;
272   private Corpus theCorpus;
273   private String annotSetName = "NE";
274   private String dsPath = "d:\\bnc";
275   private Controller application = null;
276   private File applicationFile = null;
277 
278   private boolean processMode = false;
279   private boolean saveMode = true;
280 }
281