1
15
16 package gate.util;
17
18 import java.io.File;
19 import java.text.NumberFormat;
20 import java.util.*;
21
22 import gate.*;
23 import gate.creole.ExecutionException;
24 import gate.creole.ResourceInstantiationException;
25
26 public class CorpusSaver {
27
28 private static final boolean DEBUG = true;
29
30 public CorpusSaver() {
31 }
32
33 public void init() {
34 if (saveMode) {
35 File path = new File(dsPath);
36 try {
37 ds = Factory.openDataStore("gate.persist.SerialDataStore",
38 path.toURL().toString());
39 } catch (Exception ex) {
40 throw new gate.util.GateRuntimeException(ex.getMessage());
41 }
42
43 try {
44 Corpus corpus = Factory.newCorpus("bnc");
45 LanguageResource lr = ds.adopt(corpus, null);
46 ds.sync(lr);
47 theCorpus = (Corpus) lr;
48 } catch (Exception ex) {
49 throw new GateRuntimeException(ex.getMessage());
50 }
51 }
52
53 if (processMode)
54 initPRs();
55
56 }
57
58 public void initPRs() {
59 try {
60 if (applicationFile == null)
61 Out.prln("Application not set!");
62 Out.prln("App file is: " + applicationFile.getAbsolutePath());
63 application = (Controller) gate.util.persistence.PersistenceManager
64 .loadObjectFromFile(applicationFile);
65 } catch (Exception ex) {
66 throw new GateRuntimeException("Corpus Saver: "+ex.getMessage());
67 }
68 }
70 public void execute() {
71 execute(startDir);
72 try {
73 if (saveMode) {
74 ds.sync(theCorpus);
75 Factory.deleteResource(theCorpus);
76 if (ds != null)
77 ds.close();
78 }
79 if (application != null) {
80 Iterator iter = new ArrayList(application.getPRs()).iterator();
81 while (iter.hasNext())
82 Factory.deleteResource((Resource) iter.next());
83 }
84 } catch (Exception ex) {
85 throw new GateRuntimeException(ex.getMessage());
86 }
87 }
88
89 public void execute(File dir) {
90 if (dir == null || (saveMode && ds == null))
91 return;
92 currDir = dir;
94 Out.prln("Processing directory: " + currDir);
95
96 ArrayList files = new ArrayList();
97 ArrayList dirs = new ArrayList();
98 File[] dirArray = currDir.listFiles();
99 for (int i = 0; i < dirArray.length; i++) {
100 if (dirArray[i].isDirectory())
101 dirs.add(dirArray[i]);
102 else if (dirArray[i].isFile())
103 files.add(dirArray[i]);
104 }
105
106 saveFiles(files);
107
108 if (dirs.isEmpty())
110 return;
111
112 for (int j = 0; j < dirs.size(); j++)
114 execute((File) dirs.get(j));
115
116 }
118
119 public static void main(String[] args) throws GateException {
120 Gate.init();
121
122 CorpusSaver corpusSaver1 = new CorpusSaver();
123
124 if(args.length < 2)
125 throw new GateException("usage: [-process|-process-only] source_directory datastore_path application");
126 int i = 0;
127 while (i < args.length && args[i].startsWith("-")) {
128 if(args[i].equals("-process")) {
129 Out.prln("Processing and saving the corpus enabled. <P>");
130 corpusSaver1.setProcessMode(true);
131 } else if (args[i].equals("-process_only")) {
132 Out.prln("Processing only enabled. <P>");
133 corpusSaver1.setSaveMode(false);
134 corpusSaver1.setProcessMode(true);
135 }
136 i++; }
139 String dirName = args[i];
140 File dir = new File(dirName);
141 if (!dir.isDirectory())
142 throw new GateRuntimeException("Corpus directory should be "
143 + "provided as a parameter");
144
145 if(i+1 >= args.length)
146 throw new GateRuntimeException("Datastore path not provided");
147
148 if (corpusSaver1.getSaveMode()) {
149 String storagePath = args[i + 1];
150 File storage = new File(storagePath);
151 if (!storage.isDirectory())
152 throw new GateRuntimeException("Please provide path to an existing "
153 + "GATE serial datastore");
154 corpusSaver1.setDSPath(storagePath);
155 }
156
157 if (corpusSaver1.getProcessMode()) {
159 i++;
160 String appName = args[i];
161 File appFile = new File(appName);
162 if (!appFile.isFile())
163 throw new GateException("Please provide an existing GATE application");
164 else
165 corpusSaver1.setApplicationFile(appFile);
166 }
167
168 Out.prln("Initialising GATE please wait...");
169 corpusSaver1.init();
170 corpusSaver1.setStartDir(dir);
171 Out.prln("Processing...");
172 double timeBefore = System.currentTimeMillis();
173 corpusSaver1.execute();
174 double timeAfter = System.currentTimeMillis();
175 Out.prln("Done in " +
176 NumberFormat.getInstance().format((timeAfter-timeBefore)/1000)
177 + " seconds");
178
179 }
180
181 public void setStartDir(File newDir) {
182 startDir = newDir;
183 }
184
185 public void setProcessMode(boolean mode) {
186 processMode = mode;
187 }
188
189 public boolean getProcessMode() {
190 return processMode;
191 }
192
193 public void setSaveMode(boolean mode) {
194 saveMode = mode;
195 }
196
197 public boolean getSaveMode() {
198 return saveMode;
199 }
200
201 public void setDSPath(String path){
202 dsPath = path;
203 }
204
205 public void setApplicationFile(File newAppFile) {
206 applicationFile = newAppFile;
207 }
208
209
210 protected void saveFiles(List files) {
211 if (files==null || files.isEmpty() ||
212 (saveMode && (theCorpus == null || ds == null)))
213 return;
214
215 for(int i=0; i<files.size(); i++) {
216 try {
217 Document doc = Factory.newDocument(((File)files.get(i)).toURL());
218 doc.setName(Files.getLastPathComponent(((File)files.get(i)).toURL().toString()));
219 Out.prln("Storing document: " + doc.getName());
220 if (processMode)
222 processDocument(doc);
223
224 if (saveMode) {
226 LanguageResource lr = ds.adopt(doc, null);
227 theCorpus.add(lr);
228 theCorpus.unloadDocument( (Document) lr);
229
230 if (lr != doc)
231 Factory.deleteResource(lr);
232 }
233 Factory.deleteResource(doc);
234 } catch (Exception ex) {
235 throw new GateRuntimeException(ex.getClass() + " " + ex.getMessage());
236 }
237 } }
240 protected void processDocument(Document doc) {
241 try {
242 if (application instanceof CorpusController) {
243 Corpus tempCorpus = Factory.newCorpus("temp");
244 tempCorpus.add(doc);
245 ((CorpusController)application).setCorpus(tempCorpus);
246 application.execute();
247 Factory.deleteResource(tempCorpus);
248 tempCorpus = null;
249 } else {
250 Iterator iter = application.getPRs().iterator();
251 while (iter.hasNext())
252 ((ProcessingResource) iter.next()).setParameterValue("document", doc);
253 application.execute();
254 }
255 } catch (ResourceInstantiationException ex) {
256 throw new RuntimeException("Error executing application: "
257 + ex.getMessage());
258 } catch (ExecutionException ex) {
259 throw new RuntimeException("Error executing application: "
260 + ex.getMessage());
261 }
262 }
263
264
265
268 private File startDir;
269 private File currDir;
270
271 private DataStore ds;
272 private Corpus theCorpus;
273 private String annotSetName = "NE";
274 private String dsPath = "d:\\bnc";
275 private Controller application = null;
276 private File applicationFile = null;
277
278 private boolean processMode = false;
279 private boolean saveMode = true;
280 }
281