1
15
16 package gate.email;
17
18 import java.io.*;
19 import java.util.*;
20
21 import junit.framework.Assert;
22
23 import gate.*;
24 import gate.event.StatusListener;
25
26
31 public class EmailDocumentHandler{
32
33
34 private static final boolean DEBUG = false;
35
36 private String content = null;
37 private long documentSize = 0;
38
39
42 public EmailDocumentHandler() {
43 setUp();
44 }
46
49 public EmailDocumentHandler( gate.Document aGateDocument,
50 Map aMarkupElementsMap,
51 Map anElement2StringMap
52 ) {
53
54 gateDocument = aGateDocument;
55
56 if (basicAS == null)
58 basicAS = gateDocument.getAnnotations(
59 GateConstants.ORIGINAL_MARKUPS_ANNOT_SET_NAME);
60
61 markupElementsMap = aMarkupElementsMap;
62 element2StringMap = anElement2StringMap;
63 setUp();
64 }
66
76 public void annotateMessages() throws IOException,
77 gate.util.InvalidOffsetException {
78 BufferedReader gateDocumentReader = null;
80 content = gateDocument.getContent().toString();
83 documentSize = gateDocument.getContent().size().longValue();
85
86 gateDocumentReader = new BufferedReader(new InputStreamReader(
89 new ByteArrayInputStream(content.getBytes())));
90
91
96
100
104
108 String line = null;
109 String aFieldName = null;
110
111 long cursor = 0;
112 long endEmail = 0;
113 long startEmail = 0;
114 long endHeader = 0;
115 long startHeader = 0;
116 long endBody = 0;
117 long startBody = 0;
118 long endField = 0;
119 long startField = 0;
120
121 boolean insideAnEmail = false;
122 boolean insideHeader = false;
123 boolean insideBody = false;
124 boolean emailReadBefore = false;
125 boolean fieldReadBefore = false;
126
127 long nlSize = detectNLSize();
128
129
131 while ((line = gateDocumentReader.readLine()) != null){
133 if (lineBeginsMessage(line)){
138 if ((++ emails % EMAILS_RATE) == 0)
142 fireStatusChangedEvent("Reading emails : " + emails);
143 if (true == emailReadBefore){
146 endEmail = cursor - nlSize ;
150 endBody = cursor - nlSize;
152 createAnnotation("Body",startBody,endBody,null);
154 createAnnotation("Message",startEmail,endEmail,null);
156 }
157 emailReadBefore = true;
160 startEmail = cursor;
163 startHeader = cursor;
165 cursor += line.length() + nlSize;
168 insideAnEmail = true;
170 insideHeader = true;
172 fieldReadBefore = false;
174 continue;
176 } if (false == insideAnEmail){
178 cursor += line.length() + nlSize;
181 continue;
183 } if (true == insideHeader){
186 if (line.equals("")){
189 insideHeader = false;
192 endHeader = cursor - nlSize;
194 cursor += line.length() + nlSize;
196 startBody = cursor;
198 if (true == fieldReadBefore){
200 endField = endHeader;
201 createAnnotation(aFieldName, startField, endField, null);
203 createAnnotation("Header",startHeader,endHeader,null);
205 } continue;
208 } if (lineBeginsWithField(line)){
212 if (true == fieldReadBefore){
215 endField = cursor - nlSize;
217 createAnnotation(aFieldName, startField, endField, null);
219 } fieldReadBefore = true;
221 aFieldName = getFieldName();
222 startField = cursor + aFieldName.length() + ":".length();
223 } cursor += line.length() + nlSize;
228 continue;
230 } cursor += line.length() + nlSize;
235 } if (true == emailReadBefore){
240 endBody = cursor - nlSize;
241 endEmail = cursor - nlSize;
242 createAnnotation("Body",startBody,endBody,null);
244 createAnnotation("Message",startEmail,endEmail,null);
246 }
247 }
252
258 private int detectNLSize() {
259
260 char[] document = null;
262
263 document = gateDocument.getContent().toString().toCharArray();
265
266 for (int i=0; i<document.length; i++){
269 if (document[i] == '\n'){
270
271 if (
274 (((i+1) < document.length) && (document[i+1] == '\r'))
275 ||
276 (((i-1) >= 0) && (document[i-1] == '\r'))
277 ) return 2;
278 else return 1;
279 }
280 }
281 return 0;
284
285 }
287
291 private void createAnnotation(String anAnnotationName, long anAnnotationStart,
292 long anAnnotationEnd, FeatureMap aFeatureMap)
293 throws gate.util.InvalidOffsetException{
294
295
305 if (canCreateAnnotation(anAnnotationStart,anAnnotationEnd,documentSize)){
306 if (aFeatureMap == null)
307 aFeatureMap = Factory.newFeatureMap();
308 basicAS.add( new Long(anAnnotationStart),
309 new Long(anAnnotationEnd),
310 anAnnotationName.toLowerCase(),
311 aFeatureMap);
312 } }
317 private boolean canCreateAnnotation(long start,
318 long end,
319 long gateDocumentSize){
320
321 if (start < 0 || end < 0 ) return false;
322 if (start > end ) return false;
323 if ((start > gateDocumentSize) || (end > gateDocumentSize)) return false;
324 return true;
325 }
327
333 private boolean lineBeginsMessage(String aTextLine){
334 int score = 0;
335
336 StringTokenizer tokenizer = new StringTokenizer(aTextLine," ");
340
341 String firstToken = null;
343 if (tokenizer.hasMoreTokens())
344 firstToken = tokenizer.nextToken();
345 else return false;
346
347 firstToken.trim();
349
350 if (!firstToken.equals("From"))
354 return false;
355
356 while (tokenizer.hasMoreTokens()){
358
359 String token = tokenizer.nextToken();
361 token.trim();
362
363 if (hasAMeaning(token))
365 score += 1;
366 }
367
368 if (score >= 5) return true;
370 else return false;
371
372 }
374
382 private boolean lineBeginsWithField(String aTextLine){
383 if (containsSemicolon(aTextLine)){
384 StringTokenizer tokenizer = new StringTokenizer(aTextLine,":");
385
386 String firstToken = null;
388
389 if (tokenizer.hasMoreTokens())
390 firstToken = tokenizer.nextToken();
391 else return false;
392
393 if (firstToken != null){
394 firstToken.trim();
396 if (containsWhiteSpaces(firstToken)) return false;
397
398 fieldName = firstToken;
400 }
401 return true;
402 } else return false;
403
404 }
406
409 private boolean containsWhiteSpaces(String aString) {
410 for (int i = 0; i<aString.length(); i++)
411 if (Character.isWhitespace(aString.charAt(i))) return true;
412 return false;
413 }
415
418 private boolean containsSemicolon(String aString) {
419 for (int i = 0; i<aString.length(); i++)
420 if (aString.charAt(i) == ':') return true;
421 return false;
422 }
424
427 private boolean hasAMeaning(String aToken) {
428 if (day.contains(aToken)) return true;
430
431 if (month.contains(aToken)) return true;
433
434 if (zone.contains(aToken)) return true;
436
437 Integer dayNumberOrYear = null;
439 try{
440 dayNumberOrYear = new Integer(aToken);
441 } catch (NumberFormatException e){
442 dayNumberOrYear = null;
443 }
444
445 if (dayNumberOrYear != null) {
447 int number = dayNumberOrYear.intValue();
448
449 if ((number > 0) && (number < 32)) return true;
451
452 if ((number > 1900) && (number < 3000)) return true;
454
455 if ((number >= 0) && (number <= 99)) return true;
457 }
458 if (isTime(aToken)) return true;
460
461 return false;
462 }
464
467 private boolean isTime(String aToken) {
468 StringTokenizer st = new StringTokenizer(aToken,":");
469
470 String hourString = null;
472 if (st.hasMoreTokens())
473 hourString = st.nextToken();
474
475 if (hourString == null) return false;
477
478 Integer hourInteger = null;
480 try{
481 hourInteger = new Integer(hourString);
482 } catch (NumberFormatException e){
483 hourInteger = null;
484 }
485 if (hourInteger == null) return false;
486
487 int hour = hourInteger.intValue();
491 if ( (hour < 0) || (hour > 23) ) return false;
492
493
496 String minutesString = null;
498 if (st.hasMoreTokens())
499 minutesString = st.nextToken();
500
501 if (minutesString == null) return false;
503
504 Integer minutesInteger = null;
506 try {
507 minutesInteger = new Integer (minutesString);
508 } catch (NumberFormatException e){
509 minutesInteger = null;
510 }
511
512 if (minutesInteger == null) return false;
513
514 int minutes = minutesInteger.intValue();
518 if ( (minutes < 0) || (minutes > 59) ) return false;
519
520 String secondsString = null;
522 if (st.hasMoreTokens())
523 secondsString = st.nextToken();
524
525 if (secondsString == null) return false;
527
528 Integer secondsInteger = null;
530 try {
531 secondsInteger = new Integer (secondsString);
532 } catch (NumberFormatException e){
533 secondsInteger = null;
534 }
535 if (secondsInteger == null) return false;
536
537 int seconds = secondsInteger.intValue();
541 if ( (seconds < 0) || (seconds > 59) ) return false;
542
543 if (st.hasMoreTokens()) return false;
546
547 return true;
549 }
551
554 private void setUp(){
555 day = new HashSet();
556 day.add("Mon");
557 day.add("Tue");
558 day.add("Wed");
559 day.add("Thu");
560 day.add("Fri");
561 day.add("Sat");
562 day.add("Sun");
563
564 month = new HashSet();
565 month.add("Jan");
566 month.add("Feb");
567 month.add("Mar");
568 month.add("Apr");
569 month.add("May");
570 month.add("Jun");
571 month.add("Jul");
572 month.add("Aug");
573 month.add("Sep");
574 month.add("Oct");
575 month.add("Nov");
576 month.add("Dec");
577
578 zone = new HashSet();
579 zone.add("UT");
580 zone.add("GMT");
581 zone.add("EST");
582 zone.add("EDT");
583 zone.add("CST");
584 zone.add("CDT");
585 zone.add("MST");
586 zone.add("MDT");
587 zone.add("PST");
588 zone.add("PDT");
589 }
591
597 private String getFieldName() {
598 if (fieldName == null) return new String("");
599 else return fieldName;
600 }
602
604
607 public void addStatusListener(StatusListener listener){
608 myStatusListeners.add(listener);
609 }
610
613 public void removeStatusListener(StatusListener listener){
614 myStatusListeners.remove(listener);
615 }
616
617
621 protected void fireStatusChangedEvent(String text){
622 Iterator listenersIter = myStatusListeners.iterator();
623 while(listenersIter.hasNext())
624 ((StatusListener)listenersIter.next()).statusChanged(text);
625 }
626
627 private static final int EMAILS_RATE = 16;
628
629 private String tmpDocContent = null;
632
633 private gate.Document gateDocument = null;
635
636 private gate.AnnotationSet basicAS = null;
638
639 private Map markupElementsMap = null;
641
642 private Map element2StringMap = null;
644
645 protected List myStatusListeners = new LinkedList();
647
648 private int emails = 0;
650
651 private String fieldName = null;
655
656 private Collection day = null;
657 private Collection month = null;
658 private Collection zone = null;
659
660
661
663
666 private void testContainsSemicolon() {
667 String str1 = "X-Sender: oana@derwent";
668 String str2 = "X-Sender oana@derwent";
669 String str3 = ":X-Sender oana@derwent";
670 String str4 = "X-Sender oana@derwent:";
671
672 Assert.assertTrue((containsSemicolon(str1) == true));
673 Assert.assertTrue((containsSemicolon(str2)== false));
674 Assert.assertTrue((containsSemicolon(str3) == true));
675 Assert.assertTrue((containsSemicolon(str4) == true));
676 }
678
681 private void testContainsWhiteSpaces(){
682 String str1 = "Content-Type: TEXT/PLAIN; charset=US-ASCII";
683 String str2 = "Content-Type:TEXT/PLAIN;charset=US-ASCII";
684 String str3 = " Content-Type:TEXT/PLAIN;charset=US-ASCII";
685 String str4 = "Content-Type:TEXT/PLAIN;charset=US-ASCII ";
686
687 Assert.assertTrue((containsWhiteSpaces(str1) == true));
688 Assert.assertTrue((containsWhiteSpaces(str2) == false));
689 Assert.assertTrue((containsWhiteSpaces(str3) == true));
690 Assert.assertTrue((containsWhiteSpaces(str4) == true));
691 }
693
696 private void testHasAMeaning() {
697 String str1 = "12:05:22";
698 String str2 = "Sep";
699 String str3 = "Fri";
700 String str4 = "2000";
701 String str5 = "GMT";
702 String str6 = "Date: Wed, 13 Sep 2000 13:05:22 +0100 (BST)";
703 String str7 = "12:75:22";
704 String str8 = "September";
705 String str9 = "Friday";
706
707 Assert.assertTrue((hasAMeaning(str1) == true));
708 Assert.assertTrue((hasAMeaning(str2) == true));
709 Assert.assertTrue((hasAMeaning(str3) == true));
710 Assert.assertTrue((hasAMeaning(str4) == true));
711 Assert.assertTrue((hasAMeaning(str5) == true));
712 Assert.assertTrue((hasAMeaning(str6) == false));
713 Assert.assertTrue((hasAMeaning(str7) == false));
714 Assert.assertTrue((hasAMeaning(str8) == false));
715 Assert.assertTrue((hasAMeaning(str9) == false));
716 }
718
721 private void testIsTime() {
722 String str1 = "13:05:22";
723 String str2 = "13/05/22";
724 String str3 = "24:05:22";
725
726 Assert.assertTrue((isTime(str1) == true));
727 Assert.assertTrue((isTime(str2) == false));
728 Assert.assertTrue((isTime(str3) == false));
729 }
731
734 private void testLineBeginsMessage(){
735 String str1 = "From oana@dcs.shef.ac.uk Wed Sep 13 13:05:23 2000";
736 String str2 = "Date: Wed, 13 Sep 2000 13:05:22 +0100 (BST)";
737 String str3 = "From oana@dcs.shef.ac.uk Sep 13 13:05:23 2000";
738
739 Assert.assertTrue((lineBeginsMessage(str1) == true));
740 Assert.assertTrue((lineBeginsMessage(str2) == false));
741 Assert.assertTrue((lineBeginsMessage(str3) == false));
742
743 }
745
748 private void testLineBeginsWithField() {
749 String str1 = "Message-ID: <Pine.SOL.3.91.1000913130311.19537A-10@derwent>";
750 String str2 = "%:ContentType TEXT/PLAIN; charset=US-ASCII";
751
752 Assert.assertTrue((lineBeginsWithField(str1) == true));
753 Assert.assertTrue((lineBeginsWithField(str2) == true));
754 }
756
759 public void testSelf(){
760 testContainsSemicolon();
761 testContainsWhiteSpaces();
762 testHasAMeaning();
763 testIsTime();
764 testLineBeginsMessage();
765 testLineBeginsWithField();
766 }
768 }