1   /*
2    *  DocumentContentImpl.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Hamish Cunningham, 11/Feb/2000
12   *
13   *  $Id: DocumentContentImpl.java,v 1.29 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.*;
19  import java.net.URL;
20  
21  import gate.DocumentContent;
22  import gate.util.InvalidOffsetException;
23  
24  /** Represents the commonalities between all sorts of document contents.
25    */
26  public class DocumentContentImpl implements DocumentContent
27  {
28    /** Debug flag */
29    private static final boolean DEBUG = false;
30  
31    /** Buffer size for reading
32     *  16k is 4 times the block size on most filesystems
33     *  so it should be efficient for most cases
34     *  */
35    private static final int INTERNAL_BUFFER_SIZE  = 16*1024;
36  
37    /** Default construction */
38    public DocumentContentImpl() {
39      content = new String();
40    } // default construction
41  
42    /** Contruction from URL and offsets. */
43    public DocumentContentImpl(URL u, String encoding, Long start, Long end)
44    throws IOException {
45  
46      int readLength = 0;
47      char[] readBuffer = new char[INTERNAL_BUFFER_SIZE];
48  
49      BufferedReader uReader = null;
50      StringBuffer buf = new StringBuffer();
51      char c;
52      long s = 0, e = Long.MAX_VALUE, counter = 0;
53      if(start != null && end != null) {
54        s = start.longValue();
55        e = end.longValue();
56      }
57  
58      if(encoding != null && !encoding.equalsIgnoreCase("")) {
59        uReader = new BufferedReader(
60          new InputStreamReader(u.openStream(), encoding), INTERNAL_BUFFER_SIZE
61        );
62      } else {
63        uReader = new BufferedReader(
64          new InputStreamReader(u.openStream()), INTERNAL_BUFFER_SIZE
65        );
66      };
67  
68      // 1. skip S characters
69      uReader.skip(s);
70  
71      // 2. how many character shall I read?
72      long toRead = e - s;
73  
74      // 3. read gtom source into buffer
75      while (
76        toRead > 0 &&
77        (readLength = uReader.read(readBuffer, 0, INTERNAL_BUFFER_SIZE)) != -1
78      ) {
79        if (toRead <  readLength) {
80          //well, if toRead(long) is less than readLenght(int)
81          //then there can be no overflow, so the cast is safe
82          readLength = (int)toRead;
83        }
84  
85        buf.append(readBuffer, 0, readLength);
86        toRead -= readLength;
87      }
88  
89      // 4.close reader
90      uReader.close();
91  
92      content = new String(buf);
93      originalContent = content;
94    } // Contruction from URL and offsets
95  
96    /** Propagate changes to the document content. */
97    void edit(Long start, Long end, DocumentContent replacement)
98    {
99      int s = start.intValue(), e = end.intValue();
100     String repl = ((DocumentContentImpl) replacement).content;
101     StringBuffer newContent = new StringBuffer(content);
102     newContent.replace(s, e, repl);
103     content = newContent.toString();
104   } // edit(start,end,replacement)
105 
106   /** The contents under a particular span. */
107   public DocumentContent getContent(Long start, Long end)
108     throws InvalidOffsetException
109   {
110     if(! isValidOffsetRange(start, end))
111       throw new InvalidOffsetException();
112 
113     return new DocumentContentImpl(
114       content.substring(start.intValue(), end.intValue())
115     );
116   } // getContent(start, end)
117 
118   /** Returns the String representing the content in case of a textual document.
119     * NOTE: this is a temporary solution until we have a more generic one.
120     */
121   public String toString(){
122     return content;
123   }
124 
125   /** The size of this content (e.g. character length for textual
126     * content).
127     */
128   public Long size() {
129     return new Long(content.length());
130   } // size()
131 
132   /** Check that an offset is valid */
133   boolean isValidOffset(Long offset) {
134     if(offset == null)
135       return false;
136 
137     long o = offset.longValue();
138     long len = content.length();
139     if(o > len || o < 0)
140       return false;
141 
142     return true;
143   } // isValidOffset
144 
145   /** Check that both start and end are valid offsets and that
146     * they constitute a valid offset range
147     */
148   boolean isValidOffsetRange(Long start, Long end) {
149     return
150       isValidOffset(start) && isValidOffset(end) &&
151       start.longValue() <= end.longValue();
152   } // isValidOffsetRange(start,end)
153 
154   /** Two documents are the same if their contents is the same
155    */
156   public boolean equals(Object other) {
157     if (!(other instanceof DocumentContentImpl)) return false;
158 
159     DocumentContentImpl docImpl = (DocumentContentImpl) other;
160     return content.equals(docImpl.toString());
161   } // equals
162 
163   /** Calculate the hash value for the object. */
164   public int hashCode(){ return toString().hashCode(); }
165 
166   /** Just for now - later we have to cater for different types of
167     * content.
168     */
169   String content;
170 
171   /**
172    * For preserving the original content of the document.
173    * The edit command didn't affect on the original content.
174    * If you construct the content by URL the originalContent will keep
175    * whole information retrieved by URL even you set some start and end.
176    */
177   String originalContent;
178 
179   /**
180    * Return the original content of the document received during the loading
181    * phase or on construction from string.
182    */
183   public String getOriginalContent() { return originalContent; }
184 
185   /** For ranges */
186   public DocumentContentImpl(String s)
187     { content = s; originalContent = content; }
188 
189   /** Freeze the serialization UID. */
190   static final long serialVersionUID = -1426940535575467461L;
191 } // class DocumentContentImpl
192