1   /*
2    *  RepositioningInfo.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Angel Kirilov, 04/January/2002
12   *
13   *  $Id: RepositioningInfo.java,v 1.11 2005/01/11 13:51:31 ian Exp $
14   */
15  
16  package gate.corpora;
17  
18  import java.io.Serializable;
19  import java.util.ArrayList;
20  
21  import gate.util.Out;
22  
23  /**
24   * RepositioningInfo keep information about correspondence of positions
25   * between the original and extracted document content. With this information
26   * this class could be used for computing of this correspondence in the strict
27   * way (return -1 where is no correspondence)
28   * or in "flow" way (return near computable position)
29   */
30  
31  public class RepositioningInfo extends ArrayList {
32  
33    /** Freeze the serialization UID. */
34    static final long serialVersionUID = -2895662600168468559L;
35    /** Debug flag */
36    private static final boolean DEBUG = false;
37  
38    /**
39     * Just information keeper inner class. No significant functionality.
40     */
41    public class PositionInfo implements Serializable {
42  
43      /** Freeze the serialization UID. */
44      static final long serialVersionUID = -7747351720249898499L;
45  
46      /** Data members for one peace of text information */
47      private long m_origPos, m_origLength, m_currPos, m_currLength;
48  
49      /** The only constructor. We haven't set methods for data members. */
50      public PositionInfo(long orig, long origLen, long curr, long currLen) {
51        m_origPos = orig;
52        m_origLength = origLen;
53        m_currPos = curr;
54        m_currLength = currLen;
55      } // PositionInfo
56  
57      /** Position in the extracted (and probably changed) content */
58      public long getCurrentPosition() {
59        return m_currPos;
60      } // getCurrentPosition
61  
62      /** Position in the original content */
63      public long getOriginalPosition() {
64        return m_origPos;
65      } // getOriginalPosition
66  
67      /** Length of peace of text in the original content */
68      public long getOriginalLength() {
69        return m_origLength;
70      } // getOriginalLength
71  
72      /** Length of peace of text in the extracted content */
73      public long getCurrentLength() {
74        return m_currLength;
75      } // getCurrentLength
76  
77      /** For debug purposes */
78      public String toString() {
79        return "("+m_origPos+","+m_origLength+","
80                  +m_currPos+","+m_currLength+")";
81      } // toString
82    } // class PositionInfo
83  
84    /** Default constructor */
85    public RepositioningInfo() {
86      super();
87    } // RepositioningInfo
88  
89    /** Create a new position information record. */
90    public void addPositionInfo(long origPos, long origLength,
91                                long currPos, long currLength) {
92      // sorted add of new position
93      int insertPos = 0;
94      PositionInfo lastPI;
95  
96      for(int i = size(); i>0; i--) {
97        lastPI = (PositionInfo) get(i-1);
98        if(lastPI.getOriginalPosition() < origPos) {
99          insertPos = i;
100         break;
101       } // if - sort key
102     } // for
103 
104     add(insertPos, new PositionInfo(origPos, origLength, currPos, currLength));
105   } // addPositionInfo
106 
107   /** Compute position in extracted content by position in the original content.
108    *  If there is no correspondence return -1.
109    */
110   public long getExtractedPos(long absPos) {
111     long result = absPos;
112     PositionInfo currPI = null;
113     int size = size();
114 
115     if(size != 0) {
116       long origPos, origLen;
117       boolean found = false;
118 
119       for(int i=0; i<size; ++i) {
120         currPI = (PositionInfo) get(i);
121         origPos = currPI.getOriginalPosition();
122         origLen = currPI.getOriginalLength();
123 
124         if(absPos <= origPos+origLen) {
125           if(absPos < origPos) {
126             // outside the range of information
127             result = -1;
128           }
129           else {
130             // current position + offset in this PositionInfo record
131             result = currPI.getCurrentPosition() + absPos - origPos;
132           } // if
133           found = true;
134           break;
135         } // if
136       } // for
137 
138       if(!found) {
139         // after the last repositioning info
140         result = -1;
141       } // if - !found
142     } // if
143 
144     return result;
145   } // getExtractedPos
146 
147   public long getOriginalPos(long relPos) {
148     return getOriginalPos(relPos, false);
149   } // getOriginalPos
150 
151   /** Compute position in original content by position in the extracted content.
152    *  If there is no correspondence return -1.
153    */
154   public long getOriginalPos(long relPos, boolean afterChar) {
155     long result = relPos;
156     PositionInfo currPI = null;
157     int size = size();
158 
159     if(size != 0) {
160       long currPos, currLen;
161       boolean found = false;
162 
163       for(int i=0; i<size; ++i) {
164         currPI = (PositionInfo) get(i);
165         currPos = currPI.getCurrentPosition();
166         currLen = currPI.getCurrentLength();
167 
168         if(afterChar && relPos == currPos+currLen) {
169           result = currPI.getOriginalPosition() + currPI.getOriginalLength();
170           found = true;
171           break;
172         } // if
173 
174         if(relPos < currPos+currLen) {
175           if(relPos < currPos) {
176             // outside the range of information
177             result = -1;
178           }
179           else {
180             // current position + offset in this PositionInfo record
181             result = currPI.getOriginalPosition() + relPos - currPos;
182           } // if
183           found = true;
184           break;
185         } // if
186       } // for
187 
188       if(!found) {
189         // after the last repositioning info
190         result = -1;
191       } // if - !found
192     } // if
193 
194     return result;
195   } // getOriginalPos
196 
197   /** Not finished yet */
198   public long getExtractedPosFlow(long absPos) {
199     long result = -1;
200     return result;
201   } // getExtractedPosFlow
202 
203   /** Not finished yet */
204   public long getOriginalPosFlow(long relPos) {
205     long result = -1;
206     return result;
207   } // getOriginalPosFlow
208 
209   /**
210    * Return the position info index containing <B>@param absPos</B>
211    * If there is no such position info return -1.
212    */
213   public int getIndexByOriginalPosition(long absPos) {
214     PositionInfo currPI = null;
215     int result = -1;
216 
217     int size = size();
218     long origPos, origLen;
219 
220     // Find with the liniear algorithm. Could be extended to binary search.
221     for(int i=0; i<size; ++i) {
222       currPI = (PositionInfo) get(i);
223       origPos = currPI.getOriginalPosition();
224       origLen = currPI.getOriginalLength();
225 
226       if(absPos <= origPos+origLen) {
227         if(absPos >= origPos) {
228           result = i;
229         } // if
230         break;
231       } // if
232     } // for
233 
234     return result;
235   } // getItemByOriginalPosition
236 
237   /**
238    * Return the position info index containing <B>@param absPos</B>
239    * or the index of record before this position.
240    * Result is -1 if the position is before the first record.
241    * Rezult is size() if the position is after the last record.
242    */
243   public int getIndexByOriginalPositionFlow(long absPos) {
244     PositionInfo currPI = null;
245 
246     int size = size();
247     int result = size;
248     long origPos, origLen;
249 
250     // Find with the liniear algorithm. Could be extended to binary search.
251     for(int i=0; i<size; ++i) {
252       currPI = (PositionInfo) get(i);
253       origPos = currPI.getOriginalPosition();
254       origLen = currPI.getOriginalLength();
255 
256       if(absPos <= origPos+origLen) {
257         // is inside of current record
258         if(absPos >= origPos) {
259           result = i;
260         }
261         else {
262           // not inside the current recort - return previous
263           result = i-1;
264         } // if
265         break;
266       } // if
267     } // for
268 
269     return result;
270   } // getItemByOriginalPositionFlow
271 
272   /**
273    *  Correct the RepositioningInfo structure for shrink/expand changes.
274    *  <br>
275    *
276    *  Normaly the text peaces have same sizes in both original text and
277    *  extracted text. But in some cases there are nonlinear substitutions.
278    *  For example the sequence "&lt;" is converted to "<".
279    *  <br>
280    *
281    *  The correction will split the corresponding PositionInfo structure to
282    *  3 new records - before correction, correction record and after correction.
283    *  Front and end records are the same maner like the original record -
284    *  m_origLength == m_currLength, since the middle record has different
285    *  values because of shrink/expand changes. All records after this middle
286    *  record should be corrected with the difference between these values.
287    *  <br>
288    *
289    *  All m_currPos above the current information record should be corrected
290    *  with (origLen - newLen) i.e.
291    *  <code> m_currPos -= origLen - newLen; </code>
292    *  <br>
293    *
294    *  @param originalPos Position of changed text in the original content.
295    *  @param origLen Length of changed peace of text in the original content.
296    *  @param newLen Length of new peace of text substiting the original peace.
297    */
298   public void correctInformation(long originalPos, long origLen, long newLen) {
299     PositionInfo currPI;
300     PositionInfo frontPI, correctPI, endPI;
301 
302     int index = getIndexByOriginalPositionFlow(originalPos);
303 
304     // correct the index when the originalPos precede all records
305     if(index == -1) {
306       index = 0;
307     } // if
308 
309    // correction of all other information records
310     // All m_currPos above the current record should be corrected with
311     // (origLen - newLen) i.e. <code> m_currPos -= origLen - newLen; </code>
312 
313     for(int i=index; i<size(); ++i) {
314       currPI = (PositionInfo) get(i);
315       currPI.m_currPos -= origLen - newLen;
316     } // for
317 
318     currPI = (PositionInfo) get(index);
319     if(originalPos >= currPI.m_origPos
320         && currPI.m_origPos + currPI.m_origLength >= originalPos + origLen) {
321       long frontLen = originalPos - currPI.m_origPos;
322 
323       frontPI = new PositionInfo(currPI.m_origPos,
324                               frontLen,
325                               currPI.m_currPos,
326                               frontLen);
327       correctPI = new PositionInfo(originalPos,
328                               origLen,
329                               currPI.m_currPos + frontLen,
330                               newLen);
331       long endLen = currPI.m_origLength - frontLen - origLen;
332       endPI = new PositionInfo(originalPos + origLen,
333                               endLen,
334                               currPI.m_currPos + frontLen + newLen,
335                               endLen);
336 
337       set(index, frontPI); // substitute old element
338       if(endPI.m_origLength > 0) {
339         add(index+1, endPI); // insert new end element
340       } // if
341       if(correctPI.m_origLength > 0) {
342         add(index+1, correctPI); // insert middle new element
343       } // if
344     } // if - substitution range check
345   } // correctInformation
346 
347   /**
348    *  Correct the original position information in the records. When some text
349    *  is shrinked/expanded by the parser. With this method is corrected the
350    *  substitution of "\r\n" with "\n".
351    */
352   public void correctInformationOriginalMove(long originalPos, long moveLen) {
353     PositionInfo currPI;
354 
355     if(DEBUG) {
356       if(originalPos < 380) // debug information restriction
357         Out.println("Before correction: "+this);
358     } // DEBUG
359 
360     int index = getIndexByOriginalPositionFlow(originalPos);
361 
362     // correct the index when the originalPos precede all records
363     if(index == -1) {
364       index = 0;
365     } // if
366 
367     // position is after all records in list
368     if(index == size()) {
369       return;
370     } // if
371 
372     for(int i = index+1; i<size(); ++i) {
373       currPI = (PositionInfo) get(i);
374       currPI.m_origPos += moveLen;
375     } // for
376 
377     currPI = (PositionInfo) get(index);
378 
379     // should we split this record to two new records (inside the record)
380     if(originalPos > currPI.m_origPos) {
381       if(originalPos < currPI.m_origPos + currPI.m_origLength) {
382         PositionInfo frontPI, endPI;
383         long frontLen = originalPos - currPI.m_origPos;
384         frontPI = new PositionInfo(currPI.m_origPos,
385                                 frontLen,
386                                 currPI.m_currPos,
387                                 frontLen);
388 
389         long endLen = currPI.m_origLength - frontLen;
390         endPI = new PositionInfo(originalPos + moveLen,
391                                 endLen,
392                                 currPI.m_currPos + frontLen,
393                                 endLen);
394         set(index, frontPI); // substitute old element
395         if(endPI.m_origLength != 0) {
396           add(index+1, endPI); // insert new end element
397         } // if - should add this record
398 
399         if(DEBUG) {
400           if(originalPos < 380) { // debug information restriction
401             Out.println("Point 2. Current: "+currPI);
402             Out.println("Point 2. frontPI: "+frontPI);
403             Out.println("Point 2. endPI: "+endPI);
404           }
405         } // DEBUG
406       } // if - inside the record
407     } // if
408     else {
409       // correction if the position is before the current record
410       currPI.m_origPos += moveLen;
411     }
412 
413     if(DEBUG) {
414       if(originalPos < 380) {
415         Out.println("Correction move: "+originalPos+", "+moveLen);
416         Out.println("Corrected: "+this);
417         Out.println("index: "+index);
418         /*
419         Exception ex = new Exception();
420         Out.println("Call point: ");
421         ex.printStackTrace();
422         */
423       }
424     } // DEBUG
425   } // correctInformationOriginalMove
426 
427 } // class RepositioningInfo