1   /*
2    *  XmlPositionCorrectionHandler.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Angel Kirilov,  4 January 2002
12   *
13   *  $Id: XmlPositionCorrectionHandler.java,v 1.8 2005/01/11 13:51:38 ian Exp $
14   */
15  
16  package gate.xml;
17  
18  import org.xml.sax.helpers.DefaultHandler;
19  
20  
21  /**
22   * This class correct a Xerces parser bug in reported position in file during
23   * the parsing process. Xerces parser cut processed file to 16K peaces. If
24   * the parser cross the 16K border reported in the characters() position is
25   * zerro.
26   *
27   * This bug could be covered if you extend this content handler instead of
28   * org.xml.sax.helpers.DefaultHandler.
29   *
30   * The real content handler should call methods startDocument() and characters()
31   * in order to compute correct position in file. The corrected position could be
32   * received throug protected data member m_realOffset or with getRealOffset().
33   */
34  public class XmlPositionCorrectionHandler extends DefaultHandler {
35  
36    /** Debug flag */
37    private static final boolean DEBUG = false;
38  
39    /**
40     * Variables for correction of 16K parser limit for offset
41     */
42    protected long m_realOffset;
43    private int m_lastPosition;
44    private int m_lastSize;
45    private int m_multiplyer;
46  
47    /** Constructor for initialization of variables */
48    public XmlPositionCorrectionHandler() {
49      m_realOffset = 0;
50      m_lastPosition = 0;
51      m_lastSize = 0;
52      m_multiplyer = 0;
53    } // XmlPositionCorrectionHandler
54  
55    /** Initialization of variables on start of document parsing */
56    public void startDocument() throws org.xml.sax.SAXException {
57      m_realOffset = 0;
58      m_lastPosition = 0;
59      m_lastSize = 0;
60      m_multiplyer = 0;
61    } // startDocument
62  
63    /** Return corrected offset for last characters() call */
64    public long getRealOffset() {
65      return m_realOffset;
66    } // getRealOffset
67  
68    /** Here is the correction of the Xerces parser bug. */
69    public void characters(char[] text, int offset, int len)
70                    throws org.xml.sax.SAXException {
71      if(offset == 0 && len == 1 && text.length <= 2) {
72          // unicode char or &xxx; coding
73          return;
74      } // if
75  
76      // There is 16K limit for offset. Here is the correction.
77      // Will catch the bug in most cases.
78      if(m_lastPosition - offset > 0x2000
79          || (offset == 0 && m_lastSize+m_lastPosition > 0x3000) ) {
80          m_multiplyer++;
81      }
82      m_lastPosition = offset;
83      m_lastSize = len;
84      m_realOffset = m_multiplyer*0x4000+offset;
85    } // characters
86  
87  } // XmlPositionCorrectionHandler