| XmlPositionCorrectionHandler.java |
1 /*
2 * XmlPositionCorrectionHandler.java
3 *
4 * Copyright (c) 1998-2005, The University of Sheffield.
5 *
6 * This file is part of GATE (see http://gate.ac.uk/), and is free
7 * software, licenced under the GNU Library General Public License,
8 * Version 2, June 1991 (in the distribution as file licence.html,
9 * and also available at http://gate.ac.uk/gate/licence.html).
10 *
11 * Angel Kirilov, 4 January 2002
12 *
13 * $Id: XmlPositionCorrectionHandler.java,v 1.8 2005/01/11 13:51:38 ian Exp $
14 */
15
16 package gate.xml;
17
18 import org.xml.sax.helpers.DefaultHandler;
19
20
21 /**
22 * This class correct a Xerces parser bug in reported position in file during
23 * the parsing process. Xerces parser cut processed file to 16K peaces. If
24 * the parser cross the 16K border reported in the characters() position is
25 * zerro.
26 *
27 * This bug could be covered if you extend this content handler instead of
28 * org.xml.sax.helpers.DefaultHandler.
29 *
30 * The real content handler should call methods startDocument() and characters()
31 * in order to compute correct position in file. The corrected position could be
32 * received throug protected data member m_realOffset or with getRealOffset().
33 */
34 public class XmlPositionCorrectionHandler extends DefaultHandler {
35
36 /** Debug flag */
37 private static final boolean DEBUG = false;
38
39 /**
40 * Variables for correction of 16K parser limit for offset
41 */
42 protected long m_realOffset;
43 private int m_lastPosition;
44 private int m_lastSize;
45 private int m_multiplyer;
46
47 /** Constructor for initialization of variables */
48 public XmlPositionCorrectionHandler() {
49 m_realOffset = 0;
50 m_lastPosition = 0;
51 m_lastSize = 0;
52 m_multiplyer = 0;
53 } // XmlPositionCorrectionHandler
54
55 /** Initialization of variables on start of document parsing */
56 public void startDocument() throws org.xml.sax.SAXException {
57 m_realOffset = 0;
58 m_lastPosition = 0;
59 m_lastSize = 0;
60 m_multiplyer = 0;
61 } // startDocument
62
63 /** Return corrected offset for last characters() call */
64 public long getRealOffset() {
65 return m_realOffset;
66 } // getRealOffset
67
68 /** Here is the correction of the Xerces parser bug. */
69 public void characters(char[] text, int offset, int len)
70 throws org.xml.sax.SAXException {
71 if(offset == 0 && len == 1 && text.length <= 2) {
72 // unicode char or &xxx; coding
73 return;
74 } // if
75
76 // There is 16K limit for offset. Here is the correction.
77 // Will catch the bug in most cases.
78 if(m_lastPosition - offset > 0x2000
79 || (offset == 0 && m_lastSize+m_lastPosition > 0x3000) ) {
80 m_multiplyer++;
81 }
82 m_lastPosition = offset;
83 m_lastSize = len;
84 m_realOffset = m_multiplyer*0x4000+offset;
85 } // characters
86
87 } // XmlPositionCorrectionHandler