XmlPositionCorrectionHandler.java |
1 /* 2 * XmlPositionCorrectionHandler.java 3 * 4 * Copyright (c) 1998-2005, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Angel Kirilov, 4 January 2002 12 * 13 * $Id: XmlPositionCorrectionHandler.java,v 1.8 2005/01/11 13:51:38 ian Exp $ 14 */ 15 16 package gate.xml; 17 18 import org.xml.sax.helpers.DefaultHandler; 19 20 21 /** 22 * This class correct a Xerces parser bug in reported position in file during 23 * the parsing process. Xerces parser cut processed file to 16K peaces. If 24 * the parser cross the 16K border reported in the characters() position is 25 * zerro. 26 * 27 * This bug could be covered if you extend this content handler instead of 28 * org.xml.sax.helpers.DefaultHandler. 29 * 30 * The real content handler should call methods startDocument() and characters() 31 * in order to compute correct position in file. The corrected position could be 32 * received throug protected data member m_realOffset or with getRealOffset(). 33 */ 34 public class XmlPositionCorrectionHandler extends DefaultHandler { 35 36 /** Debug flag */ 37 private static final boolean DEBUG = false; 38 39 /** 40 * Variables for correction of 16K parser limit for offset 41 */ 42 protected long m_realOffset; 43 private int m_lastPosition; 44 private int m_lastSize; 45 private int m_multiplyer; 46 47 /** Constructor for initialization of variables */ 48 public XmlPositionCorrectionHandler() { 49 m_realOffset = 0; 50 m_lastPosition = 0; 51 m_lastSize = 0; 52 m_multiplyer = 0; 53 } // XmlPositionCorrectionHandler 54 55 /** Initialization of variables on start of document parsing */ 56 public void startDocument() throws org.xml.sax.SAXException { 57 m_realOffset = 0; 58 m_lastPosition = 0; 59 m_lastSize = 0; 60 m_multiplyer = 0; 61 } // startDocument 62 63 /** Return corrected offset for last characters() call */ 64 public long getRealOffset() { 65 return m_realOffset; 66 } // getRealOffset 67 68 /** Here is the correction of the Xerces parser bug. */ 69 public void characters(char[] text, int offset, int len) 70 throws org.xml.sax.SAXException { 71 if(offset == 0 && len == 1 && text.length <= 2) { 72 // unicode char or &xxx; coding 73 return; 74 } // if 75 76 // There is 16K limit for offset. Here is the correction. 77 // Will catch the bug in most cases. 78 if(m_lastPosition - offset > 0x2000 79 || (offset == 0 && m_lastSize+m_lastPosition > 0x3000) ) { 80 m_multiplyer++; 81 } 82 m_lastPosition = offset; 83 m_lastSize = len; 84 m_realOffset = m_multiplyer*0x4000+offset; 85 } // characters 86 87 } // XmlPositionCorrectionHandler