Sgml2Xml.java |
1 /* 2 * Sgml2Xml.java 3 * 4 * Copyright (c) 1998-2005, The University of Sheffield. 5 * 6 * This file is part of GATE (see http://gate.ac.uk/), and is free 7 * software, licenced under the GNU Library General Public License, 8 * Version 2, June 1991 (in the distribution as file licence.html, 9 * and also available at http://gate.ac.uk/gate/licence.html). 10 * 11 * Cristian URSU, 4/July/2000 12 * 13 * $Id: Sgml2Xml.java,v 1.16 2005/01/11 13:51:36 ian Exp $ 14 */ 15 16 package gate.sgml; 17 18 import java.io.File; 19 import java.io.IOException; 20 import java.net.MalformedURLException; 21 import java.util.*; 22 23 import gate.Document; 24 import gate.util.Files; 25 26 27 /** 28 * Not so fast... 29 * This class is not a realy Sgml2Xml convertor. 30 * It takes an SGML document and tries to prepare it for an XML parser 31 * For a true conversion we need an Java SGML parser... 32 * If you know one let me know.... 33 * 34 * What does it do: 35 * <ul> 36 * <li>If it finds something like this : <element attribute = value> 37 * it will produce: <element attribute = "value"> 38 * <li>If it finds something like this : <element something 39 * attribute2=value>it will produce : <element 40 * defaultAttribute="something" attribute2="value"> 41 * <li>If it finds : <element att1='value1 value2' att2="value2 42 * value3"> it will produce: <element att1="value1 value2" 43 * att2="value2 value3"> 44 * <li>If it finds : <element1> <elem>text </element1> 45 * will produce: <element1> <elem>text<elem> 46 * </element1> 47 * <li>If it find : <element1> <elem>[white spaces] 48 * </element1>, 49 * it will produce:<element1> <elem/>[white spaces]< 50 * /element1> 51 * </ul> 52 * What doesn't: 53 * <ul> 54 * <li>Doesn't expand the entities. So the entities from the SGML document 55 * must be resolved by the XML parser 56 * <li>Doesn't replace internal entities with their corresponding value 57 * </ul> 58 */ 59 60 public class Sgml2Xml{ 61 62 /** Debug flag */ 63 private static final boolean DEBUG = false; 64 65 /** 66 * The constructor initialises some member fields 67 * @param SgmlDoc the content of the Sgml document that will be modified 68 */ 69 public Sgml2Xml(String SgmlDoc){ 70 // create a new modifier 71 m_modifier = new StringBuffer(SgmlDoc); 72 // create a new dobiousElements list 73 // se the explanatin at the end of the class 74 dubiousElements = new ArrayList(); 75 stack = new Stack(); 76 } 77 78 /** 79 * The other constructor 80 * @param doc The Gate document that will be transformed to XML 81 */ 82 public Sgml2Xml(Document doc){ 83 // set as a member 84 m_doc = doc; 85 86 // create a new modifier 87 m_modifier = new StringBuffer(m_doc.getContent().toString()); 88 89 // create a new dobiousElements list 90 // se the explanatin at the end of the class 91 dubiousElements = new ArrayList(); 92 stack = new Stack(); 93 94 } 95 96 /* I keep this just in case I need some more debuging 97 98 public static void main(String[] args){ 99 Sgml2Xml convertor = 100 new Sgml2Xml("<w VVI='res trtetre\" relu = \"stop\">say 101 <w VBZ>is\n<trunc> <w UNC>th </trunc>"); 102 try{ 103 Out.println(convertor.convert()); 104 } catch (Exception e){ 105 e.printStackTrace(Err.getPrintWriter()); 106 } 107 } 108 */ 109 110 /** 111 * It analises the char that was red in state 1 112 * If it finds '<' it then goes to state 2 113 * Otherwise it stays in state 1 and keeps track about the text that is not 114 * white spaces. 115 */ 116 private void doState1(char currChar){ 117 if ('<' == currChar){ 118 // change to state 2 119 m_currState = 2; 120 if (!stack.isEmpty()){ 121 // peek the element from the top of the stack 122 CustomObject o = (CustomObject) stack.peek(); 123 // set some properties for this element 124 // first test to find out if text folows this element charPos > 0 125 if (charPos > 0){ 126 // this is not an empty element because there is text that follows 127 // set the element from the top of the stack to be a non empty one 128 o.setClosePos(charPos); 129 o.setEmpty(false); 130 // reset the charPos 131 charPos = 0; 132 }//if (charPos > 0) 133 }//if (!stack.isEmpty()) 134 }//if ('<' == m_currChar) 135 // if currChar is not whiteSpace then save the position of the last 136 // char that was read 137 if (('<' != currChar) && !isWhiteSpace(currChar)) 138 charPos = m_cursor; 139 }//doState1 140 141 /** 142 We came from state 1 and just read '<' 143 If currChar == '/' -> state 11 144 If is a char != white spaces -> state 3 145 stay in state 2 while there are only white spaces 146 */ 147 private void doState2(char currChar){ 148 if ('/' == currChar){ 149 // go to state 11 150 m_currState = 11; 151 } 152 // if currChar is a char != white spaces then go to state 3 153 if (('/' != m_currChar) && !isWhiteSpace(m_currChar)){ 154 // save the position where starts the element's name 155 // we need that in order to be able to read the current tag name 156 // this name it will be read from m_modifier using the substring() method 157 elemNameStart = m_cursor -1; 158 // go to state 3 159 m_currState = 3; 160 } 161 }// doState2 162 163 /** 164 * Just read the first char from the element's name and now analize the next 165 * char. 166 * If '>' the elem name was a single char -> state 1 167 * IF is WhiteSpaces -> state 4 168 * Otherwise stay in state 3 and read the elemnt's name 169 */ 170 private void doState3(char currChar){ 171 if ( '>' == currChar ){ 172 173 // save the pos where the element's name ends 174 elemNameEnd = m_cursor - 1; 175 176 // this is also the pos where to insert '/' for empty elements. 177 // In this case we have this situation <w> sau < w> 178 closePos = m_cursor - 1; 179 180 // get the name of the element 181 elemName = m_modifier.substring(elemNameStart,elemNameEnd); 182 183 // we put the element into stack 184 // we think in this point that the element is empty... 185 performFinalAction(elemName, closePos); 186 187 // go to state 1 188 m_currState = 1; 189 } 190 if (isWhiteSpace(currChar)){ 191 // go to state 4 192 m_currState = 4; 193 194 // save the pos where the element's name ends 195 elemNameEnd = m_cursor - 1; 196 197 // get the name of the element 198 elemName = m_modifier.substring(elemNameStart,elemNameEnd); 199 } 200 }// doState3 201 202 /** 203 * We read the name of the element and we prepare for '>' or attributes 204 * '>' -> state 1 205 * any char !- white space -> state 5 206 */ 207 private void doState4(char currChar){ 208 if ( '>' == currChar ){ 209 // this is also the pos where to insert '/' for empty elements in this case 210 closePos = m_cursor -1 ; 211 212 // we put the element into stack 213 // we think in this point that the element is empty... 214 performFinalAction(elemName, closePos); 215 216 // go to state 1 217 m_currState = 1; 218 } 219 if (( '>' != currChar ) && !isWhiteSpace(currChar)){ 220 // we just read the first char from the attrib name or attrib value.. 221 // go to state 5 222 m_currState = 5; 223 224 // remember the position where starts the attrib or the value of an attrib 225 attrStart = m_cursor - 1; 226 } 227 } // doState4 228 229 /** 230 * '=' -> state 6 231 * '>' -> state 4 (we didn't read an attribute but a value of the 232 * defaultAtt ) 233 * WS (white spaces) we don't know yet if we read an attribute or the value 234 * of the defaultAttr -> state 10 235 * This state modifies the content onf m_modifier ... it adds text 236 */ 237 private void doState5(char currChar){ 238 if ( '=' == currChar ) 239 m_currState = 6; 240 if ( '>' == currChar ){ 241 // this mean that the attribute was a value and we have to create 242 // a default attribute 243 // the same as in state 10 244 attrEnd = m_cursor - 1 ; 245 m_modifier.insert(attrEnd,'"'); 246 m_modifier.insert(attrStart,"defaultAttr=\""); 247 248 // go to state 4 249 m_currState = 4; 250 251 // parse again the entire sequence from state 4 before reading any char 252 m_cursor = attrStart; 253 } 254 if (isWhiteSpace(currChar)){ 255 // go to state 10 256 m_currState = 10; 257 258 // record the position where ends this attribute 259 attrEnd = m_cursor - 1; 260 } 261 } // doState5 262 263 /** 264 * IF we read ' or " then we have to get prepared to read everything until 265 * the next ' or " 266 * If we read a char then -> state 8; 267 * Stay here while we read WS 268 */ 269 private void doState6(char currChar){ 270 if ( ('\'' == currChar) || ('"' == currChar) ){ 271 endPair = currChar; 272 if ('\'' == currChar){ 273 274 // we have to replace ' with " 275 m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\""); 276 } 277 m_currState = 7; 278 } 279 if ( ('\'' != currChar) && ('"' != currChar) && !isWhiteSpace(currChar)){ 280 281 // this means that curChar is any char 282 m_currState = 8; 283 284 // every value must be inside this pair"" 285 m_modifier.insert(m_cursor - 1, '"'); 286 287 // insert implies the modification of m_cursor 288 // we increment m_cursor in order to say in the same position and to 289 // anulate the efect of insert. 290 m_cursor ++; 291 } 292 }// doState6 293 294 /** 295 * If we find the pair ' or " go to state 9 296 * Otherwhise read everything and stay in state 7 297 * If in state 7 we read '>' then we add automaticaly a " at the end and go 298 * to state 1 299 */ 300 private void doState7(char currChar){ 301 //if ( ('\'' == currChar) || ('"' == currChar) ){ 302 if ( endPair == currChar ){ 303 if ('\'' == currChar){ 304 305 // we have to replace ' with " 306 m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\""); 307 } 308 // reset the endPair 309 endPair = ' '; 310 m_currState = 9; 311 } 312 313 if ('>' == currChar){ 314 // go to state 1 315 m_currState = 1; 316 317 // insert the final " ata the end 318 m_modifier.insert(m_cursor - 1, '"'); 319 320 // go to te current possition (because of insert) 321 m_cursor ++; 322 323 performFinalAction(elemName, m_cursor - 1); 324 } 325 326 }// doState7 327 328 /** 329 * If '>' go to state 1 330 * If WS go to state 9 331 * Stays in state 8 and read the attribute's value 332 */ 333 private void doState8(char currChar){ 334 335 if ('>' == currChar){ 336 // go to state 1 337 m_currState = 1; 338 339 // complete the end " ( <elem attr="value> ) 340 m_modifier.insert(m_cursor - 1, '"'); 341 342 // go to te current possition (because of insert) 343 m_cursor ++; 344 345 // we finished to read a beggining tag 346 // see the method definition for more details 347 performFinalAction(elemName, m_cursor - 1); 348 } 349 if (isWhiteSpace(currChar)){ 350 // go to state 9 351 m_currState = 9; 352 353 // add the ending " char 354 m_modifier.insert(m_cursor - 1, '"'); 355 356 // increment the cursor in order to anulate the effect of insert 357 m_cursor ++; 358 } 359 } // doState8 360 /** 361 * Here we prepare to read another attrib, value pair (any char -> state 5) 362 * If '>' we just read a beggining tag -> state 1 363 * Stay here while read WS 364 */ 365 private void doState9(char currChar){ 366 if ('>' == currChar){ 367 // go to state 1 368 m_currState = 1; 369 370 // add the object to the stack 371 performFinalAction(elemName, m_cursor - 1); 372 } 373 if (('>' != currChar) && !isWhiteSpace(m_currChar)){ 374 // this is the same as state 4->5 375 m_currState = 5; 376 attrStart = m_cursor - 1; 377 } 378 }//doState9 379 380 /** 381 * If any C -> state 4 382 * If '=' state 6 383 * Stays here while reads WS 384 */ 385 private void doState10(char currChar){ 386 if ('=' == currChar) 387 m_currState = 6; 388 if ( ('=' != currChar) && !isWhiteSpace(currChar)){ 389 // this mean that the attribute was a value and we have to create 390 // a default attribute 391 m_modifier.insert(attrEnd,'"'); 392 m_modifier.insert(attrStart,"defaultAttr=\""); 393 394 // go to state 4 395 m_currState = 4; 396 397 m_cursor = attrStart; 398 } 399 }// doState10 400 401 /** 402 * We are preparing to read the and definition of an element 403 * Stays in this state while reading WS 404 */ 405 private void doState11(char currChar){ 406 if (!isWhiteSpace(currChar)){ 407 m_currState = 12; 408 elemNameStart = m_cursor - 1; 409 } 410 } // doState11 411 412 /** 413 * Here we read the element's name ...this is an end tag 414 * Stays here while reads a char 415 */ 416 private void doState12(char currChar) { 417 if ('>' == currChar){ 418 elemNameEnd = m_cursor - 1; 419 elemName = m_modifier.substring(elemNameStart,elemNameEnd); 420 performActionWithEndElem(elemName); 421 m_currState = 1; 422 } 423 if (isWhiteSpace(currChar)){ 424 m_currState = 13; 425 elemNameEnd = m_cursor - 1; 426 } 427 }//doState12 428 429 /** 430 * If '>' -> state 1 431 * Stays here while reads WS 432 */ 433 private void doState13(char currChar) { 434 if ('>' == currChar){ 435 elemName = m_modifier.substring(elemNameStart,elemNameEnd); 436 performActionWithEndElem(elemName); 437 m_currState = 1; 438 } 439 } // doState13 440 441 /** 442 This method is responsable with document conversion 443 */ 444 public String convert()throws IOException,MalformedURLException { 445 while (thereAreCharsToBeProcessed()) { 446 // read() gets the next char and increment the m_cursor 447 m_currChar = read(); 448 switch(m_currState){ 449 case 1: doState1(m_currChar);break; 450 case 2: doState2(m_currChar);break; 451 case 3: doState3(m_currChar);break; 452 case 4: doState4(m_currChar);break; 453 case 5: doState5(m_currChar);break; 454 case 6: doState6(m_currChar);break; 455 case 7: doState7(m_currChar);break; 456 case 8: doState8(m_currChar);break; 457 case 9: doState9(m_currChar);break; 458 case 10: doState10(m_currChar);break; 459 case 11: doState11(m_currChar);break; 460 case 12: doState12(m_currChar);break; 461 case 13: doState13(m_currChar);break; 462 }// switch(m_currState) 463 }// while (thereAreCharsToBeProcessed()) 464 465 // put all the elements from the stack into the dubiousElements list 466 // we do that in order to colect all the dubious elements 467 while (!stack.isEmpty()) { 468 CustomObject obj = (CustomObject) stack.pop(); 469 dubiousElements.add(obj); 470 } 471 472 // sort the dubiousElements list descending on closePos... 473 // This is vital for the alghorithm because we have to make 474 // all the modifications from the bottom to the top... 475 // If we fail to do that, insert will change indices and 476 // CustomObject.getClosePos() will not be acurate anymore. 477 Collections.sort(dubiousElements, new MyComparator()); 478 479 //here we resolve all the dubious Elements... 480 // see the description of makeFinalModifications() method 481 ListIterator listIterator = dubiousElements.listIterator(); 482 while (listIterator.hasNext()){ 483 CustomObject obj = (CustomObject) listIterator.next(); 484 makeFinalModifications(obj); 485 } 486 487 //finally add the XML prolog 488 m_modifier.insert(0,"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"); 489 //Out.println(m_modifier.toString()); 490 /* 491 // get a InputStream from m_modifier and write it into a temp file 492 // finally return the URI of the new XML document 493 ByteArrayInputStream is = new ByteArrayInputStream( 494 m_modifier.toString().getBytes() 495 ); 496 */ 497 // this method is in gate.util package 498 File file = Files.writeTempFile(m_modifier.toString(),"UTF-8"); 499 500 //return m_doc.getSourceURL().toString(); 501 return file.toURL().toString(); 502 }// convert() 503 504 /** 505 * This method tests to see if there are more char to be read 506 * It will return false when there are no more chars to be read 507 */ 508 private boolean thereAreCharsToBeProcessed() { 509 if (m_cursor < m_modifier.length()) return true; 510 else return false; 511 }//thereAreCharsToBeProcessed 512 513 /** 514 * This method reads a char and increments the m_cursor 515 */ 516 private char read(){ 517 return m_modifier.charAt(m_cursor ++); 518 }//read 519 520 /** 521 * This is the action when we finished to read the entire tag 522 * The action means that we put the tag into stack and consider that is empty 523 * as default 524 */ 525 private void performFinalAction(String elemName, int pos) { 526 // create anew CustomObject 527 CustomObject obj = new CustomObject(); 528 529 // set its properties 530 obj.setElemName(elemName); 531 obj.setClosePos(pos); 532 533 // default we consider every element to be empty 534 // in state 1 we modify that if the element is followed by text 535 obj.setEmpty(true); 536 stack.push(obj); 537 } // performFinalAction 538 539 /** 540 * This is the action performed when an end tag is read. 541 * The action consists in colecting all the dubiosElements(elements without 542 * an end tag). They are considered dubious because we don't know if they 543 * are empty or may be closed... Only the DTD can provide this information. 544 * We don't have a DTD so we will consider that all dubious elements 545 * followed by text will close at the end of the text... 546 * If a dubious element is followed by another element then is 547 * automaticaly considered an empty element. 548 * 549 * @param elemName is the the name of the end tag that was read 550 */ 551 private void performActionWithEndElem(String elemName) { 552 CustomObject obj = null; 553 boolean stop = false; 554 555 // get all the elements that are dubious from the stack 556 // the iteration will stop when an element is equal with elemName 557 while (!stack.isEmpty() && !stop){ 558 559 // eliminate the object from the stack 560 obj = (CustomObject) stack.pop(); 561 562 //if its elemName is equal with the param elemName we stop the itteration 563 if (obj.getElemName().equalsIgnoreCase(elemName)) stop = true; 564 565 // otherwhise add the element to the doubiousElements list 566 else dubiousElements.add(obj); 567 } 568 }//performActionWithEndElem 569 570 /** 571 * This method is called after we read the entire SGML document 572 * It resolves the dobious Elements this way: 573 * <ul> 574 * <li> 575 * 1. We don't have a DTD so we will consider that all dubious elements 576 * followed by text will close at the end of the text... 577 * <li> 578 * 2. If a dubious element is followed by another element then is 579 automaticaly considered an empty element. 580 * 581 * An element is considered dubious when we don't know if it is empty 582 * or may be closed... 583 * 584 * @param aCustomObject an object from the dubiousElements list 585 */ 586 private void makeFinalModifications(CustomObject aCustomObject) { 587 String endElement = null; 588 // if the element is empty then we add / before > like this: 589 // <w> -> <w/> 590 if (aCustomObject.isEmpty()) 591 m_modifier.insert(aCustomObject.getClosePos(),"/"); 592 // otherwhise we create an end element 593 // <w> -> </w> 594 else{ 595 // create the end element 596 endElement = "</" + aCustomObject.getElemName() + ">"; 597 // insert it where the closePos indicates 598 m_modifier.insert(aCustomObject.getClosePos(), endElement); 599 } 600 } // makeFinalModifications 601 602 /** 603 * Tests if c is a white space char 604 */ 605 private boolean isWhiteSpace(char c) { 606 return Character.isWhitespace(c); 607 } 608 609 // this is a gate Document... It's content will be transferred to 610 // m_modifier 611 private Document m_doc = null; 612 613 // this is the modifier that will transform an SGML document into an 614 // XML document 615 private StringBuffer m_modifier = null; 616 617 // we need the stack to be able to remember the order of the tags 618 private Stack stack = null; 619 620 // this is a list with all the tags that are not colsed... 621 // some of them are empty tags and some of them are not... 622 private List dubiousElements = null; 623 624 // this is tre current position inside the modifier 625 private int m_cursor = 0; 626 627 // the current state of the SGML2XML automata 628 private int m_currState = 1; 629 630 // the char that was read from the m_modifier @ position m_cursor 631 private char m_currChar = ' '; 632 633 // the fields above are used by the convert method and its auxiliary functions 634 // like doState1...13() 635 636 // indicates the last position of a text character (one which is not a white 637 // space) 638 // it is used in doState1() when we have to decide if an element is empty or 639 // not 640 // We decide that based on this field 641 // If the charPos > 0 then it means that the object from the top of stack 642 // is followed by text and we consider that is not empty 643 private int charPos = 0; 644 645 // is the current tag name 646 private String elemName = null; 647 648 // indicates where in the m_modifier begins the current tag elemName 649 private int elemNameStart = 0; 650 651 // indicates where in the m_modifier ends the current tag elemName 652 // we need that in order to be able to read the current tag name 653 // this name it will be read from m_modifier using the substring() method 654 // it will be something like this : 655 // elemName = m_modifier.substring(elemNameStart,elemNameEnd) 656 // Eg: <w attr1=val1> -> <[elemNameStart]w[elemNameEnd] [attr1=val1> 657 private int elemNameEnd = 0; 658 659 // this is the position there a start tag ends like this: 660 // Eg: <w attr1=val1> -> <w attr1=val1 [closePos]> 661 private int closePos = 0; 662 663 //this is the position where an attribute starts... 664 // we need it when we have to add the defaultAttr (see state 5) 665 private int attrStart = 0; 666 667 //this is the position where an attribute ends... 668 // we need it when we have to add the defaultAttr (see state 5) or to add " 669 // Eg: <w attr1=val1> -> <w [attrStart]attr1[attrEnd]=val1> 670 private int attrEnd = 0; 671 672 // endPair field is used in states 6 and 7.... 673 // When we read something like this : 674 // attr=' val1 val2 val3' endPair remembers what is the pair for the beginning 675 // string 676 // Note that a combination like: attr = ' val1 val2 " will have an unexpected 677 // behaviour... 678 // We need this field when we have the following situation 679 // attr1 = " val1 val2 ' val3" . We need to know what is the end pair for ". 680 // In this case we can't allow ' to be the endPair 681 private char endPair = ' '; 682 683 } // class Sgml2Xml 684 685 /** 686 * The objects belonging to this class are used inside the stack 687 */ 688 class CustomObject { 689 690 // constructor 691 public CustomObject() { 692 elemName = null; 693 closePos = 0; 694 empty = false; 695 } 696 697 // accessor 698 public String getElemName() { 699 return elemName; 700 } 701 702 public int getClosePos() { 703 return closePos; 704 } 705 706 public boolean isEmpty() { 707 return empty; 708 } 709 710 // modifiers 711 void setElemName(String anElemName) { 712 elemName = anElemName; 713 } 714 715 void setClosePos(int aPos){ 716 closePos = aPos; 717 } 718 719 void setEmpty(boolean anEmptyValue) { 720 empty = anEmptyValue; 721 } 722 723 // data fields 724 private String elemName = null; 725 726 private int closePos = 0; 727 728 private boolean empty = false; 729 730 } // CustomObject 731 732 class MyComparator implements Comparator { 733 734 public MyComparator() { 735 } 736 737 public int compare(Object o1, Object o2) { 738 if ( !(o1 instanceof CustomObject) || 739 !(o2 instanceof CustomObject)) return 0; 740 741 CustomObject co1 = (CustomObject) o1; 742 CustomObject co2 = (CustomObject) o2; 743 int result = 0; 744 if (co1.getClosePos() < co2.getClosePos()) result = -1; 745 if (co1.getClosePos() == co2.getClosePos()) result = 0; 746 if (co1.getClosePos() > co2.getClosePos()) result = 1; 747 748 return -result; 749 } // compare 750 751 }// class MyComparator 752