1   /*
2    *  Sgml2Xml.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  4/July/2000
12   *
13   *  $Id: Sgml2Xml.java,v 1.16 2005/01/11 13:51:36 ian Exp $
14   */
15  
16  package gate.sgml;
17  
18  import java.io.File;
19  import java.io.IOException;
20  import java.net.MalformedURLException;
21  import java.util.*;
22  
23  import gate.Document;
24  import gate.util.Files;
25  
26  
27  /**
28    * Not so fast...
29    * This class is not a realy Sgml2Xml convertor.
30    * It takes an SGML document and tries to prepare it for an XML parser
31    * For a true conversion we need an Java SGML parser...
32    * If you know one let me know....
33    *
34    * What does it do:
35    * <ul>
36    *  <li>If it finds something like this : &lt;element attribute = value&gt;
37    *      it will produce: &lt;element attribute = "value"&gt;
38    *  <li>If it finds something like this : &lt;element something
39    *      attribute2=value&gt;it will produce : &lt;element
40    *      defaultAttribute="something" attribute2="value"&gt;
41    *  <li>If it finds : &lt;element att1='value1 value2' att2="value2
42    *      value3"&gt; it will produce: &lt;element att1="value1 value2"
43    *      att2="value2 value3"&gt;
44    *  <li>If it finds : &lt;element1&gt; &lt;elem&gt;text &lt;/element1&gt;
45    *      will produce: &lt;element1&gt; &lt;elem&gt;text&lt;elem&gt;
46    *      &lt;/element1&gt;
47    *  <li>If it find : &lt;element1&gt; &lt;elem&gt;[white spaces]
48    *      &lt;/element1&gt;,
49    *      it will produce:&lt;element1&gt; &lt;elem/&gt;[white spaces]&lt;
50    *      /element1&gt;
51    * </ul>
52    * What doesn't:
53    * <ul>
54    *  <li>Doesn't expand the entities. So the entities from the SGML document
55    *      must be resolved by the XML parser
56    *  <li>Doesn't replace internal entities with their corresponding value
57    * </ul>
58    */
59  
60  public class Sgml2Xml{
61  
62    /** Debug flag */
63    private static final boolean DEBUG = false;
64  
65    /**
66      * The constructor initialises some member fields
67      * @param SgmlDoc the content of the Sgml document that will be modified
68      */
69    public Sgml2Xml(String SgmlDoc){
70      // create a new modifier
71      m_modifier = new StringBuffer(SgmlDoc);
72      // create a new dobiousElements list
73      // se the explanatin at the end of the class
74      dubiousElements = new ArrayList();
75      stack = new Stack();
76    }
77  
78    /**
79      * The other constructor
80      * @param doc The Gate document that will be transformed to XML
81      */
82    public Sgml2Xml(Document doc){
83      // set as a member
84      m_doc = doc;
85  
86      // create a new modifier
87      m_modifier = new StringBuffer(m_doc.getContent().toString());
88  
89      // create a new dobiousElements list
90      // se the explanatin at the end of the class
91      dubiousElements = new ArrayList();
92      stack = new Stack();
93  
94    }
95  
96  /*  I keep this just in case I need some more debuging
97  
98    public static void main(String[] args){
99      Sgml2Xml convertor =
100       new Sgml2Xml("<w VVI='res trtetre\" relu = \"stop\">say
101       <w VBZ>is\n<trunc> <w UNC>th </trunc>");
102     try{
103       Out.println(convertor.convert());
104     } catch (Exception e){
105       e.printStackTrace(Err.getPrintWriter());
106     }
107   }
108   */
109 
110   /**
111     * It analises the char that was red in state 1
112     * If it finds '<' it then goes to state 2
113     * Otherwise it stays in state 1 and keeps track about the text that is not
114     * white spaces.
115     */
116   private void doState1(char currChar){
117     if ('<' == currChar){
118       // change to state 2
119       m_currState = 2;
120       if (!stack.isEmpty()){
121         // peek the element from the top of the stack
122         CustomObject o = (CustomObject) stack.peek();
123         // set some properties for this element
124         // first test to find out if text folows this element charPos > 0
125         if (charPos > 0){
126           // this is not an empty element because there is text that follows
127           // set the element from the top of the stack to be a non empty one
128           o.setClosePos(charPos);
129           o.setEmpty(false);
130           // reset the charPos
131           charPos = 0;
132         }//if (charPos > 0)
133       }//if (!stack.isEmpty())
134     }//if ('<' == m_currChar)
135     // if currChar is not whiteSpace then save the position of the last
136     // char that was read
137     if (('<' != currChar) && !isWhiteSpace(currChar))
138       charPos = m_cursor;
139   }//doState1
140 
141   /**
142     We came from state 1 and just read '<'
143     If currChar == '/' -> state 11
144     If is a char != white spaces -> state 3
145     stay in state 2 while there are only white spaces
146   */
147   private void doState2(char currChar){
148     if ('/' == currChar){
149       // go to state 11
150       m_currState = 11;
151     }
152     // if currChar is a char != white spaces  then go to state 3
153     if (('/' != m_currChar) && !isWhiteSpace(m_currChar)){
154       // save the position where starts the element's name
155       // we need that in order to be able to read the current tag name
156       // this name it will be read from m_modifier using the substring() method
157       elemNameStart = m_cursor -1;
158       // go to state 3
159       m_currState = 3;
160     }
161   }// doState2
162 
163   /**
164     * Just read the first char from the element's name and now analize the next
165     * char.
166     * If '>' the elem name was a single char -> state 1
167     * IF is WhiteSpaces -> state 4
168     * Otherwise stay in state 3 and read the elemnt's name
169     */
170   private void doState3(char currChar){
171     if ( '>' == currChar ){
172 
173       // save the pos where the element's name ends
174       elemNameEnd = m_cursor - 1;
175 
176       // this is also the pos where to insert '/' for empty elements.
177       // In this case we have this situation <w> sau < w>
178       closePos = m_cursor - 1;
179 
180       // get the name of the element
181       elemName = m_modifier.substring(elemNameStart,elemNameEnd);
182 
183       // we put the element into stack
184       // we think in this point that the element is empty...
185       performFinalAction(elemName, closePos);
186 
187       // go to state 1
188       m_currState = 1;
189     }
190     if (isWhiteSpace(currChar)){
191       // go to state 4
192       m_currState = 4;
193 
194       // save the pos where the element's name ends
195       elemNameEnd = m_cursor - 1;
196 
197       // get the name of the element
198       elemName = m_modifier.substring(elemNameStart,elemNameEnd);
199     }
200   }// doState3
201 
202   /**
203     * We read the name of the element and we prepare for '>' or attributes
204     * '>' -> state 1
205     * any char !- white space -> state 5
206     */
207   private void doState4(char currChar){
208     if ( '>' == currChar ){
209       // this is also the pos where to insert '/' for empty elements in this case
210       closePos = m_cursor -1 ;
211 
212       // we put the element into stack
213       // we think in this point that the element is empty...
214       performFinalAction(elemName, closePos);
215 
216       // go to state 1
217       m_currState = 1;
218     }
219     if (( '>' != currChar ) && !isWhiteSpace(currChar)){
220       // we just read the first char from the attrib name or attrib value..
221       // go to state 5
222       m_currState = 5;
223 
224       // remember the position where starts the attrib or the value of an attrib
225       attrStart = m_cursor - 1;
226     }
227   } // doState4
228 
229   /**
230     * '=' -> state 6
231     * '>' -> state 4 (we didn't read an attribute but a value of the
232     * defaultAtt )
233     * WS (white spaces) we don't know yet if we read an attribute or the value
234     * of the defaultAttr -> state 10
235     * This state modifies the content onf m_modifier ... it adds text
236     */
237   private void doState5(char currChar){
238     if ( '=' == currChar )
239           m_currState = 6;
240     if ( '>' == currChar ){
241       // this mean that the attribute was a value and we have to create
242       // a default attribute
243       // the same as in state 10
244       attrEnd = m_cursor - 1 ;
245       m_modifier.insert(attrEnd,'"');
246       m_modifier.insert(attrStart,"defaultAttr=\"");
247 
248       // go to state 4
249       m_currState = 4;
250 
251       // parse again the entire sequence from state 4 before reading any char
252       m_cursor = attrStart;
253     }
254     if (isWhiteSpace(currChar)){
255       // go to state 10
256       m_currState = 10;
257 
258       // record the position where ends this attribute
259       attrEnd = m_cursor - 1;
260     }
261   } // doState5
262 
263   /**
264     * IF we read ' or " then we have to get prepared to read everything until
265     * the next ' or "
266     * If we read a char then -> state 8;
267     * Stay here while we read WS
268     */
269   private void doState6(char currChar){
270     if ( ('\'' == currChar) || ('"' == currChar) ){
271       endPair = currChar;
272       if ('\'' == currChar){
273 
274         // we have to replace ' with "
275         m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
276       }
277       m_currState = 7;
278     }
279     if ( ('\'' != currChar) && ('"' != currChar) && !isWhiteSpace(currChar)){
280 
281       // this means that curChar is any char
282       m_currState = 8;
283 
284       // every value must be inside this pair""
285       m_modifier.insert(m_cursor - 1, '"');
286 
287       // insert implies the modification of m_cursor
288       // we increment m_cursor in order to say in the same position and to
289       // anulate the efect of insert.
290       m_cursor ++;
291     }
292   }// doState6
293 
294   /**
295     * If we find the pair ' or " go to state 9
296     * Otherwhise read everything and stay in state 7
297     * If in state 7 we read '>' then we add automaticaly a " at the end and go
298     * to state 1
299     */
300   private void doState7(char currChar){
301     //if ( ('\'' == currChar) || ('"' == currChar) ){
302     if ( endPair == currChar ){
303       if ('\'' == currChar){
304 
305         // we have to replace ' with "
306         m_modifier = m_modifier.replace(m_cursor - 1, m_cursor,"\"");
307       }
308       // reset the endPair
309       endPair = ' ';
310       m_currState = 9;
311     }
312 
313     if ('>' == currChar){
314       // go to state 1
315       m_currState = 1;
316 
317       // insert the final " ata the end
318       m_modifier.insert(m_cursor - 1, '"');
319 
320       // go to te current possition (because of insert)
321       m_cursor ++;
322 
323       performFinalAction(elemName, m_cursor - 1);
324     }
325 
326   }// doState7
327 
328   /**
329     * If '>' go to state 1
330     * If WS go to state 9
331     * Stays in state 8 and read the attribute's value
332     */
333   private void doState8(char currChar){
334 
335     if ('>' == currChar){
336       // go to state 1
337       m_currState = 1;
338 
339       // complete the end " ( <elem attr="value> )
340       m_modifier.insert(m_cursor - 1, '"');
341 
342       // go to te current possition (because of insert)
343       m_cursor ++;
344 
345       // we finished to read a beggining tag
346       // see the method definition for more details
347       performFinalAction(elemName, m_cursor - 1);
348     }
349     if (isWhiteSpace(currChar)){
350       // go to state 9
351       m_currState = 9;
352 
353       // add the ending " char
354       m_modifier.insert(m_cursor - 1, '"');
355 
356       // increment the cursor in order to anulate the effect of insert
357       m_cursor ++;
358     }
359   } // doState8
360   /**
361     * Here we prepare to read another attrib, value pair (any char -> state 5)
362     * If '>' we just read a beggining tag -> state 1
363     * Stay here while read WS
364     */
365   private void doState9(char currChar){
366     if ('>' == currChar){
367       // go to state 1
368       m_currState = 1;
369 
370       // add the object to the stack
371       performFinalAction(elemName, m_cursor - 1);
372     }
373     if (('>' != currChar) && !isWhiteSpace(m_currChar)){
374       // this is the same as state 4->5
375       m_currState = 5;
376       attrStart = m_cursor - 1;
377     }
378   }//doState9
379 
380   /**
381     * If any C -> state 4
382     * If '=' state 6
383     * Stays here while reads WS
384     */
385   private void doState10(char currChar){
386     if ('=' == currChar)
387              m_currState = 6;
388     if ( ('=' != currChar) && !isWhiteSpace(currChar)){
389       // this mean that the attribute was a value and we have to create
390       // a default attribute
391       m_modifier.insert(attrEnd,'"');
392       m_modifier.insert(attrStart,"defaultAttr=\"");
393 
394       // go to state 4
395       m_currState = 4;
396 
397       m_cursor = attrStart;
398     }
399   }// doState10
400 
401   /**
402     * We are preparing to read the and definition of an element
403     * Stays in this state while reading WS
404     */
405   private void doState11(char currChar){
406     if (!isWhiteSpace(currChar)){
407       m_currState = 12;
408       elemNameStart = m_cursor - 1;
409     }
410   } // doState11
411 
412   /**
413     * Here we read the element's name ...this is an end tag
414     * Stays here while reads a char
415     */
416   private void doState12(char currChar) {
417     if ('>' == currChar){
418       elemNameEnd = m_cursor - 1;
419       elemName = m_modifier.substring(elemNameStart,elemNameEnd);
420       performActionWithEndElem(elemName);
421       m_currState = 1;
422     }
423     if (isWhiteSpace(currChar)){
424       m_currState = 13;
425       elemNameEnd = m_cursor - 1;
426     }
427   }//doState12
428 
429   /**
430     * If '>' -> state 1
431     * Stays here while reads WS
432     */
433   private void doState13(char currChar) {
434     if ('>' == currChar){
435       elemName = m_modifier.substring(elemNameStart,elemNameEnd);
436       performActionWithEndElem(elemName);
437       m_currState = 1;
438     }
439   } // doState13
440 
441   /**
442     This method is responsable with document conversion
443   */
444   public String convert()throws IOException,MalformedURLException {
445     while (thereAreCharsToBeProcessed()) {
446       // read() gets the next char and increment the m_cursor
447       m_currChar = read();
448       switch(m_currState){
449         case 1:   doState1(m_currChar);break;
450         case 2:   doState2(m_currChar);break;
451         case 3:   doState3(m_currChar);break;
452         case 4:   doState4(m_currChar);break;
453         case 5:   doState5(m_currChar);break;
454         case 6:   doState6(m_currChar);break;
455         case 7:   doState7(m_currChar);break;
456         case 8:   doState8(m_currChar);break;
457         case 9:   doState9(m_currChar);break;
458         case 10:  doState10(m_currChar);break;
459         case 11:  doState11(m_currChar);break;
460         case 12:  doState12(m_currChar);break;
461         case 13:  doState13(m_currChar);break;
462       }// switch(m_currState)
463     }// while (thereAreCharsToBeProcessed())
464 
465     // put all the elements from the stack into the dubiousElements list
466     // we do that in order to colect all the dubious elements
467     while (!stack.isEmpty()) {
468       CustomObject obj = (CustomObject) stack.pop();
469       dubiousElements.add(obj);
470     }
471 
472     // sort the dubiousElements list descending on closePos...
473     // This is vital for the alghorithm because we have to make
474     // all the modifications from the bottom to the top...
475     // If we fail to do that, insert will change indices and
476     // CustomObject.getClosePos() will not be acurate anymore.
477     Collections.sort(dubiousElements, new MyComparator());
478 
479     //here we resolve all the dubious Elements...
480     // see the description of makeFinalModifications() method
481     ListIterator listIterator = dubiousElements.listIterator();
482     while (listIterator.hasNext()){
483       CustomObject obj = (CustomObject) listIterator.next();
484       makeFinalModifications(obj);
485     }
486 
487     //finally add the XML prolog
488     m_modifier.insert(0,"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
489     //Out.println(m_modifier.toString());
490 /*
491     // get a InputStream from m_modifier and write it into a temp file
492     // finally return the URI of the new XML document
493     ByteArrayInputStream is = new ByteArrayInputStream(
494                                               m_modifier.toString().getBytes()
495                                                        );
496 */
497     // this method is in gate.util package
498     File file = Files.writeTempFile(m_modifier.toString(),"UTF-8");
499 
500     //return m_doc.getSourceURL().toString();
501     return file.toURL().toString();
502   }// convert()
503 
504   /**
505     * This method tests to see if there are more char to be read
506     * It will return false when there are no more chars to be read
507     */
508   private boolean thereAreCharsToBeProcessed() {
509     if (m_cursor < m_modifier.length()) return true;
510     else return false;
511   }//thereAreCharsToBeProcessed
512 
513   /**
514     * This method reads a char and increments the m_cursor
515     */
516   private char read(){
517     return m_modifier.charAt(m_cursor ++);
518   }//read
519 
520   /**
521     * This is the action when we finished to read the entire tag
522     * The action means that we put the tag into stack and consider that is empty
523     * as default
524     */
525   private void performFinalAction(String elemName, int pos) {
526     // create anew CustomObject
527     CustomObject obj = new CustomObject();
528 
529     // set its properties
530     obj.setElemName(elemName);
531     obj.setClosePos(pos);
532 
533     // default we consider every element to be empty
534     // in state 1 we modify that if the element is followed by text
535     obj.setEmpty(true);
536     stack.push(obj);
537   } // performFinalAction
538 
539   /**
540     * This is the action performed when an end tag is read.
541     * The action consists in colecting all the dubiosElements(elements without
542     * an end tag). They are considered dubious because we don't know if they
543     * are empty or may be closed... Only the DTD can provide this information.
544     * We don't have a DTD so we will consider that all dubious elements
545     * followed by text will close at the end of the text...
546     * If a dubious element is followed by another element then is
547     * automaticaly considered an empty element.
548     *
549     * @param elemName is the the name of the end tag that was read
550     */
551   private void performActionWithEndElem(String elemName) {
552     CustomObject obj    = null;
553     boolean      stop = false;
554 
555     // get all the elements that are dubious from the stack
556     // the iteration will stop when an element is equal with elemName
557     while (!stack.isEmpty() && !stop){
558 
559       // eliminate the object from the stack
560       obj = (CustomObject) stack.pop();
561 
562       //if its elemName is equal with the param elemName we stop the itteration
563       if (obj.getElemName().equalsIgnoreCase(elemName)) stop = true;
564 
565       // otherwhise add the element to the doubiousElements list
566       else dubiousElements.add(obj);
567     }
568   }//performActionWithEndElem
569 
570   /**
571     * This method is called after we read the entire SGML document
572     * It resolves the dobious Elements this way:
573     * <ul>
574     * <li>
575     * 1. We don't have a DTD so we will consider that all dubious elements
576     *    followed by text will close at the end of the text...
577     * <li>
578     * 2. If a dubious element is followed by another element then is
579         automaticaly considered an empty element.
580     *
581     * An element is considered dubious when we don't know if it is  empty
582     * or may be closed...
583     *
584     * @param aCustomObject an object from the dubiousElements list
585     */
586   private void makeFinalModifications(CustomObject aCustomObject) {
587     String endElement = null;
588     // if the element is empty then we add / before > like this:
589     // <w> -> <w/>
590     if (aCustomObject.isEmpty())
591         m_modifier.insert(aCustomObject.getClosePos(),"/");
592     // otherwhise we create an end element
593     // <w> -> </w>
594     else{
595       // create the end element
596       endElement = "</" + aCustomObject.getElemName() + ">";
597       // insert it where the closePos indicates
598       m_modifier.insert(aCustomObject.getClosePos(), endElement);
599     }
600   } // makeFinalModifications
601 
602   /**
603     * Tests if c is a white space char
604     */
605   private boolean isWhiteSpace(char c) {
606     return Character.isWhitespace(c);
607   }
608 
609   // this is a gate Document... It's content will be transferred to
610   // m_modifier
611   private Document m_doc = null;
612 
613   // this is the modifier that will transform an SGML document into an
614   // XML document
615   private StringBuffer m_modifier = null;
616 
617   // we need the stack to be able to remember the order of the tags
618   private Stack stack = null;
619 
620   // this is a list with all the tags that are not colsed...
621   // some of them are empty tags and some of them are not...
622   private List dubiousElements = null;
623 
624   // this is tre current position inside the modifier
625   private int m_cursor = 0;
626 
627   // the current state of the SGML2XML automata
628   private int m_currState = 1;
629 
630   // the char that was read from the m_modifier @ position m_cursor
631   private char m_currChar = ' ';
632 
633   // the fields above are used by the convert method and its auxiliary functions
634   // like doState1...13()
635 
636   // indicates the last position of a text character (one which is not a white
637   // space)
638   // it is used in doState1() when we have to decide if an element is empty or
639   // not
640   // We decide that based on this field
641   // If the charPos > 0 then it means that the object from the top of stack
642   // is followed by text and we consider that is not empty
643   private int charPos = 0;
644 
645   // is the current tag name
646   private String elemName = null;
647 
648   // indicates where in the m_modifier begins the current tag elemName
649   private int elemNameStart = 0;
650 
651   // indicates where in the m_modifier ends the current tag elemName
652   // we need that in order to be able to read the current tag name
653   // this name it will be read from m_modifier using the substring() method
654   // it will be something like this :
655   // elemName = m_modifier.substring(elemNameStart,elemNameEnd)
656   // Eg: <w attr1=val1> -> <[elemNameStart]w[elemNameEnd] [attr1=val1>
657   private int elemNameEnd = 0;
658 
659   // this is the position there a start tag ends like this:
660   // Eg: <w attr1=val1>  -> <w attr1=val1 [closePos]>
661   private int closePos = 0;
662 
663   //this is the position where an attribute starts...
664   // we need it when we have to add the defaultAttr (see state 5)
665   private int attrStart = 0;
666 
667     //this is the position where an attribute ends...
668   // we need it when we have to add the defaultAttr (see state 5) or to add "
669   // Eg: <w attr1=val1> -> <w [attrStart]attr1[attrEnd]=val1>
670   private int attrEnd = 0;
671 
672   // endPair field is used in states 6 and 7....
673   // When we read something like this :
674   // attr=' val1 val2 val3' endPair remembers what is the pair for the beginning
675   // string
676   // Note that a combination like: attr = ' val1 val2 " will have an unexpected
677   // behaviour...
678   // We need this field when we have the following situation
679   // attr1 = " val1 val2 ' val3" . We need to know what is the end pair for ".
680   // In this case we can't allow ' to be the endPair
681   private char endPair = ' ';
682 
683 } // class Sgml2Xml
684 
685 /**
686   * The objects belonging to this class are used inside the stack
687   */
688 class  CustomObject {
689 
690   // constructor
691   public CustomObject() {
692     elemName = null;
693     closePos = 0;
694     empty = false;
695   }
696 
697   // accessor
698   public String getElemName() {
699     return elemName;
700   }
701 
702   public int getClosePos() {
703     return closePos;
704   }
705 
706   public boolean isEmpty() {
707     return empty;
708   }
709 
710   // modifiers
711   void setElemName(String anElemName) {
712     elemName = anElemName;
713   }
714 
715   void setClosePos(int aPos){
716     closePos = aPos;
717   }
718 
719   void setEmpty(boolean anEmptyValue) {
720     empty = anEmptyValue;
721   }
722 
723   // data fields
724   private String elemName = null;
725 
726   private int closePos = 0;
727 
728   private boolean empty = false;
729 
730 } // CustomObject
731 
732 class MyComparator implements Comparator {
733 
734   public MyComparator() {
735   }
736 
737   public int compare(Object o1, Object o2) {
738     if ( !(o1 instanceof CustomObject) ||
739          !(o2 instanceof CustomObject)) return 0;
740 
741     CustomObject co1 = (CustomObject) o1;
742     CustomObject co2 = (CustomObject) o2;
743     int result = 0;
744     if (co1.getClosePos() <   co2.getClosePos())  result = -1;
745     if (co1.getClosePos() ==  co2.getClosePos())  result =  0;
746     if (co1.getClosePos() >   co2.getClosePos())  result =  1;
747 
748     return -result;
749   } // compare
750 
751 }// class MyComparator
752