1   /*
2    *  HtmlLinkExtractor.java
3    *
4    *  Copyright (c) 1998-2005, The University of Sheffield.
5    *
6    *  This file is part of GATE (see http://gate.ac.uk/), and is free
7    *  software, licenced under the GNU Library General Public License,
8    *  Version 2, June 1991 (in the distribution as file licence.html,
9    *  and also available at http://gate.ac.uk/gate/licence.html).
10   *
11   *  Cristian URSU,  16/Nov/2001
12   *
13   *  $Id: HtmlLinksExtractor.java,v 1.7 2005/01/11 13:51:37 ian Exp $
14   */
15  
16  package gate.util;
17  
18  import java.io.*;
19  import java.util.*;
20  
21  import javax.swing.text.BadLocationException;
22  import javax.swing.text.MutableAttributeSet;
23  import javax.swing.text.html.HTML;
24  import javax.swing.text.html.HTMLEditorKit;
25  import javax.swing.text.html.HTMLEditorKit.ParserCallback;
26  import javax.swing.text.html.parser.ParserDelegator;
27  
28  /**
29   * This class extracts links from HTML files.
30   * <B>It has been hacked</B> to build the contents of
31   * <A HREF="http://gate.ac.uk/sitemap.html">http://gate.ac.uk/sitemap.html</A>;
32   * you <B>probably don't want to use it</B> for anything else!
33   * <P>
34   * Implements the behaviour of the HTML reader.
35   * Methods of an object of this class are called by the HTML parser when
36   * events will appear.
37   */
38  public class HtmlLinksExtractor extends ParserCallback {
39  
40    /** Debug flag */
41    private static final boolean DEBUG = false;
42  
43    /** The tag currently being processed */
44    private HTML.Tag currentTag = null;
45  
46    /** whether we've done a title before */
47    static boolean firstTitle = true;
48  
49    /** will contain &lt;/UL&gt; after first title */
50    static String endUl = "";
51  
52    /** Name of the file we're currently processing */
53    static String currFile = "";
54  
55    /** Path to the file we're currently processing */
56    static String currPath = "";
57  
58    /** This method is called when the HTML parser encounts the beginning
59      * of a tag that means that the tag is paired by an end tag and it's
60      * not an empty one.
61      */
62    public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
63  
64      currentTag = t;
65      if (HTML.Tag.A == t){
66        Out.pr("<LI><" + t);
67        String href = "";
68        Enumeration e = a.getAttributeNames();
69        while(e.hasMoreElements()) {
70          HTML.Attribute name = (HTML.Attribute) e.nextElement();
71          String value = (String) a.getAttribute(name);
72  
73          if(name == HTML.Attribute.HREF) {
74            if(
75              value.startsWith("http:") || value.startsWith("HTTP:") ||
76              value.startsWith("file:") || value.startsWith("FILE:") ||
77              value.startsWith("mailto:") || value.startsWith("MAILTO:") ||
78              value.startsWith("ftp:") || value.startsWith("FTP:")
79            )
80              Out.pr(" HREF=\"" + value + "\"");
81            else { // if it is a relative path....
82              Out.pr(" HREF=\"" + currPath + "/" + value + "\"");
83            }
84          }
85        } // while
86  
87        Out.pr(">");
88      }// End if
89  
90      if (HTML.Tag.TITLE == t){
91        Out.pr(endUl + "<H3>");
92        if(firstTitle) { firstTitle = false; endUl = "</UL>"; }
93      }// End if
94  
95    }//handleStartTag
96  
97    private void printAttributes(MutableAttributeSet a){
98      if (a == null) return;
99      // Take all the attributes an put them into the feature map
100     if (0 != a.getAttributeCount()){
101       Enumeration enumeration = a.getAttributeNames();
102       while (enumeration.hasMoreElements()){
103         Object attribute = enumeration.nextElement();
104         Out.pr(" "+ attribute.toString() + "=\"" +
105                                   a.getAttribute(attribute).toString()+"\"");
106       }// End while
107     }// End if
108   }// printAttributes();
109 
110    /** This method is called when the HTML parser encounts the end of a tag
111      * that means that the tag is paired by a beginning tag
112      */
113   public void handleEndTag(HTML.Tag t, int pos){
114     currentTag = null;
115 
116     if (HTML.Tag.A == t)
117       Out.pr("</"+t+">\n");
118     if (HTML.Tag.TITLE == t)
119       Out.pr(
120         "</H3></A>\n\n<P>Links in: <A HREF=\"" + currFile +
121         "\">" + currFile + "</A>:\n<UL>\n"
122       );
123 
124   }//handleEndTag
125 
126   /** This method is called when the HTML parser encounts an empty tag
127     */
128   public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
129     if (HTML.Tag.A == t){
130       Out.pr("<"+t);
131       printAttributes(a);
132       Out.pr("/>\n");
133     }// End if
134 
135     if (HTML.Tag.TITLE == t){
136       Out.pr("<"+t);
137       printAttributes(a);
138       Out.pr("/>\n");
139     }// End if
140   } // handleSimpleTag
141 
142   /** This method is called when the HTML parser encounts text (PCDATA)*/
143   public void handleText(char[] text, int pos){
144 
145     if(HTML.Tag.A == currentTag){
146       //text of tag A
147       String tagText = new String(text);
148       Out.pr(tagText);
149     }// End if
150 
151     if(HTML.Tag.TITLE == currentTag){
152       //text of tag A
153       String tagText = new String(text);
154       Out.pr(tagText);
155     }// End if
156 
157   }// end handleText();
158 
159   /**
160     * This method is called when the HTML parser encounts an error
161     * it depends on the programmer if he wants to deal with that error
162     */
163   public void handleError(String errorMsg, int pos) {
164     //Out.println ("ERROR CALLED : " + errorMsg);
165   }
166 
167   /** This method is called once, when the HTML parser reaches the end
168     * of its input streamin order to notify the parserCallback that there
169     * is nothing more to parse.
170     */
171   public void flush() throws BadLocationException{
172   }// flush
173 
174   /** This method is called when the HTML parser encounts a comment
175     */
176   public void handleComment(char[] text, int pos) {
177   }
178 
179   /**
180    * Given a certain folder it lists recursively all the files contained
181    * in that folder. It returns a list of strings representing the file
182    * names
183    */
184   private static List listAllFiles(File aFile, Set foldersToIgnore){
185     java.util.List sgmlFileNames = new ArrayList();
186     java.util.List foldersToExplore = new ArrayList();
187     if (!aFile.isDirectory()){
188       // add the file to the file list
189       sgmlFileNames.add(aFile.getPath());
190       return sgmlFileNames;
191     }// End if
192     listFilesRec(aFile,sgmlFileNames,foldersToExplore, foldersToIgnore);
193     return sgmlFileNames;
194   } // listAllFiles();
195 
196   /** Helper method for listAllFiles */
197   private static void listFilesRec(File aFile,
198                                   java.util.List fileNames,
199                                   java.util.List foldersToExplore,
200                                   Set foldersToIgnore){
201 
202     String[] fileList = aFile.list();
203     for (int i=0; i< fileList.length; i++){
204       File tmpFile = new File(aFile.getPath()+"\\"+fileList[i]);
205       if (tmpFile.isDirectory()){
206         // If the file is not included
207         if (!foldersToIgnore.contains(tmpFile.getName())) {  //fileList[i])) {
208           if(DEBUG) {
209             Err.prln("adding dir: " + tmpFile);
210             Err.prln("  name: " + tmpFile.getName());
211           }
212           foldersToExplore.add(tmpFile);
213         }
214       }else{
215         // only process .html files
216         if(
217           ( fileList[i].toLowerCase().endsWith(".html") ) ||
218           ( fileList[i].toLowerCase().endsWith(".htm") )
219         ) fileNames.add(tmpFile.getPath());
220       }// End if
221     }// End for
222 
223     while(!foldersToExplore.isEmpty()){
224       File folder = (File)foldersToExplore.get(0);
225       foldersToExplore.remove(0);
226       listFilesRec(folder,fileNames,foldersToExplore,foldersToIgnore);
227     }//End while
228 
229   } // listFilesRec();
230 
231   /** Extract links from all .html files below a directory */
232   public static void main(String[] args){
233     HTMLEditorKit.Parser  parser = new ParserDelegator();
234     // create a new Htmldocument handler
235     HtmlLinksExtractor htmlDocHandler = new HtmlLinksExtractor();
236 
237     if (args.length == 0){
238       Out.prln(
239         "Eg: java HtmlLinksExtractor g:\\tmp\\relative javadoc img > results.txt"
240       );
241       return;
242     }
243     // Create a folder file File
244     File htmlFolder = new File(args[0]);
245     Set foldersToIgnore = new HashSet();
246     for(int i = 1; i<args.length; i++)
247       foldersToIgnore.add(args[i]);
248 
249     List htmlFileNames = listAllFiles(htmlFolder,foldersToIgnore);
250     //Collections.sort(htmlFileNames);
251     while (!htmlFileNames.isEmpty()){
252       try{
253         String htmlFileName = (String) htmlFileNames.get(0);
254         currFile = htmlFileName;
255         currPath = new File(currFile).getParent().toString();
256         htmlFileNames.remove(0);
257 
258         Out.prln("\n\n<A HREF=\"file://" + htmlFileName + "\">");
259         Reader reader = new FileReader(htmlFileName);
260         // parse the HTML document
261         parser.parse(reader, htmlDocHandler, true);
262       } catch (IOException e){
263         e.printStackTrace(System.out);
264       }// End try
265     }// End while
266     System.err.println("done.");
267   }// main
268 
269 }//End class HtmlLinksExtractor
270 
271 
272 
273