1
15
16 package gate.util;
17
18 import java.io.*;
19 import java.util.*;
20
21 import javax.swing.text.BadLocationException;
22 import javax.swing.text.MutableAttributeSet;
23 import javax.swing.text.html.HTML;
24 import javax.swing.text.html.HTMLEditorKit;
25 import javax.swing.text.html.HTMLEditorKit.ParserCallback;
26 import javax.swing.text.html.parser.ParserDelegator;
27
28
38 public class HtmlLinksExtractor extends ParserCallback {
39
40
41 private static final boolean DEBUG = false;
42
43
44 private HTML.Tag currentTag = null;
45
46
47 static boolean firstTitle = true;
48
49
50 static String endUl = "";
51
52
53 static String currFile = "";
54
55
56 static String currPath = "";
57
58
62 public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) {
63
64 currentTag = t;
65 if (HTML.Tag.A == t){
66 Out.pr("<LI><" + t);
67 String href = "";
68 Enumeration e = a.getAttributeNames();
69 while(e.hasMoreElements()) {
70 HTML.Attribute name = (HTML.Attribute) e.nextElement();
71 String value = (String) a.getAttribute(name);
72
73 if(name == HTML.Attribute.HREF) {
74 if(
75 value.startsWith("http:") || value.startsWith("HTTP:") ||
76 value.startsWith("file:") || value.startsWith("FILE:") ||
77 value.startsWith("mailto:") || value.startsWith("MAILTO:") ||
78 value.startsWith("ftp:") || value.startsWith("FTP:")
79 )
80 Out.pr(" HREF=\"" + value + "\"");
81 else { Out.pr(" HREF=\"" + currPath + "/" + value + "\"");
83 }
84 }
85 }
87 Out.pr(">");
88 }
90 if (HTML.Tag.TITLE == t){
91 Out.pr(endUl + "<H3>");
92 if(firstTitle) { firstTitle = false; endUl = "</UL>"; }
93 }
95 }
97 private void printAttributes(MutableAttributeSet a){
98 if (a == null) return;
99 if (0 != a.getAttributeCount()){
101 Enumeration enumeration = a.getAttributeNames();
102 while (enumeration.hasMoreElements()){
103 Object attribute = enumeration.nextElement();
104 Out.pr(" "+ attribute.toString() + "=\"" +
105 a.getAttribute(attribute).toString()+"\"");
106 } } }
110
113 public void handleEndTag(HTML.Tag t, int pos){
114 currentTag = null;
115
116 if (HTML.Tag.A == t)
117 Out.pr("</"+t+">\n");
118 if (HTML.Tag.TITLE == t)
119 Out.pr(
120 "</H3></A>\n\n<P>Links in: <A HREF=\"" + currFile +
121 "\">" + currFile + "</A>:\n<UL>\n"
122 );
123
124 }
126
128 public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a, int pos){
129 if (HTML.Tag.A == t){
130 Out.pr("<"+t);
131 printAttributes(a);
132 Out.pr("/>\n");
133 }
135 if (HTML.Tag.TITLE == t){
136 Out.pr("<"+t);
137 printAttributes(a);
138 Out.pr("/>\n");
139 } }
142
143 public void handleText(char[] text, int pos){
144
145 if(HTML.Tag.A == currentTag){
146 String tagText = new String(text);
148 Out.pr(tagText);
149 }
151 if(HTML.Tag.TITLE == currentTag){
152 String tagText = new String(text);
154 Out.pr(tagText);
155 }
157 }
159
163 public void handleError(String errorMsg, int pos) {
164 }
166
167
171 public void flush() throws BadLocationException{
172 }
174
176 public void handleComment(char[] text, int pos) {
177 }
178
179
184 private static List listAllFiles(File aFile, Set foldersToIgnore){
185 java.util.List sgmlFileNames = new ArrayList();
186 java.util.List foldersToExplore = new ArrayList();
187 if (!aFile.isDirectory()){
188 sgmlFileNames.add(aFile.getPath());
190 return sgmlFileNames;
191 } listFilesRec(aFile,sgmlFileNames,foldersToExplore, foldersToIgnore);
193 return sgmlFileNames;
194 }
196
197 private static void listFilesRec(File aFile,
198 java.util.List fileNames,
199 java.util.List foldersToExplore,
200 Set foldersToIgnore){
201
202 String[] fileList = aFile.list();
203 for (int i=0; i< fileList.length; i++){
204 File tmpFile = new File(aFile.getPath()+"\\"+fileList[i]);
205 if (tmpFile.isDirectory()){
206 if (!foldersToIgnore.contains(tmpFile.getName())) { if(DEBUG) {
209 Err.prln("adding dir: " + tmpFile);
210 Err.prln(" name: " + tmpFile.getName());
211 }
212 foldersToExplore.add(tmpFile);
213 }
214 }else{
215 if(
217 ( fileList[i].toLowerCase().endsWith(".html") ) ||
218 ( fileList[i].toLowerCase().endsWith(".htm") )
219 ) fileNames.add(tmpFile.getPath());
220 } }
223 while(!foldersToExplore.isEmpty()){
224 File folder = (File)foldersToExplore.get(0);
225 foldersToExplore.remove(0);
226 listFilesRec(folder,fileNames,foldersToExplore,foldersToIgnore);
227 }
229 }
231
232 public static void main(String[] args){
233 HTMLEditorKit.Parser parser = new ParserDelegator();
234 HtmlLinksExtractor htmlDocHandler = new HtmlLinksExtractor();
236
237 if (args.length == 0){
238 Out.prln(
239 "Eg: java HtmlLinksExtractor g:\\tmp\\relative javadoc img > results.txt"
240 );
241 return;
242 }
243 File htmlFolder = new File(args[0]);
245 Set foldersToIgnore = new HashSet();
246 for(int i = 1; i<args.length; i++)
247 foldersToIgnore.add(args[i]);
248
249 List htmlFileNames = listAllFiles(htmlFolder,foldersToIgnore);
250 while (!htmlFileNames.isEmpty()){
252 try{
253 String htmlFileName = (String) htmlFileNames.get(0);
254 currFile = htmlFileName;
255 currPath = new File(currFile).getParent().toString();
256 htmlFileNames.remove(0);
257
258 Out.prln("\n\n<A HREF=\"file://" + htmlFileName + "\">");
259 Reader reader = new FileReader(htmlFileName);
260 parser.parse(reader, htmlDocHandler, true);
262 } catch (IOException e){
263 e.printStackTrace(System.out);
264 } } System.err.println("done.");
267 }
269 }
271
272
273