public class HTMLTextParser { FileInputStream fin = null; StringBuffer TextBuffer = null; InputSource inSource = null;
// HTMLTextParser Constructor 2 public HTMLTextParser() { } //Gets the text content from Nodes recursively void processNode(Node node) { if (node == null) return;
//Process a text node if (node.getNodeType() == node.TEXT_NODE) { TextBuffer.append(node.getNodeValue()); } else if (node.hasChildNodes()) { //Process the Node's children NodeList childList = node.getChildNodes(); int childLen = childList.getLength(); for (int count = 0; count < childLen; count ++) processNode(childList.item(count)); } else return; } // Extracts text from HTML Document String htmltoText(String fileName) { DOMFragmentParser parser = new DOMFragmentParser();
System.out.println("Parsing text from HTML file " + fileName + "...."); File f = new File(fileName);
if (!f.isFile()) { System.out.println("File " + fileName + " does not exist."); return null; } try { fin = new FileInputStream(f); } catch (Exception e) { System.out.println("Unable to open HTML file " + fileName + " for reading."); return null; } try { inSource = new InputSource(fin); } catch (Exception e) { System.out.println("Unable to open Input source from HTML file " + fileName); return null; } CoreDocumentImpl codeDoc = new CoreDocumentImpl(); DocumentFragment doc = codeDoc.createDocumentFragment(); try { parser.parse(inSource, doc); } catch (Exception e) { System.out.println("Unable to parse HTML file " + fileName); return null; } TextBuffer = new StringBuffer(); //Node is a super interface of DocumentFragment, so no typecast needed: processNode(doc); System.out.println("Done."); return TextBuffer.toString(); } // Writes the parsed text from HTML to a file void writeTexttoFile(String htmlText, String fileName) { System.out.println("\nWriting HTML text to output text file " + fileName + "...."); try { PrintWriter pw = new PrintWriter(fileName); pw.print(htmlText); pw.close(); } catch (Exception e) { System.out.println("An exception occurred in writing the html text to file."); e.printStackTrace(); } System.out.println("Done."); } // Extracts text from an HTML Document and writes it to a text file113: public static void main(String args[]) { if (args.length != 2) { System.out.println("Usage: java HTMLTextParser "); System.exit(1); } HTMLTextParser htmlTextParserObj = new HTMLTextParser(); String htmlToText = htmlTextParserObj.htmltoText(args[0]);
if (htmlToText == null) { System.out.println("HTML to Text Conversion failed."); } else { System.out.println("\nThe text parsed from the HTML Document....\n" + htmlToText) htmlTextParserObj.writeTexttoFile(htmlToText, args[1]); } } } } } }
public class ConvertHtmlToText extends HTMLEditorKit.ParserCallback { StringBuffer s;
public ConvertHtmlToText () {}
public void parse(Reader in) throws IOException { s = new StringBuffer(); ParserDelegator delegator = new ParserDelegator(); delegator.parse(in, this, Boolean.TRUE); }
public void handleText(char[] text, int pos) { s.append(text); }
public String getText() { return s.toString(); }
public static void main (String[] args) { try { FileReader in = new FileReader("Hello.html"); ConvertHtmlToText parser = new ConvertHtmlToText (); parser.parse(in); in.close(); System.out.println(parser.getText()); } catch (Exception e) { e.printStackTrace(); } } }