HTMLParserTxt

Dear Sir,

PLEASE if you can help me. I want to convert HTML file to txt file. I will send you the code. Please can you fix the errors.

How I get the follwing plugins. I mean from where?

Thanks a lot for your helpful.

Code:

package DBFin;

import java.io.File;
import java.io.FileInputStream;
import java.io.PrintWriter;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.apache.xerces.dom.CoreDocumentImpl;

public class HTMLTextParser
{
FileInputStream fin = null;
StringBuffer TextBuffer = null;
InputSource inSource = null;

// HTMLTextParser Constructor 2
public HTMLTextParser()
{ }
//Gets the text content from Nodes recursively
void processNode(Node node)
{
if (node == null) return;

//Process a text node
if (node.getNodeType() == node.TEXT_NODE)
{
TextBuffer.append(node.getNodeValue());
}
else
if (node.hasChildNodes())
{
//Process the Node's children
NodeList childList = node.getChildNodes();
int childLen = childList.getLength();
for (int count = 0; count < childLen; count ++)
processNode(childList.item(count));
}
else return;
}
// Extracts text from HTML Document
String htmltoText(String fileName)
{
DOMFragmentParser parser = new DOMFragmentParser();

System.out.println("Parsing text from HTML file " + fileName + "....");
File f = new File(fileName);

if (!f.isFile())
{
System.out.println("File " + fileName + " does not exist.");
return null;
}
try
{
fin = new FileInputStream(f);
}
catch (Exception e)
{
System.out.println("Unable to open HTML file " + fileName + " for reading.");
return null;
}
try
{
inSource = new InputSource(fin);
}
catch (Exception e)
{
System.out.println("Unable to open Input source from HTML file " + fileName);
return null;
}
CoreDocumentImpl codeDoc = new CoreDocumentImpl();
DocumentFragment doc = codeDoc.createDocumentFragment();
try
{
parser.parse(inSource, doc);
}
catch (Exception e)
{
System.out.println("Unable to parse HTML file " + fileName);
return null;
}
TextBuffer = new StringBuffer();
//Node is a super interface of DocumentFragment, so no typecast needed:
processNode(doc);
System.out.println("Done.");
return TextBuffer.toString();
}
// Writes the parsed text from HTML to a file
void writeTexttoFile(String htmlText, String fileName)
{
System.out.println("\nWriting HTML text to output text file " + fileName + "....");
try
{
PrintWriter pw = new PrintWriter(fileName);
pw.print(htmlText);
pw.close();
} catch
(Exception e)
{
System.out.println("An exception occurred in writing the html text to file.");
e.printStackTrace();
}
System.out.println("Done.");
}
// Extracts text from an HTML Document and writes it to a text file113:
public static void main(String args[])
{
if (args.length != 2)
{
System.out.println("Usage: java HTMLTextParser ");
System.exit(1);
}
HTMLTextParser htmlTextParserObj = new HTMLTextParser();
String htmlToText = htmlTextParserObj.htmltoText(args[0]);

if (htmlToText == null)
{
System.out.println("HTML to Text Conversion failed.");
}
else
{
System.out.println("\nThe text parsed from the HTML Document....\n" + htmlToText)
htmlTextParserObj.writeTexttoFile(htmlToText, args[1]);
}
}
}
}
}
}

View Answers

April 29, 2010 at 11:10 AM

Hi Friend,

Try the following code:

import java.io.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;

public class ConvertHtmlToText extends HTMLEditorKit.ParserCallback {
StringBuffer s;

public ConvertHtmlToText () {}

public void parse(Reader in) throws IOException {
s = new StringBuffer();
ParserDelegator delegator = new ParserDelegator();
delegator.parse(in, this, Boolean.TRUE);
}

public void handleText(char[] text, int pos) {
s.append(text);
}

public String getText() {
return s.toString();
}

public static void main (String[] args) {
try {
FileReader in = new FileReader("Hello.html");
ConvertHtmlToText parser = new ConvertHtmlToText ();
parser.parse(in);
in.close();
System.out.println(parser.getText());
}
catch (Exception e) {
e.printStackTrace();
}
}
}

Hope that it will be helpful for you.
Thanks

HTMLParserTxt

Questions by Category