Fathi
HTMLParserTxt
1 Answer(s)      3 years and a month ago
Posted in : Java Interview Questions

Dear Sir,

PLEASE if you can help me. I want to convert HTML file to txt file. I will send you the code. Please can you fix the errors.

How I get the follwing plugins. I mean from where?

Thanks a lot for your helpful.

Code:

package DBFin;

import java.io.File;
import java.io.FileInputStream;
import java.io.PrintWriter;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.apache.xerces.dom.CoreDocumentImpl;

public class HTMLTextParser
{
FileInputStream fin = null;
StringBuffer TextBuffer = null;
InputSource inSource = null;

// HTMLTextParser Constructor 2
public HTMLTextParser()
{ }
//Gets the text content from Nodes recursively
void processNode(Node node)
{
if (node == null) return;

//Process a text node
if (node.getNodeType() == node.TEXT_NODE)
{
TextBuffer.append(node.getNodeValue());
}
else
if (node.hasChildNodes())
{
//Process the Node's children
NodeList childList = node.getChildNodes();
int childLen = childList.getLength();
for (int count = 0; count < childLen; count ++)
processNode(childList.item(count));
}
else return;
}
// Extracts text from HTML Document
String htmltoText(String fileName)
{
DOMFragmentParser parser = new DOMFragmentParser();

System.out.println("Parsing text from HTML file " + fileName + "....");
File f = new File(fileName);

if (!f.isFile())
{
System.out.println("File " + fileName + " does not exist.");
return null;
}
try
{
fin = new FileInputStream(f);
}
catch (Exception e)
{
System.out.println("Unable to open HTML file " + fileName + " for reading.");
return null;
}
try
{
inSource = new InputSource(fin);
}
catch (Exception e)
{
System.out.println("Unable to open Input source from HTML file " + fileName);
return null;
}
CoreDocumentImpl codeDoc = new CoreDocumentImpl();
DocumentFragment doc = codeDoc.createDocumentFragment();
try
{
parser.parse(inSource, doc);
}
catch (Exception e)
{
System.out.println("Unable to parse HTML file " + fileName);
return null;
}
TextBuffer = new StringBuffer();
//Node is a super interface of DocumentFragment, so no typecast needed:
processNode(doc);
System.out.println("Done.");
return TextBuffer.toString();
}
// Writes the parsed text from HTML to a file
void writeTexttoFile(String htmlText, String fileName)
{
System.out.println("\nWriting HTML text to output text file " + fileName + "....");
try
{
PrintWriter pw = new PrintWriter(fileName);
pw.print(htmlText);
pw.close();
} catch
(Exception e)
{
System.out.println("An exception occurred in writing the html text to file.");
e.printStackTrace();
}
System.out.println("Done.");
}
// Extracts text from an HTML Document and writes it to a text file113:
public static void main(String args[])
{
if (args.length != 2)
{
System.out.println("Usage: java HTMLTextParser ");
System.exit(1);
}
HTMLTextParser htmlTextParserObj = new HTMLTextParser();
String htmlToText = htmlTextParserObj.htmltoText(args[0]);

if (htmlToText == null)
{
System.out.println("HTML to Text Conversion failed.");
}
else
{
System.out.println("\nThe text parsed from the HTML Document....\n" + htmlToText)
htmlTextParserObj.writeTexttoFile(htmlToText, args[1]);
}
}
}
}
}
}
View Answers

April 29, 2010 at 11:10 AM


Hi Friend,

Try the following code:

import java.io.*;
import javax.swing.text.html.*;
import javax.swing.text.html.parser.*;

public class ConvertHtmlToText extends HTMLEditorKit.ParserCallback {
StringBuffer s;

public ConvertHtmlToText () {}

public void parse(Reader in) throws IOException {
s = new StringBuffer();
ParserDelegator delegator = new ParserDelegator();
delegator.parse(in, this, Boolean.TRUE);
}

public void handleText(char[] text, int pos) {
s.append(text);
}

public String getText() {
return s.toString();
}

public static void main (String[] args) {
try {
FileReader in = new FileReader("Hello.html");
ConvertHtmlToText parser = new ConvertHtmlToText ();
parser.parse(in);
in.close();
System.out.println(parser.getText());
}
catch (Exception e) {
e.printStackTrace();
}
}
}

Hope that it will be helpful for you.
Thanks









Related Pages:

Ask Questions?

If you are facing any programming issue, such as compilation errors or not able to find the code you are looking for.

Ask your questions, our development team will try to give answers to your questions.