Convert PDF to XML in Java


 

Convert PDF to XML in Java

In this section, you will learn how to convert pdf file to xml using Java Programming language.

In this section, you will learn how to convert pdf file to xml using Java Programming language.

Convert PDF to XML File in Java

In this Java tutorial section, you will learn how to convert pdf file to xml using java program. We have used itext api for this purpose. To read hello.pdf file, we have used PDFReader class. The data is first converted into bytes and then with the use of StringBuffer, it will again converted into string. Then we have used StreamResult which acts as an holder for a transformation result in XML.After that Transformer class process XML from a variety of sources and write the transformation output to a variety of sinks. Then TransformerHandler listens for SAX ContentHandler, parse events and transforms them to a result. The method startElement() and endElement() of TransformerHandler class have created the tags in the xml file. The Parser invoked startElement() method at the beginning of every element and endElement() at the end of every element in the XML document.

Here is the code:

import java.io.*;
import java.util.*;
import org.xml.sax.*;
import javax.xml.parsers.*;
import javax.xml.transform.*;
import org.xml.sax.helpers.*;
import javax.xml.transform.sax.*;
import javax.xml.transform.stream.*;
import com.lowagie.text.*;
import com.lowagie.text.pdf.*;

public class ConvertPDFToXML {
	static StreamResult streamResult;
	static TransformerHandler handler;
	static AttributesImpl atts;

	public static void main(String[] args) throws IOException {

		try {
			Document document = new Document();
			document.open();
			PdfReader reader = new PdfReader("C:\\hello.pdf");
			PdfDictionary page = reader.getPageN(1);
			PRIndirectReference objectReference = (PRIndirectReference) page
					.get(PdfName.CONTENTS);
			PRStream stream = (PRStream) PdfReader
					.getPdfObject(objectReference);
			byte[] streamBytes = PdfReader.getStreamBytes(stream);
			PRTokeniser tokenizer = new PRTokeniser(streamBytes);

			StringBuffer strbufe = new StringBuffer();
			while (tokenizer.nextToken()) {
				if (tokenizer.getTokenType() == PRTokeniser.TK_STRING) {
					strbufe.append(tokenizer.getStringValue());
				}
			}
			String test = strbufe.toString();
			streamResult = new StreamResult("data.xml");
			initXML();
			process(test);
			closeXML();
			document.add(new Paragraph(".."));
			document.close();
		} catch (Exception e) {
		}
	}

	public static void initXML() throws ParserConfigurationException,
			TransformerConfigurationException, SAXException {
		SAXTransformerFactory tf = (SAXTransformerFactory) SAXTransformerFactory
				.newInstance();

		handler = tf.newTransformerHandler();
		Transformer serializer = handler.getTransformer();
		serializer.setOutputProperty(OutputKeys.ENCODING, "ISO-8859-1");
		serializer.setOutputProperty(
				"{http://xml.apache.org/xslt}indent-amount", "4");
		serializer.setOutputProperty(OutputKeys.INDENT, "yes");
		handler.setResult(streamResult);
		handler.startDocument();
		atts = new AttributesImpl();
		handler.startElement("", "", "Roseindia", atts);
	}

	public static void process(String s) throws SAXException {
		String[] elements = s.split("\\|");
		atts.clear();
		handler.startElement("", "", "Message", atts);
		handler.characters(elements[0].toCharArray(), 0, elements[0].length());
		handler.endElement("", "", "Message");
	}

	public static void closeXML() throws SAXException {
		handler.endElement("", "", "Roseindia");
		handler.endDocument();
	}
}

Ads