My StAX based Message Shredder

So far I have worked a lot with Java and XML but mostly with the DOM parser. For my current project I had to split a big XML document into smaller ‘messages’ and put these small messages on a queue. Well, if you have the same experience as I do with DOM then you know that that is not going to work without a lot of available memory. So that’s why I chose a SAX parser and to be more specific a StAX parser.
One thing that you have to take care of are the namespaces in the original ‘batch’ xml document. These namespaces can be specified in the root element and when you just ‘copy’ some inner XML fragments as being new XML documents you might miss these namespaces defined at a ‘higher’ level. For example see this batch xml-document:



	
		
			1
			
				Message number one
			
		
		
			2
			
				Message number two
			
		
		
			3
			
				Message number three
			
		
	

What we need is a XML document for each ‘prefix:xml-message’-tag we encounter. But simply copy & paste that piece of XML is not going to work because we will then miss the declaration of the ‘prefix’ namespace. So to avoid that I collect every namespace I encounter before I reach the ‘prefix:xml-message’-tag and process this collection of namespaces into the new XML document root element. Here is the complete class doing the whole thing:

package net.pascalalma.xml;

import java.io.InputStream;
import java.io.StringWriter;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;

import javax.xml.namespace.QName;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamConstants;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import javax.xml.stream.XMLStreamWriter;

import org.codehaus.stax2.XMLInputFactory2;
import org.codehaus.stax2.XMLOutputFactory2;

public class StaxParser {

   private static final QName MSG_TAG = new QName("http://www.pascalalma.net/xml", "message");
   private static final String ENCODING = "UTF-8";

   protected static XMLInputFactory xmlInFactory;
   protected static XMLOutputFactory xmlOutFactory;

   static {
      System.setProperty("javax.xml.stream.XMLInputFactory", "com.ctc.wstx.stax.WstxInputFactory");
      System.setProperty("javax.xml.stream.XMLOutputFactory", "com.ctc.wstx.stax.WstxOutputFactory");
      System.setProperty("javax.xml.stream.XMLEventFactory", "com.ctc.wstx.stax.WstxEventFactory");

      xmlInFactory = XMLInputFactory2.newInstance();
      xmlOutFactory = XMLOutputFactory2.newInstance();
   }
   private XMLStreamReader getXMLStream(InputStream xml) throws XMLStreamException {

      return xmlInFactory.createXMLStreamReader(xml, ENCODING);

   }

   public long parseBatchDocument(InputStream xml) throws Exception {

      long counter = -1;
      XMLStreamReader xmlReader = getXMLStream(xml);

      // Namespaces at the elements 'above'  the 
      Map globalNamespaces = new HashMap();

      try {
         while (xmlReader.hasNext()) {
            switch (xmlReader.getEventType()) {
               case XMLStreamConstants.START_DOCUMENT:
                  System.out.println("Document started");
                  break;
               case XMLStreamConstants.START_ELEMENT:

                  if (MSG_TAG.equals(xmlReader.getName())) {
                     // Start of a 'small' xml-message.
                     String xmlMsg = extractMessage(xmlReader, globalNamespaces);
                     handleMessage(xmlMsg);
                     counter++;
                  } else {
                     // Some high-level tag containing more tags detected.
                     // Get the namespaces at the highest level.
                     // After that, we need to add these namespaces at lower level,
                     // for every xml-message
                     int nrOfNamespaces = xmlReader.getNamespaceCount();
                     for (int i = 0; i < nrOfNamespaces; i++) {
                        globalNamespaces.put(xmlReader.getNamespacePrefix(i), xmlReader.getNamespaceURI(i));
                     }
                  }
                  break;
               case XMLStreamConstants.END_DOCUMENT:
                  System.out.println("Document ended");
                  break;
               default:
                  // LOG.warn("unimplemented event type: " + xmlReader.getEventType());
            }
            // move to next event
            if (xmlReader.hasNext()) {
               xmlReader.next();
            }
         }
      } catch (XMLStreamException e) {
         throw new Exception(e);
      }
      return counter;

   }

   /**
    * Copies all xml after the start element of 'xml-message' until the end element of 'xml-message' .
    * @param xmlReader the batch xml
    * @param namespaces the namespaces defined at high level
    * @return the xml-message
    * @throws Exception if any exception occurs
    */
   private String extractMessage(XMLStreamReader xmlReader, Map namespaces)
         throws Exception {
      // Untill we reach the end tag we write everything to a buffer that will
      // contain the xml-message content.
      StringWriter sw = new StringWriter();
      XMLStreamWriter xmlMsgWriter = null;

      try {

         xmlMsgWriter = xmlOutFactory.createXMLStreamWriter(sw);

         // Write first element with (global) namespaces
         xmlMsgWriter.writeStartElement(xmlReader.getPrefix(), xmlReader.getLocalName(), xmlReader
               .getNamespaceURI());
         // Add all namespaces at global level (if any) to the new root element
         Iterator keys = namespaces.keySet().iterator();
         while (keys.hasNext()) {
            String nsPrefix = (String) keys.next();
            xmlMsgWriter.writeNamespace(nsPrefix, namespaces.get(nsPrefix));
         }
         // Get the attributes for the current element
         copyAttributes(xmlReader, xmlMsgWriter);

         // Now contiue with the loop
         xmlReader.next();

         localLoop: while (xmlReader.hasNext()) {

            switch (xmlReader.getEventType()) {
               case XMLStreamConstants.ATTRIBUTE:
                  copyAttributes(xmlReader, xmlMsgWriter);
                  break;
               case XMLStreamConstants.CDATA:
                  xmlMsgWriter.writeCData(xmlReader.getText());
                  break;
               case XMLStreamConstants.CHARACTERS:
                  xmlMsgWriter.writeCharacters(xmlReader.getText());
                  break;
               case XMLStreamConstants.COMMENT:
                  xmlMsgWriter.writeComment(xmlReader.getText());
                  break;
               case XMLStreamConstants.DTD:
                  xmlMsgWriter.writeDTD(xmlReader.getText());
                  break;
               case XMLStreamConstants.NAMESPACE:
                  copyNamespaces(xmlReader, xmlMsgWriter);
                  break;
               case XMLStreamConstants.START_ELEMENT:
                  // Copy the start element
                  xmlMsgWriter.writeStartElement(xmlReader.getPrefix(), xmlReader.getLocalName(), xmlReader
                        .getNamespaceURI());
                  // Add all attributes for this element
                  copyAttributes(xmlReader, xmlMsgWriter);
                  // Add all namespaces for this element
                  copyNamespaces(xmlReader, xmlMsgWriter);
                  break;
               case XMLStreamConstants.END_ELEMENT:
                  xmlMsgWriter.writeEndElement();
                  if (MSG_TAG.equals(xmlReader.getName())) {
                     break localLoop;
                  }
               case XMLStreamConstants.SPACE:
                  // ignore spaces
                  break;
               default:
                  System.out.println("Unknown eventType = " + xmlReader.getEventType());
            }
            // move to next event
            if (xmlReader.hasNext()) {
               xmlReader.next();
            }

         } // end localLoop

         xmlMsgWriter.flush();
      } catch (XMLStreamException e) {
         throw new Exception(e);
      } finally {

            if (xmlMsgWriter != null) {
               xmlMsgWriter.close();
            }
            if (sw != null) {
               sw.close();
            }

      }
      return sw.toString();
   }

   /**
    * Copies all attributes found at the current position in the xmlReader to the current
    * position in the xmlWriter.
    * @param xmlReader XMLStreamReader
    * @param xmlWriter XMLStreamWriter
    * @throws XMLStreamException if any xmlStream related exception occurs
    */
   private void copyAttributes(XMLStreamReader xmlReader, XMLStreamWriter xmlWriter) throws XMLStreamException {
      int numOfAttr = xmlReader.getAttributeCount();
      for (int i = 0; i < numOfAttr; i++) {
         xmlWriter.writeAttribute(xmlReader.getAttributePrefix(i), xmlReader.getAttributeNamespace(i),
               xmlReader.getAttributeLocalName(i), xmlReader.getAttributeValue(i));
      }
   }
   /**
    * Copies all namespaces found at the current position in the xmlReader to the current
    * position in the xmlWriter.
    * @param xmlReader XMLStreamReader
    * @param xmlWriter XMLStreamWriter
    * @throws XMLStreamException if any xmlStream related exception occurs
    */
   private void copyNamespaces(XMLStreamReader xmlReader, XMLStreamWriter xmlWriter) throws XMLStreamException {
      int nrOfNamespaces = xmlReader.getNamespaceCount();
      for (int i = 0; i < nrOfNamespaces; i++) {

         xmlWriter.writeNamespace(xmlReader.getNamespacePrefix(i), xmlReader.getNamespaceURI(i));
      }
   }

   private void handleMessage(String xml) {
      System.out.println("====================================================");
      System.out.println(xml);
 System.out.println("+++++++++++++++++++++++++++++++++++++++++++++++++++");
   }
}

To have this class running you will need the following libraries: stax-api-1.0.1.jar and wstx-asl-3.1.1.jar.
Have fun with it!

About Pascal Alma

Pascal is a senior IT consultant and has been working in IT since 1997. He is monitoring the latest development in new technologies (Mobile, Cloud, Big Data) closely and particularly interested in Java open source tool stacks, cloud related technologies like AWS and mobile development like building iOS apps with Swift. Specialties: Java/JEE/Spring Amazon AWS API/REST Big Data Continuous Delivery Swift/iOS
This entry was posted in XML/ XSD/ XSLT and tagged . Bookmark the permalink.