在上一篇文章里说到了SAX的快速入门现在让我们来看看它的一个具体应用吧。
现在有这样的一个XML document:
<?xml version="1.0" encoding="UTF-8"?> <store> <other_tag1 /> <type name="book" type_id="1" /> <other_tag3 /> <bookstore> <other_tag2 /> <address addr="Shanghai,China" /> <other_tag4 /> <book category="COOKING" title="Everyday Italian" author="Giada De Laurentiis" year="2005" price="30.00" /> <book category="CHILDREN" title="Harry Potter" author="J K. Rowling" year="2005" price="29.99" /> </bookstore> </store>
我们要利用SAX提取<type name="book" type_id="1" />, <address addr="Shanghai,China" />, <book/>这几个nodes的信息,我们因该怎么做呢?
现在有这样的一个思路:利用SAX,当一遇到node的localName 为“type”,"address","book"的时候,就停下来抓取信息。这时候,我觉得一个方法就是在
startElement里面加上if/ else if/ else 这样的判断,这样虽然很直接明了,但是很傻,因为如果要解析的xml内容少还好,如果要抓取的信息量极大的话,那得
要多少个if/ else if/ else ?!现在我们换一个新的方法来处理这件事情:把需要追踪的元素形成一个“类似XML元素树”,例如,我们要追踪的全部元素为“store”,
“type”,"bookstore","book",树的结构为:
store | -------------- | | type bookstore | ----------- | | address book
追踪的代码:
root.track("store",new StoreTracker()); root.track("store/type",new TypeTracker()); root.track("store/bookstore",new BookStoreTracker()); root.track("store/bookstore/address",new AddrTracker()); root.track("store/bookstore/book",new BookTracker());形成“XML元素树”的关键代码,这是TagTracker.java里的一部分:
public void track(String tagName,TagTracker tracker) { int slashOffset = tagName.indexOf("/"); if(slashOffset < 0) { trackers.put(tagName,tracker); } else if(slashOffset == 0) { // "/a/b" --> "a/b" and continue. track(tagName.substring(1),tracker); } else { String topTagName = tagName.substring(0,slashOffset); String remainderOfTagName = tagName.substring(slashOffset + 1); TagTracker child = trackers.get(topTagName); if(child == null) { child = new TagTracker(); trackers.put(topTagName,child); } child.track(remainderOfTagName,tracker); } }
这样,在整个"root" element下有"store"节点,在“store”下有“type”和“bookstore”节点,在"bookstore"下又有“address”和"book"节点,形成了我们实际需要追踪的
“元素树”.
我们知道,SAX是通过startElement(String namespaceURI,String localName,String qName,Attributes attr)和 endElement(String namespaceURI,String qName)来获取元素的uri,localName,attributes等诸如信息的(如果要获取元素里的文本信息,需要调用characters(char[] ch,int start,int length))。每一个元素都应该有一个特定的的方法来收集这个元素的信息(包括元素里的文本信息):
// get information of the element "type" private class TypeTracker extends TagTracker { public TypeTracker() { } @Override public void onStart(String namespaceURI,Attributes attr) throws Exception { String name = attr.getValue("name"); String typeId = attr.getValue("type_id"); // handle these info. ... } @Override public void onEnd(String namespaceURI,CharArrayWriter contents) { // get the characters data inside the element String text = contents.toString(); // handle this text... } }而SAX的startElement和endElement方法会分别调用上述的两个方法,把namespaceURI,qName,attributes等信息传递过去
好了,现在让我们来看看,是怎么让SAX挑出我们所需要追踪的元素并解析的吧(自动忽略其他的元素)。
TagTracker.java的全部代码:
package com.desmond.xml.sax; import java.io.CharArrayWriter; import java.util.Hashtable; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.xml.sax.Attributes; public class TagTracker { private static Log log = LogFactory.getLog(TagTracker.class); private Hashtable<String,TagTracker> trackers = new Hashtable<String,TagTracker>(); // use to skip these un-choiced elements private static SkippingTagTracker skip = new SkippingTagTracker(); public TagTracker() { } /** * track all elements need to be tracked. * @param tagName the absolute path of the tracked element * @param tracker the detail handler to parse a special element */ public void track(String tagName,TagTracker tracker) { int slashOffset = tagName.indexOf("/"); if (slashOffset < 0) { // if it is a simple tag name (no "/" sperators) simple add it. trackers.put(tagName,tracker); } else if (slashOffset == 0) { // "/a/b" --> "a/b" and continue. track(tagName.substring(1),slashOffset); String remainderOfTagName = tagName.substring(slashOffset + 1); TagTracker child = trackers.get(topTagName); if (child == null) { child = new TagTracker(); trackers.put(topTagName,child); } child.track(remainderOfTagName,tracker); } } /** * start to parse a element,which will be invoked by SAX's startElement. * @param namespaceURI * @param localName * @param qName * @param attr * @param tagStack "tracked element tree" * @throws Exception */ public void startElement(String namespaceURI,Attributes attr,Stack<TagTracker> tagStack) throws Exception { TagTracker tracker = trackers.get(localName); // not found this tag track. if (tracker == null) { log.debug("Skipping tag:[" + localName + "]"); tagStack.push(skip); } else { log.debug("Tracking tag:[" + localName + "]"); onDeactivate(); tracker.onStart(namespaceURI,attr); tagStack.push(tracker); } } /** * end to parse a element,which will be invoked by SAX's endElement. * @param namespaceURI * @param localName * @param qName * @param contents * @param tagStack current element * @throws Exception */ public void endElement(String namespaceURI,CharArrayWriter contents,Stack tagStack) throws Exception { log.debug("Finished tracking tag:[" + localName + "]"); try { onEnd(namespaceURI,contents); } catch (Exception e) { e.printStackTrace(); throw e; } // clean up the stack tagStack.pop(); // send the reactivate event TagTracker activeTracker = (TagTracker) tagStack.peek(); if (activeTracker != null) { log.debug("Reactivating pervIoUs tag tracker."); activeTracker.onReactivate(); } } /** * detail method to start to parse the special element. * @param namespaceURI * @param localName * @param qName * @param attr * @throws Exception */ public void onStart(String namespaceURI,Attributes attr) throws Exception { } public void onDeactivate() throws Exception { } /** * detail method to end to parse the special element. * @param namespaceURI * @param localName * @param qName * @param contents */ public void onEnd(String namespaceURI,CharArrayWriter contents) { } public void onReactivate() throws Exception { } }在startElement中,我们会去“元素树”中选择当前的元素,从而去判断当前的元素释放应该被解析:
TagTracker tracker = trackers.get(localName); // not found this tag track. if (tracker == null) { log.debug("Skipping tag:[" + localName + "]"); tagStack.push(skip); } else { log.debug("Tracking tag:[" + localName + "]"); onDeactivate(); tracker.onStart(namespaceURI,attr); tagStack.push(tracker); }
如果tracker为null,说明这个元素不是我们想要解析的那些,因此要"跳过",如何去跳过,这里用到了另一个类SkippingTagTracker,它所做的事情就是去跳过这个
元素,代码如下:
package com.desmond.xml.sax; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.xml.sax.Attributes; public class SkippingTagTracker extends TagTracker { private static Log log = LogFactory.getLog(SkippingTagTracker.class); public void startElement(String namespaceURI,Stack tagStack) { log.debug("Skipping tag[" + localName + "]..."); tagStack.push(this); } public void endElement(String namespaceURI,Stack tagStack) { log.debug("Finished skipping tag:[" + localName + "]"); tagStack.pop(); } }如果tracker有值,我们就要开始解析这个元素了。这时,我们调用前面提到的”一个特定的的方法来收集这个元素的信息“,即:
tracker.onStart(namespaceURI,attr);完了,之后我们把当前这个元素的tracker压入栈中,如果这个元素没有子元素,那么它将会在endElement中被抛出栈顶。如果有的话,先处理它的子
元素,等所有的子元素都处理完了,才调用endElement结束这个元素(这个也是SAX处理元素的规则,这里只是用到了这一点而已)。
综上所述,整个事件的处理流程是:使用TagTracker追踪所需要的元素-------> 利用TagTracker的 track方法递归调用形成”元素树“-------> 利用这些
”元素树“去判断当前的元素是不是应该被解析-------> 不被解析就跳过,被解析,再去判断他的子元素。一种这样递归地完成真个解析过程。
附全部代码(SaxMapper.java/ SkippingTagTracker.java/ TagTracker.java/ TestMain.java,共四个类).
SaxMapper.java
package com.desmond.xml.sax; import java.io.ByteArrayInputStream; import java.io.CharArrayWriter; import java.io.File; import java.io.IOException; import java.util.Stack; import org.apache.commons.configuration.Configuration; import org.apache.commons.io.FileUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.xml.sax.Attributes; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.XMLReader; import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.XMLReaderFactory; public class SaxMapper extends DefaultHandler{ private static final Log log = LogFactory.getLog(SaxMapper.class); private String file = ""; protected Stack<TagTracker> tagStack = new Stack<TagTracker>(); protected XMLReader xr; protected CharArrayWriter contents = new CharArrayWriter(); protected boolean parSEOnly; protected Configuration config; public SaxMapper() throws Exception { try { xr = XMLReaderFactory.createXMLReader(); } catch (SAXException e) { e.printStackTrace(); } log.info("Creating the tag tracker network."); tagStack.push(createTagTrackerNetwork()); log.info("Tag Tracker network created."); } @Override public void startElement(String namespaceURI,Attributes attr) throws SAXException { contents.reset(); TagTracker ativeTracker = (TagTracker) tagStack.peek(); try { ativeTracker.startElement(namespaceURI,attr,tagStack); } catch(Exception e) { e.printStackTrace(); throw new SAXException(e); } } @Override public void endElement(String namespaceURI,String qName) throws SAXException { TagTracker activeTracker = (TagTracker) tagStack.peek(); try { activeTracker.endElement(namespaceURI,contents,tagStack); } catch(Exception e) { e.printStackTrace(); throw new SAXException(e); } } @Override public void characters(char[] ch,int length) throws SAXException { contents.write(ch,start,length); } protected InputSource getSource(String fileName) throws IOException { File xmlFile = new File(fileName); byte[] xmlBytes = FileUtils.readFileToByteArray(xmlFile); return new InputSource(new ByteArrayInputStream(xmlBytes)); } protected void parseXML() throws IOException,Exception { parse(getSource(getFileName())); } protected void parse(InputSource in) throws Exception{ parSEOnly = true; xr.setContentHandler(this); log.info("start to parse..."); xr.parse(in); log.info("end to parse..."); } protected TagTracker createTagTrackerNetwork() { TagTracker root = new TagTracker(); root.track("store",new BookTracker()); return root; } protected String getFileName() { return file; } protected void setFileName(String fileName) { this.file = fileName; } private class StoreTracker extends TagTracker { public StoreTracker() { } @Override public void onStart(String namespaceURI,Attributes attr) throws Exception { } @Override public void onEnd(String namespaceURI,CharArrayWriter contents) { } } // get information of the element "type" private class TypeTracker extends TagTracker { public TypeTracker() { } @Override public void onStart(String namespaceURI,CharArrayWriter contents) { // get the characters data inside the element String text = contents.toString(); // handle this text... } } private class BookStoreTracker extends TagTracker { public BookStoreTracker() { } } private class AddrTracker extends TagTracker { public AddrTracker() { } @Override public void onStart(String namespaceURI,CharArrayWriter contents) { } } private class BookTracker extends TagTracker { public BookTracker() { } @Override public void onStart(String namespaceURI,CharArrayWriter contents) { } } }
SkippingTagTracker.java
package com.desmond.xml.sax; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.xml.sax.Attributes; public class SkippingTagTracker extends TagTracker { private static Log log = LogFactory.getLog(SkippingTagTracker.class); public void startElement(String namespaceURI,Stack tagStack) { log.debug("Finished skipping tag:[" + localName + "]"); tagStack.pop(); } }
TagTracker
package com.desmond.xml.sax; import java.io.CharArrayWriter; import java.util.Hashtable; import java.util.Stack; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.xml.sax.Attributes; public class TagTracker { private static Log log = LogFactory.getLog(TagTracker.class); private Hashtable<String,CharArrayWriter contents) { } public void onReactivate() throws Exception { } }
TestMain.java
package com.desmond.xml.sax; public class TestMain { /** * @param args * @throws Exception */ public static void main(String[] args) throws Exception { SaxMapper mapper = new SaxMapper(); if(args.length > 0) { mapper.setFileName(args[0]); mapper.parseXML(); } else { System.out.println("no file configurated! please configurate it."); } } }