偏小数据的就不做多描述,网上有很多资料,在此有大概50G的xml格式的地理数据转换为json格式的数据,之后上传到mongodb数据库中,有什么好的建议,欢迎指正
解析xml数据
import java.util.ArrayList; import java.util.List; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.Locator; import org.xml.sax.SAXException; import com.mongodb.DBObject; /* * @author * @time 2015-11-8 * 主要是是implements ContentHandler,主要实现接口ContentHandler中的startDocument()、endDocument()、startElement()、endElement() * 另外自定义方法writeToMongoDB()、storeDBMongo() * */ public class MyContentHandler implements ContentHandler { private StringBuffer buf; private String ctitle; private String cns; private String cid; private String ctext; private String ctimestamp; private int idnumber=0; List<Data> listdata=new ArrayList<Data>(); List list=new ArrayList(); @Override public void setDocumentLocator(Locator locator) { // TODO Auto-generated method stub } @Override public void startDocument() throws SAXException { // TODO Auto-generated method stub buf=new StringBuffer(); System.out.println("*******解析开始*******"); } @Override public void endDocument() throws SAXException { // TODO Auto-generated method stub try { writeToMongoDB(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("*******解析结束*******"); } //把数据导入MongoDB数据库中 private void writeToMongoDB() throws Exception { // TODO Auto-generated method stub List<DBObject> dblist=new ArrayList<DBObject>(); for(Data d:listdata){ dblist.add(BSONT.mapToBSON(d.toJSONMap())); } MongoDBT.writeListToMongo("IP",27017,"databaseName","collectionName",dblist); } @Override public void startPrefixMapping(String prefix,String uri) throws SAXException { // TODO Auto-generated method stub } @Override public void endPrefixMapping(String prefix) throws SAXException { // TODO Auto-generated method stub } @Override public void startElement(String uri,String localName,String qName,Attributes attributes) throws SAXException { // TODO Auto-generated method stub if(qName=="page"){ idnumber=1; } if(qName=="title"){ ctitle=qName; }else if(qName=="ns"){ cns=qName; }else if(qName=="id"&&idnumber==1){ cid=qName; idnumber=0; }else if(qName=="timestamp"){ ctimestamp=qName; }else if(qName=="text"){ ctext=qName; } } @Override public void endElement(String uri,String qName) throws SAXException { // TODO Auto-generated method stub if(ctitle==qName){ String sss=buf.toString(); ctitle=""; list.add(sss); buf.setLength(0); }else if(cns==qName){ cns=""; String sss=buf.toString(); list.add(sss); buf.setLength(0); }else if(cid==qName){ cid=""; String sss=buf.toString(); list.add(sss); buf.setLength(0); }else if(ctimestamp==qName){ ctimestamp=""; String sss=buf.toString(); list.add(sss); buf.setLength(0); }else if(ctext==qName){ ctext=""; String sss=buf.toString(); list.add(sss); buf.setLength(0); //有些sss中虽然有重定向标记,但没有“[[”和“]]”,那么就会出现String的index不在范围内的问题 if((sss.toUpperCase().contains("#REDIRECT")||sss.contains("#重定向"))&&sss.contains("[[")&&sss.contains("]]")){ int i=sss.indexOf("[["); int j=sss.indexOf("]]"); String s=sss.substring(i+2,j); list.add(s); list.add("redirect"); }else{ list.add(""); list.add("article"); } } if(qName=="page"){ storeDBMongo(list); } } private void storeDBMongo(List lt) { // TODO Auto-generated method stub for(int i=0;i<list.size();i++){ System.out.println(lt.get(i)); } try { Data data=new Data(); data.setTitle(list.get(0).toString()); data.setNamespace(list.get(1).toString()); data.setId(list.get(2).toString()); data.setLastEsited(list.get(3).toString()); data.setMarkup(list.get(4).toString()); data.setTarget(list.get(5).toString()); data.setType(list.get(6).toString()); listdata.add(data); if(listdata.size()>=300){ writeToMongoDB(); listdata.clear(); } list.clear(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } @Override public void characters(char[] ch,int start,int length) throws SAXException { // TODO Auto-generated method stub if(ctitle=="title"){ buf.append(new String(ch,start,length)); }else if(cns=="ns"){ buf.append(new String(ch,length)); }else if(cid=="id"){ buf.append(new String(ch,length)); list.add(new String(ch,length)); }else if(ctimestamp=="timestamp"){ buf.append(new String(ch,length)); }else if(ctext=="text"){ buf.append(new String(ch,length)); } } @Override public void ignorableWhitespace(char[] ch,int length) throws SAXException { // TODO Auto-generated method stub } @Override public void processingInstruction(String target,String data) throws SAXException { // TODO Auto-generated method stub } @Override public void skippedEntity(String name) throws SAXException { // TODO Auto-generated method stub } }
自定义类Data、JSONT
import java.util.HashMap; import java.util.Map; public class Data { private String id; private String namespace; private String type; private String title; private String markup; private String lastEsited; private String target; public String getId() { return id; } public void setId(String id) { this.id = id; } public String getNamespace() { return namespace; } public void setNamespace(String namespace) { this.namespace = namespace; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getMarkup() { return markup; } public void setMarkup(String markup) { this.markup = markup; } public String getLastEsited() { return lastEsited; } public void setLastEsited(String lastEsited) { this.lastEsited = lastEsited; } public String getTarget() { return target; } public void setTarget(String target) { this.target = target; } public Map<String,Object> toJSONMap(){ Map<String,Object> jsonmap=new HashMap<String,Object>(); jsonmap.put("id",this.id); jsonmap.put("namespace",this.namespace); jsonmap.put("type",this.type); jsonmap.put("title",this.title); jsonmap.put("markup",this.markup); jsonmap.put("lastEsited",this.lastEsited); jsonmap.put("target",this.target); return jsonmap; } } /* * NextMap-Crawler Module * * Copyright (C) 2002-2014,Institute of Geographic Sciences and Natural Resources Research,* Chinese Academy of Sciences * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful,* but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. */ import java.io.IOException; import java.io.StringWriter; import java.util.List; import java.util.Map; import com.fasterxml.jackson.databind.ObjectMapper; /** * * @author zhuhaichuan * @date 2015-11-8 * * */ public class JSONT { public static String mapToJSONString(Map map) { StringWriter sw = new StringWriter(); try { ObjectMapper mapper = new ObjectMapper(); mapper.writeValue(sw,map); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return sw.toString(); } /** * * @param list * @return */ public static String listToJSONString(List list) { StringWriter sw = new StringWriter(); try { ObjectMapper mapper = new ObjectMapper(); mapper.writeValue(sw,list); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return sw.toString(); } /** * * @param list * @return */ public static String beanToJSONString(Object bean) { StringWriter sw = new StringWriter(); try { ObjectMapper mapper = new ObjectMapper(); mapper.writeValue(sw,bean); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return sw.toString(); } /** * * @param jsonstr * @return */ public static Map jsonToMap(String jsonstr) { Map map = null; try { ObjectMapper mapper = new ObjectMapper(); map = mapper.readValue(jsonstr,Map.class); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return map; } /** * * @param jsonstr * @return */ public static List jsonToList(String jsonstr) { List list = null; try { ObjectMapper mapper = new ObjectMapper(); list = mapper.readValue(jsonstr,List.class); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return list; } }还有就是类MyErrorHandler
import org.xml.sax.ErrorHandler; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; public class MyErrorHandler implements ErrorHandler { @Override public void warning(SAXParseException exception) throws SAXException { // TODO Auto-generated method stub System.out.println("*******WARNING******"); System.out.println("行号:" + exception.getLineNumber()); System.out.println("列号:" + exception.getColumnNumber()); System.out.println("exception信息:" + exception.getMessage()); System.out.println("********************"); } @Override public void error(SAXParseException exception) throws SAXException { // TODO Auto-generated method stub System.out.println("******* ERROR ******"); System.out.println("行号:" + exception.getLineNumber()); System.out.println("列号:" + exception.getColumnNumber()); System.out.println("exception信息:" + exception.getMessage()); System.out.println("********************"); } @Override public void fatalError(SAXParseException exception) throws SAXException { // TODO Auto-generated method stub System.out.println("******** FATAL ERROR ********"); System.out.println("行号:" + exception.getLineNumber()); System.out.println("列号:" + exception.getColumnNumber()); System.out.println("exception信息" + exception.getMessage()); System.out.println("*****************************"); } }
自定义MongoDBT类
import java.util.ArrayList; import java.util.List; import com.mongodb.DB; import com.mongodb.DBCollection; import com.mongodb.DBObject; import com.mongodb.Mongo; public class MongoDBT { public static void writeListToMongo(String ip,int port,String dbname,String collname,List<DBObject> list) throws Exception{ Mongo mongo=new Mongo(ip,port); DB db=mongo.getDB(dbname); DBCollection collection=db.getCollection(collname); List<DBObject> dblist=new ArrayList<DBObject>(); for(int i=0;i<list.size();i++){ dblist.add(list.get(i)); } collection.insert(dblist); mongo.close(); } }