我尝试使用
XML minidom解析这个巨大的XML文档.虽然它在示例文件上运行良好,但在尝试处理真实文件时(约400 MB)它会阻塞系统.
我尝试从codereview为我的xml文件调整代码(它以流式方式处理数据而不是一次性内存加载),由于元素的嵌套特性,我无法隔离数据集.我之前处理过简单的XML文件,但不是像这样的内存密集型任务.
这是正确的方法吗?
如何将清单和发布者ID与每本书相关联?这就是我计划最终将两个表联系起来的方式.
任何反馈都非常感谢.
是book.xml
<BookDatabase> <BookHeader> <Name>BookData</Name> <BookUniverse>All</BookUniverse> <AsOfDate>2010-05-02</AsOfDate> <Version>1.1</Version> </BookHeader> <InventoryBody> <Inventory ID="12"> <PublisherClass ID="34"> <Publisher> <PublisherDetails> <Name>Microsoft Press</Name> <Type>Tech</Type> <ID>7462</ID> </PublisherDetails> </Publisher> </PublisherClass> <BookList> <Listing> <BookListSummary> <Date>2009-01-30</Date> </BookListSummary> <Book> <BookDetail ID="67"> <BookName>Code Complete 2</BookName> <Author>Steve McConnell</Author> <Pages>960</Pages> <ISBN>0735619670</ISBN> </BookDetail> <BookDetail ID="78"> <BookName>Application Architecture Guide 2</BookName> <Author>Microsoft Team</Author> <Pages>496</Pages> <ISBN>073562710X</ISBN> </BookDetail> </Book> </Listing> </BookList> </Inventory> <Inventory ID="64"> <PublisherClass ID="154"> <Publisher> <PublisherDetails> <Name>O'Reilly Media</Name> <Type>Tech</Type> <ID>7484</ID> </PublisherDetails> </Publisher> </PublisherClass> <BookList> <Listing> <BookListSummary> <Date>2009-03-30</Date> </BookListSummary> <Book> <BookDetail ID="98"> <BookName>Head First Design Patterns</BookName> <Author>Kathy Sierra</Author> <Pages>688</Pages> <ISBN>0596007124</ISBN> </BookDetail> </Book> </Listing> </BookList> </Inventory> </InventoryBody> </BookDatabase>
Python代码:
import sys import os #import MysqLdb from lxml import etree CATEGORIES = set(['BookHeader','Inventory','PublisherClass','PublisherDetails','BookDetail']) SKIP_CATEGORIES = set(['BookHeader']) DATA_ITEMS = ["Name","Type","ID","BookName","Author","Pages","ISBN"] def clear_element(element): element.clear() while element.getprevIoUs() is not None: del element.getparent()[0] def extract_book_elements(context): for event,element in context: if element.tag in CATEGORIES: yield element clear_element(element) def fast_iter2(context): for bookCounter,element in enumerate(extract_book_elements(context)): books = [book.text for book in element.findall("BookDetail")] bookdetail = { 'element' : element.tag,'ID' : element.get('ID') } for data_item in DATA_ITEMS: data = element.find(data_item) if data is not None: bookdetail[data_item] = data if bookdetail['element'] not in SKIP_CATEGORIES: #populate_database(bookdetail,books,cursor) print bookdetail,books print "========>",bookCounter,"<=======" def main(): #cursor = connectToDatabase() #cursor.execute("""SET NAMES utf8""") context = etree.iterparse("book.xml",events=("start","end")) #fast_iter(context,cursor) fast_iter2(context) #cursor.close() if __name__ == '__main__': main()
Python输出:
$python lxmletree_book.py ========> 0 <======= ========> 1 <======= {'ID': '12','element': 'Inventory'} [] ========> 2 <======= {'ID': '34','element': 'PublisherClass'} [] ========> 3 <======= {'Name': <Element Name at 0x105140af0>,'Type': <Element Type at 0x105140b40>,'ID': <Element ID at 0x105140b90>,'element': 'PublisherDetails'} [] ========> 4 <======= {'ID': None,'element': 'PublisherDetails'} [] ========> 5 <======= {'ID': None,'element': 'PublisherClass'} [] ========> 6 <======= {'ISBN': <Element ISBN at 0x105140eb0>,'Name': <Element Name at 0x105140dc0>,'Author': <Element Author at 0x105140e10>,'ID': '67','element': 'BookDetail','Pages': <Element Pages at 0x105140e60>} [] ========> 7 <======= {'ID': None,'element': 'BookDetail'} [] ========> 8 <======= {'ISBN': <Element ISBN at 0x1051460a0>,'Name': <Element Name at 0x105140f50>,'Author': <Element Author at 0x105140fa0>,'ID': '78','Pages': <Element Pages at 0x105146050>} [] ========> 9 <======= {'ID': None,'element': 'BookDetail'} [] ========> 10 <======= {'ID': None,'element': 'Inventory'} [] ========> 11 <======= {'ID': '64','element': 'Inventory'} [] ========> 12 <======= {'ID': '154','element': 'PublisherClass'} [] ========> 13 <======= {'Name': <Element Name at 0x105146230>,'Type': <Element Type at 0x105146280>,'ID': <Element ID at 0x1051462d0>,'element': 'PublisherDetails'} [] ========> 14 <======= {'ID': None,'element': 'PublisherDetails'} [] ========> 15 <======= {'ID': None,'element': 'PublisherClass'} [] ========> 16 <======= {'ISBN': <Element ISBN at 0x1051465f0>,'Name': <Element Name at 0x105146500>,'Author': <Element Author at 0x105146550>,'ID': '98','Pages': <Element Pages at 0x1051465a0>} [] ========> 17 <======= {'ID': None,'element': 'BookDetail'} [] ========> 18 <======= {'ID': None,'element': 'Inventory'} [] ========> 19 <=======
期望的输出(最终存储在MysqL中 – 现在是Python中的List):
Publishers InventoryID PublisherClassID Name Type ID 12 34 Microsoft Press Tech 7462 64 154 O'Reilly Media Tech 7484 Books PublisherID BookDetailID Name Author Pages ISBN 7462 67 Code Complete 2 Steve McConnell 960 0735619670 7462 78 Application Architecture Guide 2 Microsoft Team 496 073562710X 7484 98 Head First Design Patterns Kathy Sierra 688 0596007124
您可以尝试这样的事情:
import MysqLdb from lxml import etree import config def fast_iter(context,func,args=[],kwargs={}): # http://www.ibm.com/developerworks/xml/library/x-hiperfparse/ # Author: Liza Daly for event,elem in context: func(elem,*args,**kwargs) elem.clear() while elem.getprevIoUs() is not None: del elem.getparent()[0] del context def extract_paper_elements(element,cursor): pub={} pub['InventoryID']=element.attrib['ID'] try: pub['PublisherClassID']=element.xpath('PublisherClass/@ID')[0] except IndexError: pub['PublisherClassID']=None pub['PublisherClassID']=element.xpath('PublisherClass/@ID')[0] for key in ('Name','Type','ID'): try: pub[key]=element.xpath( 'PublisherClass/Publisher/PublisherDetails/{k}/text()'.format(k=key))[0] except IndexError: pub[key]=None sql='''INSERT INTO Publishers (InventoryID,PublisherClassID,Name,Type,ID) VALUES (%s,%s,%s) ''' args=[pub.get(key) for key in ('InventoryID','PublisherClassID','Name','ID')] print(args) # cursor.execute(sql,args) for bookdetail in element.xpath('descendant::BookList/Listing/Book/BookDetail'): pub['BookDetailID']=bookdetail.attrib['ID'] for key in ('BookName','Author','Pages','ISBN'): try: pub[key]=bookdetail.xpath('{k}/text()'.format(k=key))[0] except IndexError: pub[key]=None sql='''INSERT INTO Books (PublisherID,BookDetailID,Author,Pages,ISBN) VALUES (%s,%s) ''' args=[pub.get(key) for key in ('ID','BookDetailID','BookName','ISBN')] # cursor.execute(sql,args) print(args) def main(): context = etree.iterparse("book.xml",events=("end",),tag='Inventory') connection=MysqLdb.connect( host=config.HOST,user=config.USER,passwd=config.PASS,db=config.MYDB) cursor=connection.cursor() fast_iter(context,extract_paper_elements,args=(cursor,)) cursor.close() connection.commit() connection.close() if __name__ == '__main__': main()
>不要使用fast_iter2. original fast_iter
分开了来自特定处理功能的有用实用程序(extract_paper_elements). fast_iter2将两者混合在一起让你没有可重复的代码.>如果在etree.iterparse(“book.xml”中设置tag参数,events =(“end”,tag =’Inventory’)然后是你的处理函数extract_paper_elements只会看到Inventory元素.>给定Inventory元素,您可以使用xpath方法进行挖掘向下并刮掉所需的数据.> args和kwargs参数被添加到fast_iter so cursor中可以传递给extract_paper_elements.