第一步,从PDF转成TXT
注意:去除空格、空行等
__author__ = 'wangfei' # -*- coding: utf-8 -*- import sys import os reload(sys) sys.setdefaultencoding('utf-8') from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdfpage import PDFTextExtractionNotAllowed from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter from pdfminer.pdfpage import PDFPage from pdfminer.converter import PDFPageAggregator from pdfminer.layout import * #手动输入,以后改成在某个文件夹下读取子文件 fp = open('pdf/sln.pdf','rb') #用文件对象来创建一个pdf文档分析器 parser = PDFParser(fp) # 创建一个 PDF 文档 doc = PDFDocument(parser) # 检测文档是否提供txt转换,不提供就忽略 if not doc.is_extractable: raise PDFTextExtractionNotAllowed # 创建PDf 资源管理器 来管理共享资源 rsrcmgr = PDFResourceManager() # 创建一个PDF设备对象 laparams = LAParams() device = PDFPageAggregator(rsrcmgr,laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr,device) # 处理文档对象中每一页的内容 # doc.get_pages() 获取page列表 # 循环遍历列表,每次处理一个page的内容 # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 # 一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性, all = [] for page in PDFPage.create_pages(doc): interpreter.process_page(page) layout = device.get_result() for x in layout: if(isinstance(x,LTTextBox)): #coment by dz :delete the \n string = x.get_text().replace('\n','') #print string all.append(string.strip()) #写入文件到txt目录 ls = os.linesep fObj = open('txt/sln.txt','wb') fObj.writelines(['%s%s' % (x,ls) for x in all]) fObj.close()
第二步,从TXT到XML
首先XML处理类:
__author__ = 'wangfei' import xml.dom.minidom as Dom class XMLGenerator: def __init__(self,xml_name): self.doc = Dom.Document() self.xml_name = xml_name def createNode(self,node_name): return self.doc.createElement(node_name) def addNode(self,node,pre_node = None): cur_node = node if pre_node is not None: pre_node.appendChild(cur_node) else: self.doc.appendChild(cur_node) return cur_node def setNodeAttr(self,att_name,value): cur_node = node cur_node.setAttribute(att_name,value) def setNodeValue(self,cur_node,value): node_data = self.doc.createTextNode(value) cur_node.appendChild(node_data) def genXML(self): f = open(self.xml_name,"w") f.write(self.doc.toprettyxml(indent="\t",newl="\n",encoding="utf-8")) f.close()
根据文本信息处理TXT文档,文本信息包括章节标题标记、图像、公式等
#! /usr/bin/env python #coding:utf-8 import sys import linecache import re from XMLGenerator import * reload(sys) sys.setdefaultencoding('utf-8') fileName = "txt/sln.txt" try: fobj = open(fileName,'r') except IOError,e: print("*** file open error:",e) else: tittle = linecache.getline(fileName,1).lower().strip('\n') xmlFile = XMLGenerator(tittle.strip('\n') + ".xml") #xml root node article #add article node_article = xmlFile.createNode("div") xmlFile.setNodeAttr(node_article,"id","article") xmlFile.addNode(node=node_article) #add tittle node_tittle =xmlFile.createNode("div") xmlFile.setNodeAttr(node_tittle,"tittle") xmlFile.setNodeAttr(node_tittle,"class","ltx_title ltx_title_document") xmlFile.addNode(node_tittle,node_article) #add intru node_intru = xmlFile.createNode("div") xmlFile.setNodeAttr(node_intru,"intru") xmlFile.setNodeAttr(node_intru,"ltx_p") xmlFile.addNode(node_intru,node_tittle) #NULL node_section= xmlFile.createNode("div") node_sub = xmlFile.createNode("div") n_sec = [([-1] * 15) for i in range(22)] section = [] k=0 sec=0 sub=0 sec_info=0 sub_info_1=0 sub_info_2=0 #read paper section name for (num,eachLine) in enumerate(fobj): if num ==0: xmlFile.setNodeValue(node_tittle,eachLine.strip()) elif num ==1: xmlFile.setNodeValue(node_intru,eachLine.strip()) elif(eachLine != "\n"): p_set = '^2\.(\d+)' p_sub ='^2\.(\d+)\.(\d+)' p_num ='^\d\d+' p_fig ='^Fig' p_ch = '^Chapter' p_h ='^H\.' words = len(eachLine.split(' ')) m_h =re.search(p_h,eachLine) m_ch = re.search(p_ch,eachLine) m_fig = re.search(p_fig,eachLine) m_num =re.search(p_num,eachLine) m_set = re.search(p_set,eachLine) m_sub = re.search(p_sub,eachLine) if(m_h == None and m_num==None and m_fig==None and m_ch==None and words>=9 or m_set!=None): k =k+1 section.append(eachLine.strip()) if(m_set!=None and m_sub==None): #for the 2nd section sec_info = int(m_set.group(1)) n_sec[sec_info][0]=k sec = k #add the next section to the cur node node_section = xmlFile.createNode("div") xmlFile.setNodeAttr(node_section,"s2.ss" + m_set.group(1)) xmlFile.setNodeAttr(node_section,"ltx_section") xmlFile.addNode(node_section,node_article) #xmlFile.setNodeValue(node_section,eachLine.strip()) node_st =xmlFile.createNode("h2") xmlFile.setNodeAttr(node_st,"ltx_title ltx_titleh_section") xmlFile.addNode(node_st,node_section) xmlFile.setNodeValue(node_st,eachLine.strip()) elif m_set!=None and m_sub!=None: sub_info_1 = int(m_sub.group(1)) sub_info_2 = int(m_sub.group(2)) print sub_info_1 print sub_info_2 n_sec[sub_info_1][sub_info_2]=k sub =k #add sub to section node_sub =xmlFile.createNode("div") xmlFile.setNodeAttr(node_sub,"s"+"2.ss"+m_sub.group(1)+".sss"+m_sub.group(2)) xmlFile.setNodeAttr(node_sub,"ltx_subsection") xmlFile.addNode(node_sub,node_section) node_st =xmlFile.createNode("h2") xmlFile.setNodeAttr(node_st,"ltx_title ltx_titleh_subsection") xmlFile.addNode(node_st,node_sub) xmlFile.setNodeValue(node_st,eachLine.strip()) else : #add paragraph if sec>sub: n_para = str(k-sec) node_para = xmlFile.createNode("div") xmlFile.setNodeAttr(node_para,"s2.ss"+str(sec_info)+".p"+n_para) xmlFile.setNodeAttr(node_sub,"ltx_para") xmlFile.addNode(node_para,node_section) xmlFile.setNodeValue(node_para,eachLine.strip()) else : n_para = str(k-sub) node_para =xmlFile.createNode("div") xmlFile.setNodeAttr(node_para,"s2."+str(sub_info_1)+"."+str(sub_info_1)+".p"+n_para) xmlFile.setNodeAttr(node_para,node_sub) xmlFile.setNodeValue(node_para,eachLine.strip()) #gen xmlFile.genXML() fobj.close()