PDF-TXT-XML

第一步，从PDF转成TXT

注意：去除空格、空行等

__author__ = 'wangfei'
# -*- coding: utf-8 -*-
import sys
import os
reload(sys)
sys.setdefaultencoding('utf-8')
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *
#手动输入，以后改成在某个文件夹下读取子文件
fp = open('pdf/sln.pdf','rb')
#用文件对象来创建一个pdf文档分析器
parser = PDFParser(fp)
# 创建一个  PDF 文档
doc = PDFDocument(parser)
# 检测文档是否提供txt转换，不提供就忽略
if not doc.is_extractable:
    raise PDFTextExtractionNotAllowed
# 创建PDf 资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建一个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr,laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr,device)

# 处理文档对象中每一页的内容
# doc.get_pages() 获取page列表
# 循环遍历列表，每次处理一个page的内容
# 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象
#  一般包括LTTextBox,LTFigure,LTImage,LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性，
all = []
for page in PDFPage.create_pages(doc):
    interpreter.process_page(page)
    layout = device.get_result()
    for x in layout:
        if(isinstance(x,LTTextBox)):
			#coment by dz :delete the \n
            string = x.get_text().replace('\n','')
            #print string
            all.append(string.strip())

#写入文件到txt目录
ls = os.linesep
fObj = open('txt/sln.txt','wb')
fObj.writelines(['%s%s' % (x,ls) for x in all])
fObj.close()

第二步，从TXT到XML

首先XML处理类：

__author__ = 'wangfei'

import xml.dom.minidom as Dom


class XMLGenerator:
    def __init__(self,xml_name):
        self.doc = Dom.Document()
        self.xml_name = xml_name

    def createNode(self,node_name):
        return self.doc.createElement(node_name)

    def addNode(self,node,pre_node = None):
        cur_node = node
        if pre_node is not None:
            pre_node.appendChild(cur_node)
        else:
            self.doc.appendChild(cur_node)
        return cur_node

    def setNodeAttr(self,att_name,value):
        cur_node = node
        cur_node.setAttribute(att_name,value)

    def setNodeValue(self,cur_node,value):
        node_data = self.doc.createTextNode(value)
        cur_node.appendChild(node_data)

    def genXML(self):
        f = open(self.xml_name,"w")
        f.write(self.doc.toprettyxml(indent="\t",newl="\n",encoding="utf-8"))
        f.close()

根据文本信息处理TXT文档，文本信息包括章节标题标记、图像、公式等

#! /usr/bin/env python
#coding:utf-8
import sys
import linecache
import re
from XMLGenerator import *

reload(sys)
sys.setdefaultencoding('utf-8')


fileName = "txt/sln.txt"
try:
    fobj = open(fileName,'r')
except IOError,e:
    print("*** file open error:",e)
else:
	tittle = linecache.getline(fileName,1).lower().strip('\n')
	xmlFile = XMLGenerator(tittle.strip('\n') + ".xml")
	#xml root node article
	#add article
	node_article = xmlFile.createNode("div")
	xmlFile.setNodeAttr(node_article,"id","article")
	xmlFile.addNode(node=node_article)
	
	#add tittle
	node_tittle =xmlFile.createNode("div")
	xmlFile.setNodeAttr(node_tittle,"tittle")
	xmlFile.setNodeAttr(node_tittle,"class","ltx_title ltx_title_document")
	xmlFile.addNode(node_tittle,node_article)
    
	#add intru
	node_intru = xmlFile.createNode("div")
	xmlFile.setNodeAttr(node_intru,"intru")
	xmlFile.setNodeAttr(node_intru,"ltx_p")
	xmlFile.addNode(node_intru,node_tittle)
	
	#NULL
	node_section= xmlFile.createNode("div")
	node_sub = xmlFile.createNode("div")
	
	n_sec = [([-1] * 15) for i in range(22)]
	section = []
	k=0
	sec=0
	sub=0
	sec_info=0
	sub_info_1=0
	sub_info_2=0
    
	#read paper section name
	for (num,eachLine) in enumerate(fobj):
		if num ==0:
			xmlFile.setNodeValue(node_tittle,eachLine.strip())
		elif num ==1:
			xmlFile.setNodeValue(node_intru,eachLine.strip())
		elif(eachLine != "\n"):
			p_set = '^2\.(\d+)'
			p_sub ='^2\.(\d+)\.(\d+)'
			p_num ='^\d\d+'
			p_fig ='^Fig'
			p_ch  = '^Chapter'
			p_h   ='^H\.'
			
			words = len(eachLine.split(' '))
			m_h   =re.search(p_h,eachLine)
			m_ch  = re.search(p_ch,eachLine)
			m_fig = re.search(p_fig,eachLine)
			m_num =re.search(p_num,eachLine)
			m_set = re.search(p_set,eachLine)
			m_sub = re.search(p_sub,eachLine)
			if(m_h == None and m_num==None and m_fig==None and m_ch==None and words>=9 or m_set!=None):
				k =k+1
				section.append(eachLine.strip())
				if(m_set!=None and m_sub==None):  #for the 2nd section
					sec_info = int(m_set.group(1))
					n_sec[sec_info][0]=k
					sec = k
					#add the next section to the cur node
					node_section = xmlFile.createNode("div")
					xmlFile.setNodeAttr(node_section,"s2.ss" + m_set.group(1))
					xmlFile.setNodeAttr(node_section,"ltx_section")
					xmlFile.addNode(node_section,node_article)
					#xmlFile.setNodeValue(node_section,eachLine.strip())
					
					node_st =xmlFile.createNode("h2")
					xmlFile.setNodeAttr(node_st,"ltx_title ltx_titleh_section")
					xmlFile.addNode(node_st,node_section)
					xmlFile.setNodeValue(node_st,eachLine.strip())
					
					
				elif m_set!=None and m_sub!=None:
					sub_info_1 = int(m_sub.group(1))
					sub_info_2 = int(m_sub.group(2))
					print sub_info_1
					print sub_info_2
					n_sec[sub_info_1][sub_info_2]=k
					sub =k
					#add sub to section
					node_sub =xmlFile.createNode("div")
					xmlFile.setNodeAttr(node_sub,"s"+"2.ss"+m_sub.group(1)+".sss"+m_sub.group(2))
					xmlFile.setNodeAttr(node_sub,"ltx_subsection")
					xmlFile.addNode(node_sub,node_section)
					
					node_st =xmlFile.createNode("h2")
					xmlFile.setNodeAttr(node_st,"ltx_title ltx_titleh_subsection")
					xmlFile.addNode(node_st,node_sub)
					xmlFile.setNodeValue(node_st,eachLine.strip())
				else :
					#add paragraph
					if sec>sub: 
						n_para = str(k-sec) 
						node_para = xmlFile.createNode("div")
						xmlFile.setNodeAttr(node_para,"s2.ss"+str(sec_info)+".p"+n_para)
						xmlFile.setNodeAttr(node_sub,"ltx_para")
						xmlFile.addNode(node_para,node_section)
						xmlFile.setNodeValue(node_para,eachLine.strip())
					else :
						n_para = str(k-sub)
						node_para =xmlFile.createNode("div")
						xmlFile.setNodeAttr(node_para,"s2."+str(sub_info_1)+"."+str(sub_info_1)+".p"+n_para)
						xmlFile.setNodeAttr(node_para,node_sub)
						xmlFile.setNodeValue(node_para,eachLine.strip())
	#gen
	xmlFile.genXML()
	fobj.close()

PDF-TXT-XML

猜你在找的XML相关文章