# coding:utf-8 from login163 import * from xml.parsers import expat import MysqLdb class mail163(Login163): ''' get 'limit' unread mails at once,the data format is xml if 'subject' in xml data,then return the data,else return None ''' def get_unread_mail(self,start,limit): postdata = { 'var':'<?xml version="1.0"?><object><int name="fid">1</int><boolean name="skipLockedFolders">false</boolean><string name="order">date</string><boolean name="desc">true</boolean><int name="start">'+str(start)+'</int><int name="limit">'+str(limit)+'</int><boolean name="topFirst">false</boolean><object name="filterFlags"><boolean name="read">false</boolean></object><boolean name="returnTotal">true</boolean><boolean name="returnTag">true</boolean></object>' } postdata = urllib.urlencode(postdata) url = 'http://twebmail.mail.163.com/js5/s?sid='+self.sid+'&func=mBox:listMessages&deftabclick=t2&deftabclick=undefined&from=toolbar&type=unread&mBoxentry=1' req = urllib2.Request(url=url,data=postdata,headers=self.headers) res = urllib2.urlopen(req).read() if 'subject' in res: return res else: return None ''' xml data format,then return the data ''' def format(self,xml_data): pattern = re.compile(r'<object name="ctrls">.*?</object>|<object name="flags" />|<object name="flags">.*?</object>',re.S) xml_data = pattern.sub('',xml_data) pattern = re.compile(r'<string name="from">.*?;(.*?@.*?)&.*?</string>') xml_data = pattern.sub(r'<string name="from">\1</string>',xml_data) pattern = re.compile(r'<string name="to">.*?;(.*?@.*?)&.*?</string>') xml_data = pattern.sub(r'<string name="to">\1</string>',xml_data) return xml_data #db connect class Db_Connect(object): def __init__(self,db_host,user,pwd,db_name,charset="utf8",use_unicode = True): try: self.conn = MysqLdb.Connection(db_host,charset=charset,use_unicode=use_unicode) except MysqLdb.OperationalError,e: print 'Connect %s Failed' % db_host print e.args sys.exit(1) def insert(self,sql): try: n = self.conn.cursor().execute(sql) return n except MysqLdb.Warning,e: print e.args except MysqLdb.IntegrityError,e: print e.args def close(self): self.conn.close() class Mail_Handler(object): def __init__(self,data,db_conn): self.flag = False # control the data update self.mail = {} # a mail info self.curr_attrib = '' self.data = data # xml data self.db_conn = db_conn def start(self,name,attributes): if name == 'object': self.mail = {} # get the value of the attribute # <string name="id">sdosod0sdfsd</string> # the value is "id" values = attributes.values() if len(values): self.curr_attrib = values[0] self.flag = True def end(self,name): sql = "insert into mails(id,from_mail,to_mail,subject,size) values('%s','%s',%d)" fields = ('id','from','to','subject','size') if name == 'object': #print self.mail values = [self.mail[i] for i in fields] values[-1] = int(values[-1]) # the size type is int values = tuple(values) #print values #print sql % values self.db_conn.insert(sql % values) self.flag = False def character(self,data): if self.flag: self.mail[self.curr_attrib] = data def parser(self): p = expat.ParserCreate() p.StartElementHandler = self.start p.EndElementHandler = self.end p.CharacterDataHandler = self.character p.Parse(self.data) # parse xml data def main(): flag = True db_conn = Db_Connect('192.168.110.142','admin','test') username = raw_input('Enter you email:') password = getpass.getpass('Enter you password:') login = mail163(username,password) sid = login.login() # login the 163 mail for getting sid # login success if sid: start = 0 # the start page limit = 5 # read 5 unread mails at once while flag: res = login.get_unread_mail(start,limit) if res is None: flag = False else: res = login.format(res) # use re module format data parser = Mail_Handler(res,db_conn) # use expat parse xml parser.parser() start += limit db_conn.close() if __name__ == '__main__': main()
这段代码读取163邮箱未读邮件标题并将数据插入MysqL数据库,使用了expat进行数据处理。Login163类是爬虫这一节当中的类。