首页 > python 爬取三国志 终端打印正常;保存成 txt文件 出现乱码 。

python 爬取三国志 终端打印正常;保存成 txt文件 出现乱码 。

源码:

#-*-coding:utf-8-*-
__author__="vpersie9"

import urllib2
import re
import threading
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

class Spider(object):
    def __init__(self):
        self.url="http://guoxue.lishichunqiu.com/shibu/sanguozhi/"
        self.user_agent='Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'
        self.headers={'User-Agent':self.user_agent}
    
    def getTool(self,url):
        try:
            request=urllib2.Request(url,headers=self.headers)
            response=urllib2.urlopen(request)
            result=response.read().decode('utf-8')
            return result
        except urllib2.HTTPError,e:
            print e
        except urllib2.URLError,e:
            if hasattr(e,"code"):
                print e.code
            if hasattr(e,"reason"):
                print e.reason
    
    def getTitle(self):
        try:
            result=self.getTool(self.url)
            selector=etree.HTML(result)
            main=selector.xpath('//table[@class="box no_doc"]/tbody/tr/td')[0]
            contentURL=main.xpath('ul//a/@href')
            title=main.xpath('ul//a/text()')
            return contentURL,title
        except Exception,e:
            print e

    def getContent(self,URLnum,file):
        try:
            title=self.getTitle()[1][URLnum]
            if URLnum==0:            
                response=self.getTool(self.getTitle()[0][URLnum])
                html=etree.HTML(response)
                main=html.xpath('//div[@id="maincontent"]/div[@id="content"]')            
                main=main[0].xpath('string(.)')
                Compile=re.compile(u'(.*?)历史春秋网www.lishichunqiu.co',re.S)
                context=re.search(Compile,main).group(1)
                print u'-------正在爬取第%d卷--------'%(URLnum+1)
                
                file.write(title+"\n\n")
                file.write(context+"\n\n")
            else:
                result=self.getTool(self.getTitle()[0][URLnum])
                selector=etree.HTML(result)
                content=selector.xpath('//div[@id="maincontent"]/div[@id="content"]')
                 
                
                print u'-------正在爬取第%d卷--------'%(URLnum+1)    
            
                file.write(title+"\n\n")
                file.write(content[0].xpath('string(.)')+"\n\n")
                print content[0].xpath('string(.)')
        except Exception,e:
            print e

class Threads(threading.Thread):

    def __init__(self,func,args,name):
        super(Threads,self).__init__()
        self.func=func
        self.args=args
        self.name=name
        
    def run(self):
        self.func(*self.args)

class Manager(object):
    
    def __init__(self):
        self.spider=Spider()
        self.threads=Threads
    
    def control(self):
        with open('history.odt','wb+') as file:
            URLnum=len(self.spider.getTitle()[0])
            threads_function=[]
            for item in range(URLnum):
                function=self.threads(self.spider.getContent,(item,file),self.spider.getContent.__name__)
                threads_function.append(function)
            for i in range(URLnum):
                threads_function[i].start()
            for i in range(URLnum):
                threads_function[i].join()
            print u"所有页面爬取完成"
        file.close()
        print u"所有数据保存完成"

if __name__=="__main__":
    manager=Manager()
    manager.control()

txt 文件的乱码格式为:
ꃂ胣肀苩鶊볯鞭볤鞋볯覹飩낖蟩몺胣覱迥銾ꛧ讹郥龹胣覱鳦몺鳨貼鳦膧鿧薾胣뚗鯧麷믤论볥閣雥뢛볯鶊뻥躻매貼ꏨ鎰諨낛볯鲀郥뒹뿨莸跥貼뷤뎇ꓥ蚰蛥貼냥꾾胣鶀諨뮗럥뾥ꓥ袮뫥늾ꗥꮣ볯肾뻤覄胣袅룤骮鯧麷볯鶊룤ꮃ苩膘鷧芀藥뮸蟥뎇菩貼룤궯ꓥ螥매貼鏦몸菩꒻볯膿맥覱ꓥ袮胣肉鳥薸룤覜닦ꦻ볯몺룤

请问这是怎么回事?


使用 OFFICE 打开

【热门文章】
【热门文章】