源码:
#-*-coding:utf-8-*-
__author__="vpersie9"
import urllib2
import re
import threading
from lxml import etree
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
class Spider(object):
def __init__(self):
self.url="http://guoxue.lishichunqiu.com/shibu/sanguozhi/"
self.user_agent='Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'
self.headers={'User-Agent':self.user_agent}
def getTool(self,url):
try:
request=urllib2.Request(url,headers=self.headers)
response=urllib2.urlopen(request)
result=response.read().decode('utf-8')
return result
except urllib2.HTTPError,e:
print e
except urllib2.URLError,e:
if hasattr(e,"code"):
print e.code
if hasattr(e,"reason"):
print e.reason
def getTitle(self):
try:
result=self.getTool(self.url)
selector=etree.HTML(result)
main=selector.xpath('//table[@class="box no_doc"]/tbody/tr/td')[0]
contentURL=main.xpath('ul//a/@href')
title=main.xpath('ul//a/text()')
return contentURL,title
except Exception,e:
print e
def getContent(self,URLnum,file):
try:
title=self.getTitle()[1][URLnum]
if URLnum==0:
response=self.getTool(self.getTitle()[0][URLnum])
html=etree.HTML(response)
main=html.xpath('//div[@id="maincontent"]/div[@id="content"]')
main=main[0].xpath('string(.)')
Compile=re.compile(u'(.*?)历史春秋网www.lishichunqiu.co',re.S)
context=re.search(Compile,main).group(1)
print u'-------正在爬取第%d卷--------'%(URLnum+1)
file.write(title+"\n\n")
file.write(context+"\n\n")
else:
result=self.getTool(self.getTitle()[0][URLnum])
selector=etree.HTML(result)
content=selector.xpath('//div[@id="maincontent"]/div[@id="content"]')
print u'-------正在爬取第%d卷--------'%(URLnum+1)
file.write(title+"\n\n")
file.write(content[0].xpath('string(.)')+"\n\n")
print content[0].xpath('string(.)')
except Exception,e:
print e
class Threads(threading.Thread):
def __init__(self,func,args,name):
super(Threads,self).__init__()
self.func=func
self.args=args
self.name=name
def run(self):
self.func(*self.args)
class Manager(object):
def __init__(self):
self.spider=Spider()
self.threads=Threads
def control(self):
with open('history.odt','wb+') as file:
URLnum=len(self.spider.getTitle()[0])
threads_function=[]
for item in range(URLnum):
function=self.threads(self.spider.getContent,(item,file),self.spider.getContent.__name__)
threads_function.append(function)
for i in range(URLnum):
threads_function[i].start()
for i in range(URLnum):
threads_function[i].join()
print u"所有页面爬取完成"
file.close()
print u"所有数据保存完成"
if __name__=="__main__":
manager=Manager()
manager.control()
txt 文件的乱码格式为:
ꃂ胣肀苩鶊볯鞭볤鞋볯覹飩낖蟩몺胣覱迥銾ꛧ讹郥龹胣覱鳦몺鳨貼鳦膧鿧薾胣뚗鯧麷믤论볥閣雥뢛볯鶊뻥躻매貼ꏨ鎰諨낛볯鲀郥뒹뿨莸跥貼뷤뎇ꓥ蚰蛥貼냥꾾胣鶀諨뮗럥뾥ꓥ袮뫥늾ꗥꮣ볯肾뻤覄胣袅룤骮鯧麷볯鶊룤ꮃ苩膘鷧芀藥뮸蟥뎇菩貼룤궯ꓥ螥매貼鏦몸菩꒻볯膿맥覱ꓥ袮胣肉鳥薸룤覜닦ꦻ볯몺룤
请问这是怎么回事?
使用 OFFICE 打开