我几乎把浏览器里所有的header项目都加进去了还是没用,以下是代码:
# 知乎爬虫类
class ZhiHuSpider(object):
def __init__(self):
self.baseUrl = 'http://www.zhihu.com'
self.captchaUrl = 'http://www.zhihu.com/captcha.gif?r='
self.cookies = cookiejar.CookieJar()
self.opener = request.build_opener(request.HTTPCookieProcessor(self.cookies))
# 获取网页xsrf值
def getXsrf(self):
html = None
with self.opener.open(self.baseUrl) as f:
html = f.read().decode('utf-8')
xsrf_match = re.search(r'name="_xsrf" value="(.*?)"/>', html)
if xsrf_match:
return xsrf_match.group(1).strip()
# 将验证码保存在本地
def getCaptchaCode(self):
with self.opener.open(self.captchaUrl) as f1:
with open('code.gif', 'wb') as f2:
f2.write(f1.read())
print('验证码已保存在本地!请查看!')
# 模拟登录知乎
def login(self):
form = { '_xsrf': self.getXsrf(),
'email': 'email',
'password': 'password' }
self.getCaptchaCode()
code = input('请输入验证码:')
form['captcha'] = code
postdata = parse.urlencode(form)
req = request.Request(self.baseUrl + '/#signin', postdata.encode('utf-8'))
req.add_header('Host', 'www.zhihu.com')
req.add_header('Origin', 'http://www.zhihu.com')
req.add_header('Referer', 'http://www.zhihu.com/')
req.add_header('X-Requested-With', 'XMLHttpRequest')
req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/45.0.2454.101 Chrome/45.0.2454.101 Safari/537.36')
try:
with self.opener.open(req) as f1:
with open('zhihu.html', 'w') as f2:
f2.write(f1.read().decode('utf-8'))
except error.URLError as e:
print('登录失败, 原因:', e.reason)
if __name__ == '__main__':
spider = ZhiHuSpider()
spider.login()
因为知乎反爬了。
知乎有反爬虫的机制,可以参考github上的知乎爬虫的类库。python2.7和 python3.4版本都有。