首页 > Python模拟登录知乎,为什么是403 forbidden?

Python模拟登录知乎,为什么是403 forbidden?

我几乎把浏览器里所有的header项目都加进去了还是没用,以下是代码:

# 知乎爬虫类
class ZhiHuSpider(object):
    def __init__(self):
        self.baseUrl = 'http://www.zhihu.com'
        self.captchaUrl = 'http://www.zhihu.com/captcha.gif?r='
        self.cookies = cookiejar.CookieJar()
        self.opener = request.build_opener(request.HTTPCookieProcessor(self.cookies))
    # 获取网页xsrf值
    def getXsrf(self):
        html = None
        with self.opener.open(self.baseUrl) as f:
            html = f.read().decode('utf-8')
        xsrf_match = re.search(r'name="_xsrf" value="(.*?)"/>', html)
        if xsrf_match:
            return xsrf_match.group(1).strip()
        
    # 将验证码保存在本地
    def getCaptchaCode(self):
        with self.opener.open(self.captchaUrl) as f1:
            with open('code.gif', 'wb') as f2:
                f2.write(f1.read())
        print('验证码已保存在本地!请查看!')

    # 模拟登录知乎
    def login(self):
        form = { '_xsrf': self.getXsrf(),
                 'email': 'email',
                 'password': 'password' }
        self.getCaptchaCode()
        code = input('请输入验证码:')
        form['captcha'] = code
        postdata = parse.urlencode(form)
        req = request.Request(self.baseUrl + '/#signin', postdata.encode('utf-8'))
        req.add_header('Host', 'www.zhihu.com')
        req.add_header('Origin', 'http://www.zhihu.com')
        req.add_header('Referer', 'http://www.zhihu.com/')
        req.add_header('X-Requested-With', 'XMLHttpRequest')
        req.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/45.0.2454.101 Chrome/45.0.2454.101 Safari/537.36')
        try:
            with self.opener.open(req) as f1:
                with open('zhihu.html', 'w') as f2:
                    f2.write(f1.read().decode('utf-8'))
        except error.URLError as e:
            print('登录失败, 原因:', e.reason)

   

if __name__ == '__main__':
    spider = ZhiHuSpider()
    spider.login()

因为知乎反爬了。


知乎有反爬虫的机制,可以参考github上的知乎爬虫的类库。python2.7和 python3.4版本都有。

【热门文章】
【热门文章】