首页 > python 爬虫

python 爬虫

爬去豆瓣一直碰到403,咋么破啊?

import requests
from bs4 import BeautifulSoup
import urllib
import re

username = 'xxxxxxxxxx@qq.com'
password = 'xxxxxx'
loginUrl = 'https://accounts.douban.com/login'
redir="https://www.douban.com/people/141360318/notes"
formData = {

"redir": redir,
"form_email": username,
"form_password": password,
"login": u'登录'

}
headers = {

"User-Agent": 'Mozilla/5.0 (Windows NT 6.1)\

AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36',

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.8,zh-CN;q=0.6,zh;q=0.4',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': 149,
'Content-Type': 'application/x-www-form-urlencoded'

}
with requests.Session() as s:

r = s.post(loginUrl, data=formData, headers=headers)
page= r.content
# print the html returned or something more intelligent to see if it's a successful login page.
with open('login.html', 'w') as fp:
   fp.write(page)
html = open('login.html', 'r')
soup = BeautifulSoup(html, "html.parser")

captchaAddr = soup.find('img', id='captcha_image')['src']

print captchaAddr

reCaptchaID = r'<input type="hidden" name="captcha-id" value="(.*?)"/'
captchaID = re.findall(reCaptchaID, page)
# print captchaID
# save the vefiry code
urllib.urlretrieve(captchaAddr, "captcha.jpg")
captcha = raw_input('please input the captcha:')

formData['captcha-solution'] = captcha
formData['captcha-id'] = captchaID
r=s.post(loginUrl, data=formData, headers=headers)
# print r.content
print r.url
print r.status_code
print r.cookies
cookies=r.cookies

r = s.get('https://www.douban.com/people/141360318/notes',cookies=cookies)
print r.headers
print r.status_code

豆瓣不是你想爬,想爬就能爬!

【热门文章】
【热门文章】