首页 > Python3爬虫遇到Http Error : 400, 应该如何解决?

Python3爬虫遇到Http Error : 400, 应该如何解决?

#encoding:UTF-8
import urllib.request
from bs4 import BeautifulSoup
from io import StringIO
import gzip
import json

m={}
totallist=[] 
SZ=[]
SH=[]
SHcode=[]
SHname='SH{code}'

for i in range(600000,602000):
   SHcode.append(i)
for i in range(603000,604000):
   SHcode.append(i)
for i in SHcode :
    S=SHname.format(code=i)
    SH.append(S)

SZ1='SZ00000{n}'
for x in range(10):
   Z=SZ1.format(n=x)
   SZ.append(Z)

SZ2='SZ0000{n}'
for x in range(10,100):
   Z=SZ2.format(n=x)
   SZ.append(Z)

SZ3='SZ000{n}'
for x in range(100,1000):
   Z=SZ3.format(n=x)
   SZ.append(Z)

SZ4='SZ00{n}'
for x in range(1000,2736):
   Z=SZ4.format(n=x)
   SZ.append(Z)

SH.extend(SZ)

proxy_support = urllib.request.ProxyHandler({'http':'23.94.37.50:3128'})
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)

for stockcode in SH:
    url = 'http://xueqiu.com/recommend/pofriends.json?type=1&code={code}&start=0&count=14&_=1448755098444'
    url = url.format(code=SH)
    req = urllib.request.Request(url, headers = {
        'Host': 'xueqiu.com',
        'Connection':' keep-alive',
        'Cache-Control': 'no-cache',
        'Accept': 'application/json, text/javascript, */*; q=0.01',
        'X-Requested-With': 'XMLHttpRequest',
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
        'Referer':'http://xueqiu.com/S/SZ002024',
        'Accept-Encoding': 'gzip, deflate, sdch',
        'Accept-Language': 'zh-CN,zh;q=0.',
        'Cookie': 's=9e711qyz8y; xq_a_token=3fb8ee46a1428ecd37fb311807023ff326fb2805; __utmt=1; Hm_lvt_1db88642e346389874251b5a1eded6e3=1448387849,1448535056,1448549194,1448755071; Hm_lpvt_1db88642e346389874251b5a1eded6e3=1448755084; __utma=1.442364867.1448436216.1448579882.1448755070.12; __utmb=1.3.10.1448755070; __utmc=1; __utmz=1.1448436216.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
        'RA-Ver': '3.0.7',
        'RA-Sid':'655102BF-20150723-085431-c809af-3fa054',
        'X-Forwarded-For':'23.94.37.50:3128',
    })        
    oper = urllib.request.urlopen(req)
    if oper.info().get('Content-Encoding') == 'gzip': # gzip
        content = oper.read()
        data    = StringIO.StringIO(content)
        gzipper = gzip.GzipFile(fileobj=data)
        html    = gzipper.read()
    else:
        html = oper.read()

    html = html.decode('GBK','ignore')

    soup = json.loads(html)

    b = soup['totalcount']
       
    
    if b == 0:
        continue
    else:
        m[stockcode] = b
        output = '股票代码:{stockcode} 关注人数:{b}'
        output = output.format(stockcode=stockcode,b=b)
        print (output)    


n=m.keys()
m=sorted(m.items(), key=lambda d:d[1], reverse=True)    

with open('C:/Python34/test.txt', 'wt') as f:
    print(n, file=f)
    
print (m)
    
    

昨天写了爬虫想想抓取下雪球上一些股票的数据,昨天试的时候的完全可以跑,今天再用完全没改的情况下就不行了。
一般这种应该是网站反爬虫的问题,设置下header就好了,但我header是完全按照Request的header复制的,按理说应该没有问题啊。如果是IP被封的话,我又用proxy_support设置了几个代理IP,但还是不行。

实在想不出来是哪里的问题,所以想来请教下各位。

补充:如果不加Header直接请求得话,是Error 403 forbidden


你的cookie是写死的?有可能是cookie过期


请求头不用写那么多,你试试只保留user-agent和cookie

【热门文章】
【热门文章】