首页 > 【求教】循环爬取页面超时如何解决?

【求教】循环爬取页面超时如何解决?

先贴上运行结果:

第1页正在爬取..
Traceback (most recent call last):
File "fetch2.py", line 48, in <module>

get_Pages(html) 

File "fetch2.py", line 42, in get_Pages

urllib.urlretrieve(ob_url,'%s2.html' %(i+1))

File "/usr/lib/python2.7/urllib.py", line 94, in urlretrieve

return _urlopener.retrieve(url, filename, reporthook, data)

File "/usr/lib/python2.7/urllib.py", line 240, in retrieve

fp = self.open(url, data)

File "/usr/lib/python2.7/urllib.py", line 208, in open

return getattr(self, name)(url)

File "/usr/lib/python2.7/urllib.py", line 345, in open_http

h.endheaders(data)

File "/usr/lib/python2.7/httplib.py", line 975, in endheaders

self._send_output(message_body)

File "/usr/lib/python2.7/httplib.py", line 835, in _send_output

self.send(msg)

File "/usr/lib/python2.7/httplib.py", line 797, in send

self.connect()

File "/usr/lib/python2.7/httplib.py", line 778, in connect

self.timeout, self.source_address)

File "/usr/lib/python2.7/socket.py", line 571, in create_connection

raise err

IOError: [Errno socket error] [Errno 110] Connection timed out

如题.写了一个脚本想要循环下载一系列页面,脚本在这里:

    1 #coding:utf-8
    2 import urllib
    3 import urllib2
    4 from bs4 import BeautifulSoup
    5 import re
    6 import mechanize
    7 
    8 
    9 
   10 def get_Html(set_url):
   11 
 12  # br = mechanize.Browser()
 13  # br.addheaders= [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Chrome/17.0.963.56 Fedora/3.0.1-1.fc    9 Firefox/3.0.1')]
 14  # browser = br.open(set_url)
 15   # set_url = 'http://search.jd.com/Search?keyword=iphone6'
 16   html = urllib2.urlopen(set_url)
 17   return html
 18   # local = 'jd.html'
 19   # urllib.urlretrieve(set_url,local)
 20   # print 'finished'
 21 
 22 def get_Pages(Html):
 23 
 24    soup = BeautifulSoup(Html,'lxml')
 25    link_head = soup.find_all('link')[0]
 26    firstPart_url = link_head['href']
 27 
 28    pgNum = soup.find_all(class_ = 'fp-text')[0].find_all('i')[0]
 29    round_Num =int(pgNum.string) #获取到当下标签数并赋给循环变量
 30 
 31    get_ob_url = soup.find_all(class_ = 'f-search')[0].find_all('a')[0]
 32    res = get_ob_url['data-url']
 33    reg = re.compile('.*6&')
 34    secondPart_url = reg.findall(res)[0]
 35   # print secondPart_url 
 36 
 37   # print range(round_Num) #测试int转换是否成功         
 38    for i in range(round_Num):
 39       ob_url ='http:'+firstPart_url+secondPart_url+'page=%s&click=0' %(i+1) #组装url完成
 40      # print ob_url
 41       print '第'+str(i+1)+'页正在爬取..'
 42       urllib.urlretrieve(ob_url,'%s2.html' %(i+1))
 43       print '第'+str(i+1)+'页已下载'
 44 
 45 
 46 
 47 html = get_Html('http://search.jd.com/Search?keyword=iphone6') #传入初始地址
 48 get_Pages(html)
 

单独测试指定的url用urllib.urlretrieve方法很快就下载下来,
就像这样:

 finished
 finished
 finished
 finished

测试代码如下:

 import ..
  ...
 for i in range(4):
 13    urllib.urlretrieve('http://search.jd.com/Search?keyword=iphone6&page=%s&click=0    '%(i+1) ,'%s.html' %(i+1))
 14    print 'finished'

那么问题到底是出在了哪里?还请高人指点!


首先你的代码复制有行号。。。没法测试,另外下载页面直接用file.write保存就行吧.另外我用你的测试代码并没有下载页面啊。

刚刚被人踩了答案,那我就把评论搬出来吧。

代码大体上没问题,不过,你的拼接url少了一个/,还有下载保存页面我用的是file.write(). 你看看下面的的代码的结果, 附不符合你要的。此外,把你的代码清理了一下,包括一些语法规范,比如空格,空行的使用。

#!/usr/bin/env python
# encoding: utf-8

import urllib2
from bs4 import BeautifulSoup
import re
import time


def get_Html(set_url):
    html = urllib2.urlopen(set_url)
    return html


def get_Pages(Html):
    soup = BeautifulSoup(Html, 'lxml')
    link_head = soup.find_all('link')[0]
    firstPart_url = link_head['href']
    pgNum = soup.find_all(class_='fp-text')[0].find_all('i')[0]
    round_Num =int(pgNum.string)  # 获取到当下标签数并赋给循环变量
    get_ob_url = soup.find_all(class_='f-search')[0].find_all('a')[0]
    res = get_ob_url['data-url']
    reg = re.compile('.*6&')
    secondPart_url = reg.findall(res)[0]
    for i in range(round_Num):
        time.sleep(1)
        ob_url ='http:' + firstPart_url + '/' + secondPart_url + 'page=%s&click=0' %(i + 1)  # 组装u    rl完成
        print ob_url
        htmlpage = urllib2.urlopen(ob_url).read()
        f = file('%s.html' %(i + 1), 'w')
        f.write(htmlpage)
        f.close()
        print '第' + str(i + 1) + '页正在爬取'

html = get_Html('http://search.jd.com/Search?keyword=iphone6')   # 传入初始地址
get_Pages(html)
【热门文章】
【热门文章】