利用Scrapy去爬淘宝的数据,出现了如下的错误:
Traceback (most recent call last):
File "/usr/local/bin/scrapy", line 4, in <module>
execute()
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 143, in execute
_run_print_help(parser, _run_command, cmd, args, opts)
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 89, in _run_print_help
func(*a, **kw)
File "/usr/local/lib/python2.7/dist-packages/scrapy/cmdline.py", line 150, in _run_command
cmd.run(args, opts)
File "/usr/local/lib/python2.7/dist-packages/scrapy/commands/crawl.py", line 48, in run
spider = crawler.spiders.create(spname, **opts.spargs)
File "/usr/local/lib/python2.7/dist-packages/scrapy/spidermanager.py", line 48, in create
return spcls(**spider_kwargs)
File "/srv/sxrapy_test/shiyifang/shiyifang/spiders/shiyifang_spider.py", line 32, in __init__
self.selenium.start()
File "/usr/local/lib/python2.7/dist-packages/selenium/selenium.py", line 197, in start
result = self.get_string("getNewBrowserSession", start_args)
File "/usr/local/lib/python2.7/dist-packages/selenium/selenium.py", line 231, in get_string
result = self.do_command(verb, args)
File "/usr/local/lib/python2.7/dist-packages/selenium/selenium.py", line 220, in do_command
conn.request("POST", "/selenium-server/driver/", body, headers)
File "/usr/lib/python2.7/httplib.py", line 962, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 996, in _send_request
self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 958, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 818, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 780, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 761, in connect
self.timeout, self.source_address)
File "/usr/lib/python2.7/socket.py", line 571, in create_connection
raise err
socket.error: [Errno 111] Connection refused
源码如下:
# -*- coding: utf-8 -*-
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from selenium import selenium
from shiyifang.items import ShiyifangItem
class ShiyifangSpider(CrawlSpider):
name = "shiyifang"
allowed_domains = ["taobao.com"]
start_urls = [
"http://www.taobao.com"
]
rules = (
Rule(SgmlLinkExtractor(allow=('/market/nvzhuang/index.php?spm=a217f.7297021.a214d5w.2.tvAive', )),
callback='parse_page', follow=True),
)
def __init__(self):
CrawlSpider.__init__(self)
self.verificationErrors = []
self.selenium = selenium("localhost", 4444, "*firefox", "http://www.taobao.com")
self.selenium.start()
def __del__(self):
self.selenium.stop()
print self.verificationErrors
CrawlSpider.__del__(self)
def parse_page(self, response):
sel = Selector(response)
from webproxy.items import WebproxyItem
sel = self.selenium
sel.open(response.url)
sel.wait_for_page_to_load("30000")
import time
time.sleep(2.5)
是被禁止爬了,还是需要添加其它的一些方法?
我也遇到了同样的问题,之前只单独做过scrapy和selenium,现在将两者结合在一起倒还是头一次,有点摸不着门道,不知道楼主现在解决这个问题了没有呢?