要抓多个页面,用for循环把每一页的地址写出来,但是有错误:
错误代码:(ps:爬取的网页编码是gbk)
raise TypeError('Request url must be str or unicode, got %s:' % type(url).__name__)
TypeError: Request url must be str or unicode, got list:
爬虫代码如下:
#-*-coding:utf8-*-
import scrapy
from scrapy_redis.spiders import RedisSpider
from scrapy.http import Request
from scrapy.selector import Selector
from Articlespider.items import ArticlespiderItem
import re
class Articlespider(RedisSpider):
name = "article"
redis_key = 'article:start_urls'
allowed_domains=["yyyy.com"]
start_urls = ['http://www.yyyy.com/articles/articles-list.php'
]
#生成要抓取的url地址,从1-100页,每一页都是文章列表页
def start_requests(self):
pages=[]
for i in range(1,100):
newpage=scrapy.Request("http://www.yyyy.com/articles/articles-list.php?pagenum=%s"%i)
pages.append(newpage)
return pages
#在文章列表页获取文章详情页的url
def parse(self, response):
selector = Selector(response)
articles=selector.xpath('//div[@class="articlename"]')
for eacharticle in articles:
articleUrl=eacharticle.xpath('a/@href').extract()
yield Request(articleUrl,callback=self.parse_articleInfo)
#接收到文章详情页的url,抓取文章详情
def parse_articleInfo(self,response):
selector = Selector(response)
item=ArticlespiderItem()
item['articlename']=selector.xpath('/html/body/div[4]/div[2]/div[1]/div[1]/h3/text()').extract()
item['content']=selector.xpath('/html/body/div[5]/div[2]/div[1]/div[3]/p/text()').extract()
yield item
articleUrl=eacharticle.xpath('a/@href').extract()[0]