以下是spider的代码
import re
import json
from scrapy.selector import Selector
try:
from scrapy.spider import Spider
except:
from scrapy.spider import BaseSpider as Spider
from scrapy.utils.response import get_base_url
from scrapy.utils.url import urljoin_rfc
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from tuniu.items import *
from tuniu.misc.log import *
class TuniuSpider(CrawlSpider):
name = "tuniu"
allowed_domains = ["tuniu.com"]
start_urls = [
"http://www.tuniu.com/"
]
rules = [
#Rule(sle(allow=(".*?\.tuniu.com/.*?", )), follow=True,callback='parse_item'),
Rule(sle(allow=("/guide/d\-(.*?)\-\d{1,6}/.*", )), follow=True,callback='parse_item'),
Rule(sle(allow=("/.*?/whole-.*?-0/.*", )), follow=True,callback='parse_item'),
]
def parse_item(self, response):
items = []
sel = Selector(response)
base_url = get_base_url(response)
sites_even = sel.css('content')
for site in sites_even:
item = TuniuItem()
item['price'] = site.css('.num span').xpath('text()').extract()[0]
items.append(item)
#print repr(item).decode("unicode-escape") + '\n'
info('parsed ' + str(response))
return items
def _process_request(self, request):
info('process ' + str(request))
return request