首页 > scrappy爬取汽车信息数据

scrappy爬取汽车信息数据

# -*- coding: utf-8 -*-
import scrapy
class CarinfosSpider(scrapy.Spider):
    name='carinfos'
    start_urls = (
        'http://www.xgo.com.cn/brand.html',
    )

    #解析起始url,获取各个品牌链接传给parse_brand解析
    #抓取http://www.xgo.com.cn/brand/abt/等链接
    def parse(self, response):
        #ID按照抓取顺序
        brands_id = 0    #品牌ID 
        types_id = 0    #车系ID
        cars_id = 0        #车型ID
        links = response.xpath('//div[@class="l"]/a[1]/@href').extract()
        for link in links:
            yield scrapy.Request(link,callback = self.parse_brand,meta={'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

        #获取品牌下的车系
        #抓取http://www.xgo.com.cn/4990/的链接并转换为http://www.xgo.com.cn/4990/items.html
    def parse_brand(self,response):
        brands_id = response.meta['brands_id']
        types_id = response.meta['types_id']
        cars_id = response.meta['cars_id']

        brand_id = brands_id
        brands_id = brands_id + 1
        brand_name = response.css('.brand_logo+h1::text').extract()
        brand_img = response.css('.brand_logo img::attr(src)').extract()
        #记录brand_id,brand_name,brand_img

        links = response.css('.car-list p a::attr(href)').extract()
        print links
        for link in links:
            full_url = response.urljoin(link) + 'items.html'
            yield scrapy.Request(full_url,callback = self.parse_type,meta={'brand_id':brand_id,'brand_name':brand_name,'brands_id':brands_id,'types_id':types_id,'cars_id':cars_id})

    #解析车系下的车型,#抓取http://product.xgo.com.cn/other/index190852.shtml等链接
    def parse_type(self,response):
        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = types_id
        types_id = types_id + 1
        type_name = response.css('.car_banner_l .num::text').extract()
        #记录brand_id,brand_name,type_id,type_name

        yield scrapy.Request(full_url,callback = self.parse_cars,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_id':type_id})

    #抓取http://product.xgo.com.cn/191/190852/param.shtml等链接
    def parse_cars(self,response):
        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = response.meta['type_id']
        type_name = response.meta['type_name']
        links = response.css('#theanchor .car_banner_r ul li p a::attr(href)').extract() 
        for link in links:
            yield scrapy.Request(link,callback = self.parse_car_link,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name})

    #解析车型的配置参数
    def parse_car_link(self,response):
        brand_id = response.meta['brand_id']
        brand_name = response.meta['brand_name']
        type_id = response.meta['type_id']
        type_name = response.meta['type_name']
        link = response.urljoin(response.css('.cxk-navbox ul li a::attr(href)').extract()[3])
        yield scrapy.Request(link,callback = self.parse_car,meta={'brand_id':brand_id,'brand_name':brand_name,'type_id':type_id,'type_name':type_name})
         

    def parse_car(self,response):
        manufacturers1_id = response.meta['brand_id']
        manufacturers1 = response.meta['brand_name']
        manufacturers2_id = response.meta['type_id']
        manufacturers2 = response.meta['type_name']
        car_id = cars_id
        cars_id = cars_id + 1
        #记录brand_id,type_id,car_id,car_info
        name = response.css('.offer_topnav h3 a::text').extract()[0]
        where = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[0]
        level = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[1]
        year = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[2]
        displacement = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[3]
        maximumSpeed = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[4]
        officialAcceleration = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[5]
        ministryOfIntegratedFuelConsumption = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[6]
        vehicleQuality = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[7]
        longHighWith = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[9]
        bodyStructure1 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[15]
        doorNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[17]
        seatNum = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[18]
        mailVolume = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[19]
        model = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[22]
        intakeForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[24]
        fuelForm = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[38]
        fuel = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[39]
        fuleWay = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[40]
        environmentalProtection = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[43]
        powerType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[44]
        gearbox = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[50]
        drivingMethod = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[53]
        bodyStructure2 = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[59]
        frontBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[61]
        brakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[62]
        parkingBrakeType = response.xpath('//div[@id="peizhi"]//td[@class="bor-l"]/text()').extract()[63]
        price = response.css('.cxkmoneys .cxk-jg::text').extract()
【热门文章】
【热门文章】