首页 > python 爬虫?

python 爬虫?

想爬去健康界这个网页一直,没有返回值

coding:utf-8

'''
健康界新闻爬去
'''
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor as sle
from scrapy.linkextractors import LinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.http import FormRequest, Request
from scrapy.selector import Selector
from Scrapy_demo.items import *
from scrapy import log
from scrapy.utils.response import get_base_url
import urlparse
import json
from scrapy.spider import Spider
from scrapy.selector import Selector
from scrapy import Request
from scrapy import log
from Scrapy_demo.items import *
from datetime import datetime
import re
import requests
import time

class HealthSpider(CrawlSpider):

name = 'health'
allowed_domains = ['www.cn-healthcare.com/']
download_delay = 1
start_urls = ['http://www.cn-healthcare.com/',]
def parse(self,response):
    response_selector = Selector(response)
    index_url = response_selector.css("div.indextitle-text").xpath('a/@href').extract()

    for detail_link in index_url:
        print 'detail_link'
        print detail_link
        if detail_link:
            try:
                yield Request(url=detail_link, callback=self.parse_items)
            except:
                yield Request(url=detail_link, callback=self.parse_items)
                log.msg("Page " + detail_link + " parse ERROR, try again !", level=log.WARNING)
    mytime = str(time.time())
    result =mytime[:10] +mytime[11:14]
    first_url = 'http://www.cn-healthcare.com/api/column//kxw/{page_index}?_=%s'% result

    for index in range(100):
        try:
           next_url=first_url.replace('{page_index}',str(index))
           print  next_url
           r = requests.get(next_url).json()#Dict
           print type(r.get('data'))
           print (r.get('data'))[0]['url']
           for j in range(len(r.get('data'))):
              obj = 'http://www.cn-healthcare.com/'+r.get('data')[j]['url']
           yield Request(url=obj, callback=self.parse_items)
        except TypeError:
            print 'error'
def parse_items(self, response):
    items = []
    item = HealthItem()
    sel = Selector(response)

    item['title']="ok"

    print 'Spider is work'
    yield item


试试

r.json().get('count')

list[index]如果没有还属性会报错,可能你爬的数据没有count这个索引,用list.get(index,default)设置默认值,如果获取不到数据,用default来代替

【热门文章】
【热门文章】