首页 > scrapy的spider不能调用pipeline

scrapy的spider不能调用pipeline

在debug了一下之后,我发现了有可能是bug的地方。

# -*- coding: gbk -*-
import scrapy
from scrapy.http import FormRequest
import json
import os
from datetime import datetime
from scrapy.selector import Selector
from teacherCourse.handlePic import handle
from teacherCourse.items import DetailProfItem
from teacherCourse.items import DetailProfCourseItem
from teacherCourse.items import containItem

class GetTeacherCourseSpider(scrapy.Spider):
    name = 'TeacherCourse'
#    custom_settings = {
#            'ITEM_PIPELINES': {
#                'teacherCourse.pipelines.TeacherCoursePipeline': 300,
#                }
#            }

    def __init__(self, selXNXQ='', titleCode=''):
        self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' # first
        self.vcodeUrl = 'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second
        self.postUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third
        self.findSessionId = None # to save the cookies
        self.XNXQ = selXNXQ
        self.titleCode = titleCode

    def start_requests(self):
        request = scrapy.Request(self.getUrl,
               callback = self.downloadPic)
        yield request

    def downloadPic(self, response):
        # download the picture
        # find the session id
        self.findSessionId = response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=")
        request = scrapy.Request(self.vcodeUrl,
                cookies= {self.findSessionId[0]: self.findSessionId[1]},
                callback = self.getAndHandleYzm)
        yield request
    
    def getAndHandleYzm(self, response):
        yzm = handle(response.body)
        
        yield FormRequest(self.postUrl,
                formdata={'Sel_XNXQ': '20151',
                          'sel_zc': '011',
                          'txt_yzm': yzm,
                          'type': '2'},
                headers={
                    'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
                    'Cookie': self.findSessionId[0] + '=' + self.findSessionId[1],
                    },


                callback=self.parse)

    def parse(self, response):
        body = response.body.decode('gbk')
# bug code begin
        num = body.find('alert')
        if num != -1:
            # means CAPTCHA validation fails, need to re-request the CAPTCHA
            yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
            headers={
                    'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
                    'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
                    },
            callback=self.getAndHandleYzm) # re request the url to solve the validation code fail problem
# bug code done
        else:
            # parse data
            self.parseData(body)
#            item = containItem()
#            item['first'] = len(body)
#            return item # 改成yield就可以,但是还是没法解释问题
            
    def parseData(body):
        # parse data
        # 有点长就不放了,我将item输出到终端能显示结果
        yield item # 就是这句,不能yield到pipeline

我也弄不懂为什么注释这段就可以,但是这段是我当验证码出错时重复求验证码的部分,不能少。求各位大牛解释解释

Update:
我把parseData函数的body部分合并到parse函数里,然后就可以了。但是我又不懂了,为什么分开就不行呢?一定要在parse函数里yield才可以吗?

Update2:
改成这样就可以了

def parse(self, response):
    ...
    for r in self.parseData(body):
        yield r
        
def parseData(body):
    ...
    yield item

你这样试试

def parse(self, response):
    ...
    yield self.parseData(body)
    
def parseData(body):
    return ...
【热门文章】
【热门文章】