在debug了一下之后,我发现了有可能是bug的地方。
# -*- coding: gbk -*-
import scrapy
from scrapy.http import FormRequest
import json
import os
from datetime import datetime
from scrapy.selector import Selector
from teacherCourse.handlePic import handle
from teacherCourse.items import DetailProfItem
from teacherCourse.items import DetailProfCourseItem
from teacherCourse.items import containItem
class GetTeacherCourseSpider(scrapy.Spider):
name = 'TeacherCourse'
# custom_settings = {
# 'ITEM_PIPELINES': {
# 'teacherCourse.pipelines.TeacherCoursePipeline': 300,
# }
# }
def __init__(self, selXNXQ='', titleCode=''):
self.getUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx' # first
self.vcodeUrl = 'http://jwxt.dgut.edu.cn/jwweb/sys/ValidateCode.aspx' # second
self.postUrl = 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB_rpt.aspx' # third
self.findSessionId = None # to save the cookies
self.XNXQ = selXNXQ
self.titleCode = titleCode
def start_requests(self):
request = scrapy.Request(self.getUrl,
callback = self.downloadPic)
yield request
def downloadPic(self, response):
# download the picture
# find the session id
self.findSessionId = response.headers.getlist('Set-Cookie')[0].decode().split(";")[0].split("=")
request = scrapy.Request(self.vcodeUrl,
cookies= {self.findSessionId[0]: self.findSessionId[1]},
callback = self.getAndHandleYzm)
yield request
def getAndHandleYzm(self, response):
yzm = handle(response.body)
yield FormRequest(self.postUrl,
formdata={'Sel_XNXQ': '20151',
'sel_zc': '011',
'txt_yzm': yzm,
'type': '2'},
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0] + '=' + self.findSessionId[1],
},
callback=self.parse)
def parse(self, response):
body = response.body.decode('gbk')
# bug code begin
num = body.find('alert')
if num != -1:
# means CAPTCHA validation fails, need to re-request the CAPTCHA
yield scrapy.Request(self.vcodeUrl+'?t='+'%.f' % (datetime.now().microsecond / 1000),
headers={
'Referer': 'http://jwxt.dgut.edu.cn/jwweb/ZNPK/TeacherKBFB.aspx',
'Cookie': self.findSessionId[0]+'='+self.findSessionId[1]
},
callback=self.getAndHandleYzm) # re request the url to solve the validation code fail problem
# bug code done
else:
# parse data
self.parseData(body)
# item = containItem()
# item['first'] = len(body)
# return item # 改成yield就可以,但是还是没法解释问题
def parseData(body):
# parse data
# 有点长就不放了,我将item输出到终端能显示结果
yield item # 就是这句,不能yield到pipeline
我也弄不懂为什么注释这段就可以,但是这段是我当验证码出错时重复求验证码的部分,不能少。求各位大牛解释解释
Update:
我把parseData
函数的body部分合并到parse
函数里,然后就可以了。但是我又不懂了,为什么分开就不行呢?一定要在parse
函数里yield
才可以吗?
Update2:
改成这样就可以了
def parse(self, response):
...
for r in self.parseData(body):
yield r
def parseData(body):
...
yield item
你这样试试
def parse(self, response):
...
yield self.parseData(body)
def parseData(body):
return ...