首页 > scrapy 编写爬虫遇到 exceptions.NameError

scrapy 编写爬虫遇到 exceptions.NameError

item.py

 import scrapy

  from scrapy.item import Item, Field

  class MytestItem(scrapy.Item):
      # define the fields for your item here like:
      # name = scrapy.Field()
      pass

  class DoubanmoiveItem(Item):
      name=Field()#电影名
      year=Field()#上映年份
      score=Field()#豆瓣分数
      director=Field()#导演
      classification=Field()#分类
      actor=Field()#演员

movie_spider

from scrapy.selector import Selector
from scrapy.contrib.spiders import CrawlSpider,Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from mytest.items import DoubanmoiveItem

class MovieSpider(CrawlSpider):
    name="doubanmovie"
    allowed_domains=["movie.douban.com"]
    start_urls=["https:/movie.douban.com/top250"]
    rules=[
        Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/top250\?start=\d+.*'))),    
        Rule(SgmlLinkExtractor(allow=(r'http://movie.douban.com/subject/\d+')), callback="parse_item"),
    ]

    def parse_item(self, response):
        sel=Selecotr(response)
        item=DoubanmovieItem()
        item['name']=sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        item['year']=sel.xpath('//*[@id="content"]/h1/span[2]/text()').re(r'\((\d+)\)')
        item['score']=sel.xpath('//*[@id="interest_sectl"]/div/p[1]/strong/text()').extract()
        item['director']=sel.xpath('//*[@id="info"]/span[1]/a/text()').extract()
        item['actor']=sel.xpath('//*[@id="info"]/span[3]/a[1]/text()').extract()
        return item

pipeLines.py

from scrapy import log
from twisted.enterprise import adbapi
from scrapy.http import Request

import MySQLdb
import MySQLdb.cursors

class DoubanmoivePipeline(object):
    def __init__(self):
        self.dbpool = adbapi.ConnectionPool('MySQLdb',
                db = 'JY',
                user = 'root',
                passwd = '508122guoyumei',
                cursorclass = MySQLdb.cursors.DictCursor,
                charset = 'utf8',
                use_unicode = False
                )

    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)
        query.addErrback(self.handle_error)
        return item

    def _conditional_insert(self,tx,item):
        tx.execute("select * from Moive where movieName= %s",(item['name'][0],))
        result=tx.fetchone()
        log.msg(result,level=log.DEBUG)
        print result
        if result:
            log.msg("Item already stored in db:%s" % item,level=log.DEBUG)
        else:
            classification=actor=''
            lenClassification=len(item['classification'])
            lenActor=len(item['actor'])
            for n in xrange(lenClassification):
                classification+=item['classification'][n]
                if n<lenClassification-1:
                    classification+='/'
                for n in xrange(lenActor):
                    actor+=item['actor'][n]
                    if n<lenActor-1:
                        actor+='/'

                    tx.execute(\
                            "insert into Movie(movieName,movieReleasedate,movieScore,movieDirector,movieType,movieActor) values (%s,%s,%s,%s,%s,%s)",\
                            (item['name'][0],item['year'][0],item['score'][0],item['director'][0],classification,actor))
                    log.msg("Item stored in db: %s" % item, level=log.DEBUG)

    def handle_error(self, e):
        log.err(e)

settings.py

BOT_NAME = 'mytest'


SPIDER_MODULES = ['mytest.spiders']
NEWSPIDER_MODULE = 'mytest.spiders'

LOG_LEVEL='DEBUG'

DOWNLOAD_DELAY = 2
RANDOMIZE_DOWNLOAD_DELAY = True
COOKIES_ENABLED = True


ITEM_PIPELINES = {
    'mytest.pipelines.DoubanmoviePipeline':300
}

USER_AGENT = 'mytest (+http://www.yourdomain.com)'

错误

~/mytest$ scrapy crawl doubanmovie
/usr/lib/pymodules/python2.7/scrapy/contrib/linkextractors/sgml.py:107: ScrapyDeprecationWarning: SgmlLinkExtractor is deprecated and will be removed in future releases. Please use scrapy.contrib.linkextractors.LinkExtractor
  ScrapyDeprecationWarning
2015-05-22 10:13:29+0800 [scrapy] INFO: Scrapy 0.25.0-454-gfa1039f started (bot: mytest)
2015-05-22 10:13:29+0800 [scrapy] INFO: Optional features available: ssl, http11
2015-05-22 10:13:29+0800 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'mytest.spiders', 'SPIDER_MODULES': ['mytest.spiders'], 'USER_AGENT': 'mytest (+http://www.yourdomain.com)', 'DOWNLOAD_DELAY': 2, 'BOT_NAME': 'mytest'}
2015-05-22 10:13:29+0800 [scrapy] INFO: Enabled extensions: LogStats, TelnetConsole, CloseSpider, CoreStats, SpiderState
2015-05-22 10:13:29+0800 [scrapy] INFO: Enabled downloader middlewares: HttpAuthMiddleware, DownloadTimeoutMiddleware, UserAgentMiddleware, RetryMiddleware, DefaultHeadersMiddleware, MetaRefreshMiddleware, HttpCompressionMiddleware, RedirectMiddleware, CookiesMiddleware, ChunkedTransferMiddleware, DownloaderStats
2015-05-22 10:13:29+0800 [scrapy] INFO: Enabled spider middlewares: HttpErrorMiddleware, OffsiteMiddleware, RefererMiddleware, UrlLengthMiddleware, DepthMiddleware
2015-05-22 10:13:29+0800 [-] ERROR: Unhandled error in Deferred:
2015-05-22 10:13:29+0800 [-] Unhandled Error
    Traceback (most recent call last):
      File "/usr/lib/pymodules/python2.7/scrapy/cmdline.py", line 150, in _run_command
        cmd.run(args, opts)
      File "/usr/lib/pymodules/python2.7/scrapy/commands/crawl.py", line 57, in run
        self.crawler_process.crawl(spname, **opts.spargs)
      File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 105, in crawl
        d = crawler.crawl(*args, **kwargs)
      File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 1181, in unwindGenerator
        return _inlineCallbacks(None, gen, Deferred())
    --- <exception caught here> ---
      File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 1039, in _inlineCallbacks
        result = g.send(result)
      File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 60, in crawl
        self.engine = self._create_engine()
      File "/usr/lib/pymodules/python2.7/scrapy/crawler.py", line 72, in _create_engine
        return ExecutionEngine(self, lambda _: self.stop())
      File "/usr/lib/pymodules/python2.7/scrapy/core/engine.py", line 63, in __init__
        self.scraper = Scraper(crawler)
      File "/usr/lib/pymodules/python2.7/scrapy/core/scraper.py", line 67, in __init__
        self.itemproc = itemproc_cls.from_crawler(crawler)
      File "/usr/lib/pymodules/python2.7/scrapy/middleware.py", line 50, in from_crawler
        return cls.from_settings(crawler.settings, crawler)
      File "/usr/lib/pymodules/python2.7/scrapy/middleware.py", line 29, in from_settings
        mwcls = load_object(clspath)
      File "/usr/lib/pymodules/python2.7/scrapy/utils/misc.py", line 49, in load_object
        raise NameError("Module '%s' doesn't define any object named '%s'" % (module, name))
    exceptions.NameError: Module 'mytest.pipelines' doesn't define any object named 'DoubanmoviePipeline'


应该是在settings.py文件中ITEM_PIPELINES = [ **]里面的类名拼写错误,我也是遇到同样的问题。


拼写错了,你的类名


你说你有一个 DoubanmoviePipeline 类,但是你只有 DoubanmoivePipeline 类。

【热门文章】
【热门文章】