首页 > 编程 > Python > 正文

scrapy爬虫完整实例

2020-02-22 23:00:15
字体:
来源:转载
供稿:网友

本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下。

例程1: douban

目录树

douban--douban --spiders  --__init__.py  --bookspider.py  --douban_comment_spider.py  --doumailspider.py --__init__.py --items.py --pipelines.py --settings.py--scrapy.cfg

–spiders–init.py

# This package will contain the spiders of your Scrapy project## Please refer to the documentation for information on how to create and manage# your spiders.

bookspider.py

# -*- coding:utf-8 -*-'''by sudo rm -rf http://imchenkun.com'''import scrapyfrom douban.items import DoubanBookItemclass BookSpider(scrapy.Spider):  name = 'douban-book'  allowed_domains = ['douban.com']  start_urls = [    'https://book.douban.com/top250'  ]  def parse(self, response):    # 请求第一页    yield scrapy.Request(response.url, callback=self.parse_next)    # 请求其它页    for page in response.xpath('//div[@class="paginator"]/a'):      link = page.xpath('@href').extract()[0]      yield scrapy.Request(link, callback=self.parse_next)  def parse_next(self, response):    for item in response.xpath('//tr[@class="item"]'):      book = DoubanBookItem()      book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]      book['content'] = item.xpath('td[2]/p/text()').extract()[0]      book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]      yield book

douban_comment_spider.py

# -*- coding:utf-8 -*-import scrapyfrom faker import Factoryfrom douban.items import DoubanMovieCommentItemimport urlparsef = Factory.create()class MailSpider(scrapy.Spider):  name = 'douban-comment'  allowed_domains = ['accounts.douban.com', 'douban.com']  start_urls = [    'https://www.douban.com/'  ]  headers = {    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',    'Accept-Encoding': 'gzip, deflate, br',    'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',    'Connection': 'keep-alive',    'Host': 'accounts.douban.com',    'User-Agent': f.user_agent()  }  formdata = {    'form_email': '你的邮箱',    'form_password': '你的密码',    # 'captcha-solution': '',    # 'captcha-id': '',    'login': '登录',    'redir': 'https://www.douban.com/',    'source': 'None'  }  def start_requests(self):    return [scrapy.Request(url='https://www.douban.com/accounts/login',                headers=self.headers,                meta={'cookiejar': 1},                callback=self.parse_login)]  def parse_login(self, response):    # 如果有验证码要人为处理    if 'captcha_image' in response.body:      print 'Copy the link:'      link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]      print link      captcha_solution = raw_input('captcha-solution:')      captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']      self.formdata['captcha-solution'] = captcha_solution      self.formdata['captcha-id'] = captcha_id    return [scrapy.FormRequest.from_response(response,                         formdata=self.formdata,                         headers=self.headers,                         meta={'cookiejar': response.meta['cookiejar']},                         callback=self.after_login                         )]  def after_login(self, response):    print response.status    self.headers['Host'] = "www.douban.com"    yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',               meta={'cookiejar': response.meta['cookiejar']},               headers=self.headers,               callback=self.parse_comment_url)    yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',               meta={'cookiejar': response.meta['cookiejar']},               headers=self.headers,               callback=self.parse_next_page,               dont_filter = True)  #不去重  def parse_next_page(self, response):    print response.status    try:      next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0])      print "下一页"      print next_url      yield scrapy.Request(url=next_url,               meta={'cookiejar': response.meta['cookiejar']},               headers=self.headers,               callback=self.parse_comment_url,               dont_filter = True)      yield scrapy.Request(url=next_url,               meta={'cookiejar': response.meta['cookiejar']},               headers=self.headers,               callback=self.parse_next_page,               dont_filter = True)    except:      print "Next page Error"      return  def parse_comment_url(self, response):    print response.status    for item in response.xpath('//div[@class="main review-item"]'):      comment_url = item.xpath('header/h3[@class="title"]/a/@href').extract()[0]      comment_title = item.xpath('header/h3[@class="title"]/a/text()').extract()[0]      print comment_title      print comment_url      yield scrapy.Request(url=comment_url,               meta={'cookiejar': response.meta['cookiejar']},               headers=self.headers,               callback=self.parse_comment)  def parse_comment(self, response):    print response.status    for item in response.xpath('//div[@id="content"]'):      comment = DoubanMovieCommentItem()      comment['useful_num'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip()      comment['no_help_num'] = item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip()      comment['people'] = item.xpath('//span[@property="v:reviewer"]/text()').extract()[0]      comment['people_url'] = item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0]      comment['star'] = item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0]      data_type = item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0]      print "data_type: "+data_type      if data_type == '0':        comment['comment'] = "/t#####/t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div/p/text()').extract()))      elif data_type == '1':        comment['comment'] = "/t#####/t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div[1]/text()').extract()))      comment['title'] = item.xpath('//span[@property="v:summary"]/text()').extract()[0]      comment['comment_page_url'] = response.url      #print comment      yield comment            
发表评论 共有条评论
用户名: 密码:
验证码: 匿名发表