Guest User

Untitled

a guest
Dec 28th, 2020
11
8 days
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. import scrapy
  2.  
  3.  
  4. class CurrentBordSpider(scrapy.Spider):
  5.     name = 'current_bord'
  6.     allowed_domains = ['leia.5ch.net']
  7.     start_urls = ['https://leia.5ch.net/poverty/subback.html']
  8.  
  9.     # def __init__(self):
  10.     #     pass
  11.  
  12.     # def start_requests(self):
  13.     #     url = self.thread_url()
  14.     #     yield scrapy.Request(url)
  15.  
  16.     def parse(self, response, **kwargs):
  17.         # スレのリストを取得
  18.         _xpath = '//small[@id="trad"]/a/@href'
  19.  
  20.         _remove_str = 'l50'
  21.         thread_list_xpath = response.xpath(_xpath).extract()
  22.         # URLからl50の削除
  23.         thread_list = list(map(lambda x: response.urljoin(x.strip(_remove_str)), thread_list_xpath))
  24.         # 一覧にあるスレをクローリング
  25.         for _url_list in thread_list:
  26.             yield scrapy.Request(_url_list, callback=self.parse_thread)
  27.  
  28.         pass
  29.  
  30.     def parse_thread(self, response):
  31.         # スレの内容を取得
  32.         item = response.meta['item']
  33.         for root in response.xpath('//div[@class="post"]').extract():
  34.  
  35.  
  36.             print('end')
  37.         # レスの情報
  38.         # item['res_number'] = scrapy.Field()
  39.         # item['id'] = scrapy.Field()
  40.         # item['slip'] = scrapy.Field()
  41.         # item['name'] = ''
  42.         # item['res_date'] = scrapy.Field()
  43.         # item['mail'] = scrapy.Field()
  44.         # item['ip'] = scrapy.Field()
  45.         # item['res_be_number'] = scrapy.Field()
  46.         # スレ全体の情報
  47.         item['thread_url']: str = response.request.url
  48.         item['thread_tittle']: str = response.xpath('//title/text()').extract_first()
  49.         item['be_number']: str = \
  50.             response.xpath('//div[@id=1]/div/span[@class="be r2BP"]/a/@href').extract_first().split('/')[-1]
  51.         item['last_res']: int = len(response.xpath('//div[@class="post"]'))
  52.         print('end')
  53.         yield item
  54.         pass
RAW Paste Data