Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import scrapy
- class CurrentBordSpider(scrapy.Spider):
- name = 'current_bord'
- allowed_domains = ['leia.5ch.net']
- start_urls = ['https://leia.5ch.net/poverty/subback.html']
- # def __init__(self):
- # pass
- # def start_requests(self):
- # url = self.thread_url()
- # yield scrapy.Request(url)
- def parse(self, response, **kwargs):
- # スレのリストを取得
- _xpath = '//small[@id="trad"]/a/@href'
- _remove_str = 'l50'
- thread_list_xpath = response.xpath(_xpath).extract()
- # URLからl50の削除
- thread_list = list(map(lambda x: response.urljoin(x.strip(_remove_str)), thread_list_xpath))
- # 一覧にあるスレをクローリング
- for _url_list in thread_list:
- yield scrapy.Request(_url_list, callback=self.parse_thread)
- pass
- def parse_thread(self, response):
- # スレの内容を取得
- item = response.meta['item']
- for root in response.xpath('//div[@class="post"]').extract():
- print('end')
- # レスの情報
- # item['res_number'] = scrapy.Field()
- # item['id'] = scrapy.Field()
- # item['slip'] = scrapy.Field()
- # item['name'] = ''
- # item['res_date'] = scrapy.Field()
- # item['mail'] = scrapy.Field()
- # item['ip'] = scrapy.Field()
- # item['res_be_number'] = scrapy.Field()
- # スレ全体の情報
- item['thread_url']: str = response.request.url
- item['thread_tittle']: str = response.xpath('//title/text()').extract_first()
- item['be_number']: str = \
- response.xpath('//div[@id=1]/div/span[@class="be r2BP"]/a/@href').extract_first().split('/')[-1]
- item['last_res']: int = len(response.xpath('//div[@class="post"]'))
- print('end')
- yield item
- pass
RAW Paste Data