Untitled

a guest

Dec 28th, 2020

8 days

Not a member of Pastebin yet? Sign Up, it unlocks many cool features!

Python 1.87 KB

raw download clone embed print report

import scrapy
class CurrentBordSpider(scrapy.Spider):
name = 'current_bord'
allowed_domains = ['leia.5ch.net']
start_urls = ['https://leia.5ch.net/poverty/subback.html']
# def __init__(self):
# pass
# def start_requests(self):
# url = self.thread_url()
# yield scrapy.Request(url)
def parse(self, response, **kwargs):
# スレのリストを取得
_xpath = '//small[@id="trad"]/a/@href'
_remove_str = 'l50'
thread_list_xpath = response.xpath(_xpath).extract()
# URLからl50の削除
thread_list = list(map(lambda x: response.urljoin(x.strip(_remove_str)), thread_list_xpath))
# 一覧にあるスレをクローリング
for _url_list in thread_list:
yield scrapy.Request(_url_list, callback=self.parse_thread)
pass
def parse_thread(self, response):
# スレの内容を取得
item = response.meta['item']
for root in response.xpath('//div[@class="post"]').extract():
print('end')
# レスの情報
# item['res_number'] = scrapy.Field()
# item['id'] = scrapy.Field()
# item['slip'] = scrapy.Field()
# item['name'] = ''
# item['res_date'] = scrapy.Field()
# item['mail'] = scrapy.Field()
# item['ip'] = scrapy.Field()
# item['res_be_number'] = scrapy.Field()
# スレ全体の情報
item['thread_url']: str = response.request.url
item['thread_tittle']: str = response.xpath('//title/text()').extract_first()
item['be_number']: str = \
response.xpath('//div[@id=1]/div/span[@class="be r2BP"]/a/@href').extract_first().split('/')[-1]
item['last_res']: int = len(response.xpath('//div[@class="post"]'))
print('end')
yield item
pass

RAW Paste Data

import scrapy

class CurrentBordSpider(scrapy.Spider):
    name = 'current_bord'
    allowed_domains = ['leia.5ch.net']
    start_urls = ['https://leia.5ch.net/poverty/subback.html']

# def __init__(self):
    #     pass

# def start_requests(self):
    #     url = self.thread_url()
    #     yield scrapy.Request(url)

def parse(self, response, **kwargs):
        # スレのリストを取得
        _xpath = '//small[@id="trad"]/a/@href'

_remove_str = 'l50'
        thread_list_xpath = response.xpath(_xpath).extract()
        # URLからl50の削除
        thread_list = list(map(lambda x: response.urljoin(x.strip(_remove_str)), thread_list_xpath))
        # 一覧にあるスレをクローリング
        for _url_list in thread_list:
            yield scrapy.Request(_url_list, callback=self.parse_thread)

pass

def parse_thread(self, response):
        # スレの内容を取得
        item = response.meta['item']
        for root in response.xpath('//div[@class="post"]').extract():

print('end')
        # レスの情報
        # item['res_number'] = scrapy.Field()
        # item['id'] = scrapy.Field()
        # item['slip'] = scrapy.Field()
        # item['name'] = ''
        # item['res_date'] = scrapy.Field()
        # item['mail'] = scrapy.Field()
        # item['ip'] = scrapy.Field()
        # item['res_be_number'] = scrapy.Field()
        # スレ全体の情報
        item['thread_url']: str = response.request.url
        item['thread_tittle']: str = response.xpath('//title/text()').extract_first()
        item['be_number']: str = \
            response.xpath('//div[@id=1]/div/span[@class="be r2BP"]/a/@href').extract_first().split('/')[-1]
        item['last_res']: int = len(response.xpath('//div[@class="post"]'))
        print('end')
        yield item
        pass