from urllib import request
import os
from user_agents import ua_list
import time
import random
import re
import requests
from lxml import etree
class MeiziSpider():
def __init__(self):
self.url = 'https://www.mzitu.com/all/'
def get_html(self, url):
headers = {'User-Agent': random.choice(ua_list)}
req = request.Request(url=url, headers=headers)
res = request.urlopen(req)
html = res.read()
return html
def re_func(self, re_bds, html):
pattern = re.compile(re_bds, re.S)
r_list = pattern.findall(html)
return r_list
def parse_html(self,url):
html = self.get_html(url).decode()
parse_obj = etree.HTML(html)
href_list = parse_obj.xpath('//div[@class="all"]/ul[@class="archives"]/li/p[@class="url"]/a/@href')
print("href_list:",href_list)
self.write_html(href_list)
def write_html(self, href_list):
for href in href_list:
two_url = href
print(two_url)
time.sleep(random.randint(1, 3))
self.save_image(two_url)
def save_image(self, two_url):
headers = {'Referer': two_url, 'User-Agent': random.choice(ua_list)}
print('---------two_url-----------', two_url)
i = 0
while True:
try:
img_link = two_url + '/{}'.format(i)
print("img_link:", img_link)
html = requests.get(url=img_link, headers=headers).text
re_bds = ' <div class="main-image"><p><a href="https://www.mzitu.com/.*?" ><img ' \
'src="(.*?)" alt="(.*?)" width=".*?" height=".*?" /></a></p>'
img_html_list = self.re_func(re_bds, html)
print("img_html_list", img_html_list)
name = img_html_list[0][1]
print("-----name:",name)
direc = '/home/ubuntu/meizi/{}/'.format(name)
print("direc:",direc)
if not os.path.exists(direc):
os.makedirs(direc)
img_ = requests.get(url=img_html_list[0][0], headers=headers).content
filename = direc + name + img_link.split('/')[-1] + '.jpg'
with open(filename, 'wb') as f:
f.write(img_)
i += 1
except Exception as e:
break
if __name__ == '__main__':
spider = MeiziSpider()
spider.parse_html('https://www.mzitu.com/all')
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
发布了4 篇原创文章 · 获赞 75 · 访问量 4万+