简介
再原先的案例中进行了改写,加入多线程采集嘎嘎快
开发工具
环境使用
- Python 3.8 解释器
- Pycharm 编辑器
所使用模块
- import re
- import os
- import requests >>> pip install requests
如果安装python第三方模块:
- in + R 输入 cmd 点击确定, 输入安装命令 pip install 模块名 (pip install requests) 回车
- 在pycharm中点击Terminal(终端) 输入安装命令
实现思路
- 发送请求 (这里都是get请求)
- 模拟浏览器,构造请求头headers
- 获取数据
- 解析数据
- 提取我们想要的内容(re匹配出需要的,建议直接再源码中复制需要的来进行匹配提前取)
- 保存数据
代码实现过程
# -*- coding:utf-8 -*-
# @Author:🎈RedBalloon
# @Time:2022/10/21-16:00
# @File:new.py
import requests # 用来发送请求模块<工具>
import re # 提取数据工具
import os
import threading # 加入多线程
"""获取彼岸壁纸的4k游戏页"""
# 获取页面
def get_page(page_url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36',
}
response = requests.get(url=page_url, headers=headers)
response.encoding = response.apparent_encoding
return response
# 解析页面获取需要的
def parse(html):
html_info = re.findall(r'<li><a href="(.*?)".*?alt="(.*?)" /><b>', html.text)
url_list = []
title_list = []
for link, title in html_info:
# https://pic.netbian.com/tupian/29887.html
link_url = 'https://pic.netbian.com' + link
second_html = get_page(link_url)
img_url = re.findall(r'id="img"><img src="(.*?)"', second_html.text)[0]
real_url = 'https://pic.netbian.com' + img_url
url_list.append(real_url)
title_list.append(title)
return url_list, title_list
# 保存图片
def save(path, pic_content, title):
with open(path + title + '.jpg', mode='wb') as f:
f.write(pic_content)
def download(url_link, title):
url_data = get_page(url_link).content
path = "downloads\\"
if not os.path.exists(path):
os.makedirs(path)
save(path, url_data, title)
print(url_link, title)
if __name__ == '__main__':
for page in range(1, 3): # 获取的页数 1-2
# 这是因为网站没有index_1这个页面
if page == 1:
url = 'https://pic.netbian.com/4kyouxi/index.html' # 网站的链接
else:
url = f'https://pic.netbian.com/4kyouxi/index_{page}.html'
html_data = get_page(url)
# parse(html_data)
urls, titles = parse(html_data)
threads = []
for i in range(len(urls)):
t = threading.Thread(target=download, args=(urls[i], titles[i]))
threads.append(t)
for t in threads:
# t.setDaemon(True) # 守护进程
t.start()
图片上传图床地址: 带你去看
评论 (0)