代码:
1 import requests 2 import os 3 from hashlib import md5 4 from urllib.parse import urlencode 5 from multiprocessing.pool import Pool 6 7 GROUP_START = 1 8 GROUP_END = 5 9 10 def get_page(offset):11 params = {12 'offset': offset,13 'format': 'json',14 'keyword': '街拍',15 'autoload': 'true',16 'count': '20',17 'cur_tab': '3',18 'form': 'search_tab',19 }20 url = 'https://www.toutiao.com/search_content/?' + urlencode(params)21 try:22 response = requests.get(url)23 if response.status_code == 200:24 return response.json()25 except requests.ConnectionError:26 return None27 28 def get_images(json):29 data = json.get('data')30 if data:31 for item in data:32 image_list = item.get('image_list')33 title = item.get('title')34 if image_list:35 for image in image_list:36 # 构造一个生成器,将图片和标题一起返回37 yield {38 'image': image.get('url'),39 'title': title40 }41 42 # item就是get_image()返回的一个字典43 # item里面的title创建一个文件夹44 def save_image(item):45 if not os.path.exists(item.get('title')):46 os.mkdir(item.get('title'))47 try:48 local_image_url = item.get('image')49 new_image_url = local_image_url.replace('list', 'large')50 response = requests.get('http:' + new_image_url)51 if response.status_code == 200:52 file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')53 # 判断路径是否存在,如果不存在,写入54 if not os.path.exists(file_path):55 with open(file_path, 'wb')as f:56 f.write(response.content)57 else:58 print('Already Download', file_path)59 except:60 print('Failed to save image')61 62 # 定义一个offset数组,遍历,提取图片,下载63 def main(offset):64 json = get_page(offset)65 for item in get_images(json):66 print(item)67 save_image(item)68 69 if __name__ == '__main__':70 pool = Pool() # 创建进程池71 groups = ([x * 20 for x in range(GROUP_START, GROUP_END + 1)])72 pool.map(main, groups) # 第一个参数是函数,第二个参数是一个迭代器,将迭代器中的数字作为参数依次传入函数中73 pool.close() # 关闭pool,使其不在接受新的(主进程)任务74 pool.join() # 主进程阻塞后,让子进程继续运行完成,子进程运行完后,再把主进程全部关掉
结果:
此时可以看到文件夹里:
随便打开一个:
Successful!