python爬虫练习6:今日头条搜索图集
程序员文章站
2022-04-26 09:40:21
...
描述
根据关键词下载图集等。结果如图:
代码
import requests
from urllib.parse import urlencode
import json
import os
from hashlib import md5
from multiprocessing import Pool
def get_page_index(offset,keyword):
data = {
'offset': offset,
'format': 'json',
'keyword': keyword,
'autoload': 'true',
'count': 20,
'cur_tab': 3,
'from':'gallery'
}
url='https://www.toutiao.com/search_content/?'+urlencode(data)
try:
response=requests.get(url)
if response.status_code==200:
return response.text
return None
except:
print('请求索引出错')
return None
def parse_page_index(content):
try:
content=json.loads(content)
if content and "data" in content.keys():
for item in content.get('data'):
img=[img.get('url') for img in item.get('image_detail')]
title=item.get('title')
for img_url in img:
download_image(title,img_url)
yield {
'article_url':item.get('article_url'),'title':title,'datetime': item.get('datetime'),
'favorite_count': item.get('favorite_count'),'repin_count':item.get('repin_count'), # 原文未显示
"gallary_image_count": item.get('gallary_image_count'),'image_detail': img,
'media_name': item.get('media_name'), "media_avatar_url": item.get('media_avatar_url'),# 作者
'media_url': 'https://www.toutiao.com' + item.get('media_url'), # 作者主页
'comments_count': item.get('comments_count'),
'item_id': item.get('item_id'),
'group_id': item.get('group_id'),
}
except Exception:
return None
def download_image(title,url):
try:
response=requests.get(url)
if response.status_code==200:
content=response.content
path = '{0}/{2}-{1}'.format(os.getcwd(), KEYWORD, '今日头条')
if not os.path.exists(path):
os.makedirs(path)
file_path = '{0}/{1}.{2}'.format(path, md5(content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(content)
print("下载文件成功:" ,title,url)
return None
except:
print('下载文件出错',title,url)
return None
def main(offset):
html=get_page_index(offset,KEYWORD)
for item in parse_page_index(html):
pass
GROUP_START=1
GROUP_END=10#搜索页数,自定义
KEYWORD="街拍"#搜索关键词,自定义
if __name__ == '__main__':
groups=[i*20 for i in range(GROUP_START-1,GROUP_END)]
pool=Pool()
pool.map(main,groups)
上一篇: 简单爬今日头条街拍获取图集
下一篇: 高德百度地图如何获取附近小区酒店银行等?