欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫练习6:今日头条搜索图集

程序员文章站 2022-04-26 09:40:21
...

描述

根据关键词下载图集等。结果如图:

python爬虫练习6:今日头条搜索图集



python爬虫练习6:今日头条搜索图集

代码

import requests
from urllib.parse import urlencode
import json
import os
from hashlib import md5
from multiprocessing import Pool

def get_page_index(offset,keyword):
    data = {
        'offset': offset,
        'format': 'json',
        'keyword': keyword,
        'autoload': 'true',
        'count': 20,
        'cur_tab': 3,   
        'from':'gallery'   
    }
    url='https://www.toutiao.com/search_content/?'+urlencode(data)
    try:
        response=requests.get(url)
        if response.status_code==200:
            return response.text
        return None
    except:
        print('请求索引出错')
        return None
def parse_page_index(content):
    try:
        content=json.loads(content)
        if content and "data" in content.keys():
            for item in content.get('data'):
                img=[img.get('url') for img in item.get('image_detail')]
                title=item.get('title')
                for img_url in img:
                    download_image(title,img_url)
                yield {
                    'article_url':item.get('article_url'),'title':title,'datetime': item.get('datetime'),
                    'favorite_count': item.get('favorite_count'),'repin_count':item.get('repin_count'),  # 原文未显示
                    "gallary_image_count": item.get('gallary_image_count'),'image_detail': img,
                    'media_name': item.get('media_name'),  "media_avatar_url": item.get('media_avatar_url'),# 作者
                    'media_url': 'https://www.toutiao.com' + item.get('media_url'),  # 作者主页
                    'comments_count': item.get('comments_count'),
                    'item_id': item.get('item_id'),
                    'group_id': item.get('group_id'),
                }
    except Exception:
        return None

def download_image(title,url):
    try:
        response=requests.get(url)
        if response.status_code==200:
            content=response.content
            path = '{0}/{2}-{1}'.format(os.getcwd(), KEYWORD, '今日头条')
            if not os.path.exists(path):
                os.makedirs(path)
            file_path = '{0}/{1}.{2}'.format(path, md5(content).hexdigest(), 'jpg')
            if not os.path.exists(file_path):
                with open(file_path, 'wb') as f:
                    f.write(content)
            print("下载文件成功:" ,title,url)
        return None
    except:
        print('下载文件出错',title,url)
        return None

def main(offset):
    html=get_page_index(offset,KEYWORD)
    for item in parse_page_index(html):
        pass

GROUP_START=1
GROUP_END=10#搜索页数,自定义
KEYWORD="街拍"#搜索关键词,自定义

if __name__ == '__main__':
    groups=[i*20 for i in range(GROUP_START-1,GROUP_END)]
    pool=Pool()
    pool.map(main,groups)