Python实战---抓取头条网街拍美图
啊哈哈哈哈~今天成功的完成了抓取头条网街拍美图的小任务。虽然看的是网易云课堂的老师的讲解,但是里面还是有自己的思考滴!现记录如下:
系统:windows10、Spyder~为什么我不用Ubuntu16,pycharm了呢。因为我的Ubuntu本本网坏掉了,如果能修好的话,我一定针对那个问题发个博。不过话外:Anaconda真的是好好用啊~强大、轻盈、美丽、优雅~
最终结果,抓取头条的街拍美图,并保存到指定路径下(视频里老师用的MongoDB数据库,我没有下载)。截图如下:
左边的那些jpg就是程序抓取的图片啦~plus:如果能找到资源的话,是不是抓取小黄图指日可待了,hiahiahia~
把总代码奉上,正如上图所示,程序很简单,只有一个spider.py, 一个config.py。
# -*- coding: utf-8 -*-
#spider.py
import json
import re
import os
import requests
from bs4 import BeautifulSoup
from requests import RequestException
import time
from hashlib import md5
from multiprocessing import Pool
import sys
sys.path.append("E:\PythonProject\jiepai")
import config
def get_page_index(offset,keyword):
data={
'offset':offset,
'format':'json',
'keyword':keyword,
'autoload':'true',
'count':'20',
'cur_tab':1,
'from':'search_tab'
}
headers={
'referer':'https://www.toutiao.com/search/?keyword=%E8%A1%97%E6%8B%8D',
'user-agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Mobile Safari/537.36',
'x-requested-with':'XMLHttpRequest',
'cookie':'uuid="w:eda26cb7f71b4082989cf72050143613"; _ga=GA1.2.1465496017.1517130472; UM_distinctid=1613c05db45524-05319a30e1b5b8-4323461-100200-1613c05db46aaf; tt_webid=6516025715698451976; sso_login_status=0; tt_webid=6516025715698451976; WEATHER_CITY=%E5%8C%97%E4%BA%AC; CNZZDATA1259612802=2081794518-1517129274-%7C1517449101; __tasessionId=zrltezo5h1517450759922'
}
url='https://www.toutiao.com/search_content'
try:
response = requests.get(url,params=data,headers=headers)
#print(type(response)) <class 'requests.models.Response'>
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求索引页错误")
return None
pass
def parse_page_index(html):
parse_data = json.loads(html)
#print(type(parse_data)) #<class 'dict'>
if parse_data and 'data' in parse_data.keys():
for item in parse_data.get('data'):
if item and 'article_url' in item.keys():
yield item.get('article_url')
def get_page_detail(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
return None
except RequestException:
print("请求详情页错误",url)
return None
pass
def parse_page_detail(detail_html):
soup = BeautifulSoup(detail_html,"lxml")
#print(soup)
title = soup.select('title')[0].get_text()
print(title)
images_pattern = re.compile(r'gallery: JSON.parse\("(.*?)"\),',re.S)
# json_data is type json, replace: \" ----> "
list_data = re.findall(images_pattern,detail_html.replace('\\"','"'))
#print(type(list_data)) <class 'list'>
if list_data:
str_data = list_data[0] #<class 'str'>
json_data = json.loads(str_data)
#print(type(json_data)) #dict
if json_data and 'sub_images' in json_data.keys():
sub_images = json_data.get('sub_images') #<class list>
for items in sub_images:
image_urls = items.get('url')
#print(image_urls)
download_image(image_urls)
return True
def download_image(image_url):
#print(type(image_url))
image_standardurl = image_url.replace('\\','')
print('正在加载图片',image_standardurl)
try:
response = requests.get(image_standardurl)
#print(response.status_code)
if response.status_code == 200 :
print('okokokoko')
save_image(response.content)
except RequestException:
print('请求图片错误',image_url)
pass
def save_image(content):
file_path = '{0}/{1}/{2}.{3}'.format(os.getcwd(),'pictures',md5(content).hexdigest(),'jpg')
file_standardpath = file_path.replace('\\','/')
#print(file_standardpath)
if not os.path.exists(file_standardpath):
with open(file_standardpath,'wb') as f:
f.write(content)
f.close()
def main(offset):
html = get_page_index(offset,config.KEYWORD)
#print(type(html)) <class 'str'>
#print(html)
for url in parse_page_index(html):
detail_html = get_page_detail(url)
if detail_html:
#print(detail_html)
#print('---'*30)
#print(type(detail_html))
time.sleep(3)
parse_page_detail(detail_html)
if __name__ =='__main__':
groups = [x * 20 for x in range(config.GROUP_START,config.GROUP_END)]
pool = Pool()
pool.map(main,groups)
这里需要学习的主要是:requests的用法,BeautifulSoup的用法,os的用法,同时还有md5加密以及多进程。
用到的基本语法有列表、字符串、字典。
具体思路就是,先搜索‘’街拍‘’二字,如图所示:
然后开启小工具,查看Network下面的XHR:
我们就是要找到Preview下的data数据article_url,然后对这个article_url抓取,采集里面的图片。
在对图片url进行分析的时候,用到了正则表达式。gallery: JSON.parse\("(.*?)"\), 表示的是,以gallery: JSON.parse("开头,然后以")结尾,中间的.*?表示匹配任意数量的重复,但是在能使整个匹配成功的前提下使用最少的重复。记住后面一定得是re.findall(),用search(),match()都不行。这三个函数有啥区别:
参考https://www.cnblogs.com/tina-python/p/5508402.html
config.py 代码如下:
# -*- coding: utf-8 -*-
#config.py
GROUP_START = 1
GROUP_END = 9
KEYWORD = '街拍'
恩。就酱。好像没啥需要多说的~还是太菜~该水水GitHub了
上一篇: Https图片链接批量打包下载zip
下一篇: char[]和char*的区别(转)