python爬虫绝对领域
程序员文章站
2022-05-09 21:16:57
...
前言
试着爬了一下绝对领域,学习测试用
代码如下(示例):
import requests
from bs4 import BeautifulSoup
import os
import json
import urllib
url = "https://www.jdlingyu.com/"
tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"
class JD():
def __init__(self):
self.url = "https://www.jdlingyu.com/"
self.tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
self.post="?post_order=views"
def get_all_tag(self):
'''得到标签云中的内容'''
response = requests.get(self.tagurl, headers=self.header)
res_html = response.content.decode("utf-8")
allsoup=BeautifulSoup(res_html,"html.parser")
tagcloud_soup=allsoup.find("div",attrs={"class":"tagcloud"})
tagcloud=tagcloud_soup.find_all("a",attrs={"class":"tag-cloud-link"})
tagdict={}
for tag in tagcloud:
#print(tag.text,tag.attrs["href"])
tagdict[tag.text]=tag.attrs["href"]
return tagdict
def get_a_tag_url(self,url):
'''得到一种标签云中的所有内容'''
response = requests.get(url+self.post, headers=self.header)
res_html = response.content.decode("utf-8")
allsoup=BeautifulSoup(res_html,"html.parser")
gap_soup=allsoup.find("ul",attrs={"class":"b2_gap "})
gaps=gap_soup.find_all("div",attrs={"class":"post-info"})
gapdict={}
for gap in gaps:
print(gap.find("a").attrs["href"])
gapdict[gap.find("a").text]=gap.find("a").attrs["href"]
return gapdict
def tag_all_url(self):
'''得到所有分类下的网址到json'''
tagdict=self.get_all_tag()
gapdict={}
for tag,url in tagdict.items():
gapdict[tag]=self.get_a_tag_url(url)
with open("tag_all_url.json", "w",encoding='utf-8') as f:
f.write(json.dumps(gapdict, ensure_ascii=False, indent=4, separators=(',', ':')))
return gapdict
def load_json(self)-> dict:
"""json转字典"""
with open("tag_all_url.json", encoding="utf-8") as f:
json_file = json.load(f)
return json_file
def get_img(self,url):
response = requests.get(url, headers=self.header)
res_html = response.content.decode("utf-8")
res_html=BeautifulSoup(res_html,"html.parser")
imglist=res_html.find("div",attrs={"class":"entry-content"}).find_all("img")
for newurl in imglist:
newurl=newurl["src"]
urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
newurl=imglist[0]["src"]
urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
def folder(self,listname):
try:
listname=re.sub('[\/:*?"<>|]',"_",listname)
except:
pass
path=listname
if os.path.exists(path): # 判断是否存在同名文件夹
path = path + str(1)
else:
pass
os.makedirs(path)
return path
def run(self):
json_file=self.load_json()
self.folder("img")
os.chdir('img')
for k,v in json_file.items():
try:
k=re.sub('[\/:*?"<>|]',"_",k)
except:
pass
print("正在下载大分类:"+k)
self.folder(k)
print("文件夹建立成功,进入文件夹中")
os.chdir(k)
for k2,v2 in v.items():
k2=re.sub('[\/:*?"<>|]',"_",k2)
print("正在下载:"+k2)
self.folder(k2)
print("文件夹建立成功,进入文件夹中")
os.chdir(k2)
try:
self.get_img(v2)
print("下载完毕,正在返回上一级")
except:
print("下载失败,正在返回上一级")
os.chdir("..")
os.chdir("..")
if __name__ == '__main__':
JD().run()