欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python爬虫绝对领域

程序员文章站 2022-05-09 21:16:57
...

前言

试着爬了一下绝对领域,学习测试用


代码如下(示例):


import requests
from bs4 import BeautifulSoup
import os
import json
import urllib
url = "https://www.jdlingyu.com/"
tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"




class JD():
    def __init__(self):
        self.url = "https://www.jdlingyu.com/"
        self.tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
        }
        self.post="?post_order=views"

   
    def get_all_tag(self):
        '''得到标签云中的内容'''
        response =  requests.get(self.tagurl, headers=self.header)
        res_html = response.content.decode("utf-8")
        allsoup=BeautifulSoup(res_html,"html.parser")
        tagcloud_soup=allsoup.find("div",attrs={"class":"tagcloud"})
        tagcloud=tagcloud_soup.find_all("a",attrs={"class":"tag-cloud-link"})
        tagdict={}
        for tag in tagcloud:
            #print(tag.text,tag.attrs["href"])
            tagdict[tag.text]=tag.attrs["href"]
        return tagdict
    def get_a_tag_url(self,url):
        '''得到一种标签云中的所有内容'''
        response =  requests.get(url+self.post, headers=self.header)
        res_html = response.content.decode("utf-8")
        allsoup=BeautifulSoup(res_html,"html.parser")
        gap_soup=allsoup.find("ul",attrs={"class":"b2_gap "})
        gaps=gap_soup.find_all("div",attrs={"class":"post-info"})
        gapdict={}
        for gap in gaps:
            print(gap.find("a").attrs["href"])
            gapdict[gap.find("a").text]=gap.find("a").attrs["href"]
        return gapdict    
    def tag_all_url(self):
        '''得到所有分类下的网址到json'''
        tagdict=self.get_all_tag()
        gapdict={}
        for tag,url in tagdict.items():
            gapdict[tag]=self.get_a_tag_url(url)    
        with open("tag_all_url.json", "w",encoding='utf-8') as f:
            f.write(json.dumps(gapdict, ensure_ascii=False, indent=4, separators=(',', ':')))
        return gapdict
    def load_json(self)-> dict:
        """json转字典"""
        with open("tag_all_url.json", encoding="utf-8") as f:
            json_file = json.load(f)
       
        return json_file
    def get_img(self,url):
        response =  requests.get(url, headers=self.header)
        res_html = response.content.decode("utf-8")
        res_html=BeautifulSoup(res_html,"html.parser")
        imglist=res_html.find("div",attrs={"class":"entry-content"}).find_all("img")
        for newurl in imglist:
            newurl=newurl["src"]
            urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
        
        newurl=imglist[0]["src"]
        urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
        
    def folder(self,listname):
        try:
            listname=re.sub('[\/:*?"<>|]',"_",listname)
        except:
            pass
        path=listname
        if os.path.exists(path): # 判断是否存在同名文件夹
            path = path + str(1)
        else:
            pass
        
        os.makedirs(path)
        return path
    
    def run(self):
        json_file=self.load_json()
        self.folder("img")
        os.chdir('img')
        for  k,v in json_file.items():
            try:
                k=re.sub('[\/:*?"<>|]',"_",k)
            except:
                pass
            print("正在下载大分类:"+k)
            self.folder(k)
            print("文件夹建立成功,进入文件夹中")
            os.chdir(k)
            for k2,v2 in v.items():
                k2=re.sub('[\/:*?"<>|]',"_",k2)
                print("正在下载:"+k2)
                self.folder(k2)
                print("文件夹建立成功,进入文件夹中")
                os.chdir(k2)
                try:
                    self.get_img(v2)
                    print("下载完毕,正在返回上一级")
                except:
                    print("下载失败,正在返回上一级")
                os.chdir("..")
            os.chdir("..")
               







if __name__ == '__main__':                                       
   
    JD().run()