python爬虫绝对领域

程序员文章站 2022-05-09 21:16:57

...

前言

试着爬了一下绝对领域，学习测试用

代码如下（示例）：


import requests
from bs4 import BeautifulSoup
import os
import json
import urllib
url = "https://www.jdlingyu.com/"
tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"




class JD():
    def __init__(self):
        self.url = "https://www.jdlingyu.com/"
        self.tagurl="https://www.jdlingyu.com/tuji/hentai/gctt/82096.html"
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
        }
        self.post="?post_order=views"

   
    def get_all_tag(self):
        '''得到标签云中的内容'''
        response =  requests.get(self.tagurl, headers=self.header)
        res_html = response.content.decode("utf-8")
        allsoup=BeautifulSoup(res_html,"html.parser")
        tagcloud_soup=allsoup.find("div",attrs={"class":"tagcloud"})
        tagcloud=tagcloud_soup.find_all("a",attrs={"class":"tag-cloud-link"})
        tagdict={}
        for tag in tagcloud:
            #print(tag.text,tag.attrs["href"])
            tagdict[tag.text]=tag.attrs["href"]
        return tagdict
    def get_a_tag_url(self,url):
        '''得到一种标签云中的所有内容'''
        response =  requests.get(url+self.post, headers=self.header)
        res_html = response.content.decode("utf-8")
        allsoup=BeautifulSoup(res_html,"html.parser")
        gap_soup=allsoup.find("ul",attrs={"class":"b2_gap "})
        gaps=gap_soup.find_all("div",attrs={"class":"post-info"})
        gapdict={}
        for gap in gaps:
            print(gap.find("a").attrs["href"])
            gapdict[gap.find("a").text]=gap.find("a").attrs["href"]
        return gapdict    
    def tag_all_url(self):
        '''得到所有分类下的网址到json'''
        tagdict=self.get_all_tag()
        gapdict={}
        for tag,url in tagdict.items():
            gapdict[tag]=self.get_a_tag_url(url)    
        with open("tag_all_url.json", "w",encoding='utf-8') as f:
            f.write(json.dumps(gapdict, ensure_ascii=False, indent=4, separators=(',', ':')))
        return gapdict
    def load_json(self)-> dict:
        """json转字典"""
        with open("tag_all_url.json", encoding="utf-8") as f:
            json_file = json.load(f)
       
        return json_file
    def get_img(self,url):
        response =  requests.get(url, headers=self.header)
        res_html = response.content.decode("utf-8")
        res_html=BeautifulSoup(res_html,"html.parser")
        imglist=res_html.find("div",attrs={"class":"entry-content"}).find_all("img")
        for newurl in imglist:
            newurl=newurl["src"]
            urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
        
        newurl=imglist[0]["src"]
        urllib.request.urlretrieve(newurl,newurl.split("/")[-1])
        
    def folder(self,listname):
        try:
            listname=re.sub('[\/:*?"<>|]',"_",listname)
        except:
            pass
        path=listname
        if os.path.exists(path): # 判断是否存在同名文件夹
            path = path + str(1)
        else:
            pass
        
        os.makedirs(path)
        return path
    
    def run(self):
        json_file=self.load_json()
        self.folder("img")
        os.chdir('img')
        for  k,v in json_file.items():
            try:
                k=re.sub('[\/:*?"<>|]',"_",k)
            except:
                pass
            print("正在下载大分类："+k)
            self.folder(k)
            print("文件夹建立成功，进入文件夹中")
            os.chdir(k)
            for k2,v2 in v.items():
                k2=re.sub('[\/:*?"<>|]',"_",k2)
                print("正在下载："+k2)
                self.folder(k2)
                print("文件夹建立成功，进入文件夹中")
                os.chdir(k2)
                try:
                    self.get_img(v2)
                    print("下载完毕，正在返回上一级")
                except:
                    print("下载失败，正在返回上一级")
                os.chdir("..")
            os.chdir("..")
               







if __name__ == '__main__':                                       
   
    JD().run()

相关标签：爬虫学习 python 爬虫

上一篇：数据提取（二）：xpath - lxml从字符串和文件中解析html代码 etree.html()，etree.tostring()，etree.parse()，etree.HTMLParser()

下一篇： ubuntu 14.04 安装 mysql

python爬虫绝对领域

前言

python爬虫获取动漫截图

python爬虫是什么意思（简单好玩的编程代码）

零基础写python爬虫之打包生成exe文件

零基础写python爬虫之爬虫框架Scrapy安装配置

零基础写python爬虫之抓取百度贴吧并存储到本地txt文件改进版

详解Python中的from..import绝对导入语句

python实现博客文章爬虫示例

零基础写python爬虫之urllib2中的两个重要概念：Openers和Handlers

零基础写python爬虫之爬虫编写全记录

零基础写python爬虫之神器正则表达式

python爬虫绝对领域

前言

python爬虫 获取动漫截图

python爬虫是什么意思（简单好玩的编程代码）

零基础写python爬虫之打包生成exe文件

零基础写python爬虫之爬虫框架Scrapy安装配置

零基础写python爬虫之抓取百度贴吧并存储到本地txt文件改进版

详解Python中的from..import绝对导入语句

python实现博客文章爬虫示例

零基础写python爬虫之urllib2中的两个重要概念：Openers和Handlers

零基础写python爬虫之爬虫编写全记录

零基础写python爬虫之神器正则表达式

python爬虫获取动漫截图