爬虫相关知识

程序员文章站 2024-02-22 15:21:22

...

爬虫相关知识

知识点

beautifulsoup

soup = BeautifulSoup(demo, “html.parser”）
“”"
demo 表示被解析的html格式的内容
html.parser表示解析用的解析器
“”"
print(‘a标签的href属性是：’, soup.a.attrs[‘href’]) # 同样，通过字典的方式获取a标签的href属性

soup.find(name=‘div’,attrs={“class”:“xxxxx”}) #定位一个标签
soup.find__all(name=‘div’,attrs={“class”:“xxxxx”}) #定位多个返回一个列表类型
img.get(‘src’) #获取标签的属性
div.text #获取标签的值

find例子：
print(‘所有a标签的内容：’, soup.find_all(‘a’)) # 使用find_all()方法通过标签名称查找a标签,返回的是一个列表类型
print(‘a标签和b标签的内容：’, soup.find_all([‘a’, ‘b’])) # 把a标签和b标签作为一个列表传递，可以一次找到a标签和b标签

脚本范例：
爬取天极网：

import os
import re
import requests
from bs4 import BeautifulSoup

file_dir = os.path.dirname(os.path.abspath(__file__))

respon = requests.get(url='http://pic.yesky.com/c/6_3655_6.shtml')
respon.encoding='gbk'
text =respon.text
soup = BeautifulSoup(text,'html.parser')
div_obj = soup.find(name='div', attrs={"class": "lb_box"})
li_list = div_obj.find_all(name='dd')

for i in li_list:
    img = i.find(name = 'a')
    alt = img.get('title')
    alt = re.sub('[\/:*?"<>|]', '-', alt)#####去掉特殊的标点字符
    Base_dir = os.path.join(file_dir,'7160',alt)
    big_image  = img.get('href')
    big_respon = requests.get(url=big_image)
    big_respon.encoding='gbk'
    big_text = big_respon.text
    big_soup = BeautifulSoup(big_text,'html.parser')
    big_obj = big_soup.find(name='div',attrs={"class":"overview"})
    big_list = big_obj.find_all(name='img')
    for l in big_list:
        big_src = l.get('src').replace('113x113','740x-') #根据大图和小图的特性进行替换
        File_dir = os.path.join(file_dir, '7160', alt, big_src.rsplit('/', 1)[-1])
        print(File_dir)

        if os.path.exists(Base_dir):
            with open(File_dir,'wb') as f :
                res = requests.get(url=big_src)
                f.write(res.content)
        else:
            os.makedirs(Base_dir)
            with open(File_dir,'wb') as f :
                res = requests.get(url=big_src)
                f.write(res.content)
'''

爬虫相关知识

爬虫相关知识

知识点