Images 爬虫

程序员文章站 2024-03-15 08:30:23

...

import urllib.request
from bs4 import BeautifulSoup
import ssl
import re
import csv

main_url = 'https://www.ipmimages.org/search/action.cfm?q=smut'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
open_headers = [('User-Agent',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36')]
common_head = 'https://www.ipmimages.org'
ssl._create_default_https_context = ssl._create_unverified_context


def open_page(url):
    req = urllib.request.Request(url=url, headers=headers)
    html = urllib.request.urlopen(req).read()
    return BeautifulSoup(html, features='html.parser')


def find_items(soup):
    details = set()
    links = soup.findAll('a', href=re.compile('\/browse\/detail\.cfm\?imgnum=[0-9]+'))
    for link in links:
        link.attrs['href'] = common_head + link.attrs['href']
        if link.attrs['href'] not in details:
            details.add(link.attrs['href'])
    return details


def find_next(soup):
    next_page = soup.find('a', href=re.compile('\/search\/action\.cfm\?q=[A-Za-z0-9]+[&amp;]start=[0-9]'))
    if 'next' in next_page.text:
        next_page.attrs['href'] = common_head + next_page.attrs['href']
        return next_page.attrs['href']


def save_labels(soup, writer):
    links = soup.findAll('a', href=re.compile('\/browse\/detail\.cfm\?imgnum=[0-9]+'))
    for link in links:
        writer.writerow((link.find("strong").text, link.next_sibling.next_sibling.lstrip()))
    print("save_labels end")


def download_item(url):
    soup = open_page(url)
    image = soup.find('meta', content=re.compile('https\:\/\/bugwoodcloud\.org\/images\/'))
    downlod_url = image.attrs['content']
    image_id = downlod_url[-11:]
    opener = urllib.request.build_opener()
    opener.addheaders = open_headers
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(downlod_url, "/Users/yuchu/Documents/Python/web_scraping/images/%s" % image_id)


def search_page(url, writer):
    print("search_page start")
    soup = open_page(url)
    save_labels(soup, writer)
    items = find_items(soup)
    for item in items:
        download_item(item)
    next_page = find_next(soup)
    if next_page:
        search_page(next_page, writer)


csvFile = open('/Users/yuchu/Documents/Python/web_scraping/labels/label.csv', 'w+')
try:
    csv_writer = csv.writer(csvFile)
    csv_writer.writerow(('id', 'label'))
    search_page(main_url, csv_writer)
    print("search_page end")
finally:
    csvFile.close()

转载于:https://www.jianshu.com/p/8fc5f29416b6

上一篇：二维数组右上左下遍历

下一篇： Assert(断言) 替换 throw exception进行非空判断

Images 爬虫

Images 爬虫

location ^~ /images/

RGB Images

. Smoothing Images

Python3 爬虫实战 — 58同城武汉出租房【加密字体对抗】

Python爬虫（一）

爬虫（三）——获取网页图片并保存在本地

GO语言实现爬虫（实现文字和图片爬取）

Java爬虫学习——实例：获取起点中文网站小说并保存成txt文件

Python爬虫必备技术点（二）