欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Images 爬虫

程序员文章站 2024-03-15 08:30:23
...
import urllib.request
from bs4 import BeautifulSoup
import ssl
import re
import csv

main_url = 'https://www.ipmimages.org/search/action.cfm?q=smut'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
open_headers = [('User-Agent',
                 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36')]
common_head = 'https://www.ipmimages.org'
ssl._create_default_https_context = ssl._create_unverified_context


def open_page(url):
    req = urllib.request.Request(url=url, headers=headers)
    html = urllib.request.urlopen(req).read()
    return BeautifulSoup(html, features='html.parser')


def find_items(soup):
    details = set()
    links = soup.findAll('a', href=re.compile('\/browse\/detail\.cfm\?imgnum=[0-9]+'))
    for link in links:
        link.attrs['href'] = common_head + link.attrs['href']
        if link.attrs['href'] not in details:
            details.add(link.attrs['href'])
    return details


def find_next(soup):
    next_page = soup.find('a', href=re.compile('\/search\/action\.cfm\?q=[A-Za-z0-9]+[&]start=[0-9]'))
    if 'next' in next_page.text:
        next_page.attrs['href'] = common_head + next_page.attrs['href']
        return next_page.attrs['href']


def save_labels(soup, writer):
    links = soup.findAll('a', href=re.compile('\/browse\/detail\.cfm\?imgnum=[0-9]+'))
    for link in links:
        writer.writerow((link.find("strong").text, link.next_sibling.next_sibling.lstrip()))
    print("save_labels end")


def download_item(url):
    soup = open_page(url)
    image = soup.find('meta', content=re.compile('https\:\/\/bugwoodcloud\.org\/images\/'))
    downlod_url = image.attrs['content']
    image_id = downlod_url[-11:]
    opener = urllib.request.build_opener()
    opener.addheaders = open_headers
    urllib.request.install_opener(opener)
    urllib.request.urlretrieve(downlod_url, "/Users/yuchu/Documents/Python/web_scraping/images/%s" % image_id)


def search_page(url, writer):
    print("search_page start")
    soup = open_page(url)
    save_labels(soup, writer)
    items = find_items(soup)
    for item in items:
        download_item(item)
    next_page = find_next(soup)
    if next_page:
        search_page(next_page, writer)


csvFile = open('/Users/yuchu/Documents/Python/web_scraping/labels/label.csv', 'w+')
try:
    csv_writer = csv.writer(csvFile)
    csv_writer.writerow(('id', 'label'))
    search_page(main_url, csv_writer)
    print("search_page end")
finally:
    csvFile.close()

转载于:https://www.jianshu.com/p/8fc5f29416b6