Images 爬虫
程序员文章站
2024-03-15 08:30:23
...
import urllib.request
from bs4 import BeautifulSoup
import ssl
import re
import csv
main_url = 'https://www.ipmimages.org/search/action.cfm?q=smut'
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
open_headers = [('User-Agent',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36')]
common_head = 'https://www.ipmimages.org'
ssl._create_default_https_context = ssl._create_unverified_context
def open_page(url):
req = urllib.request.Request(url=url, headers=headers)
html = urllib.request.urlopen(req).read()
return BeautifulSoup(html, features='html.parser')
def find_items(soup):
details = set()
links = soup.findAll('a', href=re.compile('\/browse\/detail\.cfm\?imgnum=[0-9]+'))
for link in links:
link.attrs['href'] = common_head + link.attrs['href']
if link.attrs['href'] not in details:
details.add(link.attrs['href'])
return details
def find_next(soup):
next_page = soup.find('a', href=re.compile('\/search\/action\.cfm\?q=[A-Za-z0-9]+[&]start=[0-9]'))
if 'next' in next_page.text:
next_page.attrs['href'] = common_head + next_page.attrs['href']
return next_page.attrs['href']
def save_labels(soup, writer):
links = soup.findAll('a', href=re.compile('\/browse\/detail\.cfm\?imgnum=[0-9]+'))
for link in links:
writer.writerow((link.find("strong").text, link.next_sibling.next_sibling.lstrip()))
print("save_labels end")
def download_item(url):
soup = open_page(url)
image = soup.find('meta', content=re.compile('https\:\/\/bugwoodcloud\.org\/images\/'))
downlod_url = image.attrs['content']
image_id = downlod_url[-11:]
opener = urllib.request.build_opener()
opener.addheaders = open_headers
urllib.request.install_opener(opener)
urllib.request.urlretrieve(downlod_url, "/Users/yuchu/Documents/Python/web_scraping/images/%s" % image_id)
def search_page(url, writer):
print("search_page start")
soup = open_page(url)
save_labels(soup, writer)
items = find_items(soup)
for item in items:
download_item(item)
next_page = find_next(soup)
if next_page:
search_page(next_page, writer)
csvFile = open('/Users/yuchu/Documents/Python/web_scraping/labels/label.csv', 'w+')
try:
csv_writer = csv.writer(csvFile)
csv_writer.writerow(('id', 'label'))
search_page(main_url, csv_writer)
print("search_page end")
finally:
csvFile.close()
转载于:https://www.jianshu.com/p/8fc5f29416b6