python识别批量网站中的图片

程序员文章站 2022-07-02 17:34:26

需要实现的功能：给出一个网站列表，抓出这些网页上的图片。实现方式：下载网页源码，在源码中识别包含图片url的标签，如,

。由于对html了解较少，哪些标签可能含有图片是从查看多个网站的源码中总结出来的。调用的库：Selenium（加载Chrome驱动）--获取执行J ......

需要实现的功能：给出一个网站列表，抓出这些网页上的图片。

实现方式：下载网页源码，在源码中识别包含图片url的标签，如<img>,<div>,<li>。由于对html了解较少，哪些标签可能含有图片是从查看多个网站的源码中总结出来的。

调用的库：selenium（加载chrome驱动）--获取执行js后的源码。

　　threading--实现多进程

代码：

from urllib.parse import urljoin,urlparse
import os
import threading
from time import ctime
from selenium import webdriver
import re

class mythread(threading.thread):
    def __init__(self,func,args,name=''):
        threading.thread.__init__(self)
        self.name=name
        self.func=func
        self.args=args
        self.is_end=false


    def getresult(self):
        return self.res

    def run(self):
        self.res=self.func(*self.args)

def filter_in_tag(page_file,tag):
    url_in_tag = []
    url_in_tag.append('------------------%s--------------------' % (tag))
    with open(page_file, 'r', encoding='utf-8') as jj:
        for line in jj:
            ##先找出li所有标签
            reg = '<%s [^>]*>' % (tag)
            all_tag_str = re.findall(reg, line)

            for tag_str in all_tag_str:
                if re.search('https?://[^\'\"\)]+', tag_str):
                    url_in_tag.extend(re.findall('http?://[^\'\"]+', tag_str))
    return url_in_tag
def process(m_url):
    imgs,big_files,hrefs=[],[],[]
    ##先找出图片
    ##添加参数，使chrome不出现界面
    chrome_options = webdriver.chrome.options.options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.chrome(r'c:\program files (x86)\google\chrome\application\chromedriver.exe',
                              chrome_options=chrome_options)  ##driver = webdriver.phantomjs(executable_path='/bin/phantomjs/bin/phantomjs')#如果不方便配置环境变量。就使用phantomjs的绝对路径也可以

    driver.set_page_load_timeout(30)
    try:
        driver.get(m_url)
    except exception as e: ##(httperror,urlerror,unicodedecodeerror,windowserror) as e:
        err_info='url open error: %s\n, reason: %s\n'%(m_url,e)
        print(err_info)
        err_log.write(err_info)
        #print('url open error: %s\n, reason: %s'%(m_url,e))
        return []
    imgs = []
    imgs.append('------------------<img src=>-----------------')
    for x in driver.find_elements_by_tag_name("img"):
        imgs.append(x.get_attribute('src'))

    # 找出所有div li标签中的链接
    with open('tmp_page_source.html','w',encoding='utf-8') as tmp_f:
        tmp_f.write(driver.page_source)
    for tag in ('li', 'div'):
        imgs.extend(filter_in_tag('tmp_page_source.html',tag))

    ##列表去重复
    imgs_uniq = []
    for url in imgs:
        if (url not in imgs_uniq) and (url): ##url不在新列表中且url不为空
            imgs_uniq.append(url)

    ##查找页面中的a链接中的大文件和其它网页
    links=[a_link.get_attribute('href') for a_link in driver.find_elements_by_tag_name('a') if a_link.get_attribute('href')]
    driver.quit()

    for link in links:
        host = urlparse(m_url).netloc.split('@')[-1].split(':')[0]
        dom = '.'.join(host.split('.')[-2:])
        if link.startswith('mailto:'):
            continue
        if   not link.startswith('http'):
            link=urljoin(m_url,link)
        f_name = urlparse(link).path.split('/')[-1]
        f_type = os.path.splitext(f_name)[1]
        if f_type not in ('.htm','.html','shtml',''):
            big_files.append(link)
            continue
        if link in seen_links:
            pass#print(link,'--aleady processed,pass.')
        else:
            if dom not in link:
                pass#print(link,'--not in domain,pass.')
            else:
                hrefs.append(link)
                seen_links.append(link)
    return imgs_uniq,big_files,hrefs




##对process处理结果进行分析，得出如下统计数据：
##图片：100，http协议占比：80%，http协议下各种后缀的数量：jpg-50，gif-30
##大文件：10，http协议占比：100%，http协议下各种后缀的数量：pdf-10

def ret_analyse(url_list):
    to_len=len(url_list)##含有3行标识信息，非url
    http_list= [url for url in url_list if url.startswith("http://")]
    http_perc='%.1f%%'%(len(http_list)/to_len*100) if to_len>0 else '0'
    exts_dict={}
    for url in url_list:
        if url.startswith('-----------'): ##排除‘-------img:src-----’等
            continue
        f_name = urlparse(url).path.split('/')[-1]
        f_type = os.path.splitext(f_name)[1]
        if f_type not in exts_dict:
            exts_dict[f_type]=1
        else:
            exts_dict[f_type]+=1
    return to_len,http_perc,exts_dict

##对一组url调用process函数处理，并输出结果到文本
def group_proc(url_f , urls,is_analyse) :
    links=[] ##存储该页面除大文件外的a链接
    ##定义写日志的函数
    def wlog(*lines):
        for line in lines:
            try:
                url_f.write(line + '\n')
            except exception as e:
                print('write eror,line:%s, err: %s'%(line,e))
    for url in urls:
        proc_ret=process(url)
        if  proc_ret:
            img_list,bigfile_list,link_list=proc_ret
            wlog('*'*40,'from: ',url) # 分隔行+起始行
            if is_analyse:
                img_output='图片：%d，http协议占比：%s，http协议下各种后缀的数量：%s'%(ret_analyse(img_list)[0]-3,ret_analyse(img_list)[1],ret_analyse(img_list)[2]) ##图片含有3行标识信息
                big_output = '大文件：%d，http协议占比：%s，http协议下各种后缀的数量：%s' % (ret_analyse(bigfile_list))
                wlog(img_output,big_output)
            img_list = '\n'.join(img_list)
            bigfile_list = '\n'.join(bigfile_list)
            wlog('imgs:',img_list,'bigfiles: ',bigfile_list,'*'*40)

            imgs_f.write(img_list + '\n')
            if bigfile_list:
                bigfiles_f.write(bigfile_list + '\n')
            if link_list:
                links.extend(link_list)
    return links


def main(depth):
    u_file=open('urls.txt','r')
    links=[line.strip('\n') for line in u_file]
    links=['http://'+link for link in links if not link.startswith('http')]
    u_file.close()

    for i in range(depth):
        is_analyse=true if i==0 else false ##对第一层数据需要分析统计
        url_f = open('layer' + str(i)+'.txt','w')
        next_links=[]

        if not links:
            break
        else:
            print('第 %d 层开始爬取...'%(i))
        ##将链接分配给5组
        avg=len(links)//5
        links_grp=[]
        if avg==0:
            grp_len=len(links)
            for i in range(grp_len):
                links_grp.append([links[i]])
        else:
            grp_len = 5
            links_grp=links[:avg],links[avg:avg*2],links[avg*2:avg*3],links[avg*3:avg*4],links[avg*4:]
        #for i in range(grp_len):
            #url_f.write('link_group %d:%s'%(i,links_grp[i]))
       ##新建5个线程，分别处理5组url
        threads=[]
        for i in range(grp_len):
            t=mythread(group_proc,(url_f,links_grp[i],is_analyse),group_proc.__name__)
            threads.append(t)
        ##线程同时启动
        for i in range(grp_len):
            print('线程%d开始运行,时间：%s'%(i,ctime()))
            threads[i].setdaemon(true)
            threads[i].start()

        ##等待线程结束，结束后将各组url中获取的外链加入到下一次处理的列表中
        for i in range(grp_len):
            threads[i].join()
            print('线程%d运行结束,时间：%s' % (i, ctime()))
            ret_links=threads[i].getresult()
            next_links.extend(ret_links)
        links=next_links
        url_f.close()


if __name__=='__main__':
    seen_links = []
    imgs_f = open('图片.txt', 'w',encoding='utf-8')
    bigfiles_f = open('大文件.txt', 'w',encoding='utf-8')
    err_log = open('err_log.txt', 'w',encoding='utf-8')
    depth=int(input('请输入爬取深度：'))
    main(depth)
    err_log.close()
    imgs_f.close()
    bigfiles_f.close()
    input('按任意键退出...')

view code

上一篇： php接口编程

下一篇： WebStorm 2018版本破解方法

python识别批量网站中的图片

利用Python对文件夹下图片数据进行批量改名的代码实例

Python中利用Scipy包的SIFT方法进行图片识别的实例教程

python 对txt中每行内容进行批量替换的方法

在Python中调用Ping命令,批量IP的方法

火狐浏览器插件BatchDownload批量下载网页中的图片

网站优化中的图片优化教程优化图片须知的8个小技巧

python批量下载图片的三种方法

对python cv2批量灰度图片并保存的实例讲解

使用Python中的cookielib模拟登录网站

打好"感情牌" 网站设计中如何使用调动用户情绪的好图片？

python识别批量网站中的图片

利用Python对文件夹下图片数据进行批量改名的代码实例

Python中利用Scipy包的SIFT方法进行图片识别的实例教程

python 对txt中每行内容进行批量替换的方法

在Python中调用Ping命令,批量IP的方法

火狐浏览器插件BatchDownload批量下载网页中的图片

网站优化中的图片优化教程 优化图片须知的8个小技巧

python批量下载图片的三种方法

对python cv2批量灰度图片并保存的实例讲解

使用Python中的cookielib模拟登录网站

打好"感情牌" 网站设计中如何使用调动用户情绪的好图片？

网站优化中的图片优化教程优化图片须知的8个小技巧