欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python入门爬虫,爬取女生头像(无需反爬)

程序员文章站 2022-05-02 17:37:46
...

环境:python3.8
IDE:pycharm

# coding: utf-8
import requests
from lxml import etree
import re
import uuid
import time
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/63.0.3239.132 Safari/537.36'}

class Spider:
    def imgrun(self,url,count):
        # 设置保存路径
        imgpath="A:\img"
        num = 1
        self.url=url
        self.count=count
        print('url========'+url)
        res=requests.get(url=url,headers=header)
        # utf8编码
        res.encoding = 'utf8'
        # 输出访问网站返回code
        print(res.status_code)
        # 将request.content 转化为 Element
        html = etree.HTML(res.content)
        # 选取图片
        items = html.xpath('//*[@id="main"]/div[3]/div[1]/div[2]/div/a[1]/img/@src')
        # print(type(items))
        for i in items:
            i+=''
            imgurl = 'https:' + i
            print('开始第'+str(count)+'次 的第'+str(num)+'张图片')
            # 去掉400*400
            # imgurl = re.sub(r'!400x400', "", imgurl)
            print(imgurl)
            result=requests.get(imgurl)
            with open(imgpath+'\\'+str(uuid.uuid1())+'.jpeg', 'wb') as f:
                f.write(result.content)
                time.sleep(1)
            num+=1
            print('download ok')
    def imgmain(self):
        # 拼接下一页,从第几次开始就是range起始值加2
        for i in range(103,110):
            print("开始第" + str(i-1) + "次")
            url = "https://www.woyaogexing.com/touxiang/nv/index_{}.html".format(i)
            self.imgrun(url,i-1)
            # time.sleep(1)

            # print(url)


sp=Spider()
# sp.modular()
sp.imgmain()