欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

实战:利用requests和bs4,爬取快代理IP并保存

程序员文章站 2022-05-04 16:53:11
...

直接上代码

import requests
from bs4 import BeautifulSoup
import time

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}

def get_infos(url):
    resp = requests.get(url,headers=headers)
    html = resp.text
    soup = BeautifulSoup(html,'lxml')
    trs = soup.find('tbody').find_all('tr')
    infos = []
    for tr in trs:
        info = list(tr.stripped_strings)
        infos.append(info)
    return infos

def save_data(infos,f):
    for info in infos:
        f.write('{},{},{},{},{},{},{}\n'.format(info[0], info[1], info[2], info[3], info[4], info[5], info[6], ))

def main():
    base_url = "https://www.kuaidaili.com/free/inha/{}/"
    for i_page in range(1,11):
        url = base_url.format(i_page)
        infos = get_infos(url)
        time.sleep(5)
        with open('kuaidaili.csv','a',encoding='utf-8-sig') as f:
            f.write('{},{},{},{},{},{},{}\n'.format('IP','PORT','匿名度','类型','位置','响应速度','最后验证时间'))
            save_data(infos,f)



if __name__ == "__main__":
    main()