实战:利用requests和bs4,爬取快代理IP并保存
程序员文章站
2022-05-04 16:53:11
...
直接上代码
import requests
from bs4 import BeautifulSoup
import time
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
def get_infos(url):
resp = requests.get(url,headers=headers)
html = resp.text
soup = BeautifulSoup(html,'lxml')
trs = soup.find('tbody').find_all('tr')
infos = []
for tr in trs:
info = list(tr.stripped_strings)
infos.append(info)
return infos
def save_data(infos,f):
for info in infos:
f.write('{},{},{},{},{},{},{}\n'.format(info[0], info[1], info[2], info[3], info[4], info[5], info[6], ))
def main():
base_url = "https://www.kuaidaili.com/free/inha/{}/"
for i_page in range(1,11):
url = base_url.format(i_page)
infos = get_infos(url)
time.sleep(5)
with open('kuaidaili.csv','a',encoding='utf-8-sig') as f:
f.write('{},{},{},{},{},{},{}\n'.format('IP','PORT','匿名度','类型','位置','响应速度','最后验证时间'))
save_data(infos,f)
if __name__ == "__main__":
main()