import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor
def task(url):
print(url)
r1 = requests.get(
url=url,
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36'
}
)
# 查看下载下来的文本信息
soup = BeautifulSoup(r1.text, 'html.parser')
print(soup.text)
content_list = soup.find('div',attrs={'id':'content-list'})
for item in content_list.find_all('div',attrs={'class':'item'}):
title = item.find('a').text.strip()
target_url = item.find('a').get('href')
print(title,target_url)
def run():
pool = ThreadPoolExecutor(5)
for i in range(1, 50):
pool.submit(task, 'https://dig.chouti.com/all/hot/recent/%s' % i)
if __name__ == '__main__':
run()
python线程池应用场景-爬虫
程序员文章站
2022-03-05 16:54:24
...
上一篇: 线程加入