欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python爬虫基础编程

程序员文章站 2022-03-08 15:49:58
...

初学Python爬虫时都会从最简单的方法开始,以下为几种常见的基础做法。

"""
    简单的循环处理
"""
import requests


url_list = [
    "https://www.baidu.com",
"https://www.cnblogs.com/"
]

for url in url_list:
    result = requests.get(url)
    print(result.text)


"""
    线程池处理
"""
import requests
from concurrent.futures import ThreadPoolExecutor


def fetch_request(url):
    result = requests.get(url)
    print(result.text)


url_list = [
    "https://www.baidu.com/",
"https://www.cnblogs.com/"
]

pool = ThreadPoolExecutor(10)

for url in url_list:
    # 线程池中获取线程,执行fetch_request方法
pool.submit(fetch_request, url)

# 关闭线程池
pool.shutdown()

"""
    线程池+回调函数
"""
import requests
from concurrent.futures import ThreadPoolExecutor


def fetch_async(url):
    response = requests.get(url)
    return response


def callback(future):
    print(future.result().text)


url_list = [
    "https://www.baidu.com/",
"https://www.cnblogs.com/"
]

pool = ThreadPoolExecutor(10)

for url in url_list:
    v = pool.submit(fetch_async, url)
    # 调用回调函数
v.add_done_callback(callback)

pool.shutdown()

"""
    进程池处理
"""
import requests
from concurrent.futures import ProcessPoolExecutor


def fetch_requst(url):
    result = requests.get(url)
    print(result.text)


url_list = [
    "https://www.baidu.com/",
"https://www.cnblogs.com/"
]

if __name__ == '__main__':

    pool = ProcessPoolExecutor(max_workers=10)

    for url in url_list:
        pool.submit(fetch_requst, url)

    pool.shutdown()

"""
    进程池+回调函数
"""
import requests
from concurrent.futures import ProcessPoolExecutor


def fetch_async(url):
    response = requests.get(url)
    return response


def callback(future):
    print(future.result().text)


url_list = [
    "https://www.baidu.com/",
"https://www.cnblogs.com/"
]


if __name__ == '__main__':
    pool = ProcessPoolExecutor(10)

    for url in url_list:
        v = pool.submit(fetch_async, url)
        v.add_done_callback(callback)

    pool.shutdown()