Python3网络爬虫数据采集（多线程可爬取几十万新闻数据）

程序员文章站 2022-06-19 15:13:37

import requestsfrom bs4 import BeautifulSoupimport datetimefrom multiprocessing import Pool# 用request和BeautifulSoup处理网页def requestOver(url): response = requests.get(url) response.encoding = 'utf-8' if("gb2312" in response.text): r...

import requests
from bs4 import BeautifulSoup
import datetime
from multiprocessing import Pool

# 用request和BeautifulSoup处理网页
def requestOver(url):
    response = requests.get(url)
    response.encoding = 'utf-8'
    if("gb2312" in response.text):
        response.encoding = 'gb2312'
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

# 从网页下载标题和内容到txt文档
def download(title, url):
    soup = requestOver(url)
    tag = soup.find('div', class_="left_zw")
    if(tag == None):
        return 0
    title = title.replace(':', '')
    title = title.replace('"', '')
    title = title.replace('|', '')
    title = title.replace('/', '')
    title = title.replace('\\', '')
    title = title.replace('*', '')
    title = title.replace('<', '')
    title = title.replace('>', '')
    title = title.replace('?', '')
    content = ""
    for p in tag.findAll('p'):
        if (p.string != None):
            content = content + p.string
    filename = r'E:\code\python\spider_news\dataset\eco\\' + title + '.txt'
    with open(filename, 'w', encoding='utf-8', errors='ignore') as file_object:
        file_object.write('           ')
        file_object.write(title)
        file_object.write(tag.get_text())
    print('正在爬取新闻:' + title + " " + url)

# 爬虫具体执行过程
def crawlAll(url):
    soup = requestOver(url)
    for s in soup.findAll("div", class_="content_list"):
        for tag in s.findAll("li"):
            sp = tag.findAll("a")
            if("财经" in str(sp)):
                title = list(sp)[1].string
                urlAll = "http://www.chinanews.com" + str(list(sp)[1])[9:str(list(sp)[1]).find("shtml")+5]
                try:
                    download(title, urlAll)
                except Exception:
                    print("新闻爬取失败")


if __name__ == '__main__':
    pool = Pool(4)
    collection = set()
    url1 = "http://www.chinanews.com/scroll-news/"
    date = "2020/1113"
    url2 = "/news.shtml"
    p1 = []
    # 3650：十年的新闻数据
    for i in range(3650):
        date1 = datetime.datetime.strptime(date, "%Y/%m%d")
        date2 = datetime.timedelta(days=-1)
        date = (date1 + date2).strftime("%Y/%m%d")
        target_url = url1 + date + url2
        p1.append(target_url)
        print(target_url)
    pool.map(crawlAll, p1)
    pool.close()
    pool.join()

本文地址：https://blog.csdn.net/weixin_44485744/article/details/111994770

相关标签： Python3网络爬虫数据采集 python

上一篇：设计模式-访问者模式

下一篇：设计模式读书笔记