欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

中国农业信息网_爬取新闻详情(源码)_一蓑烟雨任平生

程序员文章站 2024-01-20 20:53:28
...

废话不多说直接上代码

今天要倒霉的网站是全国农业信息网

import requests
import pymysql
from bs4 import BeautifulSoup  # 用来解析网页
import uuid
import time
url = "http://www.agri.cn/V20/ZX/qgxxlb_1"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 '
                  'Safari/537.36',
    'Accept-Language': 'zh-CN,zh;q=0.8'
}
conn = pymysql.connect(host='127.0.0.1', user='root', passwd='123456', db='zhang', charset='utf8')
cur = conn.cursor()
print("连接成功")
j = 1
for i in range(0, 20):  # 爬取第一页到第3页的数据
    if i == 0:
        resp = requests.get("http://www.agri.cn/V20/ZX/qgxxlb_1/index.htm", headers=headers)
    else:
        resp = requests.get(f"http://www.agri.cn/V20/ZX/qgxxlb_1/index_{i}.htm", headers=headers)
    page_one = BeautifulSoup(resp.content, "html.parser")
    dd = page_one.find('td', class_='bk_7').find_all('td', class_='bj_3-2')
    for ss in dd:
        # 省份
        province = ss.find('a').text[0:3].strip('[')
        productId = str(uuid.uuid1())
        # url
        sUrl = url + ss.find('a')['href'].strip('.')
        # 打开二级网页进行爬取
        rp = requests.get(sUrl, headers=headers)
        page_two = BeautifulSoup(rp.content, "html.parser")
        # 标题
        title = page_two.find('td', class_='hui_15_cu').text
        # 来源
        source = page_two.find('td', class_='hui_12-12').text.split('来源')[1][1:]
        # 时间
        timet = page_two.find('td', class_='hui_12-12').text[3:13]
        # 内容
        article = page_two.find('div', class_='TRS_Editor').find_all('p')
        content = ''
        for aa in article:
            content = content + aa.text.strip()
        # 分类
        n_type = ""
        if "食品安全" in content:
            n_type = "食品安全"
        elif "农业环境" in content:
            n_type = "农业环境"
        elif "农业病虫害" in content:
            n_type = "农业病虫害"
        elif "农业耕地浪费" in content:
            n_type = "农业耕地浪费"
        elif "农产品质量安全" in content:
            n_type = "农产品质量安全"
        else:
            n_type = ""
        sql = "insert into new_paper(id,n_source,n_title,n_timet,n_type,n_url,n_content,n_province) VALUES (%s,%s,%s,%s,%s,%s,%s,%s)"
        cur.execute(sql, (productId, source, title, timet, n_type, sUrl, content, province))
    print("SQL正在执行第{}页执行完毕".format(j))
    j = j + 1
    conn.commit()
    time.sleep(1)  # 防止服务器蹦了,间隔一秒钟
cur.close()
conn.close()

看效果

中国农业信息网_爬取新闻详情(源码)_一蓑烟雨任平生

拿到数据,回家睡觉

中国农业信息网_爬取新闻详情(源码)_一蓑烟雨任平生

相关标签: python 爬虫