Python bs4抓取新浪新闻简单版
程序员文章站
2022-05-02 22:02:41
...
import requests
from bs4 import BeautifulSoup
import time
import pandas
res = requests.get("http://news.sina.com.cn/china/")
res.encoding = "utf-8"
soup = BeautifulSoup(res.text, 'html.parser')
newsary = []
def get_article(url):
res = requests.get(url)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
dic = {}
if len(soup.select(".main-title"))> 0 :
dic['title'] = soup.select(".main-title")[0].text
if len(soup.select('.article'))>0:
dic['content'] = ''.join(soup.select('.article')[0].text.split())
if len(soup.select('.keywords'))>0:
dic['keywords'] = soup.select('.keywords')[0].text
if len(soup.select('.date-source'))>0:
dic['sourse'] = soup.select('.date-source')[0].text
return dic
count = 0
for link in soup.select('.news-item'):
if len(link.select('h2 a'))>0:
newsary.append(get_article(link.select('h2 a')[0]['href']))
count +=1
print("已获取第" + str(count) + "条新闻")
# time.sleep(1)
df = pandas.DataFrame(newsary)
df.to_excel("xinlang_news.xlsx")