欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python 爬取百度百科

程序员文章站 2024-02-19 17:58:28
...
import re
from urllib import request
from urllib.parse import quote
from bs4 import BeautifulSoup as sp

header = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
bracket = re.compile(r'\[\d*]')


def look_up(entry):
    url = "https://baike.baidu.com/item/" + quote(entry)
    req = request.Request(url, headers=header)
    html = request.urlopen(req).read()
    soup = sp(html, "html.parser")

    content=soup.findAll('div',{'class':'para'})
    for i in content:
        i=i.get_text()
        i=i.replace('\n','')
        i=i.replace('\r','')
        i=re.sub(bracket,'',i)
        print(i)
look_up("高等数学")