python3 爬虫学习日记【二】
程序员文章站
2022-03-20 23:38:05
...
二来了~继续记录学习历程。 真的对自己无语,这段时间虽然写了一些小脚本,就是偷懒好久没写博客,
今天的内容来自于git上看到的大神写的冲顶大会的脚本,就照着他的思路先写个简易版本的:大神的git传送门,有兴趣可以学习下
我是用BeautifulSoup写的爬虫,实现的主要功能就是把问题和答案做好一对一封装后,去百度查搜索结果条数,根据条数初步判断选项。
初版很简单,先上传,继续研究怎么结合语义分析,把问题和答案匹配的更好。(目前的准确度不是很高)
期间碰到了一个问题,取到response的内容后,我死活都取不到想要的内容,后来发现是useragent不对,导致返回的html不对(真的是各种坑,所以还是需要多动手实践),后来我直接在火狐上拷了一个useragent,ok了~。回头想想太蠢了。
from urllib import parse
from urllib import request
from bs4 import BeautifulSoup
import re
import jieba
import jieba.posseg as posseg
from colorama import init,Fore
init()
def open_webbrowser_count(question,choices):
print('\n-- 方法2: 题目+选项搜索结果计数法 --\n')
print('Question: ' + question)
if '不是' in question:
print('**请注意此题为否定题,选计数最少的**')
#TODO jieba
wordlist = posseg.cut(question)
head = {}
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:57.0) Gecko/20100101 Firefox/57.0'
counts = []
for i in range(len(choices)):
url = 'https://www.baidu.com/s?wd='+parse.quote((question + choices[i]))
req = request.Request(url=url,headers=head)
response = request.urlopen(req)
html = response.read()
soup = BeautifulSoup(html,'lxml')
te = soup.find('div',attrs={'class':'nums'})
count = re.findall("百度为您找到相关结果约(.*)个",str(te),re.S)
counts.append(int(count[0].replace(",","")))
output(choices, counts)
def output(choices, counts):
counts = list(map(int, counts))
#print(choices, counts)
# 计数最高
index_max = counts.index(max(counts))
# 计数最少
index_min = counts.index(min(counts))
if index_max == index_min:
print(Fore.RED + "高低计数相等此方法失效!" + Fore.RESET)
return
for i in range(len(choices)):
print()
if i == index_max:
# 绿色为计数最高的答案
print(Fore.GREEN + "{0} : {1} ".format(choices[i], counts[i]) + Fore.RESET)
elif i == index_min:
# 红色为计数最低的答案
print(Fore.MAGENTA + "{0} : {1}".format(choices[i], counts[i]) + Fore.RESET)
else:
print("{0} : {1}".format(choices[i], counts[i]))
if __name__ == '__main__':
question = '以下口红色号不是姨妈色的?'
choices = ['香奈儿154', '圣罗兰204', '纪梵希62']
open_webbrowser_count(question, choices)
推荐阅读