python钓鱼评论爬取
程序员文章站
2022-04-08 23:20:03
...
python钓鱼评论爬取 并存入txt
import re#python正则分割
# from bs4 import BeautifulSoup#网页美味汤
#from selenium import webdriver#模拟点击鼠标点击网页库
#import time#时间
import requests#直接爬取网页库
def direct(secname):
r=requests.get(secname)
html=r.text
rule="http://www.diaoyu.com/diaochang/changsha/list-.*-4-1-1.html"
gg=re.findall(rule,html)
return gg
def direct2(secname):
r=requests.get(secname)
html=r.text
rule="http://www.diaoyu.com/diaochang/changsha/\d+.html"#消除数字
gg=re.findall(rule,html)
gg=list(set(gg))#去除重复
return gg
def direct3(secname):
r=requests.get(secname)
html=r.text
rule="</span>\d分\n</span>\n<p>.*</p>"#消除数字
gg=re.findall(rule,html)
gg=list(set(gg))#去除重复
return gg
def main():
gg=direct("http://www.diaoyu.com/diaochang/changsha/list-0-4-1-1.html")
print(gg)
allcomments=[]
for i in range(2,len(gg)):
#第一个芙蓉区啥都没
print(gg[i])
gg2=direct2(gg[i])
# print(len(gg2))
for j in range(len(gg2)):
gg3=direct3(gg2[j])
allcomments.append(gg3)
outfiles='E:\\comments'+'.txt'
output= open(outfiles,'w',encoding='utf-8')
for i in allcomments:
for j in i:
print(str(j[7:9]))
output.write(str(j[7:9]))
output.write("\t")
output.write(str(j[21:-4]))
output.write('\t')
output.write('\n')
#调用main()函数
if __name__ == '__main__':
main()
上一篇: BeautifulSoup