欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python钓鱼评论爬取

程序员文章站 2022-04-08 23:20:03
...

python钓鱼评论爬取 并存入txt

import re#python正则分割
# from  bs4 import BeautifulSoup#网页美味汤
#from selenium import webdriver#模拟点击鼠标点击网页库
#import time#时间
import  requests#直接爬取网页库
def direct(secname):
    r=requests.get(secname)
    html=r.text
    rule="http://www.diaoyu.com/diaochang/changsha/list-.*-4-1-1.html"
    gg=re.findall(rule,html)
    return gg
def direct2(secname):
    r=requests.get(secname)
    html=r.text
    rule="http://www.diaoyu.com/diaochang/changsha/\d+.html"#消除数字
    gg=re.findall(rule,html)
    gg=list(set(gg))#去除重复
    return gg
def direct3(secname):
    r=requests.get(secname)
    html=r.text
    rule="</span>\d分\n</span>\n<p>.*</p>"#消除数字
    gg=re.findall(rule,html)
    gg=list(set(gg))#去除重复
    return gg

def main():
    gg=direct("http://www.diaoyu.com/diaochang/changsha/list-0-4-1-1.html")
    print(gg)
    allcomments=[]
    for i in range(2,len(gg)):
        #第一个芙蓉区啥都没
            print(gg[i])
            gg2=direct2(gg[i])
            # print(len(gg2))
            for j in range(len(gg2)):
                    gg3=direct3(gg2[j])
                    allcomments.append(gg3)
    outfiles='E:\\comments'+'.txt'
    output= open(outfiles,'w',encoding='utf-8')
    for i in allcomments:
        for j in i:
            print(str(j[7:9]))
            output.write(str(j[7:9]))
            output.write("\t")
            output.write(str(j[21:-4]))
            output.write('\t')
            output.write('\n')
#调用main()函数
if __name__ == '__main__':
    main()