欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

利用python爬取京东商品评论

程序员文章站 2022-05-28 22:13:41
京东评论的爬取和淘宝的差不多,可以参考上两篇文章文章:利用python分析Ajax爬取淘宝评论最新Python爬取淘宝评论(2020年4月)import timeimport reimport requestsimport jsonimport randomimport csvclass JdSpider_content(): def __init__(self, productId, page, name): self.name = name #要保存为的文...

京东评论的爬取和淘宝的差不多,可以参考上两篇文章文章:
利用python分析Ajax爬取淘宝评论
最新Python爬取淘宝评论(2020年4月)

import time
import re
import requests
import json
import random
import csv



class JdSpider_content():
    def __init__(self, productId, page, name):
        self.name = name #要保存为的文件名称
        self.page = page #页码
        self.productId = productId #商品id
        self.url = "https://club.jd.com/comment/productPageComments.action?"
        self.headers = {"User-Agent": "自己的User-Agent",
                        "referer": "https://item.jd.com/10999284925.html",
                        "Cookie": '自己的cookie'
                        }

    def get_page(self):
       
        params = {
            "productId": self.productId,
            "page": self.page,
            "callback": "fetchJSON_comment98",
            "score": "0",  # 0是正常评价 1是差评 2是中评
            "sortType": "5",
            "pageSize": "10",
            "isShadowSku": "0",
            "rid": "0",
            "fold": "1"
        }
        res = requests.get(self.url, params=params, headers=self.headers)
        try:
            if res.status_code == 200:
                res = requests.get(
                    self.url, params=params, headers=self.headers).text[20:-2]
                res_json = json.loads(res)
                res_str = json.dumps(res_json, indent=4)
                return json.loads(res_str)
        except:
            return None

    def get_content(self, json_data):
        if json_data != None:
            for item in json_data.get("comments"):
                content_data = item.get("content")
                content_time = item.get("creationTime")
                content_name = item.get("nickname")
                type_size = item.get("productSize")
                type_color = item.get("productColor")
                yield {
                    "content_time": content_time,
                    "type_color": type_color,
                    "type_size": type_size,
                    "content_name": content_name,
                    "content_data": content_data,
                }

        else:
            print("该页出错啦!")
            return None
	
    def get_word(self, json_data):
        if json_data != None:
            word_list = re.findall(
                ".*?name.*?: '(.*?)'", str(json_data.get("hotCommentTagStatistics")))
            for i in word_list:
                with open(self.name+"关键词.txt", "a", encoding="utf-8") as file:
                    file.write(i+"\n")
	#将结果保存为txt文本
    def write_txt(self, data):
        with open(self.name+".txt", "a", encoding="utf-8") as file:
            file.write(json.dumps(data, indent=2, ensure_ascii=False))
            file.write("\n")
	#将结果保存为csv
    def write_csv(self, data):
        with open(self.name+".csv", "a", encoding="utf-8-sig", newline='') as file:
            fieldnames = ["content_time", "content_type",
                          "content_name", "content_data"]
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writerow(data)
	#将结果保存为json格式
    def write_json(self, data):
        with open("taobaocontent.json", "a", encoding="utf-8") as file:
            file.write(json.dumps(data, indent=2, ensure_ascii=False))

    def main(self):
        json_data = self.get_page()
        self.get_content(json_data)
        return self.get_content(json_data)


if __name__ == "__main__":
    ls = []
    for j in range(2):
        print("\n")
        print("现在是第%d页" % (j+1))
        a = JdSpider_content(
            productId="24155385153", page=j+1, name="祺奥")
        if j==0:
            json_data = a.get_page()
            a.get_word(json_data)
        if a.main() != None:
            for i in a.main():
                print(i)
                ls.append(i)
        else:
            pass
        time.sleep(random.randint(15,20)) #防止ip被封,或者用代理池也行。
    a.write_txt(ls)

本文地址:https://blog.csdn.net/m0_46412065/article/details/107468840

相关标签: 爬虫 python