利用python爬取京东商品评论
程序员文章站
2022-05-28 22:13:41
京东评论的爬取和淘宝的差不多,可以参考上两篇文章文章:利用python分析Ajax爬取淘宝评论最新Python爬取淘宝评论(2020年4月)import timeimport reimport requestsimport jsonimport randomimport csvclass JdSpider_content(): def __init__(self, productId, page, name): self.name = name #要保存为的文...
京东评论的爬取和淘宝的差不多,可以参考上两篇文章文章:
利用python分析Ajax爬取淘宝评论
最新Python爬取淘宝评论(2020年4月)
import time
import re
import requests
import json
import random
import csv
class JdSpider_content():
def __init__(self, productId, page, name):
self.name = name #要保存为的文件名称
self.page = page #页码
self.productId = productId #商品id
self.url = "https://club.jd.com/comment/productPageComments.action?"
self.headers = {"User-Agent": "自己的User-Agent",
"referer": "https://item.jd.com/10999284925.html",
"Cookie": '自己的cookie'
}
def get_page(self):
params = {
"productId": self.productId,
"page": self.page,
"callback": "fetchJSON_comment98",
"score": "0", # 0是正常评价 1是差评 2是中评
"sortType": "5",
"pageSize": "10",
"isShadowSku": "0",
"rid": "0",
"fold": "1"
}
res = requests.get(self.url, params=params, headers=self.headers)
try:
if res.status_code == 200:
res = requests.get(
self.url, params=params, headers=self.headers).text[20:-2]
res_json = json.loads(res)
res_str = json.dumps(res_json, indent=4)
return json.loads(res_str)
except:
return None
def get_content(self, json_data):
if json_data != None:
for item in json_data.get("comments"):
content_data = item.get("content")
content_time = item.get("creationTime")
content_name = item.get("nickname")
type_size = item.get("productSize")
type_color = item.get("productColor")
yield {
"content_time": content_time,
"type_color": type_color,
"type_size": type_size,
"content_name": content_name,
"content_data": content_data,
}
else:
print("该页出错啦!")
return None
def get_word(self, json_data):
if json_data != None:
word_list = re.findall(
".*?name.*?: '(.*?)'", str(json_data.get("hotCommentTagStatistics")))
for i in word_list:
with open(self.name+"关键词.txt", "a", encoding="utf-8") as file:
file.write(i+"\n")
#将结果保存为txt文本
def write_txt(self, data):
with open(self.name+".txt", "a", encoding="utf-8") as file:
file.write(json.dumps(data, indent=2, ensure_ascii=False))
file.write("\n")
#将结果保存为csv
def write_csv(self, data):
with open(self.name+".csv", "a", encoding="utf-8-sig", newline='') as file:
fieldnames = ["content_time", "content_type",
"content_name", "content_data"]
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writerow(data)
#将结果保存为json格式
def write_json(self, data):
with open("taobaocontent.json", "a", encoding="utf-8") as file:
file.write(json.dumps(data, indent=2, ensure_ascii=False))
def main(self):
json_data = self.get_page()
self.get_content(json_data)
return self.get_content(json_data)
if __name__ == "__main__":
ls = []
for j in range(2):
print("\n")
print("现在是第%d页" % (j+1))
a = JdSpider_content(
productId="24155385153", page=j+1, name="祺奥")
if j==0:
json_data = a.get_page()
a.get_word(json_data)
if a.main() != None:
for i in a.main():
print(i)
ls.append(i)
else:
pass
time.sleep(random.randint(15,20)) #防止ip被封,或者用代理池也行。
a.write_txt(ls)
本文地址:https://blog.csdn.net/m0_46412065/article/details/107468840
上一篇: 微盟“再下沉”,线下的中小企业营销服务究竟怎么玩?
下一篇: Python中range()函数