【python 淘宝爬虫】python 淘宝店铺名称,旺旺,销售量 抓取
程序员文章站
2022-05-02 20:23:00
...
一、需求分析
抓取淘宝店铺名称,旺旺,销售量
二、效果展示
三、实现源代码
# encoding: utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import pandas as pd
import time
import re
time1=time.time()
import requests
seller=[]
shop=[]
sale=[]
city=[]
type=[]
#################################定义城市列表###########################
lista=["郑州市","洛阳市","开封市","南阳市","安阳市","商丘市","新乡市","平顶山市","许昌市","焦作市","周口市","信阳市","驻马店市","鹤壁市","濮阳市","漯河市","三门峡市","济源市","长治市","晋城市","运城市",\
"聊城市","菏泽市","宿州市","淮北市","阜阳市","亳州市","蚌埠市","邢台市","邯郸市"]
############################定义类别(在这里修改,只需要修改类别)###############
aa="女鞋"
#####################################################循环抓取###################################
for w in lista:
print ".............................................................."+w+"...................................................."
for i in range(1,101):
try:
print "正在抓取第"+str(i)+"页...................."
url = "https://shopsearch.taobao.com/search?app=shopsearch&spm=a230r.7195193.0.0.ShnhPc&q="+aa+"&tracelog=shopsearchnoqcat&loc="+w+"&isb=0&shop_type=&ratesum=&s="+str((i-1)*20)
html=requests.get(url).content
seller1=re.findall('"nick":"(.*?)","provcity":',html,re.S)
for each in seller1:
print each
seller.append(each)
sale1=re.findall('"totalsold":(.*?),"procnt"',html,re.S)
for each in sale1:
print each
city.append(w)
type.append(aa)
sale.append(each)
shop1=re.findall('"rawTitle":"(.*?)",',html,re.S)
for each in shop1:
print each.replace(" ","")
shop.append(each.replace(" ",""))
except:
pass
###################################打印长度########################################
print len(shop),len(seller),len(sale),len(city),len(type)
########################数据框#################################################
data=pd.DataFrame({'店铺名':shop,"卖家":seller,"销量":sale,"城市":city,"类型":type})
print data
# 写出excel
############################文件名称记得修改######################################################
writer = pd.ExcelWriter(r'C:\\taobao\\taobao4.xlsx', engine='xlsxwriter', options={'strings_to_urls': False})
data.to_excel(writer, index=False)
writer.close()
time2 = time.time()
print u'ok,爬虫结束!'
print u'总共耗时:' + str(time2 - time1) + 's'
上一篇: inWatch联手高通研发4G可穿戴设备
下一篇: SpringBoot Json数据交互