【python】淘宝利用cookies登录,爬取商品信息
程序员文章站
2023-12-24 12:17:51
...
所用到的库
import requests
import re
import csv
1.登录淘宝(打开持续日志,便于获得登录信息)
2.保存登录cookies(保存到本地mycookies.txt)
3.请求登录
def getHTML():
name = input('请输入爬取商品的名字:')
start_url = 'https://s.taobao.com/search?q={}&s='.format(name)
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
path = 'C:/Users/lidantao/Desktop/python_homework/17210120721_李丹涛_17创新实验班(第十五次)/mycookies.txt'
with open(path,'r')as f:
mycookies = f.read()
mycookies = mycookies.split(';')
cookies = {}
for cookie in mycookies:
name,value = cookie.strip().split('=',1)
cookies[name] = value
pages = input('请输入爬取的商品页数:')
goods = ''
for i in range(int(pages)):
url = start_url + str(i*44)
r = requests.get(url,headers=header,cookies=cookies,timeout=60)
r.encoding = r.apparent_encoding
goods += r.text
return goods
5.查找商品信息:
def findMS(html):
print('='*20,'正在爬取商品信息','='*20,'\n')
marketnames = re.findall('"nick":"(.*?)"',html)
titles = re.findall('"raw_title":"(.*?)"',html)
prices = re.findall('"view_price":"(.*?)"',html)
citys = re.findall('"item_loc":"(.*?)"',html)
pays = re.findall('"view_sales":"(.*?)"',html)
data = []
try:
for i in range(len(titles)):
data.append([marketnames[i],titles[i],prices[i],citys[i],pays[i]])
if data == '':
print('='*20,'暂无此商品信息','='*20,'\n')
return data
print('='*20,'爬取成功','='*20,'\n')
except:
print('异常,爬取中断')
return data
6.保存商品信息到本地:
def download(data):
print('='*20,'正在保存商品信息','='*20,'\n')
path = 'C:/Users/lidantao/Desktop/python_homework/17210120721_李丹涛_17创新实验班(第十五次)/goods.csv'
try:
f = open(path,"w",newline="")
writer = csv.writer(f)
writer.writerow(['店铺名称','商品','价格(单位:元)','店铺所在地','付款人数'])
writer.writerows(data)
print('='*20,'保存成功','='*20,'\n')
except:
print('保存失败')
f.close()
7.源代码如下:
import requests
import re
import csv
def getHTML():
name = input('请输入爬取商品的名字:')
start_url = 'https://s.taobao.com/search?q={}&s='.format(name)
header = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:70.0) Gecko/20100101 Firefox/70.0'
}
path = 'C:/Users/lidantao/Desktop/python_homework/17210120721_李丹涛_17创新实验班(第十五次)/mycookies.txt'
with open(path,'r')as f:
mycookies = f.read()
mycookies = mycookies.split(';')
cookies = {}
for cookie in mycookies:
name,value = cookie.strip().split('=',1)
cookies[name] = value
pages = input('请输入爬取的商品页数:')
goods = ''
for i in range(int(pages)):
url = start_url + str(i*44)
r = requests.get(url,headers=header,cookies=cookies,timeout=60)
r.encoding = r.apparent_encoding
goods += r.text
return goods
def findMS(html):
print('='*20,'正在爬取商品信息','='*20,'\n')
marketnames = re.findall('"nick":"(.*?)"',html)
titles = re.findall('"raw_title":"(.*?)"',html)
prices = re.findall('"view_price":"(.*?)"',html)
citys = re.findall('"item_loc":"(.*?)"',html)
pays = re.findall('"view_sales":"(.*?)"',html)
data = []
try:
for i in range(len(titles)):
data.append([marketnames[i],titles[i],prices[i],citys[i],pays[i]])
if data == '':
print('='*20,'暂无此商品信息','='*20,'\n')
return data
print('='*20,'爬取成功','='*20,'\n')
except:
print('异常,爬取中断')
return data
def download(data):
print('='*20,'正在保存商品信息','='*20,'\n')
path = 'C:/Users/lidantao/Desktop/python_homework/17210120721_李丹涛_17创新实验班(第十五次)/goods.csv'
try:
f = open(path,"w",newline="")
writer = csv.writer(f)
writer.writerow(['店铺名称','商品','价格(单位:元)','店铺所在地','付款人数'])
writer.writerows(data)
print('='*20,'保存成功','='*20,'\n')
except:
print('保存失败')
f.close()
def main():
html = getHTML()
data = findMS(html)
download(data)
if __name__ == "__main__":
main()
项目效果: