Python爬取淘宝某一商品所有页面信息案例
程序员文章站
2022-06-15 19:37:53
#! /usr/bin/env python # -*- coding:utf-8 -*-"""selenim 可以模拟人去控制浏览器功能:淘宝搜索一个关键字,查询商品信息(最多100页)方式:100个页面通过url访问(不通过点击下一页或其他),模拟人去拉动下滑条,直接获取xpath路径数据fake_useragent.json文件参考本人其他博客"""from selenium import webdriverimport timeimport re,osfrom lxml im....
功能:淘宝搜索一个关键字,查询商品信息(最多100页)
#! /usr/bin/env python
# -*- coding:utf-8 -*-
"""
selenim 可以模拟人去控制浏览器
功能:淘宝搜索一个关键字,查询商品信息(最多100页)
方式:100个页面通过url访问(不通过点击下一页或其他),模拟人去拉动下滑条,直接获取xpath路径数据
fake_useragent.json文件参考本人其他博客
"""
from selenium import webdriver
import time
import re,os
from lxml import etree
from fake_useragent import UserAgent
true = True
false = False
cookies = [
{
"domain": ".taobao.com",
"expirationDate": 1631901735.114169,
"hostOnly": false,
"httpOnly": false,
"name": "_cc_",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "UIHiLt3xSw%3D%3D",
"id": 1
},
{
"domain": ".taobao.com",
"expirationDate": 1601004935.565767,
"hostOnly": false,
"httpOnly": false,
"name": "_m_h5_tk",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "83e687d147cdaf7f8c2a68c7133af57f_1600410215568",
"id": 2
},
{
"domain": ".taobao.com",
"expirationDate": 1601004935.565801,
"hostOnly": false,
"httpOnly": false,
"name": "_m_h5_tk_enc",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "009442fdcc478d9950ed3ce66c21dd42",
"id": 3
},
{
"domain": ".taobao.com",
"hostOnly": false,
"httpOnly": true,
"name": "_samesite_flag_",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": true,
"storeId": "0",
"value": "true",
"id": 4
},
{
"domain": ".taobao.com",
"hostOnly": false,
"httpOnly": false,
"name": "_tb_token_",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": true,
"storeId": "0",
"value": "e0e309163e688",
"id": 5
},
{
"domain": ".taobao.com",
"expirationDate": 2231026522,
"hostOnly": false,
"httpOnly": false,
"name": "cna",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "o9PDFx8njH0CAQ4RFiRF9qoU",
"id": 6
},
{
"domain": ".taobao.com",
"hostOnly": false,
"httpOnly": true,
"name": "cookie2",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": true,
"storeId": "0",
"value": "1108ba29bf95e3bff70758c002b8c7ed",
"id": 7
},
{
"domain": ".taobao.com",
"expirationDate": 1915606281.479938,
"hostOnly": false,
"httpOnly": true,
"name": "enc",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "pdtSBn64BcPlV1fgT5fEORzPWaSvgJIV0ams%2B80hrkVpM4nZvO26j5vnkaKwAFIlso1F2zSKKBcouyBoDYn%2FhQ%3D%3D",
"id": 8
},
{
"domain": ".taobao.com",
"expirationDate": 1631871321.27879,
"hostOnly": false,
"httpOnly": false,
"name": "hng",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "CN%7Czh-CN%7CCNY%7C156",
"id": 9
},
{
"domain": ".taobao.com",
"expirationDate": 1615962531,
"hostOnly": false,
"httpOnly": false,
"name": "isg",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": false,
"storeId": "0",
"value": "BCIimN0ciRlbQ5XAUY7_l13Rc6iEcyaNuWQR9my7ShVAP8K5VAP2nY29b3vDL54l",
"id": 10
},
{
"domain": ".taobao.com",
"expirationDate": 1615962531,
"hostOnly": false,
"httpOnly": false,
"name": "l",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": false,
"storeId": "0",
"value": "eBPpW0I4OzsyEC1yBOfwourza77OSIRAguPzaNbMiOCP995p5HJfWZr-rcT9C3GVhs6pR3kKtDETBeYBqIv4n5U62j-la_kmn",
"id": 11
},
{
"domain": ".taobao.com",
"expirationDate": 1602957735.113468,
"hostOnly": false,
"httpOnly": false,
"name": "lgc",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "%5Cu60DF%5Cu4F60%5Cu5B89%5Cu597D25",
"id": 12
},
{
"domain": ".taobao.com",
"expirationDate": 1600970537.258928,
"hostOnly": false,
"httpOnly": false,
"name": "mt",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "ci=10_1",
"id": 13
},
{
"domain": ".taobao.com",
"expirationDate": 1631901735.113612,
"hostOnly": false,
"httpOnly": false,
"name": "sgcookie",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "E100dUyv3G0MpjySnKy0v91d0mA%2Fa2nR2g5oItfOcfHB1D6PA%2Fjkz55DXzHfBYBLwwHqM38etbD5hqW5r%2BxaWIeecA%3D%3D",
"id": 14
},
{
"domain": ".taobao.com",
"hostOnly": false,
"httpOnly": false,
"name": "t",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": true,
"storeId": "0",
"value": "34b12347933292e6e4250a15939f9e14",
"id": 15
},
{
"domain": ".taobao.com",
"expirationDate": 1615962531,
"hostOnly": false,
"httpOnly": false,
"name": "tfstk",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": false,
"storeId": "0",
"value": "cO4hB3OeekoISJogcwgQGGZvQ4sOZY8rnzz_7PbvyFNgH-4NiV-w0OQRsv23y31..",
"id": 16
},
{
"domain": ".taobao.com",
"expirationDate": 1631350282,
"hostOnly": false,
"httpOnly": false,
"name": "thw",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": false,
"storeId": "0",
"value": "cn",
"id": 17
},
{
"domain": ".taobao.com",
"expirationDate": 1631901735.113949,
"hostOnly": false,
"httpOnly": false,
"name": "tracknick",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "%5Cu60DF%5Cu4F60%5Cu5B89%5Cu597D25",
"id": 18
},
{
"domain": ".taobao.com",
"hostOnly": false,
"httpOnly": false,
"name": "uc1",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": true,
"storeId": "0",
"value": "cookie14=Uoe0bUwfX7d0SA%3D%3D",
"id": 19
},
{
"domain": ".taobao.com",
"expirationDate": 1602957735.113381,
"hostOnly": false,
"httpOnly": true,
"name": "uc3",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "vt3=F8dCufbMiVgb9pacA34%3D&nk2=rTA8Cvy7qWi6fA%3D%3D&id2=UU269ZzG9qTDdg%3D%3D&lg2=VFC%2FuZ9ayeYq2g%3D%3D",
"id": 20
},
{
"domain": ".taobao.com",
"expirationDate": 1602957735.11385,
"hostOnly": false,
"httpOnly": true,
"name": "uc4",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "nk4=0%40r8F0jBGF7Pz7qQEXlpLx9O3aFH%2F2&id4=0%40U2%2F9poVQih6CHOa6I%2B4oAQnBDpQi",
"id": 21
},
{
"domain": ".taobao.com",
"hostOnly": false,
"httpOnly": false,
"name": "v",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": true,
"storeId": "0",
"value": "0",
"id": 22
},
{
"domain": ".taobao.com",
"expirationDate": 1600423098,
"hostOnly": false,
"httpOnly": false,
"name": "xlly_s",
"path": "/",
"sameSite": "no_restriction",
"secure": true,
"session": false,
"storeId": "0",
"value": "1",
"id": 23
},
{
"domain": "s.taobao.com",
"hostOnly": true,
"httpOnly": false,
"name": "alitrackid",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": true,
"storeId": "0",
"value": "www.taobao.com",
"id": 24
},
{
"domain": "s.taobao.com",
"hostOnly": true,
"httpOnly": true,
"name": "JSESSIONID",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": true,
"storeId": "0",
"value": "672120E2C0B4AB8048A221FFD276B810",
"id": 25
},
{
"domain": "s.taobao.com",
"hostOnly": true,
"httpOnly": false,
"name": "lastalitrackid",
"path": "/",
"sameSite": "unspecified",
"secure": false,
"session": true,
"storeId": "0",
"value": "www.taobao.com",
"id": 26
}
]
def driver_chrome():
chrome_options = webdriver.ChromeOptions()
# 添加实验性质的设置参数 add_experimental_option
# 设置为开发者模式
chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
# 去掉开发者警告
chrome_options.add_experimental_option('useAutomationExtension', False)
# 启用无头模式
# chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu") # 禁用GPU加速
# 添加代理 (很重要 很重要 很重要) 若是直接ua=UserAgent(verify_ssl=False)可能出现超时
ua = UserAgent(path=os.getcwd() + '/fake_useragent.json')
chrome_options.add_argument('user-agent=ua.random')
driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options)
# 隐性等待,最长等10秒
# 注意:对driver起作用,所以只要设置一次即可,没有必要到处设置
driver.implicitly_wait(10)
return driver
# 登录后,拉动下滑条,采集数据
def draw_down():
# 一次拉一部分,拉一次暂停一会
for x in range(1, 11, 2):
time.sleep(0.5)
# j代表滑动条的位置
j = x/10
js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j
driver.execute_script(js)
# 元素定位,得到页数
def serch_product():
# 定位输入框
driver.find_element_by_xpath('//*[@id="q"]').send_keys(keyword)
# 注意:搜索按钮不一致,判断是否是第一次搜索。这里只搜索一次,所以不用判断
# 不是第一次搜索 driver.find_element_by_xpath('//*[@id="J_SearchForm"]/button').click()
driver.find_element_by_xpath('//*[@id="J_TSearchForm"]/div[1]/button').click()
# 得到页数
pages = driver.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[1]').text
# 解析数字 共 100 页,
pages = int(re.compile('(\d+)').search(pages).group(1))
return pages
def get_product():
# 获取页面所有的商品div //代表任意位置 .代表当前路径
divs = driver.find_elements_by_xpath('//div[@class="items"]/div[@class="item J_MouserOnverReq "]')
for div in divs:
info = div.find_element_by_xpath('.//div[@class="row row-2 title"]').text
price = div.find_element_by_xpath('.//a[@class="J_ClickStat"]').get_attribute('trace-price') + '元'
deal = div.find_element_by_xpath('.//div[@class="deal-cnt"]').text
image = div.find_element_by_xpath('.//div[@class="pic"]/a/img').get_attribute('src')
name = div.find_element_by_xpath('.//div[@class="shop"]/a/span[2]').text
product = {'标题': info, '价格': price, '订单量': deal, '图片': image, '名字': name}
print(product)
def next_page():
pages = serch_product()
draw_down()
get_product()
num = 1
while num != pages:
driver.get('https://s.taobao.com/search?q={}&s={}'.format(keyword,44*num))
num+=1
draw_down()
get_product()
# 方式2:获得页面源代码并解析数据
def get_html_data():
serch_product() # 第一次访问
html = driver.page_source # 打印网页源代码
etr = etree.HTML(html) # 将HTML转化为二进制/html 格式
divs = etr.xpath('//div[@class="grid g-clearfix"]/div[@class="items"]/div')
shop_list = []
for div in divs:
image = div.xpath('.//a/img/@src')[0]
title = div.xpath('.//div[@class="row row-2 title"]/a/text()')[1]
price = div.xpath('.//div[@class="price g_price g_price-highlight"]/span/strong/text()')
deal = div.xpath('.//div[@class="deal-cnt"]/text()')
location = div.xpath('.//div[@class="location"]/text()')
dict = {'标题':title, '图片':image, '价格':price, '销量':deal, '地址':location}
shop_list.append(dict)
print(dict)
if __name__ == '__main__':
driver_path = 'D:\install\chromedriver.exe'
url = "https://www.taobao.com/"
keyword = "手机"
driver = driver_chrome()
driver.get(url) # 打开网页
# 设置cookie
for item in cookies:
if 'sameSite' in item:
del item['sameSite']
driver.add_cookie(item)
# 方式一
# next_page()
# 方式二
get_html_data()
print(shop_list)
本文地址:https://blog.csdn.net/qq_42994177/article/details/108670649