selenium+headless chrome爬虫的实现示例

程序员文章站 2022-03-27 11:47:44

python爬虫写起来非常快，虽然也可以用java，但是没有python来的简洁迅速 selenium在前面总结过，是一个自动化测试库。headless chrome是*面的浏览器模式，和phant...

python爬虫写起来非常快，虽然也可以用java，但是没有python来的简洁迅速

selenium在前面总结过，是一个自动化测试库。headless chrome是*面的浏览器模式，和phantomjs类似。但是phantomjs往往会出现莫名的错误，而且速度没有headless chrome快

from selenium.webdriver.chrome.options import options
 
global driver
chrome_options = options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
  
driver = webdriver.chrome(chrome_options=chrome_options)

爬虫的代码有一点需要注意，需要操作事件的时候最好不要直接用相应的方法，比如click。最好嵌入js脚本的方式进行调用。因为爬虫的代码执行速度很快，前端元素结构往往反应不过来，从而找出元素不可见或者不存在的错误。

province_items = driver.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
 
#province_item.click()
driver.execute_script('arguments[0].click();',province_item)

下面来个例子，由于做电商平台，省、市、区的数据很好找，但是没有镇、街道的信息。这里通过爬虫从淘宝网将镇，街道的信息抓取下来

#! /usr/local/bin/python
# encoding: utf-8
 
'''
created on 2018年1月5日
 
@author: wulinfeng
@date: 2018-1-5
'''
 
import time
#import request
from selenium import webdriver
#from selenium.webdriver.common.desired_capabilities import desiredcapabilities
from selenium.webdriver.chrome.options import options
import pymysql
 
def init_db():
  global connection 
  connection = pymysql.connect("地址","用户名","密码","数据库" ,use_unicode=true, charset="utf8")
 
def init_web_driver():
  global driver
  #driver = webdriver.phantomjs(executable_path='c:\phantomjs-1.9.2-windows\phantomjs.exe')
  #driver.set_window_size(1920, 1080)
  '''  
  dcap = dict(desiredcapabilities.phantomjs)
 
  dcap["phantomjs.page.settings.useragent"] = (
    "mozilla/5.0 (windows nt 6.3; wow64) applewebkit/537.36 "
    "(khtml, like gecko) chrome/53.0.2785.116 safari/537.36"
  )
  
  dcap["phantomjs.page.settings.viewportsize"] = (
    "width: 1920, "
    "height: 1080"
  )
  
  driver = webdriver.phantomjs(executable_path='c:\phantomjs-1.9.2-windows\phantomjs.exe',desired_capabilities=dcap)
  driver.set_window_size(1920, 1080)
  '''
  
  chrome_options = options()
  chrome_options.add_argument('--headless')
  chrome_options.add_argument('--disable-gpu')
  
  driver = webdriver.chrome(chrome_options=chrome_options)
  
  #driver=webdriver.ie()
  #driver=webdriver.chrome()
 
def close_db():
  connection.close()    
  
def close_web_driver():
  driver.quit() 
  
def login_taobao(username, password):
  driver.get("https://member1.taobao.com/member/fresh/deliver_address.htm?spm=a1z08.2.0.0.7dad47611wnj46")
  #driver.get("https://login.taobao.com/member/login.jhtml?spm=a21bo.2017.201864-2.d1.7d2082a4fxukgr&f=top&redirecturl=http%3a%2f%2fwww.taobao.com%2f")
  #选择登陆方式
  driver.find_element_by_xpath("//*[@id=\"j_quick2static\"]").click()
 
  #登陆
  input_user = driver.find_element_by_xpath("//*[@id=\"tpl_username_1\"]")
  input_user.clear()
  input_user.send_keys(username)
  
  driver.find_element_by_xpath("//*[@id=\"tpl_password_1\"]").send_keys(password)
  driver.find_element_by_xpath("//*[@id=\"j_submitstatic\"]").click();
  
  time.sleep(0.5)
  
def get_data():
  #点击地址选择
  #driver.find_element_by_xpath("//*[@id=\"city-title\"]").click()
  city_title = driver.find_element_by_id("city-title")
  driver.execute_script('arguments[0].click();',city_title)
  
  get_province_and_sub()
  
def get_province_and_sub():
  #获得省列表
  province_items = driver.find_element_by_class_name("city-province").find_elements_by_tag_name("a")
  
  for province_item in province_items:
    pid = province_item.get_attribute("attr-id");
    pname = province_item.get_attribute("title");
    if pid == "-1":
      print("continue province")
      continue
    
    sql = "insert into region_province_t (province_id,province) values('"+pid+"','"+pname+"')"
    print(sql) 
    cursor = connection.cursor()
    cursor.execute(sql)
    connection.commit()
    
    #province_item.click()
    driver.execute_script('arguments[0].click();',province_item)
    time.sleep(0.5)
    
    get_city_and_sub(pid)
    back_tab(0) 
  
def get_city_and_sub(pid):
  #获得市列表
  city_items = driver.find_element_by_class_name("city-city").find_elements_by_tag_name("a")
  for city_item in city_items:
    cid = city_item.get_attribute("attr-id");
    cname = city_item.get_attribute("title");
    if cid == "-1":
      print("continue city")
      continue
  
    sql = "insert into region_city_t (city_id,city,province_id) values('"+cid+"','"+cname+"','"+pid+"')"
    print(sql) 
    cursor = connection.cursor()
    cursor.execute(sql)
    connection.commit()
    
    #city_item.click()
    driver.execute_script('arguments[0].click();',city_item)
    time.sleep(1)
    
    get_area_and_sub(cid)
    back_tab(1)
    
def get_area_and_sub(cid):
  #获得县区列表
  area_items = driver.find_element_by_class_name("city-district").find_elements_by_tag_name("a")
  for area_item in area_items:
    aid = area_item.get_attribute("attr-id");
    aname = area_item.get_attribute("title");
    if aid == "-1":
      print("continue area")
      continue
    
    sql = "insert into region_area_t (area_id,area,city_id) values('"+aid+"','"+aname+"','"+cid+"')"
    print(sql) 
    cursor = connection.cursor()
    cursor.execute(sql)
    connection.commit()
    
    #area_item.click()
    driver.execute_script('arguments[0].click();',area_item)
    time.sleep(0.5)
    
    get_town_and_sub(aid)
    back_tab(2)
    
  
def get_town_and_sub(aid):
  #获得镇列表
  town_items = driver.find_element_by_class_name("city-street").find_elements_by_tag_name("a")
  for town_item in town_items:
    tid = town_item.get_attribute("attr-id");
    tname = town_item.get_attribute("title");
    if tid == "-1":
      print("continue town")
      continue
    
    sql = "insert into region_town_t (town_id,town,area_id) values('"+tid+"','"+tname+"','"+aid+"')"
    print(sql) 
    cursor = connection.cursor()
    cursor.execute(sql)
    connection.commit()
    
def back_tab(index):
  districtele = driver.find_element_by_class_name("city-select-tab").find_elements_by_tag_name("a")[index]
  driver.execute_script('arguments[0].click();',districtele)
  time.sleep(0.5)
  
init_db()
init_web_driver()
login_taobao("用户名", "密码")
get_data()
close_db()
close_web_driver()

到此这篇关于selenium+headless chrome爬虫的实现示例的文章就介绍到这了,更多相关selenium+headless chrome爬虫内容请搜索以前的文章或继续浏览下面的相关文章希望大家以后多多支持！

上一篇：专门拍物品的拍照软件排行榜：手机也能拍出高品质大片

下一篇： tensorflow与numpy的版本兼容性问题的解决

selenium+headless chrome爬虫的实现示例

Java实现求子数组和的最大值算法示例

使用html2canvas.js实现页面截图并显示或上传的示例代码

Python实现手写一个类似django的web框架示例

JS实现线性表的顺序表示方法示例【经典数据结构】

Python实现的视频播放器功能完整示例

js实现网页的两个input标签内的数值加减(示例代码)

用python实现百度翻译的示例代码

php+js实现图片的上传、裁剪、预览、提交示例

Python实现获取前100组勾股数的方法示例

Python实现的堆排序算法示例