python+BeautifulSoup+selenium+mysqldb完成数据抓取
程序员文章站
2022-07-13 21:31:09
...
# coding=utf-8
'''
Created on 2017年2月20日
@author: chenkai
'''
import MySQLdb
import sys
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.remote import webelement
from selenium.webdriver.remote.webelement import WebElement
'''
连接数据库
'''
def getConn():
host ='127.0.0.1'
user ='root'
passwd ='123456'
port = 3306
dbcon=MySQLdb.connect(host,user,passwd,port=3306,charset="utf8")
return dbcon
def getCursor(mysqlConn):
return mysqlConn.cursor()
def closeDBConnandCur(cur,mysqlConn):
cur.close()
mysqlConn.commit() #加上这句,关闭数据库连接前提交数据库操作
mysqlConn.close()
#连接数据库
mysqlConn=getConn()
#得道curser
cur=getCursor(mysqlConn)
#使用test数据库
cur.execute("use test")
'''
浏览器
'''
options=webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",chrome_options=options) #调用chrome浏览器
#print dir(driver)
driver.get('https://sanya.nuomi.com/326')
#点击按钮
#driver.find_element_by_class_name("next-btn").click()
#
page = driver.page_source
# print(page)
# print type(page)
#
soup = BeautifulSoup(page,'html.parser',from_encoding="utf-8")
# print soup.prettify()
#
div_list=soup.find_all("div", class_="contentbox")
shopUrl=""
shopName=""
index=1001
for con in div_list:
index+=1
shopUrl=("https:"+con.a.get("href")).encode('utf-8')#转码,插入mysql后不会乱码
shopName=(con.h4.get_text()).encode('utf-8')
# shopUrl.encode('utf-8')
# shopName.encode('utf-8')
print shopUrl,shopName
print 'insert into k_bdnm_shopinfo values(%d,%s,%s)'%(index,shopUrl,shopName)
try:
cur.execute("insert into k_bdnm_shopinfo values(%d,'%s','%s')"%(index,shopUrl,shopName))
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
driver.quit()
closeDBConnandCur(cur,mysqlConn)#关闭游标和数据库连接
'''
数据表信息
'''
CREATE TABLE `k_bdnm_shopinfo` (
`shop_id` int(11) NOT NULL auto_increment,
`shop_url` varchar(300) NOT NULL,
`shop_name` varchar(100) NOT NULL,
PRIMARY KEY (`shop_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
'''
Created on 2017年2月20日
@author: chenkai
'''
import MySQLdb
import sys
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.remote import webelement
from selenium.webdriver.remote.webelement import WebElement
'''
连接数据库
'''
def getConn():
host ='127.0.0.1'
user ='root'
passwd ='123456'
port = 3306
dbcon=MySQLdb.connect(host,user,passwd,port=3306,charset="utf8")
return dbcon
def getCursor(mysqlConn):
return mysqlConn.cursor()
def closeDBConnandCur(cur,mysqlConn):
cur.close()
mysqlConn.commit() #加上这句,关闭数据库连接前提交数据库操作
mysqlConn.close()
#连接数据库
mysqlConn=getConn()
#得道curser
cur=getCursor(mysqlConn)
#使用test数据库
cur.execute("use test")
'''
浏览器
'''
options=webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
driver=webdriver.Chrome(executable_path="C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe",chrome_options=options) #调用chrome浏览器
#print dir(driver)
driver.get('https://sanya.nuomi.com/326')
#点击按钮
#driver.find_element_by_class_name("next-btn").click()
#
page = driver.page_source
# print(page)
# print type(page)
#
soup = BeautifulSoup(page,'html.parser',from_encoding="utf-8")
# print soup.prettify()
#
div_list=soup.find_all("div", class_="contentbox")
shopUrl=""
shopName=""
index=1001
for con in div_list:
index+=1
shopUrl=("https:"+con.a.get("href")).encode('utf-8')#转码,插入mysql后不会乱码
shopName=(con.h4.get_text()).encode('utf-8')
# shopUrl.encode('utf-8')
# shopName.encode('utf-8')
print shopUrl,shopName
print 'insert into k_bdnm_shopinfo values(%d,%s,%s)'%(index,shopUrl,shopName)
try:
cur.execute("insert into k_bdnm_shopinfo values(%d,'%s','%s')"%(index,shopUrl,shopName))
except MySQLdb.Error, e:
print "Mysql Error %d: %s" % (e.args[0], e.args[1])
driver.quit()
closeDBConnandCur(cur,mysqlConn)#关闭游标和数据库连接
'''
数据表信息
'''
CREATE TABLE `k_bdnm_shopinfo` (
`shop_id` int(11) NOT NULL auto_increment,
`shop_url` varchar(300) NOT NULL,
`shop_name` varchar(100) NOT NULL,
PRIMARY KEY (`shop_id`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
上一篇: redis启动流程介绍
下一篇: python中的开放运算