Scrapy插入Mysql数据库(多表)
程序员文章站
2022-03-03 07:53:29
...
mysql参考
保存数据到MySql数据库——我用scrapy写爬虫(二)
保证安装
pip install pymysql
pip install Twisted
主要是需要主子表插入,子表有多条数据,子表是json格式数据
思路是
for page in range(0,int(pages)):
#获取表格中的所有行,使用find_elements_by_xpath
included_names = self.driver.find_elements_by_xpath('/html/body/div[5]/div[2]/div/div')
item['titles']=""
item['projId']=""
item['projCode']=""
item['projName']=""
item['itemType']=""
titles=""
#遍历获取的div
for i in included_names:
#获取结点的子节点,在xpath路径前加"."
title=i.find_element_by_xpath('./div[4]/div[2]/div/a').text
print(title)
projId=i.find_element_by_xpath('./div[4]/div[2]/div/a').get_attribute('id')
#1.主表,selenium切换到新页面打开网页
href_main="https://XXXXXXXXXX?projId="+projId;
newwindow = 'window.open("'+href_main+'");'
self.driver.execute_script(newwindow)
# #移动句柄,对新打开页面进行操作
self.driver.switch_to_window(self.driver.window_handles[1])
self.driver.get(href_main)
html = self.driver.page_source
# print(html)
soup = BeautifulSoup(html, 'lxml')
json_text = soup.find('pre')
if json_text!=None and json_text!=''and json_text!="None" :
# print(json_text.get_text())
text = json.loads(json_text.get_text())
print("项目ID:"+text['projId'])
print("项目编号:"+text['projCode'])
projCode=text['projCode']
print("项目名称:"+text['projName'])
projName=text['projName']
item['projId']=projId
item['projCode']=projCode
item['projName']=projName
#2 项目子表信息 1
herf_sub1="https://xxxxxxxxxx?projId="+projId
self.driver.get(herf_sub)
html = self.driver.page_source
soup = BeautifulSoup(html, 'lxml')
json_text = soup.find('pre')
if json_text!=None and json_text!=''and json_text!="None" :
texts = json.loads(json_text.get_text())
##解析多条
for text in texts:
print("合同ID:"+text['contractId'])
print("合同编号:"+text['contractCode'])
print("合同名称:"+text['contractName'])
item['contractId']=text['contractId']
item['contractCode']=text['contractCode']
item['projId']=projId
###定义了一个 item['itemType'],用来判断执行哪一个插入语句
item['itemType']="sub1"
#-----------单独执行sub1对应的插入-------------
yield item
#3 项目子表信息 2
herf_sub2="https://xxxxxxxxxxxx?projCode="+projCode
self.driver.get(herf_sub2)
html = self.driver.page_source
soup = BeautifulSoup(html, 'lxml')
json_text = soup.find('pre')
if json_text!=None and json_text!=''and json_text!="None" :
texts = json.loads(json_text.get_text())
##解析多条
for text in texts:
print("订单编号:"+text['orderCode'])
print("合同编号:"+str(text['contractCode']))
print("项目编号:"+text['projectCode'])
item['orderCode']=text['orderCode']
item['contractCode']=text['contractCode']
item['projectCode']=projCode
#------------单独执行sub2对应的插入-----------
item['itemType']="sub2"
yield item
else:
print("null")
item['itemType']="main"
#-------单独执行main对应的插入---------
yield item
time.sleep(1) # 仅为能达到悬停效果睡眠,可删除
#关闭该新打开的页面
self.driver.close()
self.driver.switch_to_window(self.driver.window_handles[0])
time.sleep(3) # 仅为能达到悬停效果睡眠,可删除
print("page:"+str(page))
if(page!=pages):
#下一页 self.driver.find_element_by_xpath('/html/body/div[6]/div/div[1]/div[2]/ul/li[4]/button').click()
Pipeline中改写对应的方法
from XX.db.dbhelper import DBHelper
class XXXXPipeline(object):
# 连接数据库
def __init__(self):
self.db = DBHelper()
def process_item(self, item, spider):
# 插入数据库
if item['itemType']=="main":
self.db.insertmain(item)
if item['itemType']=="sub1":
self.db.insertsub1(item)
if item['itemType']=="sub2":
self.db.insertsub2(item)
return item