Python 爬取优书网小说数据并存入sql数据库
程序员文章站
2022-05-03 13:13:38
...
首先爬取优书网–>书库
通过书库翻页来获得书籍相关信息
def get_url():
url = "http://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page="
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
html = requests.get(url+"1",headers=headers)
html.encoding = "UTF-8"
js_info = xpathnode(html)
js_info = js_info.get('Bookstore')
account_info = js_info.get('total')
pages = math.ceil(float(account_info/20)) #get the upper integer
url = [url+str(i+1) for i in range(pages)] #this is the array of waited crawl url ,just return to another block
return pages,url
def xpathnode(html): #return the structure of json data
tree = etree.HTML(html.text)
node = tree.xpath('//script/text()') #get the account of books
info = node[0][25:-122]
js_info = json.loads(info)
return js_info
def crawl(): #the core
pages,url_combine = get_url()
conn = conn_sql()
create_tab(conn)
cursor = conn.cursor()
flag = 0
for url in url_combine: #page turning
flag = flag+1
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
html = requests.get(url,headers=headers)
html.encoding = "UTF-8"
book_js_info = xpathnode(html)
book_js_info = book_js_info.get('Bookstore')
book_js_info = book_js_info.get('books')
print('rate of progress:'+str(round(flag*100/pages,2))+'%') #rate of progress
for i in range(20): #scanning the page
try:
book = book_js_info[i]
dt = {'bookId':book.get('bookId'),'title':book.get('title'),'score':book.get('score'),'scorerCount':book.get('scorerCount'),
'author':book.get('author'),'countWord':str(round(book.get('countWord')/10000,2)),\
'tags':str(book.get('tags')).translate(str.maketrans('', '', '\'')),\
'updateAt':book.get('updateAt')[:10]}
store_to_sql(dt,conn,cursor)
except:
print('erro')
cursor.close()
conn.close()
存入SQL server数据库:(使用时可更改自己数据库参数的设置)
def store_to_sql(dt,connect,cursor): #insert or just change the information
tbname = '['+record+']'
ls = [(k,v) for k,v in dt.items() if k is not None]
sentence = 'IF NOT EXISTS ( SELECT * FROM '+tbname+' WHERE bookId = '+str(ls[0][1])+') \
INSERT INTO %s (' % tbname +','.join([i[0] for i in ls]) +') VALUES (' + ','.join(repr(i[1]) for i in ls) + ');'
cursor.execute((sentence))
connect.commit()
return ""
def create_tab(conn): #create table(if not exists)
cursor = conn.cursor()
sentence = 'if not exists (select * from sysobjects where id = object_id('+record+')and OBJECTPROPERTY(id, \'IsUserTable\') = 1) \
CREATE TABLE\"'+record+'\"\
(NUM int IDENTITY(1,1),\
bookId INT NOT NULL,\
title VARCHAR(100),\
score VARCHAR(100),\
scorerCount float,\
author VARCHAR(50),\
countWord float ,\
tags VARCHAR(100),\
updateAt date) '
cursor.execute(sentence)
conn.commit()
cursor.close()
def conn_sql():
server = "127.0.0.1"
user = "sa"
password = "123456"
conn = pymssql.connect(server, user, password, "novel")
return conn
爬取结果:
注意,优书网更新后限制了爬虫,需要增加一个sleep()人为每次爬取的增加时间间隔。
全部代码:
# -- coding: gbk --
import json
import requests
import csv
import numpy
import json
import math
from lxml import etree
import pymssql
import datetime
#爬取数据并存入sql数据库
record = str(datetime.date.today()).translate(str.maketrans('','','-')) #date of today
def get_url():
url = "http://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page="
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
html = requests.get(url+"1",headers=headers)
html.encoding = "UTF-8"
js_info = xpathnode(html)
js_info = js_info.get('Bookstore')
account_info = js_info.get('total')
pages = math.ceil(float(account_info/20)) #get the upper integer
url = [url+str(i+1) for i in range(pages)] #this is the array of waited crawl url ,just return to another block
return pages,url
def xpathnode(html): #return the structure of json data
tree = etree.HTML(html.text)
node = tree.xpath('//script/text()') #get the account of books
info = node[0][25:-122]
js_info = json.loads(info)
return js_info
def crawl(): #the core
pages,url_combine = get_url()
conn = conn_sql()
create_tab(conn)
cursor = conn.cursor()
flag = 0
for url in url_combine: #page turning
flag = flag+1
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
html = requests.get(url,headers=headers)
html.encoding = "UTF-8"
book_js_info = xpathnode(html)
book_js_info = book_js_info.get('Bookstore')
book_js_info = book_js_info.get('books')
print('rate of progress:'+str(round(flag*100/pages,2))+'%') #rate of progress
for i in range(20): #scanning the page
try:
book = book_js_info[i]
dt = {'bookId':book.get('bookId'),'title':book.get('title'),'score':book.get('score'),'scorerCount':book.get('scorerCount'),
'author':book.get('author'),'countWord':str(round(book.get('countWord')/10000,2)),\
'tags':str(book.get('tags')).translate(str.maketrans('', '', '\'')),\
'updateAt':book.get('updateAt')[:10]}
store_to_sql(dt,conn,cursor)
except:
print('erro')
cursor.close()
conn.close()
def store_to_sql(dt,connect,cursor): #insert or just change the information
tbname = '['+record+']'
ls = [(k,v) for k,v in dt.items() if k is not None]
sentence = 'IF NOT EXISTS ( SELECT * FROM '+tbname+' WHERE bookId = '+str(ls[0][1])+') \
INSERT INTO %s (' % tbname +','.join([i[0] for i in ls]) +') VALUES (' + ','.join(repr(i[1]) for i in ls) + ');'
cursor.execute((sentence))
connect.commit()
return ""
def create_tab(conn): #create table(if not exists)
cursor = conn.cursor()
sentence = 'if not exists (select * from sysobjects where id = object_id('+record+')and OBJECTPROPERTY(id, \'IsUserTable\') = 1) \
CREATE TABLE\"'+record+'\"\
(NUM int IDENTITY(1,1),\
bookId INT NOT NULL,\
title VARCHAR(100),\
score VARCHAR(100),\
scorerCount float,\
author VARCHAR(50),\
countWord float ,\
tags VARCHAR(100),\
updateAt date) '
cursor.execute(sentence)
conn.commit()
cursor.close()
def conn_sql():
server = "127.0.0.1"
user = "sa"
password = "123456"
conn = pymssql.connect(server, user, password, "novel")
return conn
if __name__ == '__main__':
crawl()
下一篇: VB接入不同版本的Access