欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Python 爬取优书网小说数据并存入sql数据库

程序员文章站 2022-05-03 13:13:38
...

首先爬取优书网–>书库
通过书库翻页来获得书籍相关信息

def get_url():
    url = "http://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page="
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
    html = requests.get(url+"1",headers=headers)
    html.encoding = "UTF-8"
    js_info = xpathnode(html)
    
    js_info = js_info.get('Bookstore')
    account_info = js_info.get('total')
    pages = math.ceil(float(account_info/20))  #get the upper integer
    url = [url+str(i+1) for i in range(pages)]    #this is the array of waited crawl url ,just return to another block
    return pages,url

def xpathnode(html):            #return the structure of json data
    tree = etree.HTML(html.text)
    node = tree.xpath('//script/text()')   #get the account of books
    info = node[0][25:-122]
    js_info = json.loads(info)
    return js_info

def crawl():    #the core
    pages,url_combine = get_url()
    conn = conn_sql()
    create_tab(conn)
    cursor = conn.cursor()
    flag = 0
    for url in url_combine:       #page turning
        flag  = flag+1
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
        html = requests.get(url,headers=headers)
        html.encoding = "UTF-8"
        book_js_info = xpathnode(html)
        book_js_info = book_js_info.get('Bookstore')
        book_js_info = book_js_info.get('books')
        print('rate of progress:'+str(round(flag*100/pages,2))+'%')   #rate of progress
        for i in range(20):       #scanning the page
            try:
                book = book_js_info[i]
                dt = {'bookId':book.get('bookId'),'title':book.get('title'),'score':book.get('score'),'scorerCount':book.get('scorerCount'),
                        'author':book.get('author'),'countWord':str(round(book.get('countWord')/10000,2)),\
                        'tags':str(book.get('tags')).translate(str.maketrans('', '', '\'')),\
                        'updateAt':book.get('updateAt')[:10]}
                store_to_sql(dt,conn,cursor)
            except:
                print('erro')
    cursor.close()
    conn.close()

存入SQL server数据库:(使用时可更改自己数据库参数的设置)

         
def store_to_sql(dt,connect,cursor):    #insert or just change the information
    tbname = '['+record+']'
    ls = [(k,v) for k,v in dt.items() if k is not None]
    sentence = 'IF NOT EXISTS ( SELECT  * FROM '+tbname+' WHERE bookId = '+str(ls[0][1])+') \
                INSERT INTO %s (' % tbname +','.join([i[0] for i in ls]) +') VALUES (' + ','.join(repr(i[1]) for i in ls) + ');'
    cursor.execute((sentence))
    connect.commit()
    return ""

def create_tab(conn):    #create table(if not exists)
    cursor = conn.cursor()
    sentence = 'if not exists (select * from sysobjects where id = object_id('+record+')and OBJECTPROPERTY(id, \'IsUserTable\') = 1) \
    CREATE TABLE\"'+record+'\"\
    (NUM int IDENTITY(1,1),\
    bookId INT NOT NULL,\
    title VARCHAR(100),\
    score VARCHAR(100),\
    scorerCount float,\
    author VARCHAR(50),\
    countWord float ,\
    tags VARCHAR(100),\
    updateAt date) '
    cursor.execute(sentence)
    conn.commit()
    cursor.close()

def conn_sql():
    server = "127.0.0.1"
    user = "sa"
    password = "123456"
    conn = pymssql.connect(server, user, password, "novel")
    return conn

爬取结果:
Python 爬取优书网小说数据并存入sql数据库

注意,优书网更新后限制了爬虫,需要增加一个sleep()人为每次爬取的增加时间间隔。

全部代码:

# -- coding: gbk --
import json
import requests
import csv
import numpy
import json
import math
from lxml import etree
import pymssql 
import datetime

#爬取数据并存入sql数据库
record = str(datetime.date.today()).translate(str.maketrans('','','-'))  #date of today
def get_url():
    url = "http://www.yousuu.com/bookstore/?channel&classId&tag&countWord&status&update&sort&page="
    headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
    html = requests.get(url+"1",headers=headers)
    html.encoding = "UTF-8"
    js_info = xpathnode(html)
    
    js_info = js_info.get('Bookstore')
    account_info = js_info.get('total')
    pages = math.ceil(float(account_info/20))  #get the upper integer
    url = [url+str(i+1) for i in range(pages)]    #this is the array of waited crawl url ,just return to another block
    return pages,url

def xpathnode(html):            #return the structure of json data
    tree = etree.HTML(html.text)
    node = tree.xpath('//script/text()')   #get the account of books
    info = node[0][25:-122]
    js_info = json.loads(info)
    return js_info

def crawl():    #the core
    pages,url_combine = get_url()
    conn = conn_sql()
    create_tab(conn)
    cursor = conn.cursor()
    flag = 0
    for url in url_combine:       #page turning
        flag  = flag+1
        headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3534.4 Safari/537.36'}
        html = requests.get(url,headers=headers)
        html.encoding = "UTF-8"
        book_js_info = xpathnode(html)
        book_js_info = book_js_info.get('Bookstore')
        book_js_info = book_js_info.get('books')
        print('rate of progress:'+str(round(flag*100/pages,2))+'%')   #rate of progress
        for i in range(20):       #scanning the page
            try:
                book = book_js_info[i]
                dt = {'bookId':book.get('bookId'),'title':book.get('title'),'score':book.get('score'),'scorerCount':book.get('scorerCount'),
                        'author':book.get('author'),'countWord':str(round(book.get('countWord')/10000,2)),\
                        'tags':str(book.get('tags')).translate(str.maketrans('', '', '\'')),\
                        'updateAt':book.get('updateAt')[:10]}
                store_to_sql(dt,conn,cursor)
            except:
                print('erro')
    cursor.close()
    conn.close()
            
def store_to_sql(dt,connect,cursor):    #insert or just change the information
    tbname = '['+record+']'
    ls = [(k,v) for k,v in dt.items() if k is not None]
    sentence = 'IF NOT EXISTS ( SELECT  * FROM '+tbname+' WHERE bookId = '+str(ls[0][1])+') \
                INSERT INTO %s (' % tbname +','.join([i[0] for i in ls]) +') VALUES (' + ','.join(repr(i[1]) for i in ls) + ');'
    cursor.execute((sentence))
    connect.commit()
    return ""

def create_tab(conn):    #create table(if not exists)
    cursor = conn.cursor()
    sentence = 'if not exists (select * from sysobjects where id = object_id('+record+')and OBJECTPROPERTY(id, \'IsUserTable\') = 1) \
    CREATE TABLE\"'+record+'\"\
    (NUM int IDENTITY(1,1),\
    bookId INT NOT NULL,\
    title VARCHAR(100),\
    score VARCHAR(100),\
    scorerCount float,\
    author VARCHAR(50),\
    countWord float ,\
    tags VARCHAR(100),\
    updateAt date) '
    cursor.execute(sentence)
    conn.commit()
    cursor.close()

def conn_sql():
    server = "127.0.0.1"
    user = "sa"
    password = "123456"
    conn = pymssql.connect(server, user, password, "novel")
    return conn



if __name__ == '__main__':
    crawl()


相关标签: python 大数据