python爬虫记录（二）

程序员文章站 2022-05-01 18:38:19

python爬虫记录（二） #!/user/bin/python # -*- coding: UTF-8 -*- import urllib import urll...

python爬虫记录（二）

#!/user/bin/python
# -*- coding: UTF-8 -*-

import urllib
import urllib2
import lxml
import MySQLdb
from bs4 import BeautifulSoup

import httplib
httplib.HTTPConnection._http_vsn = 10
httplib.HTTPConnection._http_vsn_str = 'HTTP/1.0'

user_agent = '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) 
                AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36''' 
hdr = { 'User-Agent' : user_agent }

db = MySQLdb.connect(host="localhost", port=3306, user="root", passwd="123456", db="xiaoshuo", charset="utf8")
cursor = db.cursor()

str_sql2 = '''INSERT INTO `xiaoshuo`.`chapter1` (`bookId`, `chapterNum`, 
            `chapterName`, `chapterUrl`) VALUES '''

str_sql3 = '''INSERT INTO `xiaoshuo`.`chapter` (`bookId`, `chapterNum`, `chapterName`, `chapterUrl`) 
            VALUES (%s,%s,%s,%s)'''

def getUrlFromDbAndGetChapterInfo():
    global str_sql2

    str_sql1 = 'select bookId, bookName, url from book1'
    cursor1 = db.cursor()
    cursor1.execute(str_sql1)
    url_list = cursor1.fetchall() 
    cursor1.close()
    print "get book url list:", url_list

    for item in url_list:
        param=[]
        bookId = item[0]
        bookName = item[1].encode('utf-8')
        url = item[2].encode('utf-8')
        print "bookId:", bookId, "bookName:", bookName, "url:", url
        getChapterInfoAndSaveInDb(bookId, url, param)
        
        try:
            cursor.executemany(str_sql3,param)
            db.commit()
        except MySQLdb.Error, e:  
            sqlError =  "Error:%s" % str(e) 
            print "sqlError:", sqlError

def getChapterInfoAndSaveInDb(bookId, url, param):

    request = urllib2.Request(url, headers=hdr)
    response = urllib2.urlopen(request)
    html_data = response.read().decode('gbk')
    #f  = open('2.html')
    soup = BeautifulSoup(html_data,'lxml')
    mylist = soup.find_all('p', id ='list')

    for item in mylist:
        section_list = item.find_all('dd')
        for item in section_list:
            #print item
            chapterUrl = "https://www.biquzi.com" + item.find('a').get('href')
            #print "章节url:", chapterUrl

            tmp = item.find('a').get_text().split(' ')
            chapterNum = ""
            chapterName = ""
            if len(tmp)>1:
                chapterNum = tmp[0].encode("utf-8")
                chapterName = tmp[1].encode("utf-8")
            else:
                str1 = item.find('a').get_text().split(u'章',1)
                if len(str1) == 1:
                    continue

                chapterNum = str1[0].encode("utf-8") + "章"
                chapterName = str1[1].encode("utf-8")

            #temp_str = '("' + str(bookId) + '", "'+ chapterNum + '", "'+ chapterName + '", "'+ chapterUrl + '"),'
            tup1 = (bookId, chapterNum, chapterName, chapterUrl)
            param.append(tup1)

            #print "第几章:", chapterNum, len(chapterNum)
            #print "章节名:", chapterName, len(chapterName)
            #print "-----------------------------------------------------------------------------------------"
    #str_sql2 = str_sql2 + ''.join(str_list)
    #f.close()

if __name__ == "__main__":
    print ("<<<-----Start Get Book Chapter And Save In Db------>>")
    getUrlFromDbAndGetChapterInfo()

    cursor.close()
    db.close()

上一篇： Jenkins安装配置

下一篇： unbutu下zookeeper的配置

python爬虫记录（二）

python爬虫记录（二）

Python制作爬虫采集小说

selenium python虚拟点击网页爬虫翻页功能 href=javascript:void(0)怎么翻页

Python爬虫实战之12306抢票开源

Python实现查找二叉搜索树第k大的节点功能示例

sql server获得新记录标识列值的二种方法

python制作最美应用的爬虫

python制作花瓣网美女图片爬虫

Python爬虫框架Scrapy实战之批量抓取招聘信息

python爬虫教程之爬取百度贴吧并下载的示例

Python中的二叉树查找算法模块使用指南

python爬虫记录（二）

python爬虫记录（二）

Python制作爬虫采集小说

selenium python虚拟点击网页 爬虫翻页功能 href=javascript:void(0)怎么翻页

Python爬虫实战之12306抢票开源

Python实现查找二叉搜索树第k大的节点功能示例

sql server获得新记录标识列值的二种方法

python制作最美应用的爬虫

python制作花瓣网美女图片爬虫

Python爬虫框架Scrapy实战之批量抓取招聘信息

python爬虫教程之爬取百度贴吧并下载的示例

Python中的二叉树查找算法模块使用指南

selenium python虚拟点击网页爬虫翻页功能 href=javascript:void(0)怎么翻页