博客园搜索爬取
程序员文章站
2023-09-29 08:54:04
爬取用户提交关键字在博客园搜索出来的文章,一页十篇,共50页,获取标题,内容,发表时间,推荐量,评论量,浏览量 写入sql server数据库,代码如下; import requests from lxml import etree import pymssql import time # 连接sq ......
爬取用户提交关键字在博客园搜索出来的文章,一页十篇,共50页,获取标题,内容,发表时间,推荐量,评论量,浏览量
写入sql server数据库,代码如下;
import requests from lxml import etree import pymssql import time # 连接sql server数据库 conn = pymssql.connect(host='127.0.0.1', user='sa', password='root', database='a', charset='utf8') cursor = conn.cursor() headers = { 'user-agent': 'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/78.0.3904.108 safari/537.36', 'cookie': '_ga=ga1.2.789692814.1575245968; _gid=ga1.2.90574348.1575245968; __gads=id=d0b3d037d343ea7f:t=1575246122:s=alni_myb3-nqsf59wlf_5kayyqyllv7efa; _gat=1; .cnblogs.aspnetcore.cookies=cfdj8dehxseuwr9ktnvagu7_dx-wfut1-dgx_yw1t_fpbsg6ejwby5on7dpqagwvw_wdjyzxksv4bwouwpbclu4vncysbhu5xw1f4vpuob4net3tigrh9t3mlgnwiwy7oqlfygxjqxnj2gkfzpdx7yq8t7hjomxg30lx50dn4ssngtwvctppmnhjt1nyfqs58horuczthrwejtxdmdcai_vogbd-emmout9h-flvnq_hn4b8lq9evymg4n9nmmarbnhf3wno-rkb7tgmcx6quwwibyxp2m2tjzg3uzbo3rneljktl1cveb6my97zqfjlre27rbarxp4wltsxi4wkbcntqaxyi2spifzyccbztxt_uc-z5phphjs-sl1_iu7sir-8m0qysad-bukds6qwvj5qljt1jcjbi_wfh6dzs_rgjvn0dfpqe50salhos6dhgqc7n-ydvqpsphjdrlrkim6jbh8pq6ez8s0irbzsdkiqij54cd-h5g5hx9oatleakaqdnwyz4llbvyu1wkne48r5usxkmityz1pdwwhc5pkrkxfelxdor05reo4gdoxhxxg5xezeya1rwdji7aknim5rm9y; .cnblogscookie=e4793f450c4325e3c9ef21b78b1de43f6258c9fd5951338859d96a5ec8795064ab518501755136f3a4cb1ce647ebd2cc352c1e9ebdc6e460b6320e9f62f083a52a635a4651a3d1082631d55fce58e283b97d016e61dc411e094f6ea9a9cf9a59a292c16f' } """ 标题,内容,发表时间,推荐量,评论量,浏览量 """ # 写入数据库 def insert_sqlserver(key,data): try: cursor.executemany( "insert into {}(title,contents,create_time,view_count,comment_count,good_count) values(%s,%s,%s,%s,%s,%s)".format(key),data ) conn.commit() except exception as e: print(e,'写入数据库时错误') # 获取数据 def get_all(key,url): for i in range(1,51): next_url = url+'&pageindex=%s'%i res = requests.get(next_url,headers=headers) response = etree.html(res.text) details = response.xpath('//div[@class="searchitem"]') data = [] print(next_url) for detail in details: try: detail_url = detail.xpath('./h3/a[1]/@href') good = detail.xpath('./div/span[3]/text()') comments = ['0' if not detail.xpath('./div/span[4]/text()') else detail.xpath('./div/span[4]/text()')[0]] views = ['0' if not detail.xpath('./div/span[5]/text()') else detail.xpath('./div/span[5]/text()')[0]] res = requests.get(detail_url[0],headers=headers) response = etree.html(res.text) title = response.xpath('//a[@id="cb_post_title_url"]/text()')[0] contents = response.xpath('//div[@id="post_detail"]') if not response.xpath('//div[@class="postbody"]') else response.xpath('//div[@class="postbody"]') content = etree.tounicode(contents[0],method='html') create_time = response.xpath('//span[@id="post-date"]/text()')[0] print(detail_url[0],good[0],comments[0],views[0],title,create_time) data.append((title,content,create_time,views[0],comments[0],good[0])) time.sleep(2) except exception as e: print(e,'获取数据错误') insert_sqlserver(key,data) # //*[@id="searchresult"]/div[2]/div[2]/h3/a # 主函数并创建数据表 def main(key,url): cursor.execute(""" if object_id('%s','u') is not null drop table %s create table %s( id int not null primary key identity(1,1), title varchar(500), contents text, create_time datetime, view_count varchar(100), comment_count varchar(100), good_count varchar(100) ) """%(key,key,key)) conn.commit() get_all(key,url) if __name__ == '__main__': key = 'python' url = 'https://zzk.cnblogs.com/s?t=b&w=%s'%key main(key,url) conn.close()
查看数据库内容:
done