Xpath实战之爬取学习猿地的猿著(上)
程序员文章站
2022-05-02 22:03:23
...
爬取猿著(代码篇)
1、爬取地址:url = https://www.lmonkey.com/essence/
2、请求头池用于该博客分享:https://www.cnblogs.com/huangyuechujiu/p/12893982.html
3、代码部分:(爬取的数据写入json文件中)
import requests, json
from lxml import etree
//封装为类,便于管理
class Xp_test():
#请求的地址 猿著
url = 'https://www.lmonkey.com/essence'
headers = {
'user-Agent': "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)"
}
# 爬取的数据
data = ''
#存储数据
filepath = './yq.json'
#初始化
def __init__(self):
#发送请求
res = requests.get(url = self.url,headers = self.headers)
if res.status_code == 200:
# 请求内容写入文件
with open('./yq.html','wb') as fp:
fp.write(res.content)
if self.parth_data():
self.write_data()
def parth_data(self):
#解析数据
html = etree.parse('./yq.html',etree.HTMLParser())
#提取数据 文章作者 标题 地址
authors = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"list-group-item-action")]//strong/a/text()')
titles = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"list-group-item-action")]//div[contains(@class,"flex-fill")]//div/text()')
titleurl = html.xpath('//div[contains(@class,"old_content")]//div[contains(@class,"list-group-item-action")]//div[contains(@class,"flex-fill")]//a/@href')
#整理数据
data = []
for i in range(0,len(authors)):
res = {'author':authors[i],
'title': titles[i],
'titleurl': titleurl[i]
}
data.append(res)
self.data = data
return True
def write_data(self):
#写入数据
print(self.data)
with open(self.filepath,'w',encoding='utf-8') as fp:
json.dump(self.data,fp,ensure_ascii=False)
#实例化对象
Xp_test()
4,爬取结果:(yq.json)
[{"author": "xxyd_h5x", "title": "JetBrains开发工具正版授权领取", "titleurl": "https://www.lmonkey.com/t/lpLmQeKLg"},
{"author": "IT头条", "title": "面向回家编程!GitHub标星两万的”Python抢票教程”,我们先帮你跑了一遍", "titleurl": "https://www.lmonkey.com/t/lpLmQeKLg"},
{"author": "duke", "title": "Python教程-一文读懂运算和运算符", "titleurl": "https://www.lmonkey.com/t/lpLmQeKLg"},
{"author": "dragonsz", "title": "CentOS7 下使用 rsync+sersync 配置文件自动同步", "titleurl": "https://www.lmonkey.com/t/user/15"},
{"author": "qingqi", "title": "Python 教程-代码测试", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "jhxspy", "title": "Python教程-强制数据类型转换", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "xxyd_python", "title": "Python 教程-从变量开始", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "IT头条", "title": "Python 教程-Python 安装", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "IT头条", "title": "Python 教程-了解Python", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "GaiJoon", "title": "喊话 JavaScript 开发者:玩 DOM 也要专业范儿", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "IT头条", "title": "1000 行 Python 代码脚本 bug,或影响上百篇学术论文", "titleurl": "https://www.lmonkey.com/t/2zLAPzMyW"},
{"author": "IT头条", "title": "生产环境下的LAMP环境搭建", "titleurl": "https://www.lmonkey.com/t/user/168547"},
{"author": "王炸", "title": "Golang语言的主要特性与发展的环境和影响因素", "titleurl": "https://www.lmonkey.com/t/G5yvRWXyp"},
{"author": "王炸", "title": "分享 10 个有用的 Laravel 5.8 集合辅助方法", "titleurl": "https://www.lmonkey.com/t/G5yvRWXyp"}]
下一篇: Java二进制.位运算.位移运算