浙江高职高校网站爬虫实例
import requests
from lxml import etree
import os
def write_page(title,author,source,time,test):#写入文件
print(“正在保存:”+title)
with open(title+’.txt’,‘w’,encoding=‘UTF-8’) as file:
file.write(title+’\n’)
file.write(author+’\n’)
file.write(source+’\n’)
file.write(time+’\n’)
file.write(test)
file.close()
def spider_data(url):#爬取每条通知的数据
response = requests.get(url)
response.encoding = ‘utf-8’
html = etree.HTML(response.text)
title=’’.join(html.xpath(’//h2/text()’))
zz=’’.join(html.xpath("//div[@class=‘zz’]/text()"))
test=’’.join(html.xpath(’//*[@id=“vsb_content”]/p/text()’))
author=zz.split()[0]
source=zz.split()[1]
time=zz.split()[2]+" "+zz.split()[3]
write_page(title,author,source,time,test)
#print(type(title))
def spider_url(url):#爬取每条通知的url
response = requests.get(url)
response.encoding = 'utf-8'
result = response.text
html = etree.HTML(result)
all_url = html.xpath("//div[@class='right-1']/ul/li/a/@href")
url = []
for i in all_url:
if i.startswith("../"):
url.append('http://www.zjitc.net' + i.lstrip('..'))
return url
def spider_list_url():#爬取每页的url
url = ‘http://www.zjitc.net/xwzx/tztg.htm’
num = int(input(“请输入要爬取得页数:”))
url_lsit = []
url_lsit.append(url)
for i in range(num - 1):
response = requests.get(url)
response.encoding = ‘utf-8’
result = response.text
html = etree.HTML(result)
url_next = html.xpath("//a[contains(string(), ‘下页’)]/@href")[0]
if url_next.startswith(‘tztg’):
url = ‘http://www.zjitc.net/xwzx/’ + url_next
else:
url = ‘http://www.zjitc.net/xwzx/tztg/’ + url_next
url_lsit.append(url)
return url_lsit
if name==“main”:
for i in spider_list_url():
for j in spider_url(i):
spider_data(j)
本文地址:https://blog.csdn.net/m0_46206005/article/details/107057968