欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

浙江高职高校网站爬虫实例

程序员文章站 2023-11-29 12:07:46
import requestsfrom lxml import etreeimport osdef write_page(title,author,source,time,test):#写入文件print(“正在保存:”+title)with open(title+’.txt’,‘w’,encoding=‘UTF-8’) as file:file.write(title+’\n’)file.write(author+’\n’)file.write(source+’\n’)file.writ...

import requests
from lxml import etree
import os

def write_page(title,author,source,time,test):#写入文件
print(“正在保存:”+title)
with open(title+’.txt’,‘w’,encoding=‘UTF-8’) as file:
file.write(title+’\n’)
file.write(author+’\n’)
file.write(source+’\n’)
file.write(time+’\n’)
file.write(test)
file.close()
def spider_data(url):#爬取每条通知的数据
response = requests.get(url)
response.encoding = ‘utf-8’
html = etree.HTML(response.text)
title=’’.join(html.xpath(’//h2/text()’))
zz=’’.join(html.xpath("//div[@class=‘zz’]/text()"))
test=’’.join(html.xpath(’//*[@id=“vsb_content”]/p/text()’))
author=zz.split()[0]
source=zz.split()[1]
time=zz.split()[2]+" "+zz.split()[3]

write_page(title,author,source,time,test)
#print(type(title))

def spider_url(url):#爬取每条通知的url

response = requests.get(url)
response.encoding = 'utf-8'
result = response.text
html = etree.HTML(result)
all_url = html.xpath("//div[@class='right-1']/ul/li/a/@href")
url = []
for i in all_url:
    if i.startswith("../"):
        url.append('http://www.zjitc.net' + i.lstrip('..'))

return url

def spider_list_url():#爬取每页的url
url = ‘http://www.zjitc.net/xwzx/tztg.htm’
num = int(input(“请输入要爬取得页数:”))
url_lsit = []
url_lsit.append(url)
for i in range(num - 1):
response = requests.get(url)
response.encoding = ‘utf-8’
result = response.text
html = etree.HTML(result)
url_next = html.xpath("//a[contains(string(), ‘下页’)]/@href")[0]
if url_next.startswith(‘tztg’):
url = ‘http://www.zjitc.net/xwzx/’ + url_next
else:
url = ‘http://www.zjitc.net/xwzx/tztg/’ + url_next
url_lsit.append(url)
return url_lsit
if name==“main”:
for i in spider_list_url():
for j in spider_url(i):
spider_data(j)

本文地址:https://blog.csdn.net/m0_46206005/article/details/107057968