爬取博客园的所有随笔的url以及计数,还有对应标题
程序员文章站
2022-06-27 22:06:39
1.爬取博客园的所有随笔的url以及计数,还有对应标题 ......
1.爬取博客园的所有随笔的url以及计数,还有对应标题
import re import requests #https://www.cnblogs.com/pythonywy/default.html?page=2 #返回内容,随笔数量,随笔名称 def func(url): lis = [] count = 1 while true: count_1 = len(lis) response = requests.get(f'{url}default.html?page={count}') response = response.text data_1 = re.findall(' href="(.*?)"', response, re.s) for a in data_1: # type:str if a.startswith('http'): if a.endswith('html'): if 'archive' not in a: lis.append(a) count +=1 lis = set(lis) lis = list(lis) count_2 = len(lis) if count_1 == count_2: return count_2,lis #获取标题 def func_2(url): response = requests.get('url') response = response.text name = re.findall('<title>(.*?)</title>',response) print(name) #两个函数连成一起弄成字典形式输出 def func_1_deco(func_1): def wrapper(*args,**kwargs): dic = dict() lis = func_1(*args,**kwargs) print(lis) count = lis[0] url_lis = lis[1] dic['count'] = count for url in url_lis: response = requests.get(url) response = response.text name = re.findall('<title>(.*?)</title>', response) name = name[0] name = name.split(' ') name = name[0] print(name) dic[name] = url return dic return wrapper @func_1_deco def func(url): lis = [] count = 1 while true: count_1 = len(lis) response = requests.get(f'{url}default.html?page={count}') response = response.text data_1 = re.findall(' href="(.*?)"', response, re.s) for a in data_1: # type:str if a.startswith('http'): if a.endswith('html'): if 'archive' not in a: lis.append(a) count +=1 lis = set(lis) lis = list(lis) count_2 = len(lis) if count_1 == count_2: return count_2,lis func('博客的首页地址') #注意结尾要有/,字典格式是有一栏计数,其他均为标题+对应的url
上一篇: pandas(四)