中断可恢复性-爬虫系统(广度优先-python单进程版)
程序员文章站
2022-07-01 20:50:37
...
最近突发奇想,想做NLP相关的中文调研,然而苦于没有数据,只能先简单写个爬虫系统来抓取数据。在写爬虫系统前,我简单的梳理了一下抓取逻辑和目标,
- 抓取以新闻内容为主题的文章
- 以广度优先进行抓取
- 单进程,单线程(反屏蔽)
- 建立已抓取url 池子,减少重复抓取
- 建立抓取失败的url池子,进行重试抓取
- 建立重复抓取池,确保首页可以重复抓取
- 在特定时间内保存当前工作状态,包括抓取池现状等
- 建立抗干扰机制,在人工杀死后可以保留工作状态
总体流程图
代码部分
class Spider(object):
"""
1) input a pool of url seeds (still need crawl)
2) input a pool of url (be crawl before)
3) input a pool of fail url
"""
def __init__(self, seeds_name, fail_file, suce_file, recrawl_file, write_path, pool_path, url_pool_max=10000, write_time=600, sleep_time=10):
"""
Time: 24 * 60 * 60 / 10 = 8640
sleep time = 10 to avoid be block
"""
self.seeds_name = seeds_name
self.fail_file = fail_file
self.suce_file = suce_file
self.pool_path = pool_path
self.write_path = write_path
self.url_pool_max = url_pool_max
self.suce_loader = open(pool_path + suce_file, "a")
self.pool_seeds = self.read_top_n(pool_path + seeds_name, url_pool_max)
self.pool_fail = []
self.pool_recrawl = dict()
self.pool_suce = dict()
with open(self.pool_path + suce_file) as file_obj:
for line in file_obj:
line = line.strip("\n\r").split("\t")
key = self.hash_str(line[0])
self.pool_suce[key] = 0
with open(self.pool_path + recrawl_file) as file_obj:
for line in file_obj:
line = line.strip("\n\r")
key = self.hash_str(line)
self.pool_recrawl[key] = 0
self.sleep_time = sleep_time
self.write_time = write_time
self.default_write_time = write_time
def read_top_n(self, file_path, n):
"""
"""
array = []
with open(file_path) as file_obj:
for line in file_obj:
line = line.strip("\n\r")
if len(array) >= n: break
array.append(line)
return array
def file_append(self, file_path, array):
"""
"""
with open(file_path, "a") as file_obj:
for line in array:
file_obj.write(line + "\n")
def hash_str(self, a):
"""
"""
hash_obj = hashlib.md5(a)
return hash_obj.hexdigest()
def condition_on_url(self, url):
"""
"""
if "news.sina.com.cn" in url: return True
return False
def back_up(self):
"""
"""
# append fail url to pool of fail url
self.file_append(self.pool_path + self.fail_file, self.pool_fail)
self.pool_fail = []
# append seeds url to seeds pool to avoid interrupt
self.file_append(self.pool_path + self.seeds_name, self.pool_seeds)
print "[Info] Pool back complete! \n"
def run(self):
"""
"""
delete_n = len(self.pool_seeds)
while(True):
if len(self.pool_seeds) == 0:
# reload pool seeds
res = delete_top_n(self.pool_path + self.seeds_name, delete_n)
if res != 0: print "[Info] Fail to delete top n of file: %s" (self.pool_path + seeds_name)
self.pool_seeds = self.read_top_n(self.pool_path + self.seeds_name, self.url_pool_max)
delete_n = len(self.pool_seeds)
url = self.pool_seeds.pop(0)
# back_up to prevent from system interrupt
if self.write_time == 0:
self.write_time = self.default_write_time
self.back_up()
key = self.hash_str(url)
if key in self.pool_suce and \
key not in self.pool_recrawl: continue
try:
res = requests.get(url)
except Exception as e:
continue
if res.status_code != 200:
self.pool_fail.append(url)
continue
# append to sucess file
self.suce_loader.write(url + "\t" + key + "\n")
self.pool_suce[key] = 0
# success crawling res
content = res.text
with open(self.write_path + key, 'w') as file_obj:
file_obj.write(content)
# get new url
soup = BeautifulSoup(content, 'html.parser')
with open(self.pool_path + self.seeds_name, 'a') as file_obj:
for link in soup.find_all('a'):
seeds = link.get('href')
if seeds is None: continue
if self.condition_on_url(seeds):
if len(self.pool_seeds) >= self.url_pool_max: file_obj.write(seeds + "\n")
else: self.pool_seeds.append(seeds)
self.write_time -= 1
time.sleep(self.sleep_time)
后续会分享出相关抓取文章的结果, 敬请期待!
上一篇: 关于mvvm简易封装(二)