python多进程爬虫解决进程挂掉问题
程序员文章站
2022-05-02 12:53:20
...
这几天写了个爬虫,爬取的数据比较多。一直挂在服务器上跑,后面发现启动十个进程总会运行着某几个进程挂掉,导致数据采集工作比较延后。
后面重新改进了一下,从日志中读取当前进程断点,继续爬取。
用了一个笨方法解决进程挂掉的问题。就是每半个小时关闭所有进程,重新从断点开始,当然各位有好的方法可以留言告诉我。
脚本使用方法直接运行 run.py
run.py
#-*- coding:utf-8 -*-
import os,time
import re
#检查sqlpwd.py的进程,并关闭进程
def kill():
lines = os.popen('ps -e|grep sqlpwd.py').readlines()
pa = re.compile(r'(\d+) ')
for line in lines:
result = re.findall(pa,line)
print(result[0])
os.popen('kill -9 '+str(result[0]))
def check():
with open('log.txt','r') as f:
lines = f.readlines()
for line in lines:
if 'FinishAll' in line:
return True
return False
'''
如果已经进行爬取了,则直接从上次扫描的地方开始
设置每半个小时重新扫描,解决之前扫描过程中进程掉的问题
'''
def runpwd():
if os.path.exists('log.txt') == False:
os.system('python sqlpwd.py 0 &')
time.sleep(18000)
kill()
while True:
print('-------------启动进程-------------------')
os.system('python sqlpwd.py 1 &')
time.sleep(60)
print('-------------结束进程-------------------')
kill()
if check() == True:
print('所有信息采集完成')
break
else:
pass
if __name__ == '__main__':
runpwd()
sqlpwd.py
#-*- coding:utf-8 -*-
import requests
import re
import random
from bs4 import BeautifulSoup
from multiprocessing import Process,Pool,Lock
import multiprocessing
import time
import sys
import logging
reload(sys)
sys.setdefaultencoding('utf-8')
def get_pwd(j,start,end):
logger = logging.getLogger(__name__)
logger.setLevel(level = logging.INFO)
handler = logging.FileHandler("log.txt")
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
handler.setFormatter(formatter)
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(handler)
logger.addHandler(console)
user_agent=['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36',
'Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
]
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'zh-CN,zh;q=0.8',
'User-Agent': user_agent[random.randint(0,5)]
}
#print(start,end)
logger.info('进程'+str(j)+'任务从第'+str(start)+'页采集到'+str(end)+'页')
for i in range(start,end):
url = 'https://www.mysql-password.com/database/'+str(i)
#print(url)
logger.info(url)
try:
r = requests.get(url,headers=headers)
r.encoding = 'utf8'
html = r.text
soup = BeautifulSoup(html,'lxml')
except Exception as e:
print('进程'+str(j)+'异常:原因如下')
print(e)
#log.write('进程'+str(j)+'异常:原因如下'+str(e)+'\n')
logger.info('进程'+str(j)+'异常:原因如下'+str(e))
time.sleep(10)
continue
filename = 'password'+str(j)+'.txt'
try:
for tag in soup.find('tbody').find_all('tr'):
ss = str(tag.find_all('td')[2].string)
#print(ss)
with open(filename,'a') as f:
f.write(ss+'\n')
except Exception as e:
print('进程'+str(j)+'异常:原因如下')
print(e)
#log.write('进程'+str(j)+'异常:原因如下'+str(e)+'\n')
logger.info('进程'+str(j)+'异常:原因如下'+str(e))
time.sleep(10)
continue
#print('进程'+str(j)+'采集完成第'+str(i)+'页')
logger.info('进程'+str(j)+'采集完成第'+str(i)+'页')
def check():
num = []
mul = []
with open('log.txt','r') as f:
lines = f.readlines()
pa = re.compile(r'https://www.mysql-password.com/database/(\d+)')
for line in lines:
result = re.findall(pa,line)
if len(result) != 0 :
#print(result)
num.append(int(result[0]))
m = 10
start = 20000
cha = int(((180000-start)/m))
for j in range(0,m):
numList = []
info = []
end = 20000 + cha * (j+1)
for n in num:
if n > start and n < end:
numList.append(n)
print (str(j)+','+str(numList[-1])+','+str(end)+',')
if numList[-1] + 1 == end:
start = end + 1
continue
else:
info.append(j)
info.append(numList[-1])
info.append(end)
mul.append(info)
start = end + 1
return mul
def run(type):
if type == 0:
m = 10
pool = multiprocessing.Pool(m)
#start是开始爬行的页数, 下面写得是20000 180000 意思是从20000页爬到180000页,根据实际情况修改
start = 20000
cha = int(((180000-start)/m))
for j in range(0,m):
end = 20000 + cha * (j+1)
pool.apply_async(get_pwd,args=(j,start,end,))
#print(start,end)
start = end + 1
pool.close()
pool.join()
else:
mul = check()
#print(mul)
if len(mul) == 0:
print('所有进程都爬取完成')
logger.info('FinishAll')
else:
pool = multiprocessing.Pool(len(mul))
for mu in mul:
pool.apply_async(get_pwd,args=(mu[0],mu[1],mu[2],))
pool.close()
pool.join()
if __name__ == '__main__':
runtype = int(sys.argv[1])
run(runtype)
推荐阅读