Python3爬虫（十四）验证码处理

程序员文章站 2022-04-09 20:59:52

Infi-chu: http://www.cnblogs.com/Infi-chu/ 一、图形验证码识别1.使用tesserocr # 在本地存储一张验证码的图片做测试image = Image.open('test.jpg')result = tesserocr.image_to_text(ima ......

Infi-chu:

http://www.cnblogs.com/Infi-chu/

一、图形验证码识别
1.使用tesserocr

import tesserocr
from PIL import Image

# 在本地存储一张验证码的图片做测试
image = Image.open('test.jpg')
result = tesserocr.image_to_text(image)
print(result)

# 直接将文本转为字符串
import tesserocr
print(tesserocr.file_to_text('test.jpg'))

2.处理验证码图片
convert()方法，可将图片转化为灰度图像、二值化图像

image = image.convert('L')	# 将图像转化为灰度图像
image.show()
image = image.convert('1')	# 将图像转化为二值化图像，二值化阈值默认是127

# 现将图片转化成灰度图像，再转化成二值化图像
image = image.convert('L')
threshold = 80	# 设定阈值
table = []
for i in range(256):
    if i < threshold:
		table.append(0)
	else:
		table.append(1)
image = image.point(table,'1')
image.show()	# 图像变得清晰
result = tesserocr.image_to_text(image)
print(result)

二、滑动验证码识别
滑动验证码就如同用一块拼图去在图片中填充
1.滑动验证码特点：
防模拟
防伪造
防暴力

2.如何识别：
采用浏览器模拟验证

3.初始化：

EMAIL = 'test@test.com'
PASSWORD = '123456'

class CrackGeetest():
    def __init__(self):
	    self.url = 'https://account.geetest.com/login'
		self.browser = webdriver.Chome()
		self.wait = WebDriverWait(self.browser,20)
		self.email = EMAIL
		self.pasword = PASSWORD

4.模拟点击：

# 寻找按钮
def get_geetest_button(self):
    button = self.wait.until(EC.element_to_be_clickable((BY.CLASS_NAME,'geetest_radar_tip')))
	return button
# 点击验证按钮
button = self.get_geetest_button()
button.click()

5.识别缺口：
首先对比原图和现图，利用selenium选取图片元素，得到位置和size，然后获取截图

# 
# 获取位置和size
def position(self):
    img = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,'geetest_canvas_img')))
	time.sleep(2)
	location = img.location
	size = img.size
	top,bottom,left,right = location['y'],location['y']+size['height'],location['x'],location['x']+size['width']
	return (top,bottom,left,right)
# 获取网页截图
def get_geetest_image(self,name='captcha.png'):
    top,bottom,left,right = self.get_position()	# 获取图片的位置和宽高，随后返回左上角和右下角的坐标
	print('验证码位置',top,bottom,left,right)
	screenshot = self.get_screenshot()	# 得到屏幕目标
	captcha = screenshot.crop((left,top,right,bottom))
# 获取第二张图片（带有缺口的图片）
def get_slider(self):
    slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button')))
	return slider
# 点击后出现接口
slider = self.get_slider()
slider.click()
# 在调用 get_geetest_image()函数获取第二张图，分别命名为img1和img2
'''
对比图像的缺口，需要遍历图片的每一个坐标点，获取两张图片对应像素点的RGB数据，如果差距在一定范围内，则代表两个像素相同，接着继续对比下一个像素点。如果差距在一定范围之外，则说明不是相同的像素点，则该位置就是缺口位置
'''
def is_pixel_equal(self,img1,img2,x,y):
    # 取两个图片的像素点
	pixel1 = img1.load()[x,y]
	pixel2 = img2.load()[x,y]
	threshold = 60
	# 两张图RGB的绝对值小于定义的阈值，则代表像素点相同，继续遍历。否则不相同，为缺口位置
	if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(pixel1[2] - pixel2[2]) < threshold:
	    return True
	else:
	    return False

def get_gap(self,img1,img2):
    left = 60
	for i in range(left,img1.size[0]):
	    for j in range(img1.size[1]):
			if not self.is_pixel_equal(img1.img2,i,j):	# 判断两个图片的某一点的像素是否相同
			    left = i
				return left
	return left

6.模拟拖动：

def get_track():
    track = []
	current = 0
	mid = distance * 4 / 5
	t = 0.2
	v = 0
	while current < distance:
		if current < mid:
		    a = 2
		else:
		    a = -3
		v0 = v
		v = v0 + a * t
		x = v0*t+1/2*a*t^2
		move = v0*t+1/2*a*t^2
		current += move
		track.append(round(move))
	return track

def move_to_gap(self,slider,tracks):
    ActionChains(self.browser).click_and_hold(slider).perform()
	for x in tracks:
	    ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform()
	time.sleep(0.3)
	ActionChains(self.browser).release().perform()

1.和12306的验证码类似
2.思路：
文字识别、图像识别
3.使用超级鹰平台识别
修改Python API

import requests
from hashlib import md5

class Chaojiying(obj):
    def __init__(self,username,password,soft_id):
	    self.username=username
		self.password=md5(password.encode('utf-8')).hexdigest()
		self.soft_id=soft_id
		self.base_params = {
			'user':self.username,
			'pass2':self.password,
			'softid':self.soft_id,
		}
		self.headers = {
			'Connection':'Keep-Alive',
			'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
		}
	def post_pic(self,im,codetype):
	    params = {
			'codetype':codetype,
		}
		params.update(self.base_params)
		files = {'userfile':('test.jpg',im)}
		r = requests.post('http://upload.chaojiying.net/Upload/Processing.php',data=params,files=files,headers=self.headers)
		return r.json()
	def report_error(self,im_id):
	    params = {'id':im_id,}
		params.update(self.base_params)
		r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php',data=params,headers=self.headers)
		return r.json()

4.初始化：

EMAIL = 'test@test.com'
PASSWORD = ''
CHAOJIYING_USERNAME='test'
CHAOJIYING_PASSWORD=''
CHAOJIYING_SOFT_ID=893590    # 软件ID
CHAOJIYING_KIND=9102    # 验证码类型
class CrackTouClick():
    def __init__(self):
	    self.url='输入要识别的网站'
		self.browser=webdriver.Chome()
		self.wait=WebDriverWait(self.browser,20)
		self.email=EMAIL
		self.password=PASSWORD
		self.chaojiying=Chaojiying(CHAOJIYING_USERNAME,CHAOJIYING_PASSWORD,CHAOJIYING_SOFT_ID,CHAOJIYING_KIND)

5.获取验证码：

def open():
	self.browser.get(self.url)
	email=self.wait.until(EC.persence_of_element_located((By.ID,'email')))
	password=self.wait.until(EC.persence_of_element_located((By.ID,'password')))
	email.send_keys(self.password)
def get_touclick_button(self):
    button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'touclick-hod-wrap')))
	return button
def get_touclick_element(self):
    element = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,'touclick-pub-content')))
	return element
def get_position(self):
    element=self.get_touclick_element()
    time.sleep(1)
    location=element.location
    size=element.size
    top,bottom,left,right=location['y'],location['y']+size['height'],location['x'],location['x']+size['width']
    return (top,bottom,left,right)
def get_screenshot(self):
	screenshot=self.browser.get_screenshot_as_png()
	screenshot=Image.open(BytesIO(screenshot))
	return screenshot
def get_touclick_image(self,name='captcha.png')
    top,bottom,left,right=self.get_position()
	print('验证码位置',top,bottom,left,right)
	screenshot = self.get_screenshot()
	captcha = screenshot.crop((left,top,right,bottom))
	return captcha

6.识别验证码：

image = self.get_touclick_image()
bytes_array=BytesIO()
image.save(bytes_array,format='PNG')
res = self.chaojiying.post_pic(bytes_array,getvalue(),CHAOJIYING_KIND)
print(res)
def get_points(self,captcha_result):
    groups=captcha_result.get('pic_str').split('|')
	locations=[[int(number) for number in group.split(',')]for group in groups]
	return locations
def touch_click_words(self,locations):
    for location in locations:
	    print(location)
		ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(),location[0],location[1]).click().perform()
		time.sleep(1)

上一篇： Golang学习--平滑重启

下一篇： 24.C++- 抽象类(存虚函数)、接口、多重继承

Python3爬虫（十四）验证码处理

【Python3爬虫】网络小说更好看？十四万条书籍信息告诉你

Python3爬虫关于识别检验滑动验证码的实例

Python3爬虫关于识别点触点选验证码的实例讲解

Python3爬虫里关于识别微博宫格验证码的知识点详解

python网络爬虫——验证码处理

爬虫处理普通验证码

python爬虫验证码的处理（云打码）

爬虫之简单验证码处理