Python3爬虫(十四) 验证码处理
程序员文章站
2022-04-09 20:59:52
Infi-chu: http://www.cnblogs.com/Infi-chu/ 一、图形验证码识别1.使用tesserocr # 在本地存储一张验证码的图片做测试image = Image.open('test.jpg')result = tesserocr.image_to_text(ima ......
Infi-chu:
http://www.cnblogs.com/Infi-chu/
一、图形验证码识别
1.使用tesserocr
import tesserocr from PIL import Image
# 在本地存储一张验证码的图片做测试
image = Image.open('test.jpg')
result = tesserocr.image_to_text(image)
print(result)
# 直接将文本转为字符串
import tesserocr
print(tesserocr.file_to_text('test.jpg'))
2.处理验证码图片
convert()方法,可将图片转化为灰度图像、二值化图像
image = image.convert('L') # 将图像转化为灰度图像 image.show() image = image.convert('1') # 将图像转化为二值化图像,二值化阈值默认是127 # 现将图片转化成灰度图像,再转化成二值化图像 image = image.convert('L') threshold = 80 # 设定阈值 table = [] for i in range(256): if i < threshold: table.append(0) else: table.append(1) image = image.point(table,'1') image.show() # 图像变得清晰 result = tesserocr.image_to_text(image) print(result)
二、滑动验证码识别
滑动验证码就如同用一块拼图去在图片中填充
1.滑动验证码特点:
防模拟
防伪造
防暴力
2.如何识别:
采用浏览器模拟验证
3.初始化:
EMAIL = 'test@test.com' PASSWORD = '123456' class CrackGeetest(): def __init__(self): self.url = 'https://account.geetest.com/login' self.browser = webdriver.Chome() self.wait = WebDriverWait(self.browser,20) self.email = EMAIL self.pasword = PASSWORD
4.模拟点击:
# 寻找按钮 def get_geetest_button(self): button = self.wait.until(EC.element_to_be_clickable((BY.CLASS_NAME,'geetest_radar_tip'))) return button # 点击验证按钮 button = self.get_geetest_button() button.click()
5.识别缺口:
首先对比原图和现图,利用selenium选取图片元素,得到位置和size,然后获取截图
# # 获取位置和size def position(self): img = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,'geetest_canvas_img'))) time.sleep(2) location = img.location size = img.size top,bottom,left,right = location['y'],location['y']+size['height'],location['x'],location['x']+size['width'] return (top,bottom,left,right) # 获取网页截图 def get_geetest_image(self,name='captcha.png'): top,bottom,left,right = self.get_position() # 获取图片的位置和宽高,随后返回左上角和右下角的坐标 print('验证码位置',top,bottom,left,right) screenshot = self.get_screenshot() # 得到屏幕目标 captcha = screenshot.crop((left,top,right,bottom)) # 获取第二张图片(带有缺口的图片) def get_slider(self): slider = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'geetest_slider_button'))) return slider # 点击后出现接口 slider = self.get_slider() slider.click() # 在调用 get_geetest_image()函数获取第二张图,分别命名为img1和img2 ''' 对比图像的缺口,需要遍历图片的每一个坐标点,获取两张图片对应像素点的RGB数据,如果差距在一定范围内,则代表两个像素相同,接着继续对比下一个像素点。如果差距在一定范围之外,则说明不是相同的像素点,则该位置就是缺口位置 ''' def is_pixel_equal(self,img1,img2,x,y): # 取两个图片的像素点 pixel1 = img1.load()[x,y] pixel2 = img2.load()[x,y] threshold = 60 # 两张图RGB的绝对值小于定义的阈值,则代表像素点相同,继续遍历。否则不相同,为缺口位置 if abs(pixel1[0] - pixel2[0]) < threshold and abs(pixel1[1] - pixel2[1]) < threshold and abs(pixel1[2] - pixel2[2]) < threshold: return True else: return False def get_gap(self,img1,img2): left = 60 for i in range(left,img1.size[0]): for j in range(img1.size[1]): if not self.is_pixel_equal(img1.img2,i,j): # 判断两个图片的某一点的像素是否相同 left = i return left return left
6.模拟拖动:
def get_track(): track = [] current = 0 mid = distance * 4 / 5 t = 0.2 v = 0 while current < distance: if current < mid: a = 2 else: a = -3 v0 = v v = v0 + a * t x = v0*t+1/2*a*t^2 move = v0*t+1/2*a*t^2 current += move track.append(round(move)) return track def move_to_gap(self,slider,tracks): ActionChains(self.browser).click_and_hold(slider).perform() for x in tracks: ActionChains(self.browser).move_by_offset(xoffset=x,yoffset=0).perform() time.sleep(0.3) ActionChains(self.browser).release().perform()
1.和12306的验证码类似
2.思路:
文字识别、图像识别
3.使用超级鹰平台识别
修改Python API
import requests from hashlib import md5 class Chaojiying(obj): def __init__(self,username,password,soft_id): self.username=username self.password=md5(password.encode('utf-8')).hexdigest() self.soft_id=soft_id self.base_params = { 'user':self.username, 'pass2':self.password, 'softid':self.soft_id, } self.headers = { 'Connection':'Keep-Alive', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)' } def post_pic(self,im,codetype): params = { 'codetype':codetype, } params.update(self.base_params) files = {'userfile':('test.jpg',im)} r = requests.post('http://upload.chaojiying.net/Upload/Processing.php',data=params,files=files,headers=self.headers) return r.json() def report_error(self,im_id): params = {'id':im_id,} params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php',data=params,headers=self.headers) return r.json()
4.初始化:
EMAIL = 'test@test.com' PASSWORD = '' CHAOJIYING_USERNAME='test' CHAOJIYING_PASSWORD='' CHAOJIYING_SOFT_ID=893590 # 软件ID CHAOJIYING_KIND=9102 # 验证码类型 class CrackTouClick(): def __init__(self): self.url='输入要识别的网站' self.browser=webdriver.Chome() self.wait=WebDriverWait(self.browser,20) self.email=EMAIL self.password=PASSWORD self.chaojiying=Chaojiying(CHAOJIYING_USERNAME,CHAOJIYING_PASSWORD,CHAOJIYING_SOFT_ID,CHAOJIYING_KIND)
5.获取验证码:
def open(): self.browser.get(self.url) email=self.wait.until(EC.persence_of_element_located((By.ID,'email'))) password=self.wait.until(EC.persence_of_element_located((By.ID,'password'))) email.send_keys(self.password) def get_touclick_button(self): button = self.wait.until(EC.element_to_be_clickable((By.CLASS_NAME,'touclick-hod-wrap'))) return button def get_touclick_element(self): element = self.wait.until(EC.persence_of_element_located((By.CLASS_NAME,'touclick-pub-content'))) return element def get_position(self): element=self.get_touclick_element() time.sleep(1) location=element.location size=element.size top,bottom,left,right=location['y'],location['y']+size['height'],location['x'],location['x']+size['width'] return (top,bottom,left,right) def get_screenshot(self): screenshot=self.browser.get_screenshot_as_png() screenshot=Image.open(BytesIO(screenshot)) return screenshot def get_touclick_image(self,name='captcha.png') top,bottom,left,right=self.get_position() print('验证码位置',top,bottom,left,right) screenshot = self.get_screenshot() captcha = screenshot.crop((left,top,right,bottom)) return captcha
6.识别验证码:
image = self.get_touclick_image() bytes_array=BytesIO() image.save(bytes_array,format='PNG') res = self.chaojiying.post_pic(bytes_array,getvalue(),CHAOJIYING_KIND) print(res) def get_points(self,captcha_result): groups=captcha_result.get('pic_str').split('|') locations=[[int(number) for number in group.split(',')]for group in groups] return locations def touch_click_words(self,locations): for location in locations: print(location) ActionChains(self.browser).move_to_element_with_offset(self.get_touclick_element(),location[0],location[1]).click().perform() time.sleep(1)
上一篇: Golang学习--平滑重启