Java selenium处理极验滑动验证码示例
程序员文章站
2024-04-03 17:55:52
要爬取一个网站遇到了极验的验证码,这周都在想着怎么破解这个,网上搜了好多知乎上看到有人问了这问题,我按照这思路去大概实现了一下。
1.使用htmlunit(这种方式我...
要爬取一个网站遇到了极验的验证码,这周都在想着怎么破解这个,网上搜了好多知乎上看到有人问了这问题,我按照这思路去大概实现了一下。
1.使用htmlunit(这种方式我没成功,模拟鼠标拖拽后轨迹没生成,可以跳过)
我用的是java,我首先先想到了用直接用htmlunit,我做了点初始化
private void initwebclient() { if (webclient != null) { return; } webclient = new webclient(browserversion.firefox_24); webclient.getoptions().setproxyconfig(new proxyconfig("127.0.0.1",8888)); webclient.getoptions().setactivexnative(true); webclient.getoptions().setuseinsecuressl(true); // 配置证书 webclient.getoptions().setjavascriptenabled(true); webclient.getoptions().setcssenabled(true); webclient.setcsserrorhandler(new silentcsserrorhandler()); webclient.getoptions().setthrowexceptiononscripterror(false); webclient.getoptions().setthrowexceptiononfailingstatuscode(false); cookiemanager cookiemanager = new cookiemanager(); list<org.apache.http.cookie.cookie> httpcookies = client.getcookies();//其方式获取的cookie for (org.apache.http.cookie.cookie cookie : httpcookies) { cookiemanager.addcookie(new com.gargoylesoftware.htmlunit.util.cookie(cookie)); } webclient.setcookiemanager(cookiemanager); }
初始化代理,cookie..然后就能正常调用了
htmlpage page = webclient.getpage("http://www.qixin.com/login");//企信宝 gepageinfor(page);
下面就是我获取图片,还原图片并且模拟拖拽,(这里我觉得是有些问题的,可能是拖拽我模拟的不对导致触发的js并没有生成正确的轨迹,还请大家帮忙看看哪里错了)
private void gepageinfor(htmlpage page) { string[] img_slice={"div", "class", "gt_cut_fullbg_slice"}; string[] img_bg_slice={"div", "class", "gt_cut_bg_slice"}; htmldivision div = (htmldivision) page.getelementbyid("captcha"); int decaptcha = 0; try { byte[] img_slice_binary = client.get(getimgurl(img_slice, div, true)).getbinary();//获取图片byte byte[] img_bg_slice_binary = client.get(getimgurl(img_bg_slice, div, false)).getbinary(); //获取还原后的图片 bufferedimage geetestimg = imgtest.getgeetestimg(img_slice_binary, imgtest.imgarray); bufferedimage geetestimg2 = imgtest.getgeetestimg(img_bg_slice_binary, imgtest.imgarray); //获得图片移动位置(目前还有问题,需改用第三方图片识别) decaptcha =imgtest.decaptcha(geetestimg,geetestimg2); system.out.println(decaptcha); } catch (ioexception | fetchexception e) { e.printstacktrace(); } htmldivision div_slider_knob = get_div_slider_knob(page,"gt_slider_knob gt_show");//获取要移动div htmlpage mouseover = (htmlpage) div_slider_knob.mouseover(); htmlpage mousedownpage = (htmlpage)div_slider_knob.mousedown(); div_slider_knob = get_div_slider_knob(mousedownpage,"gt_slider_knob gt_show moving"); mousemovex(decaptcha, div_slider_knob, mousedownpage); htmlpage newpage =(htmlpage)div_slider_knob.mouseover(); // newpage =(htmlpage)div_slider_knob.mousedown(); system.out.println(newpage.asxml()); div = (htmldivision)newpage.getelementbyid("captcha"); htmlelement htmlelement = div.getelementsbyattribute("div", "class", "gt_slice gt_show moving").get(0); system.out.println(htmlelement); newpage =(htmlpage)div_slider_knob.mouseup();//触发js,轨迹没有生成 system.out.println("---------------"); system.out.println(newpage.asxml()); if (newpage.getelementbyid("captcha")!=null) {//错误重试 //gepageinfor(newpage); } } private void mousemovex(int decaptcha, htmldivision div_slider_knob, htmlpage mousedown) { mouseevent mouseevent = new mouseevent(div_slider_knob, mouseevent.type_mouse_move, false, false, false, mouseevent.button_left); mouseevent.setclientx( mouseevent.getclientx()+((decaptcha!=0)?decaptcha:99)); //移动x坐标 scriptresult scriptresult = mousedown.getdocumentelement().fireevent(mouseevent); } private htmldivision get_div_slider_knob(htmlpage page,string classstring) { return (htmldivision)(((htmldivision) page.getelementbyid("captcha")).getelementsbyattribute("div", "class", classstring).get(0)); } private string getimgurl(string[] img_slice, htmldivision div, boolean isneedcheckpostion) { string url =""; int[] postion = new int[2]; boolean empty = div.getelementsbyattribute(img_slice[0],img_slice[1],img_slice[2]).isempty(); if (div.haschildnodes() && !empty) { list<htmlelement> elementsbyattribute = div.getelementsbyattribute(img_slice[0],img_slice[1],img_slice[2]); for(int i = 0;i<elementsbyattribute.size();i++){ htmldivision div_img = (htmldivision)elementsbyattribute.get(i); string style = div_img.getattribute("style"); string[] imge_url_position = style.split(";"); if(stringutils.isblank(url)){//确认url url = stringutils.replacepattern(imge_url_position[0], ".*\\(", "").replace(")", ""); } if (isneedcheckpostion) {//确认图片切割postion,两张图切割方式一样 background-position: -157px -58px // string[] positions = stringutils.split(stringutils.remove(imge_url_position[1], "px").replace("-", "").replaceall(".*:", ""), null); string[] positions = stringutils.split(stringutils.removepattern(imge_url_position[1], "[^\\d+ \\s]"),null); postion[0] = integer.parseint(positions[0]); postion[1] = integer.parseint(positions[1]); int[] is = imgtest.imgarray[i]; if (is[0]!=postion[0]||is[1]!=postion[1]) { logger.debug("更新分割postion"); imgtest.imgarray[i] = postion; } system.out.println(imgtest.imgarray); isneedcheckpostion= false; } } } return url; }
对比图片获取位移方法(decaptcha)是错的我就不放代码了,下面是其中还原图片用的方法,目前是其实审查元素后你就明白怎么还原这个图片了,这里是每次读的10px,58px
public static bufferedimage getgeetestimg(byte[] binary, int[][] imgarray) throws ioexception { bufferedimage img = imageio.read(new bytearrayinputstream(binary)); list<bufferedimage> list = new arraylist<>(); for (int i=0;i< imgarray.length;i++) { bufferedimage subimage = img.getsubimage(imgarray[i][0], imgarray[i][1], 10, 58); list.add(subimage); // imageio.write(subimage, "jpg", new file("d:\\image\\imgs"+i+".jpg")); } bufferedimage mergeimageup = null; bufferedimage mergeimagedown = null; int mid = list.size()>>>1; for (int i = 0; i <mid-1 ; i++) { mergeimageup = mergeimage(mergeimageup==null?list.get(i):mergeimageup, list.get(i+1), true); } for(int i = mid;i<list.size()-1;i++){ mergeimagedown = mergeimage(mergeimagedown==null?list.get(i):mergeimagedown,list.get(i+1), true); } img = mergeimage(mergeimageup, mergeimagedown, false); return img; } public static bufferedimage mergeimage(bufferedimage img1, bufferedimage img2, boolean ishorizontal) throws ioexception { int w1 = img1.getwidth(); int h1 = img1.getheight(); int w2 = img2.getwidth(); int h2 = img2.getheight(); // 从图片中读取rgb int[] imagearrayone = new int[w1 * h1]; imagearrayone = img1.getrgb(0, 0, w1, h1, imagearrayone, 0, w1); // 逐行扫描图像中各个像素的rgb到数组中 int[] imagearraytwo = new int[w2 * h2]; imagearraytwo = img2.getrgb(0, 0, w2, h2, imagearraytwo, 0, w2); // 生成新图片 bufferedimage destimage = null; if (ishorizontal) { // 水平方向合并 destimage = new bufferedimage(w1+w2, h1, bufferedimage.type_int_rgb); destimage.setrgb(0, 0, w1, h1, imagearrayone, 0, w1); // 设置上半部分或左半部分的rgb destimage.setrgb(w1, 0, w2, h2, imagearraytwo, 0, w2); } else { // 垂直方向合并 destimage = new bufferedimage(w1, h1 + h2, bufferedimage.type_int_rgb); destimage.setrgb(0, 0, w1, h1, imagearrayone, 0, w1); // 设置上半部分或左半部分的rgb destimage.setrgb(0, h1, w2, h2, imagearraytwo, 0, w2); // 设置下半部分的rgb } return destimage; }
2.使用selenium
后来我想着是我模拟鼠标这个动作哪里有问题,我就又找到了selenium(2.42.2),他也能操作htmlunit关键他的鼠标动作好像封装比较完全
但是我尝试了以后发现了这个,htmlunitmouse这个动作没有实现
public void mousemove(coordinates where, long xoffset, long yoffset) { throw new unsupportedoperationexception("moving to arbitrary x,y coordinates not supported."); }
好吧,于是调用chrome吧
system.setproperty("webdriver.chrome.driver","c:\\chromedriver.exe"); proxy proxy = new proxy(); //设置代理服务器地址 proxy.sethttpproxy("127.0.0.1:8888"); // desiredcapabilities capabilities = desiredcapabilities.htmlunitwithjs(); desiredcapabilities capabilities = desiredcapabilities.chrome(); capabilities.setcapability(capabilitytype.proxy, proxy); // final webdriver driver = new htmlunitdriver(capabilities); webdriver driver = new chromedriver(capabilities); driver.get("http://www.qixin.com/login"); driver.manage().timeouts().implicitlywait(10, timeunit.seconds); checkpage(driver,"return $('.gt_cut_fullbg_slice');"); // 获取 网页的 title system.out.println("1 page title is: " + driver.gettitle()); // 通过 id 找到 input 的 dom string pagesource = driver.getpagesource(); system.out.println(pagesource); org.openqa.selenium.javascriptexecutor executor = (org.openqa.selenium.javascriptexecutor)driver; boolean equals = executor.executescript("return document.readystate").equals("complete"); int movex =99;//移动位置 if (equals) { webelement element = driver.findelement(by.classname("gt_slider_knob"));//(".gt_slider_knob")); point location = element.getlocation(); element.getsize(); actions action = new actions(driver); // action.clickandhold().perform();// 鼠标在当前位置点击后不释放 // action.clickandhold(element).perform();// 鼠标在 onelement 元素的位置点击后不释放 // action.clickandhold(element).movebyoffset(location.x+99,location.y).release().perform(); //选中source元素->拖放到(xoffset,yoffset)位置->释放左键 action.draganddropby(element, location.x+movex,location.y).perform(); // action.draganddrop(element,newelement).perform(); pagesource = driver.getpagesource(); } //更新cookie set<org.openqa.selenium.cookie> cookies = driver.manage().getcookies(); set<cookie> cookies2 = new hashset<>(); for (org.openqa.selenium.cookie cookie : cookies) { cookies2.add((cookie) new cookie(cookie.getdomain(), cookie.getname(), cookie.getvalue(), cookie.getpath(), cookie.getexpiry(), true)); } for (cookie cookie : cookies2) { org.apache.http.cookie.cookie httpclient = cookie.tohttpclient(); } system.out.println(pagesource);
这样提交的表单确实是有轨迹的,这里移动位置我先写了个固定值,可以由上面图片还原,以及一些开源的图片识别工具识别出位置。以上应该就能解决这个滑动验证码了
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持。
上一篇: JavaAPI的使用方法详解