欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Webgic的使用简单实例

程序员文章站 2022-07-05 23:21:43
...

相比前两天使用的其他网络爬虫工具,Webgic 框架更加灵活,使用更加方便,支持多种解析方式,纠错能力更强。

废话不多说,心得全在代码中,说多以后自己看起来也费劲。

1.添加依赖

<dependency>
	<groupId>us.codecraft</groupId>
	<artifactId>webmagic-core</artifactId>
	<version>0.7.3</version>
</dependency>

2.编写 PageProcessor 接口的实现类

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Html;
import us.codecraft.webmagic.selector.Selectable;

import java.util.ArrayList;
import java.util.List;

public class PCZQProcessor implements PageProcessor {
    /**
     * 网络爬虫相关设置
     * *这里设置了重试次数
     */
    private Site site = Site.me()
            .setRetryTimes(3)
            .setSleepTime(5000)
            .addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36");
    private List<JobDetails> jobs = new ArrayList<>();

    public List<JobDetails> getJobs() {
        return jobs;
    }

    public void setJobs(List<JobDetails> jobs) {
        this.jobs = jobs;
    }

    @Override
    public  Site getSite()
    {
        return  site;
    }

    @Override
    public  void  process(Page page)
    {
        Html html = page.getHtml();
        Selectable selectable = html.xpath("//div[@data-widget='app/ms_v2/wanted/list.js#companyAjaxBid']/dl");
        for (Selectable node : selectable.nodes())
        {
            JobDetails jobDetails = new JobDetails();
            jobDetails.setPost_id(node.xpath("//dt/a/@post_id").get())
                    .setPost_url(node.xpath("//dt/a/@post_url").get())
                    .setCompany_id(node.xpath("//dt/a/@company_id").get())
                    .setPuid(node.xpath("//dt/a/@puid").get())
                    .setTitle(node.xpath("//dt/a/text()").get())
                    .setCompany_name(node.xpath("//div[@class='clearfix']/p/text()").get())
                    .setAddress(node.xpath("//dd[@class='pay']/text()").get())
                    .setTime(node.xpath("//dd[@class='pub-time']/text()").get());
            System.out.println(jobDetails);
            jobs.add(jobDetails);
        }
        //检查是否分页有的话每个分页URL加入到待采集表中
        List<String> moreurls = html.xpath("//*[@id=\"list-job-id\"]/div[13]").links().all();
        //存储结果
        page.addTargetRequests(moreurls);
    }

}

3.测试方法

    PCZQProcessor pczqProcessor = new PCZQProcessor();
    JobDetailProcessor jobDetailProcessor = new JobDetailProcessor();
    @RequestMapping("magic")
    public String magic() throws IOException {

        String url = "://beijing.gj.com/qita/";

        Spider.create(pczqProcessor)
                .addUrl(url)
                .addPipeline(new JsonFilePipeline("outputfile/"))
                .thread(1)
                .run();

        return pczqProcessor.getJobs().toString();
    }

    @RequestMapping("detail")
    public  String detail()
    {
        String url = "://beijing.gj.com/qita/3624203469x.htm";

        Spider.create(jobDetailProcessor)
                .addUrl(url)
                .addPipeline(new JsonFilePipeline("outputfile/"))
                .thread(1)
                .run();

        return pczqProcessor.getJobs().toString();

    }

4.浏览器测试运行http://localhost:9010/magic

Webgic的使用简单实例

成功解析

另一个抓取解析类 (里面包含了图片下载和图片内容识别方法 )

public class JobDetailProcessor implements PageProcessor {
    private Site site = Site.me()
            .setRetryTimes(3)
            .setSleepTime(8000)
            .addHeader("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36");


    private Map<String,Object> details = new HashMap<>();

    public Map<String,Object> getDetails() {
        return details;
    }

    public void setJobs(Map<String,Object> details) {
        this.details = details;
    }

    @Override
    public  Site getSite()
    {
        return  site;
    }

    @Override
    public  void  process(Page page)
    {

        Html html = page.getHtml();
        Selectable selectable = html.xpath("//ul[@data-widget='app/ms_v2/wanted/detail.js#shopAuth']");
        List<String> dls = html.xpath("//div[@data-widget='app/ms_v2/wanted/list.js#companyAjaxBid']/dl").all();

        details.put("title",html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[1]/h1/text()").toString());
        String update = html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[1]/p/span[1]/text()").toString();
        String[]  strs=update.split(":");
        details.put("updatetime",strs[strs.length-1]);
        details.put("type", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[1]/em/a/text()").toString());
        details.put("pay", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[2]/em/text()").get());
        details.put("edu", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[3]/em/text()").get());
        details.put("work", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[4]/em/text()").get());
        details.put("age", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[5]/em/text()").get());;

        details.put("number", html.xpath("*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[6]/em/text()").get());

        String phone = getDomainForUrl(page.getUrl().toString()) +html.xpath("//*[@id=\"isShowPhoneBotton\"]/img/@src").get();

        details.put("phone", phone);

        details.put("contacts", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[7]/dl/dd[2]/text()").get());
        details.put("address", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[7]/dl/dd[3]/text()").get());

        details.put("worktime", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[9]/em/text()").get());
        String describe = html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[7]/div[1]/text()").get();
        String[] describes = describe.split("\n|\r|  ");
        details.put("describe",describes);
        details.put("payType", html.xpath("//*[@id=\"wrapper\"]/div[4]/div[1]/div[3]/ul/li[9]/em/span/em/text()").get());


        String jSONObject= JSONObject.toJSONString(details);
       page.putField("list",jSONObject);

//        try {
//            downImage(phone);
//            System.out.println(imageUrl(phone));
//        } catch (Exception e) {
//            e.printStackTrace();
//        }


    }
    public  void downImage(String destUrl)  throws Exception
    {
        //new一个URL对象
        URL url = new URL(destUrl);
        //打开链接
        HttpURLConnection conn = (HttpURLConnection)url.openConnection();
        //设置请求方式为"GET"
        conn.setRequestMethod("GET");
        //超时响应时间为5秒
        conn.setConnectTimeout(5 * 1000);
        //通过输入流获取图片数据
        InputStream inStream = conn.getInputStream();
        //得到图片的二进制数据,以二进制封装得到数据,具有通用性
        byte[] data = readInputStream(inStream);
        //new一个文件对象用来保存图片,默认保存当前工程根目录
        File imageFile = new File("/Users/kangxg/Library/ApacheTomcat/webapps/log/pic2020.png");
//        //创建输出流
        FileOutputStream outStream = new FileOutputStream(imageFile);
        //写入数据
        outStream.write(data);
        //关闭输出流
        outStream.close();
    }

    public String imageUrl(String destUrl) throws Exception {



        Tesseract instance = new Tesseract();
        instance.setTessVariable("user_defined_dpi", "300");
               // setDPI(300);
        //如果未将tessdata放在根目录下需要指定绝对路径
//        instance.setDatapath("tessdata");

        //如果需要识别英文之外的语种,需要指定识别语种,并且需要将对应的语言包放进项目中
        //instance.setLanguage("chi_sim");
        File imageFile = new File("/Users/kangxg/Library/ApacheTomcat/webapps/log/pic2020.png");
        BufferedImage image = ImageIO.read(imageFile);
        // 指定识别图片

                //new File("/Users/kangxg/Library/ApacheTomcat/webapps/log/20.png");
        //long startTime = System.currentTimeMillis();
        String result = instance.doOCR(convertImage(image));
        //return  ocrResult;



        return result;
    }
    public BufferedImage convertImage(BufferedImage image) throws Exception {
        //按指定宽高创建一个图像副本
        //image = ImageHelper.getSubImage(image, 0, 0, image.getWidth(), image.getHeight());
        //图像转换成灰度的简单方法 - 黑白处理
        image = ImageHelper.convertImageToGrayscale(image);
        //图像缩放 - 放大n倍图像
        image = ImageHelper.getScaledInstance(image, image.getWidth() * 5, image.getHeight() * 5);
        return image;
    }
    public  byte[] readInputStream(InputStream inStream) throws Exception{
        ByteArrayOutputStream outStream = new ByteArrayOutputStream();
        //创建一个Buffer字符串
        byte[] buffer = new byte[1024];
        //每次读取的字符串长度,如果为-1,代表全部读取完毕
        int len = 0;
        //使用一个输入流从buffer里把数据读取出来
        while( (len=inStream.read(buffer)) != -1 ){
            //用输出流往buffer里写入数据,中间参数代表从哪个位置开始读,len代表读取的长度
            outStream.write(buffer, 0, len);
        }
        //关闭输入流
        inStream.close();
        //把outStream里的数据写入内存
        return outStream.toByteArray();
    }
    public static  StringBuffer mergedString(String string1, String string2) {
        StringBuffer sb = new StringBuffer(string1);
        return sb.append(string2);
    }
    public  String getDomainForUrl(String url){
        //使用正则表达式过滤,
        String re = "((http|ftp|https)://)(([a-zA-Z0-9._-]+)|([0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}.[0-9]{1,3}))(([a-zA-Z]{2,6})|(:[0-9]{1,4})?)";
        String str = "";
        // 编译正则表达式
        Pattern pattern = Pattern.compile(re);
        // 忽略大小写的写法
        // Pattern pat = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
        Matcher matcher = pattern.matcher(url);
        //若url==http://127.0.0.1:9040或www.baidu.com的,正则表达式表示匹配
        if (matcher.matches()) {
            str = url;
        } else {
            String[] split2 = url.split(re);
            if (split2.length > 1) {
                String substring = url.substring(0, url.length() - split2[1].length());
                str = substring;
            } else {
                str = split2[0];
            }
        }
        return str;
    }

}