欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

爬虫demo

程序员文章站 2022-03-22 17:10:39
...

爬虫Demo

爬虫demo

依赖

这里可以用jpa或者mybatis保存数据

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.thh</groupId>
    <artifactId>blog-spider</artifactId>
    
     <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.4.RELEASE</version>
        <relativePath />
    </parent>
    <!--爬取框架webmagic-->
    <dependencies>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>


        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>


        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-redis</artifactId>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
        </dependency>
        <dependency>
            <groupId>io.jsonwebtoken</groupId>
            <artifactId>jjwt</artifactId>
            <version>0.6.0</version>
        </dependency>

        <!--web起步依赖-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>


        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

        <!--mybatis分页插件-->
        <dependency>
            <groupId>com.github.pagehelper</groupId>
            <artifactId>pagehelper-spring-boot-starter</artifactId>
            <version>1.2.3</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-configuration-processor</artifactId>
            <optional>true</optional>
        </dependency>

        <!-- 通用mapper -->
        <dependency>
            <groupId>tk.mybatis</groupId>
            <artifactId>mapper-spring-boot-starter</artifactId>
            <version>2.1.5</version>
        </dependency>



    </dependencies>

</project>

application.yml
server:
  port: 7777
spring:
  datasource:
    username: root
    password: 123456
    url: jdbc:mysql://192.168.139.128:3306/thh_blog?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull&transformedBitIsBoolean=true&useSSL=false&serverTimezone=GMT%2B8
    driver-class-name: com.mysql.cj.jdbc.Driver

  jpa:
    show-sql: true
    generate-ddl: false
    database: mysql
  redis:
    host: 192.168.139.128
    port: 6379

mybatis:
  mapper-locations: classpath:mappers/*.xml  #此处为Mybatis的Mapper.xml文件路径
  type-aliases-package: com.thh.spider.pojo  # 此处为实体Bean的路径

logback-spring.xml 日志

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <contextName>${APP_NAME}</contextName>

    <springProperty name="APP_NAME" scope="context" source="spring.application.name"/>
    <springProperty name="LOG_FILE" scope="context" source="logging.file" defaultValue="./logs/${APP_NAME}"/>
    <springProperty name="LOG_POINT_FILE" scope="context" source="logging.file" defaultValue="./logs/point"/>
    <springProperty name="LOG_MAXFILESIZE" scope="context" source="logback.filesize" defaultValue="50MB"/>
    <springProperty name="LOG_FILEMAXDAY" scope="context" source="logback.filemaxday" defaultValue="7"/>
    <springProperty name="ServerIP" scope="context" source="spring.cloud.client.ip-address" defaultValue="0.0.0.0"/>
    <springProperty name="ServerPort" scope="context" source="server.port" defaultValue="0000"/>

    <!-- 彩色日志 -->
    <!-- 彩色日志依赖的渲染类 -->
    <conversionRule conversionWord="clr" converterClass="org.springframework.boot.logging.logback.ColorConverter"/>
    <conversionRule conversionWord="wex"
                    converterClass="org.springframework.boot.logging.logback.WhitespaceThrowableProxyConverter"/>
    <conversionRule conversionWord="wEx"
                    converterClass="org.springframework.boot.logging.logback.ExtendedWhitespaceThrowableProxyConverter"/>

    <!-- 彩色日志格式 -->
    <property name="CONSOLE_LOG_PATTERN"
              value="[${APP_NAME}:${ServerIP}:${ServerPort}] [%clr(%X{traceid}){yellow},%clr(%X{X-B3-TraceId}){yellow}] %clr(%d{yyyy-MM-dd HH:mm:ss.SSS}){faint} %clr(%level){blue} %clr(${PID}){magenta} %clr([%thread]){orange} %clr(%logger){cyan} %m%n${LOG_EXCEPTION_CONVERSION_WORD:-%wEx}"/>
    <property name="CONSOLE_LOG_PATTERN_NO_COLOR"
              value="[${APP_NAME}:${ServerIP}:${ServerPort}] [%X{traceid},%X{X-B3-TraceId}] %d{yyyy-MM-dd HH:mm:ss.SSS} %level ${PID} [%thread] %logger %m%n${LOG_EXCEPTION_CONVERSION_WORD:-%wEx}"/>


    <!-- 控制台日志 -->
    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
        <withJansi>true</withJansi>
        <encoder>
            <pattern>${CONSOLE_LOG_PATTERN}</pattern>
            <charset>UTF-8</charset>
        </encoder>
    </appender>

    <!-- 按照每天生成常规日志文件 -->
    <appender name="ERROR" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>${LOG_FILE}/${APP_NAME}-error.log</file>
        <!-- 基于时间的分包策略 -->
        <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
            <fileNamePattern>${LOG_FILE}/${APP_NAME}-error.%d{yyyy-MM-dd}.%i.log</fileNamePattern>
            <maxFileSize>100MB</maxFileSize>
            <!--保留时间,单位:天-->
            <maxHistory>60</maxHistory>
        </rollingPolicy>
        <encoder>
            <pattern>${CONSOLE_LOG_PATTERN_NO_COLOR}</pattern>
            <charset>UTF-8</charset>
        </encoder>
        <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
            <MaxFileSize>100MB</MaxFileSize>
        </triggeringPolicy>
        <filter class="ch.qos.logback.classic.filter.LevelFilter"><!-- 只打印错误日志 -->
            <level>ERROR</level>
            <onMatch>ACCEPT</onMatch>
            <onMismatch>DENY</onMismatch>
        </filter>
    </appender>

    <!-- 按照每天生成常规日志文件 -->
    <appender name="INFO" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>${LOG_FILE}/${APP_NAME}-info.log</file>
        <!-- 基于时间的分包策略 -->
        <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
            <fileNamePattern>${LOG_FILE}/${APP_NAME}-info.%d{yyyy-MM-dd}.%i.log</fileNamePattern>
            <maxFileSize>100MB</maxFileSize>
            <!--保留时间,单位:天-->
            <maxHistory>60</maxHistory>
        </rollingPolicy>
        <encoder>
            <pattern>${CONSOLE_LOG_PATTERN_NO_COLOR}</pattern>
            <charset>UTF-8</charset>
        </encoder>
        <filter class="ch.qos.logback.classic.filter.LevelFilter">
            <level>INFO</level>
            <onMatch>ACCEPT</onMatch>
            <onMismatch>DENY</onMismatch>
        </filter>
    </appender>

    <root level="INFO">
        <appender-ref ref="STDOUT"/>
        <appender-ref ref="ERROR"/>
        <appender-ref ref="INFO"/>
    </root>

</configuration>

爬虫demo

sql
DROP TABLE IF EXISTS `t_blog_spider`;
CREATE TABLE `t_blog_spider`  (
  `uid` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '唯一uid',
  `title` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客标题',
  `summary` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客简介',
  `content` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '博客内容',
  `tag_uid` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '标签uid',
  `click_count` int(11) NULL DEFAULT 0 COMMENT '博客点击数',
  `collect_count` int(11) NULL DEFAULT 0 COMMENT '博客收藏数',
  `file_uid` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '标题图片uid',
  `status` tinyint(1) UNSIGNED NOT NULL DEFAULT 1 COMMENT '状态',
  `create_time` timestamp(0) NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP(0) COMMENT '创建时间',
  `update_time` timestamp(0) NULL DEFAULT NULL COMMENT '更新时间',
  `admin_uid` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '管理员uid',
  `is_original` varchar(1) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '1' COMMENT '是否原创(0:不是 1:是)',
  `author` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '作者',
  `articles_part` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '文章出处',
  `blog_sort_uid` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客分类UID',
  `level` tinyint(1) NULL DEFAULT 0 COMMENT '推荐等级(0:正常)',
  `is_publish` varchar(1) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '1' COMMENT '是否发布:0:否,1:是',
  `sort` int(11) NOT NULL DEFAULT 0 COMMENT '排序字段',
  PRIMARY KEY (`uid`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '博客爬取表' ROW_FORMAT = Dynamic;
dao
package com.thh.spider.dao;

import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;

import com.thh.spider.pojo.Blog;
import tk.mybatis.mapper.common.Mapper;

/**
 * blog数据访问接口
 * @author Administrator
 *
 */
public interface BlogDao  extends Mapper<Blog> {

}

pipeline
package com.thh.spider.pipeline;

import com.thh.spider.dao.BlogDao;
import com.thh.spider.pojo.Blog;
import com.thh.spider.util.DownloadUtil;
import com.thh.spider.util.IdWorker;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.io.IOException;
import java.util.Date;

@Component
public class BlogPipeline implements Pipeline {


	@Autowired
	private IdWorker idWorker;

	@Autowired
	private BlogDao blogDao;


	private final String SAVE_PATH = "C:/Users/King/Desktop/tensquare/webmgicFile/piantuUrl/youxi/";


	@Override
	public void process(ResultItems res, Task task) {
		//获取title和content
		String  title = res.get("title");
		String  content = res.get("content");
		System.out.println("title: "+title);
		System.out.println("content: "+content);
		if (!StringUtils.isEmpty(title) && !StringUtils.isEmpty(content)){

			try {
				Blog blog = new Blog();
				blog.setUid(idWorker.nextId()+""); //主键
				blog.setTitle(title);                   //标题
				blog.setSummary("爬取到的页面");
				blog.setContent(content);              //博客内容
				blog.setTagUid("1");
				blog.setClickCount(0); //点击数
				blog.setCollectCount(0); //收藏数
				blog.setFile_uid(null);
				blog.setStatus(1);     //状态1
				Date now = new Date();
				blog.setCreateTime(now);//创建时间
				blog.setUpdateTime(now);
				blog.setAdminUid("1f01cd1d2f474743b241d74008b12333");//管理员uid 写死
				blog.setAuthor("作者");
				blog.setArticlesPart("辉皇博客");
				blog.setBlogSortUid("1");
				blog.setLevel(1);
				blog.setIsPublish("1");
				blog.setSort(0);
				blogDao.insertSelective(blog);
				//下载到本地
				//DownloadUtil.download("http://pic.netbian.com"+fileUrl,fileName,SAVE_PATH);
			} catch (Exception e) {
				e.printStackTrace();
			}
		}




	}


}

pojo

用到了lombok插件 如果不想安装就全部get set方法就好 不需要@Data

package com.thh.spider.pojo;

import lombok.Data;

import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import java.io.Serializable;
import java.util.Date;

/**
 * blog实体类
 * @author thh
 *
 */

@Table(name="t_blog_spider")
@Data
public class Blog implements Serializable{

	@Id
	@Column(name="uid")
	private String uid;//唯一uid

	@Column(name="title")
	private String title;//博客标题

	@Column(name="summary")
	private String summary;//博客简介

	@Column(name="content")
	private String content;//博客内容

	@Column(name="tag_uid")
	private String tagUid;//标签uid

	@Column(name="click_count")
	private Integer clickCount;//博客点击数

	@Column(name="collect_count")
	private Integer collectCount;//博客收藏数

	@Column(name="fileUid")
	private String file_uid;//标题图片uid

	@Column(name="status")
	private Integer status;//状态

	@Column(name="create_time")
	private Date createTime;//创建时间

	@Column(name="update_time")
	private Date updateTime;//更新时间

	@Column(name="admin_uid")
	private String adminUid;//管理员uid

	@Column(name="is_original")
	private String isOriginal;//是否原创(0:不是 1:是)

	@Column(name="author")
	private String author;//作者

	@Column(name="articles_part")
	private String articlesPart;//文章出处

	@Column(name="blog_sort_uid")
	private String blogSortUid;//博客分类UID


	@Column(name="level")
	private Integer level;//推荐等级(0:正常)

	@Column(name="is_publish")
	private String isPublish;//是否发布:0:否,1:是

	@Column(name="sort")
	private Integer sort;//排序字段


}

processer

package com.thh.spider.processer;

import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

@Component
public class BlogProcesser implements PageProcessor {


	/**
	 * 处理我们需要的页面
	 */


	@Override
	public void process(Page page) {
		List<String> list = page.getHtml().regex("https://blog.csdn.net/[a-zA-Z0-9_]+/article/details/[0-9]{9}").all();
		this.saveBlogInfo(page);
		page.addTargetRequests(list);

	/*	if(list==null || list.size()==0){
			// 如果为空 表示这是图片详情页
			this.saveBlogInfo(page);
		}else {
			// 如果不为空 表示这是列表页 解析出详情页的url地址 放到任务队列中
			*//*for (Selectable selectable : list) {
				//获取url地址
			String details = 	selectable.links().toString();
			page.addTargetRequest(details);
			}*//*

			for (String details : list) {
				//获取url地址
				page.addTargetRequest(details);
			}
		}*/

	}

	private void saveBlogInfo(Page page) {

		//2、获取我们需要的内容: title和content
		String title =  page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
		String content =  page.getHtml().xpath("//*[@id=\"article_content\"]").toString();


		if(title!=null){
			page.putField("title", title);
			page.putField("content", content);
		}else {
			page.setSkip(true);//跳过爬取
		}

	}

	//站点信息设置
	@Override
	public Site getSite() {
		return Site.me().setCharset("utf8").setRetryTimes(2).setSleepTime(500).setTimeOut(2000);
	}

}

定时任务类
package com.thh.spider.task;

import com.thh.spider.pipeline.BlogPipeline;
import com.thh.spider.processer.BlogProcesser;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.RedisScheduler;

/**
 * 定时任务类
 */
@Component
public class BlogTask {

    @Autowired
    private BlogProcesser blogProcesser;

    @Autowired
    private BlogPipeline blogPipeline;

    @Autowired
    private RedisScheduler redisScheduler;




    /**
     * 爬取文章: 爬取数据库分类
     */
    //@Scheduled(cron = "0/20 * * * * ?")
    //initialDelay 任务启动后多久后执行
    //fixedDelay 多久执行一次
    @Scheduled(initialDelay = 1000,fixedDelay = 100*1000)
    public void webArticleTask(){
        //开启蜘蛛爬取内容
        Spider.create(blogProcesser)
                .addUrl("https://www.csdn.net/")
                .addPipeline(blogPipeline)
                //.setScheduler(redisScheduler) //redis来效验存储重复爬取的链接
               .setScheduler(new QueueScheduler()) //内存效验重复爬取  关闭就失效
                .thread(10)  //十个线程
                .run();
    }

}

util

下载工具类

DownloadUtil

package com.thh.spider.util;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;

/**
 * 下载工具类
 */
public class DownloadUtil {

    public static void download(String urlStr,String filename,String savePath) throws IOException {
        URL url = new URL(urlStr);
        //打开url连接
        URLConnection connection = url.openConnection();
        //请求超时时间
        connection.setConnectTimeout(5000);
        //输入流
        InputStream in = connection.getInputStream();
        //缓冲数据
        byte [] bytes = new byte[1024];
        //数据长度
        int len;
        //文件
        File file = new File(savePath);
        if(!file.exists())
            file.mkdirs();
        OutputStream out = new FileOutputStream(file.getPath()+"\\"+filename);
        //先读到bytes中
        while ((len=in.read(bytes))!=-1){
        //再从bytes中写入文件
            out.write(bytes,0,len);
        }
        //关闭IO
        out.close();
        in.close();
    }
}

idWork

package com.thh.spider.util;

import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.NetworkInterface;

/**
 * <p>名称:IdWorker.java</p>
 * <p>描述:分布式自增长ID</p>
 * <pre>
 *     Twitter的 Snowflake JAVA实现方案
 * </pre>
 * 核心代码为其IdWorker这个类实现,其原理结构如下,我分别用一个0表示一位,用—分割开部分的作用:
 * 1||0---0000000000 0000000000 0000000000 0000000000 0 --- 00000 ---00000 ---000000000000
 * 在上面的字符串中,第一位为未使用(实际上也可作为long的符号位),接下来的41位为毫秒级时间,
 * 然后5位datacenter标识位,5位机器ID(并不算标识符,实际是为线程标识),
 * 然后12位该毫秒内的当前毫秒内的计数,加起来刚好64位,为一个Long型。
 * 这样的好处是,整体上按照时间自增排序,并且整个分布式系统内不会产生ID碰撞(由datacenter和机器ID作区分),
 * 并且效率较高,经测试,snowflake每秒能够产生26万ID左右,完全满足需要。
 * <p>
 * 64位ID (42(毫秒)+5(机器ID)+5(业务编码)+12(重复累加))
 *
 * @author Polim
 */
public class IdWorker {
    // 时间起始标记点,作为基准,一般取系统的最近时间(一旦确定不能变动)
    private final static long twepoch = 1288834974657L;
    // 机器标识位数
    private final static long workerIdBits = 5L;
    // 数据中心标识位数
    private final static long datacenterIdBits = 5L;
    // 机器ID最大值
    private final static long maxWorkerId = -1L ^ (-1L << workerIdBits);
    // 数据中心ID最大值
    private final static long maxDatacenterId = -1L ^ (-1L << datacenterIdBits);
    // 毫秒内自增位
    private final static long sequenceBits = 12L;
    // 机器ID偏左移12位
    private final static long workerIdShift = sequenceBits;
    // 数据中心ID左移17位
    private final static long datacenterIdShift = sequenceBits + workerIdBits;
    // 时间毫秒左移22位
    private final static long timestampLeftShift = sequenceBits + workerIdBits + datacenterIdBits;

    private final static long sequenceMask = -1L ^ (-1L << sequenceBits);
    /* 上次生产id时间戳 */
    private static long lastTimestamp = -1L;
    // 0,并发控制
    private long sequence = 0L;

    private final long workerId;
    // 数据标识id部分
    private final long datacenterId;

    public IdWorker(){
        this.datacenterId = getDatacenterId(maxDatacenterId);
        this.workerId = getMaxWorkerId(datacenterId, maxWorkerId);
    }
    /**
     * @param workerId
     *            工作机器ID
     * @param datacenterId
     *            ***
     */
    public IdWorker(long workerId, long datacenterId) {
        if (workerId > maxWorkerId || workerId < 0) {
            throw new IllegalArgumentException(String.format("worker Id can't be greater than %d or less than 0", maxWorkerId));
        }
        if (datacenterId > maxDatacenterId || datacenterId < 0) {
            throw new IllegalArgumentException(String.format("datacenter Id can't be greater than %d or less than 0", maxDatacenterId));
        }
        this.workerId = workerId;
        this.datacenterId = datacenterId;
    }
    /**
     * 获取下一个ID
     *
     * @return
     */
    public synchronized long nextId() {
        long timestamp = timeGen();
        if (timestamp < lastTimestamp) {
            throw new RuntimeException(String.format("Clock moved backwards.  Refusing to generate id for %d milliseconds", lastTimestamp - timestamp));
        }

        if (lastTimestamp == timestamp) {
            // 当前毫秒内,则+1
            sequence = (sequence + 1) & sequenceMask;
            if (sequence == 0) {
                // 当前毫秒内计数满了,则等待下一秒
                timestamp = tilNextMillis(lastTimestamp);
            }
        } else {
            sequence = 0L;
        }
        lastTimestamp = timestamp;
        // ID偏移组合生成最终的ID,并返回ID
        long nextId = ((timestamp - twepoch) << timestampLeftShift)
                | (datacenterId << datacenterIdShift)
                | (workerId << workerIdShift) | sequence;

        return nextId;
    }

    private long tilNextMillis(final long lastTimestamp) {
        long timestamp = this.timeGen();
        while (timestamp <= lastTimestamp) {
            timestamp = this.timeGen();
        }
        return timestamp;
    }

    private long timeGen() {
        return System.currentTimeMillis();
    }

    /**
     * <p>
     * 获取 maxWorkerId
     * </p>
     */
    protected static long getMaxWorkerId(long datacenterId, long maxWorkerId) {
        StringBuffer mpid = new StringBuffer();
        mpid.append(datacenterId);
        String name = ManagementFactory.getRuntimeMXBean().getName();
        if (!name.isEmpty()) {
            /*
             * GET jvmPid
             */
            mpid.append(name.split("@")[0]);
        }
        /*
         * MAC + PID 的 hashcode 获取16个低位
         */
        return (mpid.toString().hashCode() & 0xffff) % (maxWorkerId + 1);
    }

    /**
     * <p>
     * 数据标识id部分
     * </p>
     */
    protected static long getDatacenterId(long maxDatacenterId) {
        long id = 0L;
        try {
            InetAddress ip = InetAddress.getLocalHost();
            NetworkInterface network = NetworkInterface.getByInetAddress(ip);
            if (network == null) {
                id = 1L;
            } else {
                byte[] mac = network.getHardwareAddress();
                id = ((0x000000FF & (long) mac[mac.length - 1])
                        | (0x0000FF00 & (((long) mac[mac.length - 2]) << 8))) >> 6;
                id = id % (maxDatacenterId + 1);
            }
        } catch (Exception e) {
            System.out.println(" getDatacenterId: " + e.getMessage());
        }
        return id;
    }


    public static void main(String[] args) {
        //推特  26万个不重复的ID
        IdWorker idWorker = new IdWorker(0,0);
        for (int i = 0; i <2600 ; i++) {
            System.out.println(idWorker.nextId());
        }
    }

}

启动类

package com.thh.spider;


import com.thh.spider.util.IdWorker;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

import org.springframework.context.annotation.Bean;
import org.springframework.scheduling.annotation.EnableScheduling;
import tk.mybatis.spring.annotation.MapperScan;
import us.codecraft.webmagic.scheduler.RedisScheduler;


@SpringBootApplication
//开启定时启动
@EnableScheduling
@MapperScan(basePackages = {"com.thh.spider.dao"})
public class SpiderApplication {

    public static void main(String[] args) {


        SpringApplication.run(SpiderApplication.class,args);

    }

    //注入redis的ip地址
    @Value("${spring.redis.host}")
    private String host;

    //url去重使用的
    @Bean
    public RedisScheduler redisScheduler() {
        return new RedisScheduler(host);
    }

    @Bean
    public IdWorker idWorker() {
        return new IdWorker(1,1);
    }

}