爬虫demo

程序员文章站 2022-03-22 17:10:39

...

爬虫Demo

爬虫demo

依赖

这里可以用jpa或者mybatis保存数据

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.thh</groupId>
    <artifactId>blog-spider</artifactId>
    
     <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.1.4.RELEASE</version>
        <relativePath />
    </parent>
    <!--爬取框架webmagic-->
    <dependencies>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-jpa</artifactId>
        </dependency>


        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
        </dependency>


        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-redis</artifactId>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
        </dependency>
        <dependency>
            <groupId>io.jsonwebtoken</groupId>
            <artifactId>jjwt</artifactId>
            <version>0.6.0</version>
        </dependency>

        <!--web起步依赖-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>


        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-core</artifactId>
            <version>0.7.3</version>
            <exclusions>
                <exclusion>
                    <groupId>org.slf4j</groupId>
                    <artifactId>slf4j-log4j12</artifactId>
                </exclusion>
            </exclusions>
        </dependency>

        <dependency>
            <groupId>us.codecraft</groupId>
            <artifactId>webmagic-extension</artifactId>
            <version>0.7.3</version>
        </dependency>

        <!--mybatis分页插件-->
        <dependency>
            <groupId>com.github.pagehelper</groupId>
            <artifactId>pagehelper-spring-boot-starter</artifactId>
            <version>1.2.3</version>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-configuration-processor</artifactId>
            <optional>true</optional>
        </dependency>

        <!-- 通用mapper -->
        <dependency>
            <groupId>tk.mybatis</groupId>
            <artifactId>mapper-spring-boot-starter</artifactId>
            <version>2.1.5</version>
        </dependency>



    </dependencies>

</project>

application.yml

server:
  port: 7777
spring:
  datasource:
    username: root
    password: 123456
    url: jdbc:mysql://192.168.139.128:3306/thh_blog?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull&transformedBitIsBoolean=true&useSSL=false&serverTimezone=GMT%2B8
    driver-class-name: com.mysql.cj.jdbc.Driver

  jpa:
    show-sql: true
    generate-ddl: false
    database: mysql
  redis:
    host: 192.168.139.128
    port: 6379

mybatis:
  mapper-locations: classpath:mappers/*.xml  #此处为Mybatis的Mapper.xml文件路径
  type-aliases-package: com.thh.spider.pojo  # 此处为实体Bean的路径

logback-spring.xml 日志

<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <contextName>${APP_NAME}</contextName>

    <springProperty name="APP_NAME" scope="context" source="spring.application.name"/>
    <springProperty name="LOG_FILE" scope="context" source="logging.file" defaultValue="./logs/${APP_NAME}"/>
    <springProperty name="LOG_POINT_FILE" scope="context" source="logging.file" defaultValue="./logs/point"/>
    <springProperty name="LOG_MAXFILESIZE" scope="context" source="logback.filesize" defaultValue="50MB"/>
    <springProperty name="LOG_FILEMAXDAY" scope="context" source="logback.filemaxday" defaultValue="7"/>
    <springProperty name="ServerIP" scope="context" source="spring.cloud.client.ip-address" defaultValue="0.0.0.0"/>
    <springProperty name="ServerPort" scope="context" source="server.port" defaultValue="0000"/>

    <!-- 彩色日志 -->
    <!-- 彩色日志依赖的渲染类 -->
    <conversionRule conversionWord="clr" converterClass="org.springframework.boot.logging.logback.ColorConverter"/>
    <conversionRule conversionWord="wex"
                    converterClass="org.springframework.boot.logging.logback.WhitespaceThrowableProxyConverter"/>
    <conversionRule conversionWord="wEx"
                    converterClass="org.springframework.boot.logging.logback.ExtendedWhitespaceThrowableProxyConverter"/>

    <!-- 彩色日志格式 -->
    <property name="CONSOLE_LOG_PATTERN"
              value="[${APP_NAME}:${ServerIP}:${ServerPort}] [%clr(%X{traceid}){yellow},%clr(%X{X-B3-TraceId}){yellow}] %clr(%d{yyyy-MM-dd HH:mm:ss.SSS}){faint} %clr(%level){blue} %clr(${PID}){magenta} %clr([%thread]){orange} %clr(%logger){cyan} %m%n${LOG_EXCEPTION_CONVERSION_WORD:-%wEx}"/>
    <property name="CONSOLE_LOG_PATTERN_NO_COLOR"
              value="[${APP_NAME}:${ServerIP}:${ServerPort}] [%X{traceid},%X{X-B3-TraceId}] %d{yyyy-MM-dd HH:mm:ss.SSS} %level ${PID} [%thread] %logger %m%n${LOG_EXCEPTION_CONVERSION_WORD:-%wEx}"/>


    <!-- 控制台日志 -->
    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
        <withJansi>true</withJansi>
        <encoder>
            <pattern>${CONSOLE_LOG_PATTERN}</pattern>
            <charset>UTF-8</charset>
        </encoder>
    </appender>

    <!-- 按照每天生成常规日志文件 -->
    <appender name="ERROR" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>${LOG_FILE}/${APP_NAME}-error.log</file>
        <!-- 基于时间的分包策略 -->
        <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
            <fileNamePattern>${LOG_FILE}/${APP_NAME}-error.%d{yyyy-MM-dd}.%i.log</fileNamePattern>
            <maxFileSize>100MB</maxFileSize>
            <!--保留时间,单位:天-->
            <maxHistory>60</maxHistory>
        </rollingPolicy>
        <encoder>
            <pattern>${CONSOLE_LOG_PATTERN_NO_COLOR}</pattern>
            <charset>UTF-8</charset>
        </encoder>
        <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
            <MaxFileSize>100MB</MaxFileSize>
        </triggeringPolicy>
        <filter class="ch.qos.logback.classic.filter.LevelFilter"><!-- 只打印错误日志 -->
            <level>ERROR</level>
            <onMatch>ACCEPT</onMatch>
            <onMismatch>DENY</onMismatch>
        </filter>
    </appender>

    <!-- 按照每天生成常规日志文件 -->
    <appender name="INFO" class="ch.qos.logback.core.rolling.RollingFileAppender">
        <file>${LOG_FILE}/${APP_NAME}-info.log</file>
        <!-- 基于时间的分包策略 -->
        <rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
            <fileNamePattern>${LOG_FILE}/${APP_NAME}-info.%d{yyyy-MM-dd}.%i.log</fileNamePattern>
            <maxFileSize>100MB</maxFileSize>
            <!--保留时间,单位:天-->
            <maxHistory>60</maxHistory>
        </rollingPolicy>
        <encoder>
            <pattern>${CONSOLE_LOG_PATTERN_NO_COLOR}</pattern>
            <charset>UTF-8</charset>
        </encoder>
        <filter class="ch.qos.logback.classic.filter.LevelFilter">
            <level>INFO</level>
            <onMatch>ACCEPT</onMatch>
            <onMismatch>DENY</onMismatch>
        </filter>
    </appender>

    <root level="INFO">
        <appender-ref ref="STDOUT"/>
        <appender-ref ref="ERROR"/>
        <appender-ref ref="INFO"/>
    </root>

</configuration>

爬虫demo

sql

DROP TABLE IF EXISTS `t_blog_spider`;
CREATE TABLE `t_blog_spider`  (
  `uid` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '唯一uid',
  `title` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客标题',
  `summary` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客简介',
  `content` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '博客内容',
  `tag_uid` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '标签uid',
  `click_count` int(11) NULL DEFAULT 0 COMMENT '博客点击数',
  `collect_count` int(11) NULL DEFAULT 0 COMMENT '博客收藏数',
  `file_uid` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '标题图片uid',
  `status` tinyint(1) UNSIGNED NOT NULL DEFAULT 1 COMMENT '状态',
  `create_time` timestamp(0) NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP(0) COMMENT '创建时间',
  `update_time` timestamp(0) NULL DEFAULT NULL COMMENT '更新时间',
  `admin_uid` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '管理员uid',
  `is_original` varchar(1) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '1' COMMENT '是否原创（0:不是 1：是）',
  `author` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '作者',
  `articles_part` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '文章出处',
  `blog_sort_uid` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客分类UID',
  `level` tinyint(1) NULL DEFAULT 0 COMMENT '推荐等级(0:正常)',
  `is_publish` varchar(1) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '1' COMMENT '是否发布：0：否，1：是',
  `sort` int(11) NOT NULL DEFAULT 0 COMMENT '排序字段',
  PRIMARY KEY (`uid`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '博客爬取表' ROW_FORMAT = Dynamic;

dao

package com.thh.spider.dao;

import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;

import com.thh.spider.pojo.Blog;
import tk.mybatis.mapper.common.Mapper;

/**
 * blog数据访问接口
 * @author Administrator
 *
 */
public interface BlogDao  extends Mapper<Blog> {

}

pipeline

package com.thh.spider.pipeline;

import com.thh.spider.dao.BlogDao;
import com.thh.spider.pojo.Blog;
import com.thh.spider.util.DownloadUtil;
import com.thh.spider.util.IdWorker;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;

import java.io.IOException;
import java.util.Date;

@Component
public class BlogPipeline implements Pipeline {


	@Autowired
	private IdWorker idWorker;

	@Autowired
	private BlogDao blogDao;


	private final String SAVE_PATH = "C:/Users/King/Desktop/tensquare/webmgicFile/piantuUrl/youxi/";


	@Override
	public void process(ResultItems res, Task task) {
		//获取title和content
		String  title = res.get("title");
		String  content = res.get("content");
		System.out.println("title: "+title);
		System.out.println("content: "+content);
		if (!StringUtils.isEmpty(title) && !StringUtils.isEmpty(content)){

			try {
				Blog blog = new Blog();
				blog.setUid(idWorker.nextId()+""); //主键
				blog.setTitle(title);                   //标题
				blog.setSummary("爬取到的页面");
				blog.setContent(content);              //博客内容
				blog.setTagUid("1");
				blog.setClickCount(0); //点击数
				blog.setCollectCount(0); //收藏数
				blog.setFile_uid(null);
				blog.setStatus(1);     //状态1
				Date now = new Date();
				blog.setCreateTime(now);//创建时间
				blog.setUpdateTime(now);
				blog.setAdminUid("1f01cd1d2f474743b241d74008b12333");//管理员uid 写死
				blog.setAuthor("作者");
				blog.setArticlesPart("辉皇博客");
				blog.setBlogSortUid("1");
				blog.setLevel(1);
				blog.setIsPublish("1");
				blog.setSort(0);
				blogDao.insertSelective(blog);
				//下载到本地
				//DownloadUtil.download("http://pic.netbian.com"+fileUrl,fileName,SAVE_PATH);
			} catch (Exception e) {
				e.printStackTrace();
			}
		}




	}


}

pojo

用到了lombok插件如果不想安装就全部get set方法就好不需要@Data

package com.thh.spider.pojo;

import lombok.Data;

import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import java.io.Serializable;
import java.util.Date;

/**
 * blog实体类
 * @author thh
 *
 */

@Table(name="t_blog_spider")
@Data
public class Blog implements Serializable{

	@Id
	@Column(name="uid")
	private String uid;//唯一uid

	@Column(name="title")
	private String title;//博客标题

	@Column(name="summary")
	private String summary;//博客简介

	@Column(name="content")
	private String content;//博客内容

	@Column(name="tag_uid")
	private String tagUid;//标签uid

	@Column(name="click_count")
	private Integer clickCount;//博客点击数

	@Column(name="collect_count")
	private Integer collectCount;//博客收藏数

	@Column(name="fileUid")
	private String file_uid;//标题图片uid

	@Column(name="status")
	private Integer status;//状态

	@Column(name="create_time")
	private Date createTime;//创建时间

	@Column(name="update_time")
	private Date updateTime;//更新时间

	@Column(name="admin_uid")
	private String adminUid;//管理员uid

	@Column(name="is_original")
	private String isOriginal;//是否原创（0:不是 1：是）

	@Column(name="author")
	private String author;//作者

	@Column(name="articles_part")
	private String articlesPart;//文章出处

	@Column(name="blog_sort_uid")
	private String blogSortUid;//博客分类UID


	@Column(name="level")
	private Integer level;//推荐等级(0:正常)

	@Column(name="is_publish")
	private String isPublish;//是否发布：0：否，1：是

	@Column(name="sort")
	private Integer sort;//排序字段


}

processer

package com.thh.spider.processer;

import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.List;

@Component
public class BlogProcesser implements PageProcessor {


	/**
	 * 处理我们需要的页面
	 */


	@Override
	public void process(Page page) {
		List<String> list = page.getHtml().regex("https://blog.csdn.net/[a-zA-Z0-9_]+/article/details/[0-9]{9}").all();
		this.saveBlogInfo(page);
		page.addTargetRequests(list);

	/*	if(list==null || list.size()==0){
			// 如果为空 表示这是图片详情页
			this.saveBlogInfo(page);
		}else {
			// 如果不为空 表示这是列表页 解析出详情页的url地址 放到任务队列中
			*//*for (Selectable selectable : list) {
				//获取url地址
			String details = 	selectable.links().toString();
			page.addTargetRequest(details);
			}*//*

			for (String details : list) {
				//获取url地址
				page.addTargetRequest(details);
			}
		}*/

	}

	private void saveBlogInfo(Page page) {

		//2、获取我们需要的内容： title和content
		String title =  page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
		String content =  page.getHtml().xpath("//*[@id=\"article_content\"]").toString();


		if(title!=null){
			page.putField("title", title);
			page.putField("content", content);
		}else {
			page.setSkip(true);//跳过爬取
		}

	}

	//站点信息设置
	@Override
	public Site getSite() {
		return Site.me().setCharset("utf8").setRetryTimes(2).setSleepTime(500).setTimeOut(2000);
	}

}

定时任务类

package com.thh.spider.task;

import com.thh.spider.pipeline.BlogPipeline;
import com.thh.spider.processer.BlogProcesser;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.RedisScheduler;

/**
 * 定时任务类
 */
@Component
public class BlogTask {

    @Autowired
    private BlogProcesser blogProcesser;

    @Autowired
    private BlogPipeline blogPipeline;

    @Autowired
    private RedisScheduler redisScheduler;




    /**
     * 爬取文章: 爬取数据库分类
     */
    //@Scheduled(cron = "0/20 * * * * ?")
    //initialDelay 任务启动后多久后执行
    //fixedDelay 多久执行一次
    @Scheduled(initialDelay = 1000,fixedDelay = 100*1000)
    public void webArticleTask(){
        //开启蜘蛛爬取内容
        Spider.create(blogProcesser)
                .addUrl("https://www.csdn.net/")
                .addPipeline(blogPipeline)
                //.setScheduler(redisScheduler) //redis来效验存储重复爬取的链接
               .setScheduler(new QueueScheduler()) //内存效验重复爬取  关闭就失效
                .thread(10)  //十个线程
                .run();
    }

}

util

下载工具类

DownloadUtil

package com.thh.spider.util;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;

/**
 * 下载工具类
 */
public class DownloadUtil {

    public static void download(String urlStr,String filename,String savePath) throws IOException {
        URL url = new URL(urlStr);
        //打开url连接
        URLConnection connection = url.openConnection();
        //请求超时时间
        connection.setConnectTimeout(5000);
        //输入流
        InputStream in = connection.getInputStream();
        //缓冲数据
        byte [] bytes = new byte[1024];
        //数据长度
        int len;
        //文件
        File file = new File(savePath);
        if(!file.exists())
            file.mkdirs();
        OutputStream out = new FileOutputStream(file.getPath()+"\\"+filename);
        //先读到bytes中
        while ((len=in.read(bytes))!=-1){
        //再从bytes中写入文件
            out.write(bytes,0,len);
        }
        //关闭IO
        out.close();
        in.close();
    }
}

idWork

package com.thh.spider.util;

import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.NetworkInterface;

/**
 * <p>名称：IdWorker.java</p>
 * <p>描述：分布式自增长ID</p>
 * <pre>
 *     Twitter的 Snowflake　JAVA实现方案
 * </pre>
 * 核心代码为其IdWorker这个类实现，其原理结构如下，我分别用一个0表示一位，用—分割开部分的作用：
 * 1||0---0000000000 0000000000 0000000000 0000000000 0 --- 00000 ---00000 ---000000000000
 * 在上面的字符串中，第一位为未使用（实际上也可作为long的符号位），接下来的41位为毫秒级时间，
 * 然后5位datacenter标识位，5位机器ID（并不算标识符，实际是为线程标识），
 * 然后12位该毫秒内的当前毫秒内的计数，加起来刚好64位，为一个Long型。
 * 这样的好处是，整体上按照时间自增排序，并且整个分布式系统内不会产生ID碰撞（由datacenter和机器ID作区分），
 * 并且效率较高，经测试，snowflake每秒能够产生26万ID左右，完全满足需要。
 * <p>
 * 64位ID (42(毫秒)+5(机器ID)+5(业务编码)+12(重复累加))
 *
 * @author Polim
 */
public class IdWorker {
    // 时间起始标记点，作为基准，一般取系统的最近时间（一旦确定不能变动）
    private final static long twepoch = 1288834974657L;
    // 机器标识位数
    private final static long workerIdBits = 5L;
    // 数据中心标识位数
    private final static long datacenterIdBits = 5L;
    // 机器ID最大值
    private final static long maxWorkerId = -1L ^ (-1L << workerIdBits);
    // 数据中心ID最大值
    private final static long maxDatacenterId = -1L ^ (-1L << datacenterIdBits);
    // 毫秒内自增位
    private final static long sequenceBits = 12L;
    // 机器ID偏左移12位
    private final static long workerIdShift = sequenceBits;
    // 数据中心ID左移17位
    private final static long datacenterIdShift = sequenceBits + workerIdBits;
    // 时间毫秒左移22位
    private final static long timestampLeftShift = sequenceBits + workerIdBits + datacenterIdBits;

    private final static long sequenceMask = -1L ^ (-1L << sequenceBits);
    /* 上次生产id时间戳 */
    private static long lastTimestamp = -1L;
    // 0，并发控制
    private long sequence = 0L;

    private final long workerId;
    // 数据标识id部分
    private final long datacenterId;

    public IdWorker(){
        this.datacenterId = getDatacenterId(maxDatacenterId);
        this.workerId = getMaxWorkerId(datacenterId, maxWorkerId);
    }
    /**
     * @param workerId
     *            工作机器ID
     * @param datacenterId
     *            ***
     */
    public IdWorker(long workerId, long datacenterId) {
        if (workerId > maxWorkerId || workerId < 0) {
            throw new IllegalArgumentException(String.format("worker Id can't be greater than %d or less than 0", maxWorkerId));
        }
        if (datacenterId > maxDatacenterId || datacenterId < 0) {
            throw new IllegalArgumentException(String.format("datacenter Id can't be greater than %d or less than 0", maxDatacenterId));
        }
        this.workerId = workerId;
        this.datacenterId = datacenterId;
    }
    /**
     * 获取下一个ID
     *
     * @return
     */
    public synchronized long nextId() {
        long timestamp = timeGen();
        if (timestamp < lastTimestamp) {
            throw new RuntimeException(String.format("Clock moved backwards.  Refusing to generate id for %d milliseconds", lastTimestamp - timestamp));
        }

        if (lastTimestamp == timestamp) {
            // 当前毫秒内，则+1
            sequence = (sequence + 1) & sequenceMask;
            if (sequence == 0) {
                // 当前毫秒内计数满了，则等待下一秒
                timestamp = tilNextMillis(lastTimestamp);
            }
        } else {
            sequence = 0L;
        }
        lastTimestamp = timestamp;
        // ID偏移组合生成最终的ID，并返回ID
        long nextId = ((timestamp - twepoch) << timestampLeftShift)
                | (datacenterId << datacenterIdShift)
                | (workerId << workerIdShift) | sequence;

        return nextId;
    }

    private long tilNextMillis(final long lastTimestamp) {
        long timestamp = this.timeGen();
        while (timestamp <= lastTimestamp) {
            timestamp = this.timeGen();
        }
        return timestamp;
    }

    private long timeGen() {
        return System.currentTimeMillis();
    }

    /**
     * <p>
     * 获取 maxWorkerId
     * </p>
     */
    protected static long getMaxWorkerId(long datacenterId, long maxWorkerId) {
        StringBuffer mpid = new StringBuffer();
        mpid.append(datacenterId);
        String name = ManagementFactory.getRuntimeMXBean().getName();
        if (!name.isEmpty()) {
            /*
             * GET jvmPid
             */
            mpid.append(name.split("@")[0]);
        }
        /*
         * MAC + PID 的 hashcode 获取16个低位
         */
        return (mpid.toString().hashCode() & 0xffff) % (maxWorkerId + 1);
    }

    /**
     * <p>
     * 数据标识id部分
     * </p>
     */
    protected static long getDatacenterId(long maxDatacenterId) {
        long id = 0L;
        try {
            InetAddress ip = InetAddress.getLocalHost();
            NetworkInterface network = NetworkInterface.getByInetAddress(ip);
            if (network == null) {
                id = 1L;
            } else {
                byte[] mac = network.getHardwareAddress();
                id = ((0x000000FF & (long) mac[mac.length - 1])
                        | (0x0000FF00 & (((long) mac[mac.length - 2]) << 8))) >> 6;
                id = id % (maxDatacenterId + 1);
            }
        } catch (Exception e) {
            System.out.println(" getDatacenterId: " + e.getMessage());
        }
        return id;
    }


    public static void main(String[] args) {
        //推特  26万个不重复的ID
        IdWorker idWorker = new IdWorker(0,0);
        for (int i = 0; i <2600 ; i++) {
            System.out.println(idWorker.nextId());
        }
    }

}

启动类

package com.thh.spider;


import com.thh.spider.util.IdWorker;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;

import org.springframework.context.annotation.Bean;
import org.springframework.scheduling.annotation.EnableScheduling;
import tk.mybatis.spring.annotation.MapperScan;
import us.codecraft.webmagic.scheduler.RedisScheduler;


@SpringBootApplication
//开启定时启动
@EnableScheduling
@MapperScan(basePackages = {"com.thh.spider.dao"})
public class SpiderApplication {

    public static void main(String[] args) {


        SpringApplication.run(SpiderApplication.class,args);

    }

    //注入redis的ip地址
    @Value("${spring.redis.host}")
    private String host;

    //url去重使用的
    @Bean
    public RedisScheduler redisScheduler() {
        return new RedisScheduler(host);
    }

    @Bean
    public IdWorker idWorker() {
        return new IdWorker(1,1);
    }

}

相关标签：学习记录 java 爬虫 spring

上一篇： JavaScript中BOM和DOM详解

下一篇： MySQL单表千万级数据处理的思路分享

爬虫demo

爬虫Demo

依赖

application.yml

sql

dao

pipeline

pojo

定时任务类

util

用Python爬取了拉勾网的招聘信息+详细教程+趣味学习+快速爬虫入门+学习交流+大神+爬虫入门

神箭手云爬虫-爬取携程【国际】航班/机票信息-利用python解析返回的json文件将信息存储进Mysql数据库

爬虫爬取千千音乐榜单音乐

Python高级爬虫开发，高难度JS解密教程，绝地求生模拟登陆！

零基础写Java知乎爬虫之获取知乎编辑推荐内容(3)

Unity通过脚本添加组件实现鼠标滑过3D物体高亮（附Demo）

爬虫出现TypeError: cannot use a string pattern on a bytes-like object报错

获取百度贴吧头像的爬虫

scrapy在python爬虫中搭建出错的解决方法

教你如何利用python3爬虫爬取漫画岛-非人哉漫画