爬虫demo
程序员文章站
2022-03-22 17:10:39
...
爬虫Demo
依赖
这里可以用jpa或者mybatis保存数据
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.thh</groupId>
<artifactId>blog-spider</artifactId>
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.1.4.RELEASE</version>
<relativePath />
</parent>
<!--爬取框架webmagic-->
<dependencies>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-jpa</artifactId>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-data-redis</artifactId>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
</dependency>
<dependency>
<groupId>io.jsonwebtoken</groupId>
<artifactId>jjwt</artifactId>
<version>0.6.0</version>
</dependency>
<!--web起步依赖-->
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-web</artifactId>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-core</artifactId>
<version>0.7.3</version>
<exclusions>
<exclusion>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>us.codecraft</groupId>
<artifactId>webmagic-extension</artifactId>
<version>0.7.3</version>
</dependency>
<!--mybatis分页插件-->
<dependency>
<groupId>com.github.pagehelper</groupId>
<artifactId>pagehelper-spring-boot-starter</artifactId>
<version>1.2.3</version>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-configuration-processor</artifactId>
<optional>true</optional>
</dependency>
<!-- 通用mapper -->
<dependency>
<groupId>tk.mybatis</groupId>
<artifactId>mapper-spring-boot-starter</artifactId>
<version>2.1.5</version>
</dependency>
</dependencies>
</project>
application.yml
server:
port: 7777
spring:
datasource:
username: root
password: 123456
url: jdbc:mysql://192.168.139.128:3306/thh_blog?useUnicode=true&characterEncoding=utf-8&zeroDateTimeBehavior=convertToNull&transformedBitIsBoolean=true&useSSL=false&serverTimezone=GMT%2B8
driver-class-name: com.mysql.cj.jdbc.Driver
jpa:
show-sql: true
generate-ddl: false
database: mysql
redis:
host: 192.168.139.128
port: 6379
mybatis:
mapper-locations: classpath:mappers/*.xml #此处为Mybatis的Mapper.xml文件路径
type-aliases-package: com.thh.spider.pojo # 此处为实体Bean的路径
logback-spring.xml 日志
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<contextName>${APP_NAME}</contextName>
<springProperty name="APP_NAME" scope="context" source="spring.application.name"/>
<springProperty name="LOG_FILE" scope="context" source="logging.file" defaultValue="./logs/${APP_NAME}"/>
<springProperty name="LOG_POINT_FILE" scope="context" source="logging.file" defaultValue="./logs/point"/>
<springProperty name="LOG_MAXFILESIZE" scope="context" source="logback.filesize" defaultValue="50MB"/>
<springProperty name="LOG_FILEMAXDAY" scope="context" source="logback.filemaxday" defaultValue="7"/>
<springProperty name="ServerIP" scope="context" source="spring.cloud.client.ip-address" defaultValue="0.0.0.0"/>
<springProperty name="ServerPort" scope="context" source="server.port" defaultValue="0000"/>
<!-- 彩色日志 -->
<!-- 彩色日志依赖的渲染类 -->
<conversionRule conversionWord="clr" converterClass="org.springframework.boot.logging.logback.ColorConverter"/>
<conversionRule conversionWord="wex"
converterClass="org.springframework.boot.logging.logback.WhitespaceThrowableProxyConverter"/>
<conversionRule conversionWord="wEx"
converterClass="org.springframework.boot.logging.logback.ExtendedWhitespaceThrowableProxyConverter"/>
<!-- 彩色日志格式 -->
<property name="CONSOLE_LOG_PATTERN"
value="[${APP_NAME}:${ServerIP}:${ServerPort}] [%clr(%X{traceid}){yellow},%clr(%X{X-B3-TraceId}){yellow}] %clr(%d{yyyy-MM-dd HH:mm:ss.SSS}){faint} %clr(%level){blue} %clr(${PID}){magenta} %clr([%thread]){orange} %clr(%logger){cyan} %m%n${LOG_EXCEPTION_CONVERSION_WORD:-%wEx}"/>
<property name="CONSOLE_LOG_PATTERN_NO_COLOR"
value="[${APP_NAME}:${ServerIP}:${ServerPort}] [%X{traceid},%X{X-B3-TraceId}] %d{yyyy-MM-dd HH:mm:ss.SSS} %level ${PID} [%thread] %logger %m%n${LOG_EXCEPTION_CONVERSION_WORD:-%wEx}"/>
<!-- 控制台日志 -->
<appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
<withJansi>true</withJansi>
<encoder>
<pattern>${CONSOLE_LOG_PATTERN}</pattern>
<charset>UTF-8</charset>
</encoder>
</appender>
<!-- 按照每天生成常规日志文件 -->
<appender name="ERROR" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>${LOG_FILE}/${APP_NAME}-error.log</file>
<!-- 基于时间的分包策略 -->
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>${LOG_FILE}/${APP_NAME}-error.%d{yyyy-MM-dd}.%i.log</fileNamePattern>
<maxFileSize>100MB</maxFileSize>
<!--保留时间,单位:天-->
<maxHistory>60</maxHistory>
</rollingPolicy>
<encoder>
<pattern>${CONSOLE_LOG_PATTERN_NO_COLOR}</pattern>
<charset>UTF-8</charset>
</encoder>
<triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
<MaxFileSize>100MB</MaxFileSize>
</triggeringPolicy>
<filter class="ch.qos.logback.classic.filter.LevelFilter"><!-- 只打印错误日志 -->
<level>ERROR</level>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>
<!-- 按照每天生成常规日志文件 -->
<appender name="INFO" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>${LOG_FILE}/${APP_NAME}-info.log</file>
<!-- 基于时间的分包策略 -->
<rollingPolicy class="ch.qos.logback.core.rolling.SizeAndTimeBasedRollingPolicy">
<fileNamePattern>${LOG_FILE}/${APP_NAME}-info.%d{yyyy-MM-dd}.%i.log</fileNamePattern>
<maxFileSize>100MB</maxFileSize>
<!--保留时间,单位:天-->
<maxHistory>60</maxHistory>
</rollingPolicy>
<encoder>
<pattern>${CONSOLE_LOG_PATTERN_NO_COLOR}</pattern>
<charset>UTF-8</charset>
</encoder>
<filter class="ch.qos.logback.classic.filter.LevelFilter">
<level>INFO</level>
<onMatch>ACCEPT</onMatch>
<onMismatch>DENY</onMismatch>
</filter>
</appender>
<root level="INFO">
<appender-ref ref="STDOUT"/>
<appender-ref ref="ERROR"/>
<appender-ref ref="INFO"/>
</root>
</configuration>
sql
DROP TABLE IF EXISTS `t_blog_spider`;
CREATE TABLE `t_blog_spider` (
`uid` varchar(64) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '唯一uid',
`title` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客标题',
`summary` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客简介',
`content` longtext CHARACTER SET utf8 COLLATE utf8_general_ci NULL COMMENT '博客内容',
`tag_uid` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '标签uid',
`click_count` int(11) NULL DEFAULT 0 COMMENT '博客点击数',
`collect_count` int(11) NULL DEFAULT 0 COMMENT '博客收藏数',
`file_uid` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '标题图片uid',
`status` tinyint(1) UNSIGNED NOT NULL DEFAULT 1 COMMENT '状态',
`create_time` timestamp(0) NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP(0) COMMENT '创建时间',
`update_time` timestamp(0) NULL DEFAULT NULL COMMENT '更新时间',
`admin_uid` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '管理员uid',
`is_original` varchar(1) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '1' COMMENT '是否原创(0:不是 1:是)',
`author` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '作者',
`articles_part` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '文章出处',
`blog_sort_uid` varchar(32) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL COMMENT '博客分类UID',
`level` tinyint(1) NULL DEFAULT 0 COMMENT '推荐等级(0:正常)',
`is_publish` varchar(1) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '1' COMMENT '是否发布:0:否,1:是',
`sort` int(11) NOT NULL DEFAULT 0 COMMENT '排序字段',
PRIMARY KEY (`uid`) USING BTREE
) ENGINE = InnoDB CHARACTER SET = utf8 COLLATE = utf8_general_ci COMMENT = '博客爬取表' ROW_FORMAT = Dynamic;
dao
package com.thh.spider.dao;
import org.springframework.data.jpa.repository.JpaRepository;
import org.springframework.data.jpa.repository.JpaSpecificationExecutor;
import com.thh.spider.pojo.Blog;
import tk.mybatis.mapper.common.Mapper;
/**
* blog数据访问接口
* @author Administrator
*
*/
public interface BlogDao extends Mapper<Blog> {
}
pipeline
package com.thh.spider.pipeline;
import com.thh.spider.dao.BlogDao;
import com.thh.spider.pojo.Blog;
import com.thh.spider.util.DownloadUtil;
import com.thh.spider.util.IdWorker;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import org.springframework.util.StringUtils;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.pipeline.Pipeline;
import java.io.IOException;
import java.util.Date;
@Component
public class BlogPipeline implements Pipeline {
@Autowired
private IdWorker idWorker;
@Autowired
private BlogDao blogDao;
private final String SAVE_PATH = "C:/Users/King/Desktop/tensquare/webmgicFile/piantuUrl/youxi/";
@Override
public void process(ResultItems res, Task task) {
//获取title和content
String title = res.get("title");
String content = res.get("content");
System.out.println("title: "+title);
System.out.println("content: "+content);
if (!StringUtils.isEmpty(title) && !StringUtils.isEmpty(content)){
try {
Blog blog = new Blog();
blog.setUid(idWorker.nextId()+""); //主键
blog.setTitle(title); //标题
blog.setSummary("爬取到的页面");
blog.setContent(content); //博客内容
blog.setTagUid("1");
blog.setClickCount(0); //点击数
blog.setCollectCount(0); //收藏数
blog.setFile_uid(null);
blog.setStatus(1); //状态1
Date now = new Date();
blog.setCreateTime(now);//创建时间
blog.setUpdateTime(now);
blog.setAdminUid("1f01cd1d2f474743b241d74008b12333");//管理员uid 写死
blog.setAuthor("作者");
blog.setArticlesPart("辉皇博客");
blog.setBlogSortUid("1");
blog.setLevel(1);
blog.setIsPublish("1");
blog.setSort(0);
blogDao.insertSelective(blog);
//下载到本地
//DownloadUtil.download("http://pic.netbian.com"+fileUrl,fileName,SAVE_PATH);
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
pojo
用到了lombok插件 如果不想安装就全部get set方法就好 不需要@Data
package com.thh.spider.pojo;
import lombok.Data;
import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.Id;
import javax.persistence.Table;
import java.io.Serializable;
import java.util.Date;
/**
* blog实体类
* @author thh
*
*/
@Table(name="t_blog_spider")
@Data
public class Blog implements Serializable{
@Id
@Column(name="uid")
private String uid;//唯一uid
@Column(name="title")
private String title;//博客标题
@Column(name="summary")
private String summary;//博客简介
@Column(name="content")
private String content;//博客内容
@Column(name="tag_uid")
private String tagUid;//标签uid
@Column(name="click_count")
private Integer clickCount;//博客点击数
@Column(name="collect_count")
private Integer collectCount;//博客收藏数
@Column(name="fileUid")
private String file_uid;//标题图片uid
@Column(name="status")
private Integer status;//状态
@Column(name="create_time")
private Date createTime;//创建时间
@Column(name="update_time")
private Date updateTime;//更新时间
@Column(name="admin_uid")
private String adminUid;//管理员uid
@Column(name="is_original")
private String isOriginal;//是否原创(0:不是 1:是)
@Column(name="author")
private String author;//作者
@Column(name="articles_part")
private String articlesPart;//文章出处
@Column(name="blog_sort_uid")
private String blogSortUid;//博客分类UID
@Column(name="level")
private Integer level;//推荐等级(0:正常)
@Column(name="is_publish")
private String isPublish;//是否发布:0:否,1:是
@Column(name="sort")
private Integer sort;//排序字段
}
processer
package com.thh.spider.processer;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;
import java.util.List;
@Component
public class BlogProcesser implements PageProcessor {
/**
* 处理我们需要的页面
*/
@Override
public void process(Page page) {
List<String> list = page.getHtml().regex("https://blog.csdn.net/[a-zA-Z0-9_]+/article/details/[0-9]{9}").all();
this.saveBlogInfo(page);
page.addTargetRequests(list);
/* if(list==null || list.size()==0){
// 如果为空 表示这是图片详情页
this.saveBlogInfo(page);
}else {
// 如果不为空 表示这是列表页 解析出详情页的url地址 放到任务队列中
*//*for (Selectable selectable : list) {
//获取url地址
String details = selectable.links().toString();
page.addTargetRequest(details);
}*//*
for (String details : list) {
//获取url地址
page.addTargetRequest(details);
}
}*/
}
private void saveBlogInfo(Page page) {
//2、获取我们需要的内容: title和content
String title = page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1/text()").toString();
String content = page.getHtml().xpath("//*[@id=\"article_content\"]").toString();
if(title!=null){
page.putField("title", title);
page.putField("content", content);
}else {
page.setSkip(true);//跳过爬取
}
}
//站点信息设置
@Override
public Site getSite() {
return Site.me().setCharset("utf8").setRetryTimes(2).setSleepTime(500).setTimeOut(2000);
}
}
定时任务类
package com.thh.spider.task;
import com.thh.spider.pipeline.BlogPipeline;
import com.thh.spider.processer.BlogProcesser;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.pipeline.FilePipeline;
import us.codecraft.webmagic.scheduler.QueueScheduler;
import us.codecraft.webmagic.scheduler.RedisScheduler;
/**
* 定时任务类
*/
@Component
public class BlogTask {
@Autowired
private BlogProcesser blogProcesser;
@Autowired
private BlogPipeline blogPipeline;
@Autowired
private RedisScheduler redisScheduler;
/**
* 爬取文章: 爬取数据库分类
*/
//@Scheduled(cron = "0/20 * * * * ?")
//initialDelay 任务启动后多久后执行
//fixedDelay 多久执行一次
@Scheduled(initialDelay = 1000,fixedDelay = 100*1000)
public void webArticleTask(){
//开启蜘蛛爬取内容
Spider.create(blogProcesser)
.addUrl("https://www.csdn.net/")
.addPipeline(blogPipeline)
//.setScheduler(redisScheduler) //redis来效验存储重复爬取的链接
.setScheduler(new QueueScheduler()) //内存效验重复爬取 关闭就失效
.thread(10) //十个线程
.run();
}
}
util
下载工具类
DownloadUtil
package com.thh.spider.util;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
/**
* 下载工具类
*/
public class DownloadUtil {
public static void download(String urlStr,String filename,String savePath) throws IOException {
URL url = new URL(urlStr);
//打开url连接
URLConnection connection = url.openConnection();
//请求超时时间
connection.setConnectTimeout(5000);
//输入流
InputStream in = connection.getInputStream();
//缓冲数据
byte [] bytes = new byte[1024];
//数据长度
int len;
//文件
File file = new File(savePath);
if(!file.exists())
file.mkdirs();
OutputStream out = new FileOutputStream(file.getPath()+"\\"+filename);
//先读到bytes中
while ((len=in.read(bytes))!=-1){
//再从bytes中写入文件
out.write(bytes,0,len);
}
//关闭IO
out.close();
in.close();
}
}
idWork
package com.thh.spider.util;
import java.lang.management.ManagementFactory;
import java.net.InetAddress;
import java.net.NetworkInterface;
/**
* <p>名称:IdWorker.java</p>
* <p>描述:分布式自增长ID</p>
* <pre>
* Twitter的 Snowflake JAVA实现方案
* </pre>
* 核心代码为其IdWorker这个类实现,其原理结构如下,我分别用一个0表示一位,用—分割开部分的作用:
* 1||0---0000000000 0000000000 0000000000 0000000000 0 --- 00000 ---00000 ---000000000000
* 在上面的字符串中,第一位为未使用(实际上也可作为long的符号位),接下来的41位为毫秒级时间,
* 然后5位datacenter标识位,5位机器ID(并不算标识符,实际是为线程标识),
* 然后12位该毫秒内的当前毫秒内的计数,加起来刚好64位,为一个Long型。
* 这样的好处是,整体上按照时间自增排序,并且整个分布式系统内不会产生ID碰撞(由datacenter和机器ID作区分),
* 并且效率较高,经测试,snowflake每秒能够产生26万ID左右,完全满足需要。
* <p>
* 64位ID (42(毫秒)+5(机器ID)+5(业务编码)+12(重复累加))
*
* @author Polim
*/
public class IdWorker {
// 时间起始标记点,作为基准,一般取系统的最近时间(一旦确定不能变动)
private final static long twepoch = 1288834974657L;
// 机器标识位数
private final static long workerIdBits = 5L;
// 数据中心标识位数
private final static long datacenterIdBits = 5L;
// 机器ID最大值
private final static long maxWorkerId = -1L ^ (-1L << workerIdBits);
// 数据中心ID最大值
private final static long maxDatacenterId = -1L ^ (-1L << datacenterIdBits);
// 毫秒内自增位
private final static long sequenceBits = 12L;
// 机器ID偏左移12位
private final static long workerIdShift = sequenceBits;
// 数据中心ID左移17位
private final static long datacenterIdShift = sequenceBits + workerIdBits;
// 时间毫秒左移22位
private final static long timestampLeftShift = sequenceBits + workerIdBits + datacenterIdBits;
private final static long sequenceMask = -1L ^ (-1L << sequenceBits);
/* 上次生产id时间戳 */
private static long lastTimestamp = -1L;
// 0,并发控制
private long sequence = 0L;
private final long workerId;
// 数据标识id部分
private final long datacenterId;
public IdWorker(){
this.datacenterId = getDatacenterId(maxDatacenterId);
this.workerId = getMaxWorkerId(datacenterId, maxWorkerId);
}
/**
* @param workerId
* 工作机器ID
* @param datacenterId
* ***
*/
public IdWorker(long workerId, long datacenterId) {
if (workerId > maxWorkerId || workerId < 0) {
throw new IllegalArgumentException(String.format("worker Id can't be greater than %d or less than 0", maxWorkerId));
}
if (datacenterId > maxDatacenterId || datacenterId < 0) {
throw new IllegalArgumentException(String.format("datacenter Id can't be greater than %d or less than 0", maxDatacenterId));
}
this.workerId = workerId;
this.datacenterId = datacenterId;
}
/**
* 获取下一个ID
*
* @return
*/
public synchronized long nextId() {
long timestamp = timeGen();
if (timestamp < lastTimestamp) {
throw new RuntimeException(String.format("Clock moved backwards. Refusing to generate id for %d milliseconds", lastTimestamp - timestamp));
}
if (lastTimestamp == timestamp) {
// 当前毫秒内,则+1
sequence = (sequence + 1) & sequenceMask;
if (sequence == 0) {
// 当前毫秒内计数满了,则等待下一秒
timestamp = tilNextMillis(lastTimestamp);
}
} else {
sequence = 0L;
}
lastTimestamp = timestamp;
// ID偏移组合生成最终的ID,并返回ID
long nextId = ((timestamp - twepoch) << timestampLeftShift)
| (datacenterId << datacenterIdShift)
| (workerId << workerIdShift) | sequence;
return nextId;
}
private long tilNextMillis(final long lastTimestamp) {
long timestamp = this.timeGen();
while (timestamp <= lastTimestamp) {
timestamp = this.timeGen();
}
return timestamp;
}
private long timeGen() {
return System.currentTimeMillis();
}
/**
* <p>
* 获取 maxWorkerId
* </p>
*/
protected static long getMaxWorkerId(long datacenterId, long maxWorkerId) {
StringBuffer mpid = new StringBuffer();
mpid.append(datacenterId);
String name = ManagementFactory.getRuntimeMXBean().getName();
if (!name.isEmpty()) {
/*
* GET jvmPid
*/
mpid.append(name.split("@")[0]);
}
/*
* MAC + PID 的 hashcode 获取16个低位
*/
return (mpid.toString().hashCode() & 0xffff) % (maxWorkerId + 1);
}
/**
* <p>
* 数据标识id部分
* </p>
*/
protected static long getDatacenterId(long maxDatacenterId) {
long id = 0L;
try {
InetAddress ip = InetAddress.getLocalHost();
NetworkInterface network = NetworkInterface.getByInetAddress(ip);
if (network == null) {
id = 1L;
} else {
byte[] mac = network.getHardwareAddress();
id = ((0x000000FF & (long) mac[mac.length - 1])
| (0x0000FF00 & (((long) mac[mac.length - 2]) << 8))) >> 6;
id = id % (maxDatacenterId + 1);
}
} catch (Exception e) {
System.out.println(" getDatacenterId: " + e.getMessage());
}
return id;
}
public static void main(String[] args) {
//推特 26万个不重复的ID
IdWorker idWorker = new IdWorker(0,0);
for (int i = 0; i <2600 ; i++) {
System.out.println(idWorker.nextId());
}
}
}
启动类
package com.thh.spider;
import com.thh.spider.util.IdWorker;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.boot.SpringApplication;
import org.springframework.boot.autoconfigure.SpringBootApplication;
import org.springframework.context.annotation.Bean;
import org.springframework.scheduling.annotation.EnableScheduling;
import tk.mybatis.spring.annotation.MapperScan;
import us.codecraft.webmagic.scheduler.RedisScheduler;
@SpringBootApplication
//开启定时启动
@EnableScheduling
@MapperScan(basePackages = {"com.thh.spider.dao"})
public class SpiderApplication {
public static void main(String[] args) {
SpringApplication.run(SpiderApplication.class,args);
}
//注入redis的ip地址
@Value("${spring.redis.host}")
private String host;
//url去重使用的
@Bean
public RedisScheduler redisScheduler() {
return new RedisScheduler(host);
}
@Bean
public IdWorker idWorker() {
return new IdWorker(1,1);
}
}
上一篇: JavaScript中BOM和DOM详解
下一篇: MySQL单表千万级数据处理的思路分享
推荐阅读
-
用Python爬取了拉勾网的招聘信息+详细教程+趣味学习+快速爬虫入门+学习交流+大神+爬虫入门
-
神箭手云爬虫-爬取携程【国际】航班/机票信息-利用python解析返回的json文件将信息存储进Mysql数据库
-
爬虫爬取千千音乐榜单音乐
-
Python高级爬虫开发,高难度JS解密教程,绝地求生模拟登陆!
-
零基础写Java知乎爬虫之获取知乎编辑推荐内容(3)
-
Unity通过脚本添加组件实现鼠标滑过3D物体高亮(附Demo)
-
爬虫出现TypeError: cannot use a string pattern on a bytes-like object报错
-
获取百度贴吧头像的爬虫
-
scrapy在python爬虫中搭建出错的解决方法
-
教你如何利用python3爬虫爬取漫画岛-非人哉漫画