使用 BulkProcessor 批量导入mysql数据到Elasticsearch

程序员文章站 2022-06-11 11:44:21

...

1. pom文件

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>com.rpp</groupId>
    <artifactId>elasticsearch-demo</artifactId>
    <version>1.0-SNAPSHOT</version>

    <parent>
        <groupId>org.springframework.boot</groupId>
        <artifactId>spring-boot-starter-parent</artifactId>
        <version>2.3.2.RELEASE</version>
        <relativePath/>
    </parent>

    <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
        <java.version>1.8</java.version>
    </properties>

    <dependencyManagement>
        <dependencies>
            <dependency>
                <groupId>org.springframework.cloud</groupId>
                <artifactId>spring-cloud-dependencies</artifactId>
                <version>Hoxton.SR6</version>
                <type>pom</type>
                <scope>import</scope>
            </dependency>
        </dependencies>
    </dependencyManagement>

    <dependencies>
        <dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>elasticsearch-rest-high-level-client</artifactId>
            <version>7.6.2</version>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-thymeleaf</artifactId>
        </dependency>
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-web</artifactId>
        </dependency>

        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-configuration-processor</artifactId>
            <optional>true</optional>
        </dependency>
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <optional>true</optional>
        </dependency>

        <dependency>
            <groupId>mysql</groupId>
            <artifactId>mysql-connector-java</artifactId>
            <scope>runtime</scope>
        </dependency>

        <!--elasticsearch-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-starter-data-elasticsearch</artifactId>
        </dependency>

        <!--devtools热部署-->
        <dependency>
            <groupId>org.springframework.boot</groupId>
            <artifactId>spring-boot-devtools</artifactId>
            <optional>true</optional>
            <scope>runtime</scope>
        </dependency>
    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.springframework.boot</groupId>
                <artifactId>spring-boot-maven-plugin</artifactId>
            </plugin>
        </plugins>
    </build>

</project>

2. 配置类

TransportClient 在 Elasticsearch 7 过时并且 Elasticsearch 8将会被移除，建议使用RestHighLevelClient 操作。

import org.apache.http.HttpHost;
import org.elasticsearch.client.RestClient;
import org.elasticsearch.client.RestHighLevelClient;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.data.elasticsearch.client.ClientConfiguration;
import org.springframework.data.elasticsearch.client.RestClients;

@Configuration
public class EsConfig {

    @Value("${spring.elasticsearch.rest.uris}")
    private String hostlist;

    @Bean
    public RestHighLevelClient client() {

        //解析hostlist配置信息
        String[] split = hostlist.split(",");
//        //创建HttpHost数组，其中存放es主机和端口的配置信息
//        HttpHost[] httpHostArray = new HttpHost[split.length];
//        for(int i=0;i<split.length;i++){
//            String item = split[i];
//            System.out.println(item);
//            httpHostArray[i] = new HttpHost(item.split(":")[0], Integer.parseInt(item.split(":")[1]), "http");
//        }
//        //创建RestHighLevelClient客户端
//        return new RestHighLevelClient(RestClient.builder(httpHostArray));

        final ClientConfiguration clientConfiguration = ClientConfiguration.builder()
                .connectedTo(split)
                .build();

        return RestClients.create(clientConfiguration).rest();
    }
}

spring:
  elasticsearch:
    rest:
      uris: 192.168.8.31:9200,192.168.8.31:9201,192.168.8.31:9202

详细配置示例如下：

HttpHeaders httpHeaders = new HttpHeaders();
httpHeaders.add("some-header", "on every request")                      

ClientConfiguration clientConfiguration = ClientConfiguration.builder()
  .connectedTo("localhost:9200", "localhost:9291")                      
  .useSsl()                                                             
  .withProxy("localhost:8888")                                          
  .withPathPrefix("ela")                                                
  .withConnectTimeout(Duration.ofSeconds(5))                            
  .withSocketTimeout(Duration.ofSeconds(3))                             
  .withDefaultHeaders(defaultHeaders)                                   
  .withBasicAuth(username, password)                                    
  .withHeaders(() -> {                                                  
    HttpHeaders headers = new HttpHeaders();
    headers.add("currentTime", LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME));
    return headers;
  })
  . // ... other options
  .build();

同样的ElasticsearchTemplate在 ES7也被ElasticsearchRestTemplate替换

@Configuration
public class RestClientConfig extends AbstractElasticsearchConfiguration {
  @Override
  public RestHighLevelClient elasticsearchClient() {       
    return RestClients.create(ClientConfiguration.localhost()).rest();
  }

  // no special bean creation needed                       
}

基类AbstractElasticsearchConfiguration已经提供了ElasticsearchRestTemplate Bean的初始化

3. 导入代码

private void writeMySQLDataToES(String tableName) {
        BulkProcessor bulkProcessor = getBulkProcessor(client);
        Connection connection = null;
        PreparedStatement ps = null;
        ResultSet rs = null;
        try {
            connection = DBHelper.getConn();
            logger.info("start handle data :" + tableName);
            String sql = "select * from " + tableName;
            ps = connection.prepareStatement(sql, ResultSet.TYPE_FORWARD_ONLY, ResultSet.CONCUR_READ_ONLY);
            // 根据自己需要设置 fetchSize
            ps.setFetchSize(20);
            rs = ps.executeQuery();
            ResultSetMetaData colData = rs.getMetaData();
            ArrayList<HashMap<String, String>> dataList = new ArrayList<>();
            HashMap<String, String> map = null;
            int count = 0;
            // c 就是列的名字   v 就是列对应的值
            String c = null;
            String v = null;
            while (rs.next()) {
                count++;
                map = new HashMap<String, String>(128);
                for (int i = 1; i < colData.getColumnCount(); i++) {
                    c = colData.getColumnName(i);
                    v = rs.getString(c);
                    map.put(c, v);
                }
                dataList.add(map);
                // 每1万条 写一次   不足的批次的数据 最后一次提交处理
                if (count % 10000 == 0) {
                    logger.info("mysql handle data  number:" + count);
                    // 将数据添加到 bulkProcessor
                    for (HashMap<String, String> hashMap2 : dataList) {
                        bulkProcessor.add(new IndexRequest(POSITION_INDEX).source(hashMap2));
                    }
                    // 每提交一次 清空 map 和  dataList
                    map.clear();
                    dataList.clear();
                }
            }
            // 处理 未提交的数据
            for (HashMap<String, String> hashMap2 : dataList) {
                bulkProcessor.add(new IndexRequest(POSITION_INDEX).source(hashMap2));
            }
            bulkProcessor.flush();

        } catch (SQLException e) {
            e.printStackTrace();
        } finally {
            try {
                rs.close();
                ps.close();
                connection.close();
                boolean terinaFlag = bulkProcessor.awaitClose(150L, TimeUnit.SECONDS);
                logger.info(terinaFlag);
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

    }

    private BulkProcessor getBulkProcessor(RestHighLevelClient client) {

        BulkProcessor bulkProcessor = null;
        try {

            BulkProcessor.Listener listener = new BulkProcessor.Listener() {
                @Override
                public void beforeBulk(long executionId, BulkRequest request) {
                    logger.info("Try to insert data number : "
                            + request.numberOfActions());
                }

                @Override
                public void afterBulk(long executionId, BulkRequest request,
                                      BulkResponse response) {
                    logger.info("************** Success insert data number : "
                            + request.numberOfActions() + " , id: " + executionId);
                }

                @Override
                public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
                    logger.error("Bulk is unsuccess : " + failure + ", executionId: " + executionId);
                }
            };

            BiConsumer<BulkRequest, ActionListener<BulkResponse>> bulkConsumer = (request, bulkListener) -> client
                    .bulkAsync(request, RequestOptions.DEFAULT, bulkListener);

            bulkProcessor = BulkProcessor.builder(bulkConsumer, listener)
                    .setBulkActions(5000)
                    .setBulkSize(new ByteSizeValue(100L, ByteSizeUnit.MB))
                    .setConcurrentRequests(10)
                    .setFlushInterval(TimeValue.timeValueSeconds(100L))
                    .setBackoffPolicy(BackoffPolicy.constantBackoff(TimeValue.timeValueSeconds(1L), 3))
                    .build();

        } catch (Exception e) {
            e.printStackTrace();
            try {
                bulkProcessor.awaitClose(100L, TimeUnit.SECONDS);
            } catch (Exception e1) {
                logger.error(e1.getMessage());
            }
        }
        return bulkProcessor;
    }

获取数据库连接工具类，导入的时候为了效率高直接使用底层的JDBC进行批量操作

import java.sql.Connection;
import java.sql.DriverManager;

public class DBHelper {
    public static final String url = "jdbc:mysql://192.168.8.31:3306/position?useUnicode=true&characterEncoding=utf-8&serverTimezone=Asia/Shanghai&useSSL=false";
    public static final String name = "com.mysql.cj.jdbc.Driver";
    public static final String user = "root";
    public static final String password = "root";
    private  static Connection  connection = null;

    public  static   Connection  getConn(){
        try {
            Class.forName(name);
            connection = DriverManager.getConnection(url,user,password);
        }catch (Exception e){
            e.printStackTrace();
        }
        return  connection;
    }
}

上一篇：英特尔X86成载大数据主流认可度与行业企业规模正比

下一篇：专家称：机器人将在30年内使失业率上升50%

使用 BulkProcessor 批量导入mysql数据到Elasticsearch

1. pom文件

2. 配置类

3. 导入代码

java实现批量导入.csv文件到mysql数据库

php中使用ExcelFileParser处理excel获得数据(可作批量导入到数据库使用)

Java利用MYSQL LOAD DATA LOCAL INFILE实现大批量导入数据到MySQL

使用PHPExcel实现数据批量导入到数据库

解决大批量数据插入mysql问题:使用mysqlimport工具从文件中导入

Elasticsearch-MySQL数据导入到Elasticsearch中

logstash从MySQL导入数据到ElasticSearch的配置

使用 BulkProcessor 批量导入mysql数据到Elasticsearch

elasticsearch 数据导入到 mysql

批量导入GIS数据到Elasticsearch中