欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

elasticsearch 中文分词器

程序员文章站 2024-02-21 22:47:16
...

中文分词器

使用默认分词器查询中文。

# 可以看到一个中文是一个词,对于中文很不友好,所以需要使用中文分词器
curl --location --request POST 'localhost:9200/_analyze' \
--header 'Content-Type: application/json' \
--data-raw '{
    "analyzer": "standard",
    "text": "RNG今年总冠军"
}'
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
{
    "tokens": [
        {
            "token": "rng",
            "start_offset": 0,
            "end_offset": 3,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "今",
            "start_offset": 3,
            "end_offset": 4,
            "type": "<IDEOGRAPHIC>",
            "position": 1
        },
        {
            "token": "年",
            "start_offset": 4,
            "end_offset": 5,
            "type": "<IDEOGRAPHIC>",
            "position": 2
        },
        {
            "token": "总",
            "start_offset": 5,
            "end_offset": 6,
            "type": "<IDEOGRAPHIC>",
            "position": 3
        },
        {
            "token": "冠",
            "start_offset": 6,
            "end_offset": 7,
            "type": "<IDEOGRAPHIC>",
            "position": 4
        },
        {
            "token": "军",
            "start_offset": 7,
            "end_offset": 8,
            "type": "<IDEOGRAPHIC>",
            "position": 5
        }
    ]
}

常用的分词器

  • smartCN 简单友好中英文混合的文本分词器
  • IK分词器 更智能友好的中文分词器

smartCN安装与使用

# 去安装目录的bin目录下,执行安装命令,然后重启
.\bin\elasticsearch-plugin.bat install analysis-smartcn
# 重启
bin/elasticsearch.bat
# 查询
POST localhost:9200/_analyze
{
    "analyzer": "smartcn",
    "text": "RNG今年总冠军"
}

>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
{
    "tokens": [
        {
            "token": "rng",
            "start_offset": 0,
            "end_offset": 3,
            "type": "word",
            "position": 0
        },
        {
            "token": "今年",
            "start_offset": 3,
            "end_offset": 5,
            "type": "word",
            "position": 1
        },
        {
            "token": "总",
            "start_offset": 5,
            "end_offset": 6,
            "type": "word",
            "position": 2
        },
        {
            "token": "冠军",
            "start_offset": 6,
            "end_offset": 8,
            "type": "word",
            "position": 3
        }
    ]
}

IK分词

# 下载对应版本
https://github.com/medcl/elasticsearch-analysis-ik/releases
# 安装,就压对应包到plugins目录下,然后重启

# 查询
POST localhost:9200/_analyze
{
    "analyzer": "ik_max_word",
    "text": "RNG今年总冠军"
}
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
{
    "tokens": [
        {
            "token": "rng",
            "start_offset": 0,
            "end_offset": 3,
            "type": "ENGLISH",
            "position": 0
        },
        {
            "token": "今年",
            "start_offset": 3,
            "end_offset": 5,
            "type": "CN_WORD",
            "position": 1
        },
        {
            "token": "总冠军",
            "start_offset": 5,
            "end_offset": 8,
            "type": "CN_WORD",
            "position": 2
        },
        {
            "token": "冠军",
            "start_offset": 6,
            "end_offset": 8,
            "type": "CN_WORD",
            "position": 3
        }
    ]
}
相关标签: ELK