欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

Elasticsearch打造全文搜索引擎(二)

程序员文章站 2022-04-29 08:13:49
...

一、Es的文档、索引的CURD操作

1. elasticsearch概念

  • 集群:一个或多个节点组织在一起
  • 节点:一个节点是集群中的一个服务器,有一个名字来标识,默认是一个随机的漫画角色的名字
  • 分片:将索引划分为多份的能力,允许水平分割和扩展容量,多个分片相应请求,提高性能和吞吐量。
  • 副本:创建分片的一份或多份的能力,在一个节点失败其余节点可以顶上。
elasticsearch mysql
index(索引) 数据库
type(类型)
document(文档)
fields

2.常用属性和类型

 

Elasticsearch打造全文搜索引擎(二)

3.内置类型

Elasticsearch打造全文搜索引擎(二)

4. CURD操作

  • 索引的初始化操作
  • 指定分片和副本的数量
  • shards一旦设置不能修改
# 索引初始化
PUT lagou { "settings": { "index": { "number_of_shards": 5, # 分片 "number_of_replicas": 1 # 备份 } } } GET lagou/_settings GET _all/_settings GET .kibana,lagou/_settings GET _settings # 修改settings PUT lagou/_settings { "number_of_replicas": 2 } # 获取索引信息 GET _all GET lagou # 新建/保存文档 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":15000, "city":"北京", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 新建文档 # 方式二 POST lagou/job/ { "title": "python django 开发工程师", "salary_min":30000, "city":"上海", "company":{ "name":"美团科技", "company_addr":"北京市软件园A区" }, "publish_date":"2019-06-15", "comments":120 } GET lagou/job/1 GET lagou/job/1?_source=title GET lagou/job/1?_source=title,city GET lagou/job/1?_source # 修改文章 # 方式一 PUT lagou/job/1 { "title": "python爬虫分布式开发", "salary_min":18000, "city":"广州", "company":{ "name":"百度", "company_addr":"北京市软件园" }, "publish_date":"2019-06-15", "comments":15 } # 方式二:修改修改某一字段 POST lagou/job/1/_update { "doc": { "comments":20 } } # 删除 DELETE lagou/job/1 DELETE lagou/job DELETE lagou

二、mget和bulk操作

Elasticsearch打造全文搜索引擎(二)

# 批量操作

数据准备
POST lagou/job1/1
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job1/2
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job2/1
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

POST lagou/job2/2
{
  "title": "python django 开发工程师",
  "salary_min":30000,
  "city":"上海",
  "company":{
    "name":"美团科技",
    "company_addr":"北京市软件园A区"
  },
  "publish_date":"2019-06-15",
  "comments":120
}

mget批量获取
GET _mget
{
  "docs":[
      {"_index":"lagou",
       "_type":"job1",
       "_id":1
      },
      {"_index":"lagou",
       "_type":"job2",
       "_id":2
      }
    ]
}

GET lagou/_mget
{
  "docs":[
      {
       "_type":"job1",
       "_id":1
      },
      {
       "_type":"job2",
       "_id":2
      }
    ]
}

GET lagou/job1/_mget
{
  "docs":[
      {
       "_id":1
      },
      {
       "_id":2
      }
    ]
}

GET lagou/job1/_mget
{
  "ids":[1,2]
}

bulk增删改查

POST _bulk
{"index":{"_index":"lagou","_type":"job1","_id":"3"}}
{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}
{"index":{"_index":"lagou","_type":"job2","_id":"3"}}
{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}

POST _bulk
{"create":{"_index":"lagou","_type":"job1","_id":"3"}}
{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}

POST _bulk
{"delete":{"_index":"lagou","_type":"job1","_id":"3"}}

POST _bulk
{"update":{"_index":"lagou","_type":"job1","_id":"3"}}
{"doc":{"title": "python django 开发工程师","salary_min":30000,"city":"上海","company":{"name":"美团科技","company_addr":"北京市软件园A区"},"publish_date":"2019-06-15","comments":120}}

三、mapping映射和查询

1. mapping映射

Elasticsearch打造全文搜索引擎(二)

2.倒排索引

Elasticsearch打造全文搜索引擎(二)

3. 倒排索引待解决的问题

Elasticsearch打造全文搜索引擎(二)

4. 查询

Elasticsearch打造全文搜索引擎(二)

5. 操作

# mapping操作

PUT lagou1
{
  "mappings":{
    "job":{
      "properties":{
        "title":{
          "type":"text"
        },
        "salary_min":{
          "type":"integer"
        },
        "city":{
          "type":"keyword"
        },
        "company":{
          "properties":{
            "name":{
              "type":"text"
            },
            "company_addr":{
              "type":"text"
            },
            "employee_count":{
              "type":"integer"
            }
        }
      },
      "publish_date":{
        "type":"date",
        "format":"yyyy-MM-dd"
      },
      "comments":{
        "type":"integer"
      }
    }
  }
}
}

PUT lagou1/job/1
{
  "title": "python爬虫分布式开发",
  "salary_min":15000,
  "city":"北京",
  "company":{
    "name":"百度",
    "company_addr":"北京市软件园",
    "employee_count":50
  },
  "publish_date":"2019-06-15",
  "comments":15
}

# get index mapping

GET lagou1/_mapping
GET lagou1/_mapping/job
GET _all/_mapping/job

# 查询

PUT lagou2
{
  "mappings": {
    "job":{
      "properties":{
        "title":{
          "type": "text",
          "store":true,
          "analyzer": "ik_max_word"
        },
        "company_name": {
          "type": "keyword",
          "store":true
        },
        "desc":{
          "type":"text"
        }, 
        "add_time":{
          "type":"date",
          "format":"yyyy-MM-dd"
        },
        "comments":{
          "type": "integer"
        }
      }
    }
  }
}


POST lagou2/job
{
  "title":"python django 开发工程师" ,
  "company_name":"美国科技有限公司",
  "desc":"对django的概念熟悉,熟悉python基础知识", 
  "comments":20,
  "add_time":"2017-04-01"  
}

POST lagou2/job
{
  "title":"python scrapy redis 分布式爬虫基本" ,
  "company_name":"百度科技有限公司",
  "desc":"对scrapy的概念熟悉,熟悉redis的基本操作",
  "comments":5,
  "add_time":"2017-04-15"  
} 

POST lagou2/job
{
  "title":"Elasticsearch打造搜索引擎" ,
  "company_name":"阿里巴巴科技有限公司",
  "desc":"熟悉数据结构算法,熟悉python的基本开发",
  "comments":15,
  "add_time":"2017-06-20"  
} 

POST lagou2/job
{
  "title":"python打造推荐引擎系统" ,
  "company_name":"阿里巴巴科技有限公司",
  "desc":"熟悉推荐引擎的原理以及算法、掌握C语言",
  "comments":60,
  "add_time":"2016-10-20"  
} 

# 简单查询
#查看分析器解析的结果
GET _analyze
{
  "analyzer": "ik_smart",
  "text":"Python网络开发师"
}
GET _analyze
{
  "analyzer": "ik_max_word",
  "text":"Python网络开发师"
}

#match查询 (分词查询) python 和分布式
#查询第0-2条的title和company_name字段(desc字段的stored属性不是true),并按comments排序
GET lagou2/_search
{
 "stored_fields":["title","company_name","desc"], 
  "query":{
    "match":{
      "title":"python分布式"  
    }
  },
  "from": 0,
  "size": 2,
  "sort": [
    {
      "comments": {
        "order": "desc"
      }
    }
  ]
}

#查询comments在大于等于10、小于等于20、权重2.0的数据
GET lagou2/_search
{
  "query":{  
    "range": {
      "comments": {
        "gte": 10,
        "lte": 20,
        "boost":2.0
      }
    }
  }
}
GET lagou2/_search
{
  "query":{  
    "range": {
      "add_time": {
        "gte": "2017-04-01",
        "lte": "now"
      }
    }
  }
}

#term查询(不会做处理、直接查,类似于keyword属性)
GET lagou2/_search
{
  "query":{
    "term":{
      "title":"python"  
    }
  }
}
#terms 和用match查django分布工程  效果一样
GET lagou2/_search
{
  "query":{
    "terms":{
      "title":["django"  ,"分布"  ,"工程"  ]
    }
  }
}

#match_all
GET lagou2/_search
{
  "query":{
    "match_all":{}
  }
}
 
#match_phrase 
#短语查询
#满足所有词 既有python也有系统,俩个词最小间距6位
GET lagou2/_search
{
  "query":{
    "match_phrase": {
      "title": {
        "query": "python系统",
        "slop":6
      }
    }
  }
}

#multi_match 多字段匹配,title的权重高于desc的3倍
GET lagou2/_search
{
  "query":{
    "multi_match": { 
      "query": "python系统",
      "fields":["title^3","desc"]
    }
  }
}

# sort查询
GET lagou2/_search
{
  "query": {
    "match_all": {}
  },
  "sort": [
    {
      "comments": {
        "order": "asc"
      }
    }
  ]
}

# range范围查询
GET lagou2/_search
{
  "query": { 
      "range": {
      "comments": {
        "gte": 20,
        "lte": 60,
        "boost":2.0
      }
    }
  }
}

GET lagou2/_search
{
  "query": { 
      "range": {
      "add_time": {
        "gte": "2017-06-07",
        "lte": "now"
      }
  }
}
}

#wildcard 通配符查询
GET lagou2/_search
{
  "query":{  
    "wildcard": {
      "title": {
        "value": "pyth*n",
        "boost": 2
      }
    }
  }
}

# 组合查询
#bool 查询
#用 bool 包括 must should must_not filter来完成
#格式如下
#bool:{
#  "filter":[], #不参与打分
#  "must":[],  #相当于        (salary=20 and title=Python)
#  "should":[], #相当于       (salary=20 or title=Python)
#  "must_not":[], #相当于not
#}

#建立测试数据
POST lagou/testjob/_bulk
{"index":{"_id":1}}
{"salary":10,"title":"Python"}
{"index":{"_id":2}}
{"salary":20,"title":"Scrapy"}
{"index":{"_id":3}}
{"salary":30,"title":"Django"}
{"index":{"_id":4}}
{"salary":30,"title":"Elasticsearch"}

DELETE lagou/testjob

#简单的过滤查询
#最简单的fileter查询
#select * from testjob where salary=20
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "must": {
        "match":{
          "salary":20
        }
      }, 
      "filter":{ 
        "match":{
          "title":"Scrapy"
        }
      }
    }
  }
}
#select * from testjob
#where (salary=20 or title=Python) and salary!=30 and salary!=10
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "should":[
          {"term":{"salary":20}},
          {"term":{"title":"python"}}
        ],
      "must_not": [
        {"term": {"salary": "30"}},
        {"term": {"salary": "10"}}
      ] 
    }
  }
}

#where (salary=30 and title="django") or title="python"
GET lagou/testjob/_search
{
  "query":{
    "bool": { 
      "should":[
          {"term":{"title":"python"}},
          {"bool": { 
            "must":[
                {"term":{"salary":30}},
                {"term":{"title":"django"}}
              ] 
          }}
        ] 
    }
  }
}

#测试数据
POST lagou/testjob2/_bulk
{"index":{"_id":1}}
{"tags":["search"]}
{"index":{"_id":2}}
{"tags":["search","python"]}
{"index":{"_id":3}}
{"other_filed":["some data"]}
{"index":{"_id":4}}
{"tags":null}
{"index":{"_id":5}}
{"tags":["search",null]}

#处理null空值的方法
#select tags from testjob2 where tags is not null
GET lagou/testjob2/_search
{
  "query": {
    "bool": {
      "filter": {
        "exists": {
          "field": "tags"
        }
      }
    }
  }
}
#select tags from testjob2 where tags is null
GET lagou/testjob2/_search
{
  "query": {
    "bool": {
      "must_not": {
        "exists": {
          "field": "tags"
        }
      }
    }
  }
} 

 

gitee地址https://gitee.com/zhangyafeii/ArticleSpider_LcvSearch