centos7 sphinx全文索引
先说下原理:
sphinx将mysql数据表的字段导出定义好要全文索引的MySQL的字段,例如:user 表里的nickname(昵称)和realname(真实姓名),假如我搜索小华 ,就会从nickname和realname去匹配,可以完全匹配也可以模糊匹配按权重排序得到的是用户id,
然后我拿到用户id去user表里去找出详细信息,主键查询很快的呢!
还有关于分词sphinx不自持中文,你可以先分好,分享一个github上开源的
composer require lizhichao/word
给出我的配置
#
# Minimal Sphinx configuration sample (clean, simple, functional)
#
source src1
{
type = mysql
sql_host = localhost
sql_user = atcfw_com
sql_pass = ju8s6JkD0jH
sql_db = atcfw_com
sql_port = 3306 # optional, default is 3306
sql_query_pre = SET NAMES utf8 # 去掉此行前面的注释,如果你的数据库是uft8编码的
sql_query = \
SELECT id,title ,remark,dis\
FROM at_v_sphinx_resource
sql_attr_uint = group_id
sql_attr_timestamp = date_added
}
#############################################################################
## source src2
#############################################################################
source src2 :src1
{
sql_query = \
SELECT id,Name ,dis\
FROM at_v_pms_sphinx_resource
}
source src3 :src1
{
sql_query = \
SELECT id,title ,content,dis\
FROM at_v_sphinx_item
}
source src4 :src1
{
sql_query = \
SELECT id,title ,description,content\
FROM at_news\
where category_id not in(21,22,23)
}
source src5 :src1
{
sql_query = \
SELECT id,title ,content\
FROM at_baike_question
}
source src6 :src1
{
sql_query = \
SELECT id,title ,remark\
FROM at_temp_resource
}
index at_resource
{
# index type
# optional, default is 'plain'
# known values are 'plain', 'distributed', and 'rt' (see samples below)
# type = plain
# document source(s) to index
# multi-value, mandatory
# document IDs must be globally unique across all sources
source = src1
# index files path and file name, without extension
# mandatory, path must be writable, extensions will be auto-appended
path = /usr/local/sphinx211/var/data/at_resource
# 指定utf-8的编码表
charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
ngram_len = 1
ngram_chars = U+3000..U+2FA1F
# document attribute values (docinfo) storage mode
# optional, default is 'extern'
# known values are 'none', 'extern' and 'inline'
docinfo = extern
# dictionary type, 'crc' or 'keywords'
# crc is faster to index when no substring/wildcards searches are needed
# crc with substrings might be faster to search but is much slower to index
# (because all substrings are pre-extracted as individual keywords)
# keywords is much faster to index with substrings, and index is much (3-10x) smaller
# keywords supports wildcards, crc does not, and never will
# optional, default is 'keywords'
dict = keywords
# memory locking for cached data (.spa and .spi), to prevent swapping
# optional, default is 0 (do not mlock)
# requires searchd to be run from root
mlock = 0
# a list of morphology preprocessors to apply
# optional, default is empty
#
# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
# 'soundex', and 'metaphone'; additional preprocessors available from
# libstemmer are 'libstemmer_XXX', where XXX is algorithm code
# (see libstemmer_c/libstemmer/modules.txt)
#
# morphology = stem_en, stem_ru, soundex
# morphology = libstemmer_german
# morphology = libstemmer_sv
morphology = none
# minimum word length at which to enable stemming
# optional, default is 1 (stem everything)
#
# min_stemming_len = 1
# stopword files list (space separated)
# optional, default is empty
# contents are plain text, charset_table and stemming are both applied
#
# stopwords = @[email protected]/data/stopwords.txt
# wordforms file, in "mapfrom > mapto" plain text format
# optional, default is empty
#
# wordforms = @[email protected]/data/wordforms.txt
# tokenizing exceptions file
# optional, default is empty
#
# plain text, case sensitive, space insensitive in map-from part
# one "Map Several Words => ToASingleOne" entry per line
#
# exceptions = @[email protected]/data/exceptions.txt
# embedded file size limit
# optional, default is 16K
#
# exceptions, wordforms, and stopwords files smaller than this limit
# are stored in the index; otherwise, their paths and sizes are stored
#
# embedded_limit = 16K
# minimum indexed word length
# default is 1 (index everything)
min_word_len = 1
# ignored characters list
# optional, default value is empty
#
# ignore_chars = U+00AD
# minimum word prefix length to index
# optional, default is 0 (do not index prefixes)
#
# min_prefix_len = 0
# minimum word infix length to index
# optional, default is 0 (do not index infixes)
#
# min_infix_len = 0
# maximum substring (prefix or infix) length to index
# optional, default is 0 (do not limit substring length)
#
# max_substring_len = 8
# list of fields to limit prefix/infix indexing to
# optional, default value is empty (index all fields in prefix/infix mode)
#
# prefix_fields = filename
# infix_fields = url, domain
# expand keywords with exact forms and/or stars when searching fit indexes
# search-time only, does not affect indexing, can be 0 or 1
# optional, default is 0 (do not expand keywords)
#
# expand_keywords = 1
# n-gram length to index, for CJK indexing
# only supports 0 and 1 for now, other lengths to be implemented
# optional, default is 0 (disable n-grams)
#
# ngram_len = 1
# n-gram characters list, for CJK indexing
# optional, default is empty
#
# ngram_chars = U+3000..U+2FA1F
# phrase boundary characters list
# optional, default is empty
#
# phrase_boundary = ., ?, !, U+2026 # horizontal ellipsis
# phrase boundary word position increment
# optional, default is 0
#
# phrase_boundary_step = 100
# blended characters list
# blended chars are indexed both as separators and valid characters
# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
# optional, default is empty
#
# blend_chars = +, &, U+23
# blended token indexing mode
# a comma separated list of blended token indexing variants
# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
# optional, default is trim_none
#
# blend_mode = trim_tail, skip_pure
# whether to strip HTML tags from incoming documents
# known values are 0 (do not strip) and 1 (do strip)
# optional, default is 0
html_strip = 0
# what HTML attributes to index if stripping HTML
# optional, default is empty (do not index anything)
#
# html_index_attrs = img=alt,title; a=title;
# what HTML elements contents to strip
# optional, default is empty (do not strip element contents)
#
# html_remove_elements = style, script
# whether to preopen index data files on startup
# optional, default is 0 (do not preopen), searchd-only
#
# preopen = 1
# whether to enable in-place inversion (2x less disk, 90-95% speed)
# optional, default is 0 (use separate temporary files), indexer-only
#
# inplace_enable = 1
# in-place fine-tuning options
# optional, defaults are listed below
#
# inplace_hit_gap = 0 # preallocated hitlist gap size
# inplace_docinfo_gap = 0 # preallocated docinfo gap size
# inplace_reloc_factor = 0.1 # relocation buffer size within arena
# inplace_write_factor = 0.1 # write buffer size within arena
# whether to index original keywords along with stemmed versions
# enables "=exactform" operator to work
# optional, default is 0
#
# index_exact_words = 1
# position increment on overshort (less that min_word_len) words
# optional, allowed values are 0 and 1, default is 1
#
# overshort_step = 1
# position increment on stopword
# optional, allowed values are 0 and 1, default is 1
#
# stopword_step = 1
# hitless words list
# positions for these keywords will not be stored in the index
# optional, allowed values are 'all', or a list file name
#
# hitless_words = all
# hitless_words = hitless.txt
# detect and index sentence and paragraph boundaries
# required for the SENTENCE and PARAGRAPH operators to work
# optional, allowed values are 0 and 1, default is 0
#
# index_sp = 1
# index zones, delimited by HTML/XML tags
# a comma separated list of tags and wildcards
# required for the ZONE operator to work
# optional, default is empty string (do not index zones)
#
# index_zones = title, h*, th
# index per-document and average per-index field lengths, in tokens
# required for the BM25A(), BM25F() in expression ranker
# optional, default is 0 (do not index field lenghts)
#
# index_field_lengths = 1
# regular expressions (regexps) to filter the fields and queries with
# gets applied to data source fields when indexing
# gets applied to search queries when searching
# multi-value, optional, default is empty list of regexps
#
# regexp_filter = \b(\d+)\" => \1inch
# regexp_filter = (blue|red) => color
# list of the words considered frequent with respect to bigram indexing
# optional, default is empty
#
# bigram_freq_words = the, a, i, you, my
# bigram indexing mode
# known values are none, all, first_freq, both_freq
# option, default is none (do not index bigrams)
#
# bigram_index = both_freq
# snippet document file name prefix
# preprended to file names when generating snippets using load_files option
# WARNING, this is a prefix (not a path), trailing slash matters!
# optional, default is empty
#
# snippets_file_prefix = /mnt/mydocs/server1
# whether to apply stopwords before or after stemming
# optional, default is 0 (apply stopwords after stemming)
#
# stopwords_unstemmed = 0
# path to a global (cluster-wide) keyword IDFs file
# optional, default is empty (use local IDFs)
#
# global_idf = /usr/local/sphinx/var/global.idf
}
index at_pms_core_resource : at_resource
{
source = src2
path = /usr/local/sphinx211/var/data/at_pms_core_resource
}
index at_item : at_resource
{
source = src3
path = /usr/local/sphinx211/var/data/at_item
}
index at_news : at_resource
{
source = src4
path = /usr/local/sphinx211/var/data/at_news
}
index at_baike_question : at_resource
{
source = src5
path = /usr/local/sphinx211/var/data/at_baike_question
}
index at_temp_resource : at_resource
{
source = src6
path = /usr/local/sphinx211/var/data/at_temp_resource
}
indexer
{
mem_limit = 128M
}
searchd
{
listen = 9312
listen = 9306:mysql41
log = /usr/local/sphinx211/var/log/searchd.log
query_log = /usr/local/sphinx211/var/log/query.log
read_timeout = 5
max_children = 30
pid_file = /usr/local/sphinx211/var/log/searchd.pid
seamless_rotate = 1
preopen_indexes = 1
unlink_old = 1
workers = threads # for RT to work
binlog_path = /usr/local/sphinx211/var/data
}
linux
安装sphinx
参考链接:
https://blog.csdn.net/ikscher/article/details/8478240
http://blog.51cto.com/4301862/1367864
wget http://sphinxsearch.com/files/sphinx-2.2.11-release.tar.gz
扩展sphinx
参考链接:
https://blog.csdn.net/qq_34432348/article/details/70471842
全部
https://www.cnblogs.com/zuikeol/p/6801148.html
#安装php的igbinary扩展
http://pecl.php.net/package/igbinary
cd; \
wget http://pecl.php.net/get/igbinary-1.1.1.tgz; \
tar zxvf igbinary*; \
cd igbinary*; \
/usr/local/php/bin/phpize; \
./configure --with-php-config=/usr/local/php/bin/php-config; \
make; \
make install; \
mv /usr/local/php/lib/php/extensions/*/igbinary.so /usr/local/php/lib/php/extensions/;
vi /usr/local/php/etc/php.ini;
#添加一行
extension=igbinary.so
#退出vim
创建索引
/usr/local/sphinx211/bin/indexer -c /usr/local/sphinx211/etc/sphinx.conf at_resource
不关闭sphinx 应用searchd 情况下增加索引
/usr/local/sphinx211/bin/indexer -c /usr/local/sphinx211/etc/sphinx.conf at_resource --rotate
给出我的php代码
<?php
$sphinx = new SphinxClient();
$sphinx->SetServer('localhost',9312);
$sphinx->setMatchMode(SPH_MATCH_EXTENDED2);
$sphinx->SetArrayResult ( true );
$sphinx->SetLimits (0,1000,100);
$sphinx->SetFieldWeights (array('dis'=>6,'types'=>4,'usearea'=>4,'title'=>2));//设置字段的权重,如果dis命中,那么权重算6
$at_resource =$sphinx->Query("@title(萧山)|@dis(萧山)|@types(萧山)|@usearea(萧山)|@title(厂房)|@dis(厂房)|@types(厂房)|@usearea(厂房)|@title(出租)|@dis(出租)|@types(出租)|@usearea(出租)|@title(平方)|@dis(平方)|@types(平方)|@usearea(平方)",'at_resource');//资源