您现在的位置是: 首页

centos7 sphinx全文索引

程序员文章站 2024-01-01 10:22:16


     sphinx将mysql数据表的字段导出定义好要全文索引的MySQL的字段,例如:user 表里的nickname(昵称)和realname(真实姓名),假如我搜索小华 ,就会从nickname和realname去匹配,可以完全匹配也可以模糊匹配按权重排序得到的是用户id,




composer require lizhichao/word


# Minimal Sphinx configuration sample (clean, simple, functional)

source src1
	type			= mysql

	sql_host		= localhost
	sql_user		= atcfw_com
	sql_pass		= ju8s6JkD0jH
	sql_db			= atcfw_com
	sql_port		= 3306	# optional, default is 3306
	sql_query_pre   = SET NAMES utf8 # 去掉此行前面的注释,如果你的数据库是uft8编码的
	sql_query		= \
		SELECT  id,title ,remark,dis\
		FROM at_v_sphinx_resource

	sql_attr_uint		= group_id
	sql_attr_timestamp	= date_added

## source src2
source src2 :src1
    sql_query		= \
		SELECT  id,Name ,dis\
		FROM at_v_pms_sphinx_resource
source src3 :src1
    sql_query		= \
		SELECT  id,title ,content,dis\
		FROM  at_v_sphinx_item
source src4 :src1
    sql_query		= \
		SELECT  id,title ,description,content\
		FROM at_news\
		where category_id not in(21,22,23)

source src5 :src1
    sql_query		= \
		SELECT  id,title ,content\
		FROM at_baike_question
source src6 :src1
    sql_query		= \
		SELECT  id,title ,remark\
		FROM at_temp_resource
index at_resource
	# index type
	# optional, default is 'plain'
	# known values are 'plain', 'distributed', and 'rt' (see samples below)
	# type			= plain

	# document source(s) to index
	# multi-value, mandatory
	# document IDs must be globally unique across all sources
	source			= src1

	# index files path and file name, without extension
	# mandatory, path must be writable, extensions will be auto-appended
	path			= /usr/local/sphinx211/var/data/at_resource
    #  指定utf-8的编码表
    charset_table  = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
    ngram_len    = 1
    ngram_chars   = U+3000..U+2FA1F

	# document attribute values (docinfo) storage mode
	# optional, default is 'extern'
	# known values are 'none', 'extern' and 'inline'
	docinfo			= extern
	# dictionary type, 'crc' or 'keywords'
	# crc is faster to index when no substring/wildcards searches are needed
	# crc with substrings might be faster to search but is much slower to index
	# (because all substrings are pre-extracted as individual keywords)
	# keywords is much faster to index with substrings, and index is much (3-10x) smaller
	# keywords supports wildcards, crc does not, and never will
	# optional, default is 'keywords'
	dict			= keywords

	# memory locking for cached data (.spa and .spi), to prevent swapping
	# optional, default is 0 (do not mlock)
	# requires searchd to be run from root
	mlock			= 0

	# a list of morphology preprocessors to apply
	# optional, default is empty
	# builtin preprocessors are 'none', 'stem_en', 'stem_ru', 'stem_enru',
	# 'soundex', and 'metaphone'; additional preprocessors available from
	# libstemmer are 'libstemmer_XXX', where XXX is algorithm code
	# (see libstemmer_c/libstemmer/modules.txt)
	# morphology		= stem_en, stem_ru, soundex
	# morphology		= libstemmer_german
	# morphology		= libstemmer_sv
	morphology		= none

	# minimum word length at which to enable stemming
	# optional, default is 1 (stem everything)
	# min_stemming_len	= 1

	# stopword files list (space separated)
	# optional, default is empty
	# contents are plain text, charset_table and stemming are both applied
	# stopwords		= @[email protected]/data/stopwords.txt

	# wordforms file, in "mapfrom > mapto" plain text format
	# optional, default is empty
	# wordforms		= @[email protected]/data/wordforms.txt

	# tokenizing exceptions file
	# optional, default is empty
	# plain text, case sensitive, space insensitive in map-from part
	# one "Map Several Words => ToASingleOne" entry per line
	# exceptions		= @[email protected]/data/exceptions.txt

	# embedded file size limit
	# optional, default is 16K
	# exceptions, wordforms, and stopwords files smaller than this limit
	# are stored in the index; otherwise, their paths and sizes are stored
	# embedded_limit		= 16K

	# minimum indexed word length
	# default is 1 (index everything)
	min_word_len		= 1

	# ignored characters list
	# optional, default value is empty
	# ignore_chars		= U+00AD

	# minimum word prefix length to index
	# optional, default is 0 (do not index prefixes)
	# min_prefix_len		= 0

	# minimum word infix length to index
	# optional, default is 0 (do not index infixes)
	# min_infix_len		= 0

	# maximum substring (prefix or infix) length to index
	# optional, default is 0 (do not limit substring length)
	# max_substring_len	= 8

	# list of fields to limit prefix/infix indexing to
	# optional, default value is empty (index all fields in prefix/infix mode)
	# prefix_fields		= filename
	# infix_fields		= url, domain

	# expand keywords with exact forms and/or stars when searching fit indexes
	# search-time only, does not affect indexing, can be 0 or 1
	# optional, default is 0 (do not expand keywords)
	# expand_keywords		= 1

	# n-gram length to index, for CJK indexing
	# only supports 0 and 1 for now, other lengths to be implemented
	# optional, default is 0 (disable n-grams)
	# ngram_len		= 1

	# n-gram characters list, for CJK indexing
	# optional, default is empty
	# ngram_chars		= U+3000..U+2FA1F

	# phrase boundary characters list
	# optional, default is empty
	# phrase_boundary		= ., ?, !, U+2026 # horizontal ellipsis

	# phrase boundary word position increment
	# optional, default is 0
	# phrase_boundary_step	= 100

	# blended characters list
	# blended chars are indexed both as separators and valid characters
	# for instance, AT&T will results in 3 tokens ("at", "t", and "at&t")
	# optional, default is empty
	# blend_chars		= +, &, U+23

	# blended token indexing mode
	# a comma separated list of blended token indexing variants
	# known variants are trim_none, trim_head, trim_tail, trim_both, skip_pure
	# optional, default is trim_none
	# blend_mode		= trim_tail, skip_pure

	# whether to strip HTML tags from incoming documents
	# known values are 0 (do not strip) and 1 (do strip)
	# optional, default is 0
	html_strip		= 0

	# what HTML attributes to index if stripping HTML
	# optional, default is empty (do not index anything)
	# html_index_attrs	= img=alt,title; a=title;

	# what HTML elements contents to strip
	# optional, default is empty (do not strip element contents)
	# html_remove_elements	= style, script

	# whether to preopen index data files on startup
	# optional, default is 0 (do not preopen), searchd-only
	# preopen			= 1

	# whether to enable in-place inversion (2x less disk, 90-95% speed)
	# optional, default is 0 (use separate temporary files), indexer-only
	# inplace_enable		= 1

	# in-place fine-tuning options
	# optional, defaults are listed below
	# inplace_hit_gap		= 0 # preallocated hitlist gap size
	# inplace_docinfo_gap	= 0 # preallocated docinfo gap size
	# inplace_reloc_factor	= 0.1 # relocation buffer size within arena
	# inplace_write_factor	= 0.1 # write buffer size within arena

	# whether to index original keywords along with stemmed versions
	# enables "=exactform" operator to work
	# optional, default is 0
	# index_exact_words	= 1

	# position increment on overshort (less that min_word_len) words
	# optional, allowed values are 0 and 1, default is 1
	# overshort_step		= 1

	# position increment on stopword
	# optional, allowed values are 0 and 1, default is 1
	# stopword_step		= 1

	# hitless words list
	# positions for these keywords will not be stored in the index
	# optional, allowed values are 'all', or a list file name
	# hitless_words		= all
	# hitless_words		= hitless.txt

	# detect and index sentence and paragraph boundaries
	# required for the SENTENCE and PARAGRAPH operators to work
	# optional, allowed values are 0 and 1, default is 0
	# index_sp			= 1

	# index zones, delimited by HTML/XML tags
	# a comma separated list of tags and wildcards
	# required for the ZONE operator to work
	# optional, default is empty string (do not index zones)
	# index_zones		= title, h*, th

	# index per-document and average per-index field lengths, in tokens
	# required for the BM25A(), BM25F() in expression ranker
	# optional, default is 0 (do not index field lenghts)
	# index_field_lengths	= 1

	# regular expressions (regexps) to filter the fields and queries with
	# gets applied to data source fields when indexing
	# gets applied to search queries when searching
	# multi-value, optional, default is empty list of regexps
	# regexp_filter		= \b(\d+)\" => \1inch
	# regexp_filter		= (blue|red) => color

	# list of the words considered frequent with respect to bigram indexing
	# optional, default is empty
	# bigram_freq_words	= the, a, i, you, my

	# bigram indexing mode
	# known values are none, all, first_freq, both_freq
	# option, default is none (do not index bigrams)
	# bigram_index		= both_freq

	# snippet document file name prefix
	# preprended to file names when generating snippets using load_files option
	# WARNING, this is a prefix (not a path), trailing slash matters!
	# optional, default is empty
	# snippets_file_prefix	= /mnt/mydocs/server1

	# whether to apply stopwords before or after stemming
	# optional, default is 0 (apply stopwords after stemming)
	# stopwords_unstemmed	= 0

	# path to a global (cluster-wide) keyword IDFs file
	# optional, default is empty (use local IDFs)
	# global_idf		= /usr/local/sphinx/var/global.idf
index at_pms_core_resource :  at_resource

	source			= src2
	path			= /usr/local/sphinx211/var/data/at_pms_core_resource

index at_item :  at_resource

	source			= src3
	path			= /usr/local/sphinx211/var/data/at_item

index at_news :  at_resource
	source			= src4
	path			= /usr/local/sphinx211/var/data/at_news

index at_baike_question : at_resource
	source			= src5
	path			= /usr/local/sphinx211/var/data/at_baike_question

index at_temp_resource : at_resource
	source			= src6
	path			= /usr/local/sphinx211/var/data/at_temp_resource

	mem_limit		= 128M

	listen			= 9312
	listen			= 9306:mysql41
	log			= /usr/local/sphinx211/var/log/searchd.log
	query_log		= /usr/local/sphinx211/var/log/query.log
	read_timeout		= 5
	max_children		= 30
	pid_file		= /usr/local/sphinx211/var/log/searchd.pid
	seamless_rotate		= 1
	preopen_indexes		= 1
	unlink_old		= 1
	workers			= threads # for RT to work
	binlog_path		= /usr/local/sphinx211/var/data












  wget http://sphinxsearch.com/files/sphinx-2.2.11-release.tar.gz













   cd; \
   wget http://pecl.php.net/get/igbinary-1.1.1.tgz; \
   tar zxvf igbinary*; \
   cd igbinary*; \
   /usr/local/php/bin/phpize; \
  ./configure --with-php-config=/usr/local/php/bin/php-config; \
  make; \
  make install; \
  mv /usr/local/php/lib/php/extensions/*/igbinary.so /usr/local/php/lib/php/extensions/;
  vi /usr/local/php/etc/php.ini;




 /usr/local/sphinx211/bin/indexer   -c /usr/local/sphinx211/etc/sphinx.conf   at_resource 


  不关闭sphinx 应用searchd 情况下增加索引



/usr/local/sphinx211/bin/indexer -c /usr/local/sphinx211/etc/sphinx.conf   at_resource --rotate


 $sphinx = new SphinxClient();
            $sphinx->SetArrayResult ( true );
            $sphinx->SetLimits (0,1000,100);
            $sphinx->SetFieldWeights (array('dis'=>6,'types'=>4,'usearea'=>4,'title'=>2));//设置字段的权重,如果dis命中,那么权重算6
  $at_resource =$sphinx->Query("@title(萧山)|@dis(萧山)|@types(萧山)|@usearea(萧山)|@title(厂房)|@dis(厂房)|@types(厂房)|@usearea(厂房)|@title(出租)|@dis(出租)|@types(出租)|@usearea(出租)|@title(平方)|@dis(平方)|@types(平方)|@usearea(平方)",'at_resource');//资源




相关标签: sphinx

