solr的facet源码解读（二）——facet.field

程序员文章站 2022-04-03 19:11:17

...

facet.field比facet.query要复杂的多，参数也更多，看看代码吧，方法是：SimpleFacets.getFacetFieldCounts()

public NamedList<Object> getFacetFieldCounts() throws IOException, SyntaxError {
	NamedList<Object> res = new SimpleOrderedMap<>();
	String[] facetFs = params.getParams(FacetParams.FACET_FIELD);
	if (null == facetFs) {
		return res;
	}
 	int maxThreads = req.getParams().getInt(FacetParams.FACET_THREADS, 0);//有的情况下可以使用多个线程处理facet。参数是facet.threads，如果大于0，使用一个多线程的线程池。不过多个任务的处理是对于多个facet field来说的，如果只有一个域的facet，则这个参数设置了也没用。
	Executor executor = maxThreads == 0 ? directExecutor : facetExecutor;
	//如果小于0，表示使用一个没有限制的线程池（当然他的实现还是使用了一个类似CachedThreadPool的线程池，即最大值没有限制的，但是可以使用下面的semaphore来做一个限制，实现同样的功能）
	final Semaphore semaphore = new Semaphore((maxThreads <= 0) ? Integer.MAX_VALUE : maxThreads);
	List<Future<NamedList>> futures = new ArrayList<>(facetFs.length);//多个域的facet的结果。
	try {
		for (String f : facetFs) {//循环所有的facet field
			parseParams(FacetParams.FACET_FIELD, f);//解析这个域的facet的参数
			final String termList = localParams == null ? null : localParams.get(CommonParams.TERMS);//没有看这个情况，下文中全部假设termList=null
			final String workerKey = key;//在没有localParam的时候，具体的field
			final String workerFacetValue = facetValue;//facet的对象，也就是field的名字
			final DocSet workerBase = this.docs;//之前的查询的过程中由q和fq获得的所有的doc的id
			Callable<NamedList> callable = new Callable<NamedList>() {//这个就是要提交到线程池中的任务，用于处理某一个facet.field
				@Override
				public NamedList call() throws Exception {
					try {
						NamedList<Object> result = new SimpleOrderedMap<>();
						if (termList != null) {//不考虑这种情况。工作中没用到
							List<String> terms = StrUtils.splitSmart(termList, ",", true);
							result.add(workerKey, getListedTermCounts(workerFacetValue, workerBase, terms));
						} else {
							result.add(workerKey, getTermCounts(workerFacetValue, workerBase));//具体的方法就是这个，
						}
						return result;
					} catch (SolrException se) {
						throw se;
					} catch (Exception e) {
						throw new SolrException(ErrorCode.SERVER_ERROR,
								"Exception during facet.field: " + workerFacetValue, e);
					} finally {
						semaphore.release();//释放资源
					}
				}
			};
			RunnableFuture<NamedList> runnableFuture = new FutureTask<>(callable);
			semaphore.acquire();// may block and/or interrupt
			executor.execute(runnableFuture);// releases semaphore when done
			futures.add(runnableFuture);
		} // facetFs loop
		// Loop over futures to get the values. The order is the same as facetFs but shouldn't matter.
		for (Future<NamedList> future : futures) {
			res.addAll(future.get());
		}
		assert semaphore.availablePermits() >= maxThreads;
	} catch (InterruptedException e) {
		throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,"Error while processing facet fields: InterruptedException", e);
	} catch (ExecutionException ee) {
		Throwable e = ee.getCause();// unwrap
		if (e instanceof RuntimeException) {
			throw (RuntimeException) e;
		}
		throw new SolrException(SolrException.ErrorCode.SERVER_ERROR,"Error while processing facet fields: " + e.toString(), e);
	}
	return res;
}

上面可以看到，对于多个facet.field是可以使用一个线程池的，将多个facet.field提交给不同的cpu并行处理，可以提高速度，控制使用多线程的参数是facet.threads，但是如果只有一个facet.field的话，就没有用了，因为他只有一个任务。

下面最关键的就是getTermCounts方法了

/**
 * 在base的范围内进行facet <br/>
 * Term counts for use in field faceting that resepects the appropriate mincount
 * @see FacetParams#FACET_MINCOUNT
 */
public NamedList<Integer> getTermCounts(String field, DocSet base) throws IOException {
	Integer mincount = params.getFieldInt(field, FacetParams.FACET_MINCOUNT);//每个term匹配的doc的最少的数量
	return getTermCounts(field, mincount, base);
}

/**
 * @param filed:facet的域
 * @param mincount :符合条件的term匹配最少的doc的值
 * @param base 由q和fq确定的集合
 */
private NamedList<Integer> getTermCounts(String field, Integer mincount, DocSet base) throws IOException {
	
	int offset = params.getFieldInt(field, FacetParams.FACET_OFFSET, 0);//偏移量
	int limit = params.getFieldInt(field, FacetParams.FACET_LIMIT, 100);//要多少个
	if (limit == 0)
		return new NamedList<>();
	
	if (mincount == null) {
		// 判断要不要收集没有doc匹配的term的值
		Boolean zeros = params.getFieldBool(field, FacetParams.FACET_ZEROS);
		mincount = (zeros != null && !zeros) ? 1 : 0;
	}
	// 是不是要收集null的值，有的doc在这个域中没有值，用null表示，如果是true，则要返回一个null的term命中的doc的数量。
	boolean missing = params.getFieldBool(field, FacetParams.FACET_MISSING, false);
	// default to sorting if there is a limit.    facet结果的排序，如果没有指定sort，如果limit>0，则使用count排序，即命中的doc的数量，否则使用facet到的值得字面值排序
	String sort = params.getFieldParam(field, FacetParams.FACET_SORT,limit > 0 ? FacetParams.FACET_SORT_COUNT : FacetParams.FACET_SORT_INDEX);
	String prefix = params.getFieldParam(field, FacetParams.FACET_PREFIX);//必须匹配的前缀的值

	NamedList<Integer> counts;
	SchemaField sf = searcher.getSchema().getField(field);
	FieldType ft = sf.getType();

	// 确定faceting method
	final String methodStr = params.getFieldParam(field, FacetParams.FACET_METHOD);
	FacetMethod method = null;
	if (FacetParams.FACET_METHOD_enum.equals(methodStr)) {
		method = FacetMethod.ENUM;//忽略这个情况，没有遇到过
	} else if (FacetParams.FACET_METHOD_fcs.equals(methodStr)) {
		method = FacetMethod.FCS;
	} else if (FacetParams.FACET_METHOD_fc.equals(methodStr)) {
		method = FacetMethod.FC;
	}
	if (method == FacetMethod.ENUM && TrieField.getMainValuePrefix(ft) != null) {
		method = sf.multiValued() ? FacetMethod.FC : FacetMethod.FCS;
	}
	if (method == null && ft instanceof BoolField) {
		// Always use filters for booleans... we know the number of values is very small.
		method = FacetMethod.ENUM;
	}
	//是不是多值域或者分词的。
	final boolean multiToken = sf.multiValued() || ft.multiValuedFieldCache();
	if (method == null && ft.getNumericType() != null && !sf.multiValued()) {//如果没有指定方法，且是单值域的数字类型，优先使用FCS.
		// the per-segment approach is optimal for numeric field types since there are no global ords to merge and no need to create an expensive top-level reader
		method = FacetMethod.FCS;//fcs（只能facet单值域的域）
	}
	if (ft.getNumericType() != null && sf.hasDocValues()) {//如果是数字类型的且有docVaue，推荐使用FCS
		// only fcs is able to leverage the numeric field caches
		method = FacetMethod.FCS;//
	}
	if (method == null) {//如果上面的都没有进入，默认使用FC
		method = FacetMethod.FC;
	}
	if (method == FacetMethod.FCS && multiToken) {//FCS不能处理多值域的，所以要切换为FC
		method = FacetMethod.FC;
	}
	if (method == FacetMethod.ENUM && sf.hasDocValues()) {
		method = FacetMethod.FC;
	}
	if (params.getFieldBool(field, GroupParams.GROUP_FACET, false)) {//没有用到这个功能，忽略
	        counts = getGroupedCounts(searcher, base, field, multiToken, offset, limit, mincount, missing, sort,prefix);
	} else {
		assert method != null;
		switch (method) {
		case ENUM:
			assert TrieField.getMainValuePrefix(ft) == null;
			counts = getFacetTermEnumCounts(searcher, base, field, offset, limit, mincount, missing, sort, prefix);
			break;
		case FCS://只能处理单值域且不分词的。
			assert !multiToken;
			if (ft.getNumericType() != null/* && !sf.multiValued()*/) {//这个是我自己注释的。因为如果使用FCS且getNumericType !=null 的话，一定不会是multiValued的，所以第二个条件是没用的。
				if (prefix != null && !prefix.isEmpty()) {
					throw new SolrException(ErrorCode.BAD_REQUEST, FacetParams.FACET_PREFIX + " is not supported on numeric types");
				}
				// 这个会尽可能的不使用读取词典表，除非是要返回的结果不够了且使用了minCount=0的参数
				counts = NumericFacets.getCounts(searcher, base, field, offset, limit, mincount, missing, sort);
			} else {//单值域的facet
				PerSegmentSingleValuedFaceting ps = new PerSegmentSingleValuedFaceting(searcher, base, field, offset, limit, mincount, missing, sort, prefix);
				Executor executor = threads == 0 ? directExecutor : facetExecutor;
				ps.setNumThreads(threads);
				counts = ps.getFacetCounts(executor);
			}
			break;
		case FC:
			if (sf.hasDocValues()) {//如果有docValue，
				counts = DocValuesFacets.getCounts(searcher, base, field, offset, limit, mincount, missing, sort, prefix);
			} else if (multiToken || TrieField.getMainValuePrefix(ft) != null) {//如果没有docValue且是多值域的
				UnInvertedField uif = UnInvertedField.getUnInvertedField(field, searcher);
				counts = uif.getCounts(searcher, base, offset, limit, mincount, missing, sort, prefix);
			} else {
				counts = getFieldCacheCounts(searcher, base, field, offset, limit, mincount, missing, sort, prefix);
			}
			break;
		default:
			throw new AssertionError();
		}
	}
	return counts;
}

从上面的代码来看，如果没有指定类型，如果是单值域的数字，则优先使用FCS，否则使用FC.如果指定了使用FCS，但是如果域是多值域的也会使用FC，所以我们只需要看一下FCS的数字的情况和FC的情况即可。在接下来的几篇博客中，会仔细的看下FCS的数字和非数字的两种情况以及FCS的含有docValue的情况（不含有docValue的情况不看了，工作中都是含有docValue的）。

上一篇： PHP和.net中des加解密的实现方法_PHP教程

下一篇： python 是什么东东

solr的facet源码解读（二）——facet.field

solr中对于关键字置顶（竞价排名）、拉黑的源码实现已经实例讲解（二）

Tomcat源码解读系列（二）——Tomcat的核心组成和启动过程

Tomcat源码解读系列（二）——Tomcat的核心组成和启动过程

solr的facet源码解读（一）——facet.query

solr的facet源码解读（二）——facet.field

solr的facet源码解读（三）——facet.field之数字单值域类型

深入理解Vue中的Typescript(二)-vue_component源码解读

solr的facet源码解读（四）——facet.field之非数字单值域类型

第二章 SpringCloud Ribbon的源码解读