lucene入门系列（一、文档预处理）

程序员文章站 2022-03-05 12:41:53

...

package com.heming.lucene.process;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;

/**
 * 文档预处理类
 * 
 * @author 何明
 * 
 */
public class FilePreprocess {

	/**
	 * 文件处理类
	 * 
	 * @param file
	 *            被处理的源文件
	 * @param outputDir
	 *            处理后的文件输出路径
	 */
	public static void preprocess(File file, String outputDir) {

		try {

			splitToSmallFiles(charactorProcess(file, outputDir + "output.all"),
					outputDir);

			File fileDelete = new File(outputDir + "output.all");

			if (fileDelete.exists())

				fileDelete.delete();

		} catch (Exception e) {

			e.printStackTrace();

		}

	}

	/**
	 * 拆分成小文件
	 * 
	 * @param file
	 *            需要拆分的文件
	 * @param outputpath
	 *            小文件路径
	 * @throws IOException
	 *             IO异常
	 */
	public static void splitToSmallFiles(File file, String outputpath)
			throws IOException {

		//文件计数器,用于产生文件名
		int filePointer = 0;

		//定义单个文件的最大长度
		int MAX_SIZE = 10240;

		//创建文件输出流
		BufferedWriter writer = null;

		//简历字符串缓冲区，存储大文件中读取的数据 
		BufferedReader reader = new BufferedReader(new FileReader(file));

		StringBuffer buffer = new StringBuffer();

		String line = reader.readLine();

		//循环遍历读取的每行字符串
		while (line != null) {			//如果文本里只有一行那么这里应该这样写了String line = null; while ((line = reader.readerLine()) != null) {     }

			//如果读取字符串不为空，则将字符串加入缓冲区
			//并在每行字符串后面加上回车换行
			buffer.append(line).append("\r\n");

			//判断缓冲区长度是否达到文件最大长度
			if (buffer.toString().getBytes().length >= MAX_SIZE) {

				//如果达到最大长度，则将缓冲区的数据写入文件
				//filePointer是文件名前缀的一部分
				writer = new BufferedWriter(new FileWriter(outputpath
						+ "output" + filePointer + ".txt"));

				writer.write(buffer.toString());

				writer.close();

				//文件计数器加1
				filePointer++;

				//清空缓冲区数据
				buffer = new StringBuffer();
			}

			//如果没有达到文件最大长度，则继续读取下一行
			line = reader.readLine();

		}

		//如果大文件已经读取完毕，直接将缓冲区数据写入文件
		writer = new BufferedWriter(new FileWriter(outputpath + "output"
				+ filePointer + ".txt"));

		writer.write(buffer.toString());

		writer.close();

	}

	/**
	 * 对文件进行字符串进行全角/半角处理
	 * 
	 * @param file
	 *            文件名
	 * @param destFile
	 *            目标文件
	 * @return
	 */
	public static File charactorProcess(File file, String destFile)
			throws Exception {

		//创建一个输出流，用于写新文件
		BufferedWriter writer = new BufferedWriter(new FileWriter(destFile));

		//创建一个输入流，用于读取文件
		BufferedReader reader = new BufferedReader(new FileReader(file));

		String line = reader.readLine();

		while (line != null) {

			//调用replace方法替换所有的全角字符
			String newline = replace(line);

			//将替换后的String写入新文件
			writer.write(newline);

			//写入行分隔符
			writer.newLine();

			line = reader.readLine();

		}

		//关闭输入、输出流，将缓冲区数据写入文件
		reader.close();

		writer.close();

		//将替换后的文件返回
		return new File(destFile);

	}

	private static String replace(String line) {

		//创建一个HashMap存储全角字符和半角字符之间的对应关系
		//每个entry里的key为全角字符、value为半角字符
		HashMap map = new HashMap();
		map.put("，", ",");
		map.put("。", ".");
		map.put("〈", "<");
		map.put("〉", ">");
		map.put("‖", "|");
		map.put("《", "<");
		map.put("》", ">");
		map.put("〔", "[");
		map.put("〕", "]");
		map.put("﹖", "?");
		map.put("？", "?");
		map.put("“", "\"");
		map.put("”", "\"");
		map.put("：", ":");
		map.put("、", ",");
		map.put("（", "(");
		map.put("）", ")");
		map.put("【", "[");
		map.put("】", "]");
		map.put("—", "-");
		map.put("～", "~");
		map.put("！", "!");
		map.put("‵", "'");
		map.put("①", "1");
		map.put("②", "2");
		map.put("③", "3");
		map.put("④", "4");
		map.put("⑤", "5");
		map.put("⑥", "6");
		map.put("⑦", "7");
		map.put("⑧", "8");
		map.put("⑨", "9");

		int length = line.length();

		for (int i = 0; i < length; i++) {

			//逐个取得 长度为1的String作为查询条件
			String charat = line.substring(i, i + 1);

			//判断Hashmap的key里面是否出现该String
			if (map.get(charat) != null) {

				//如果存在，说明是全角字符，需替换成半角字符
				line = line.replace(charat, (String) map.get(charat));

			}
		}

		//将处理后的字符返回
		return line;
	}

	public static void main(String[] args) {

		// 设置需要被预处理的源文件位置
		String inputFile = "d:\\test.txt"; // 此文件25.1k，用记事本打开都比较慢

		// 设置处理后的文件存放位置
		String outputDir = "d:\\test\\";

		// 判断处理后文件存放的文件夹是否存在，如果不存在，则创建文件夹
		if (!new File(outputDir).exists())

			new File(outputDir).mkdirs();
		// 调用preprocess方法进行预处理
		preprocess(new File(inputFile), outputDir);

	}
}

lucene入门系列（一、文档预处理）

Word 利用书签来入门一下文档中的交叉引用的用法

Word/Excel文档操作API哪家强？一张表带你了解Aspose和Spire系列全功能对比

Entity Framework 6.0 入门系列第一篇

Linux Shell脚本系列教程（一）：Shell入门

ES6 入门系列（一）ES6的前世今生

不归路系列：Python入门之旅-一定要注意缩进！！！（推荐）

ASP.Net Core 2.2 MVC入门到基本使用系列 (一)

PDF文档操作API哪家强？一张表带你了解Aspose和Spire系列PDF控件全功能对比

8天入门docker系列 —— 第五天使用aspnetcore小案例熟悉容器互联和docker-compose一键部署

8天入门docker系列 —— 第二天通过一个aspnetcore程序加深对容器的理解

lucene入门系列（一、文档预处理）

Word 利用书签来入门一下文档中的交叉引用的用法

Word/Excel文档操作API哪家强？一张表带你了解Aspose和Spire系列全功能对比

Entity Framework 6.0 入门系列 第一篇

Linux Shell脚本系列教程（一）：Shell入门

ES6 入门系列 （一）ES6的前世今生

不归路系列：Python入门之旅-一定要注意缩进！！！（推荐）

ASP.Net Core 2.2 MVC入门到基本使用系列 (一)

PDF文档操作API哪家强？一张表带你了解Aspose和Spire系列PDF控件全功能对比

8天入门docker系列 —— 第五天 使用aspnetcore小案例熟悉容器互联和docker-compose一键部署

8天入门docker系列 —— 第二天 通过一个aspnetcore程序加深对容器的理解

Entity Framework 6.0 入门系列第一篇

ES6 入门系列（一）ES6的前世今生

8天入门docker系列 —— 第五天使用aspnetcore小案例熟悉容器互联和docker-compose一键部署

8天入门docker系列 —— 第二天通过一个aspnetcore程序加深对容器的理解