lucene 实现全文检索(二):创建索引
程序员文章站
2022-07-09 11:51:17
...
indexCreate()
方法可以对每种类型的文档建立索引
/***
* 创建索引
* @param targetFileDir 源文件夹
* @param indexSaveDir 索引存放文件夹
* @throws IOException
*/
public static void indexCreate(File targetFileDir, File indexSaveDir) throws IOException, InvalidFormatException {
// 不是目录或不存在则返回
if (!targetFileDir.isDirectory() || !targetFileDir.exists()) {
return;
}
// 保存Lucene索引文件的路径
Directory directory = FSDirectory.open(indexSaveDir.toPath());
// 创建一个简单的分词器,可以对数据进行分词
Analyzer analyzer = new StandardAnalyzer();
// 创建索引实例
IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
IndexWriter indexWriter = new IndexWriter(directory, indexWriterConfig);
// 获取所有需要建立索引的文件
File[] files = targetFileDir.listFiles();
for (int i = 0; i < files.length; i++) {
// 文件的完整路径 files[i].toString()
// 获取文件名称
String fileName = files[i].getName();
// 获取文件后缀名,将其作为文件类型
String fileType = fileName.substring(fileName.lastIndexOf(".") + 1, fileName.length()).toLowerCase();
if(fileType.equals("index")){
continue;
}
Document doc = new Document();
InputStream in = new FileInputStream(files[i]);
FieldType fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setStored(true);
fieldType.setTokenized(true);
// 为每种文档类型创建索引
if (!fileType.equals("")) {
switch (fileType) {
// doc类型文档
case "doc":
// 获取doc的word文档
WordExtractor wordExtractor = new WordExtractor(in);
// 创建Field对象,并放入doc对象中
doc.add(new Field("contents", wordExtractor.getText(), fieldType));
// 关闭文档
wordExtractor.close();
break;
// docx类型文档
case "docx":
// 获取docx的word文档
XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(new XWPFDocument(in));
// 创建Field对象,并放入doc对象中
doc.add(new Field("contents", xwpfWordExtractor.getText(), fieldType));
// 关闭文档
xwpfWordExtractor.close();
break;
// pdf类型文档
case "pdf":
// 获取pdf文档
PDFParser parser = new PDFParser(in);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
// 创建Field对象,并放入doc对象中
doc.add(new Field("contents", stripper.getText(pdDocument), fieldType));
// 关闭文档
pdDocument.close();
break;
// txt类型文档
case "txt":
String encodingTxt = "GBK";
String txtFile = FileUtils.readFileToString(files[i],encodingTxt);
// 创建Field对象,并放入doc对象中
//System.out.println(txtFile);
System.out.println(txtFile);
doc.add(new Field("contents", txtFile, fieldType));
//System.out.println(doc);
break;
// html类型文档
case "html": {
StringBuffer sb = new StringBuffer();
String str = null;
String encoding = "GBK";
File file = new File(String.valueOf(files[i]));
if (file.isFile() && file.exists()) { //判断文件是否存在
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);//考虑到编码格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
sb.append(lineTxt);
}
str = sb.substring(sb.indexOf("<body>") + 6, sb.indexOf("</body>"));
read.close();
}
doc.add(new Field("contents", str, fieldType));
break;
}
// xml类型文档
case "xml": {
StringBuffer sb = new StringBuffer();
String str = null;
String encoding = "UTF-8";
File file = new File(String.valueOf(files[i]));
if (file.isFile() && file.exists()) { //判断文件是否存在
InputStreamReader read = new InputStreamReader(
new FileInputStream(file), encoding);//考虑到编码格式
BufferedReader bufferedReader = new BufferedReader(read);
String lineTxt = null;
while ((lineTxt = bufferedReader.readLine()) != null) {
sb.append(lineTxt);
}
str = sb.substring(sb.indexOf("content"), sb.indexOf("</content>"));
read.close();
}
doc.add(new Field("contents", str, fieldType));
break;
}
// xls类型文档
case "xls": {
// 工作表
Workbook workbook = null;
workbook = WorkbookFactory.create(in);
StringBuffer sb = new StringBuffer();
// 表个数。
int numberOfSheets = workbook.getNumberOfSheets();
for (int k = 0; k < numberOfSheets; k++) {
Sheet sheet = workbook.getSheetAt(k);
// 行数。
int rowNumbers = sheet.getLastRowNum() + 1;
// Excel第一行。
Row temp = sheet.getRow(0);
if (temp == null) {
continue;
}
int cells = temp.getPhysicalNumberOfCells();
// 读数据。
for (int row = 0; row < rowNumbers; row++) {
Row r = sheet.getRow(row);
for (int col = 0; col < cells; col++) {
if (r.getCell(col) == null) {
continue;
}
sb.append(r.getCell(col).toString()).append(" ");
}
}
}
doc.add(new Field("contents", String.valueOf(sb), fieldType));
workbook.close();
break;
}
// ppt类型文档
case "ppt":
PowerPointExtractor powerpointExtractor = new PowerPointExtractor(in);
doc.add(new Field("contents", powerpointExtractor.getText(), fieldType));
break;
// 其他非需求类型文档
default:
System.out.println("文件类型格式错误!!!");
continue;
}
}
// 创建文件名的域,并放入doc对象中
doc.add(new Field("filename", files[i].getName(), fieldType));
// 创建时间的域,并放入doc对象中
doc.add(new Field("indexDate", DateTools.dateToString(new Date(), DateTools.Resolution.DAY), fieldType));
// 写入IndexWriter
indexWriter.addDocument(doc);
}
// 查看IndexWriter里面有多少个索引
System.out.println("查看IndexWriter里面有多少个索引:" + indexWriter.numDocs());
// 关闭索引
indexWriter.close();
}
上一篇: Mtbatis关系-一对多关系