lucene4.0索引txt文本

程序员文章站 2022-03-28 20:56:08

...

package com.searchtxt.lucene;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;
import org.wltea.analyzer.lucene.IKAnalyzer;

/**
 * txt文件索引
 * @author lijunqing
 */
public class IndexFile {

    private Directory directory;

    private String indexPath = "D://lucene/index"; // 建立索引文件的目录
    
    private String dirPath = "D://lucene/test"; // txt资源目录
    
    private Analyzer analyzer = new IKAnalyzer();

    private IndexWriter indexWriter;
     
    @Before
    public void init() {
        try {
            directory=FSDirectory.open(new File(indexPath));
            indexWriter=getIndexWriter(directory);
        } catch(Exception e) {
            System.out.println("索引打开异常！");
        }
    }

    /**
     * 获得所有txt文件
     * @param dirPath
     * @return
     */
    public List<File> getFileList(String dirPath) {
        File[] files=new File(dirPath).listFiles();
        List<File> fileList=new ArrayList<File>();
        for(File file: files) {
            if(isTxtFile(file.getName())) {
                fileList.add(file);
            }
        }
        return fileList;
    }
    
    /**
     * 创建索引
     * @throws Exception
     */
    @Test
    public void  createIndex() throws Exception{
        List<File> fileList = getFileList(dirPath);
        Document document = null;
        for(File file:fileList){
          document = fileToDocument(file);
          indexWriter.addDocument(document);
          System.out.println("filename=="+document.get("filename"));
          indexWriter.commit();
        }
        closeWriter();
    }

    /**
     * 判断是否是txt文件
     * @param fileName
     * @return
     */
    public boolean isTxtFile(String fileName) {
        if(fileName.lastIndexOf(".txt") > 0) {
            return true;
        }
        return false;
    }

    /**
     * 将文件转换成Document对象
     * @param file
     * @return
     * @throws Exception
     */
    public Document fileToDocument(File file) throws Exception {
        Document document=new Document();
        document.add(new TextField("filename", file.getName(), Store.YES));
        document.add(new TextField("content", getFileContent(file), Store.YES));
        document.add(new LongField("size", file.getTotalSpace(), Store.YES));
        return document;
    }
    
    /**
     * 获得indexwriter对象
     * @param dir
     * @return
     * @throws Exception
     */
    public IndexWriter getIndexWriter(Directory dir) throws Exception {
        IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_40, analyzer);
        return new IndexWriter(dir, iwc);
    }
    
    /**
     * 关闭indexwriter对象
     * @throws Exception
     */
    public void closeWriter() throws Exception {
        if(indexWriter != null) {
            indexWriter.close();
        }
    }
    
    /**
     * 读取文件内容
     * @param file
     * @return
     * @throws Exception 
     */
    public String getFileContent(File file) throws Exception{
        Reader reader = new InputStreamReader(new FileInputStream(file),"GBK");
        BufferedReader br = new BufferedReader(reader);
        String result ="";
        while(br.readLine() != null){
            result = result+"\n"+br.readLine();
        }
        br.close();
        reader.close();
        return result;
    }
}

说明：lucene4.0中TextField()中reader参数那个源代码中是

 public TextField(String name, Reader reader) {
    super(name, reader, TYPE_NOT_STORED);
  }

而TYPE_NOT_STORED的定义为：是默认不保存的。

所以就先读出文件的内容用：

  public TextField(String name, String value, Store store) {
    super(name, value, store == Store.YES ? TYPE_STORED : TYPE_NOT_STORED);
  }

这个方法。

相关标签： java lucene 搜索文本

上一篇： canvas怎样做出黑色背景的青色烟花

下一篇： php 图片添加文字水印并添加文字阴影_PHP教程

lucene4.0索引txt文本

python实现pdf转换成word/txt纯文本文件

比较好用用的pdf转txt文本文件图片文字提取工具使用介绍

python3.4.3下逐行读入txt文本并去重的方法

怎么明确区分文本文件txt里的1和l？

Mac系统下.txt格式的纯文本怎么保存？

突破搜索引擎robots.txt限制：让搜索引擎给你的网站做外链

Python将文本去空格并保存到txt文件中的实例

python 去除txt文本中的空格、数字、特定字母等方法

文本txt文件怎么批量去掉换行并添加逗号?

[20181124]关于降序索引问题3.txt

lucene4.0索引txt文本

python实现pdf转换成word/txt纯文本文件

比较好用用的pdf转txt文本文件 图片文字提取工具使用介绍

python3.4.3下逐行读入txt文本并去重的方法

怎么明确区分文本文件txt里的1和l？

Mac系统下.txt格式的纯文本怎么保存？

突破搜索引擎robots.txt限制：让搜索引擎给你的网站做外链

Python将文本去空格并保存到txt文件中的实例

python 去除txt文本中的空格、数字、特定字母等方法

文本txt文件怎么批量去掉换行并添加逗号?

[20181124]关于降序索引问题3.txt

比较好用用的pdf转txt文本文件图片文字提取工具使用介绍