lucene4.0索引txt文本
程序员文章站
2022-03-28 20:56:08
...
package com.searchtxt.lucene; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.Reader; import java.io.StringReader; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.Token; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TypeAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.LongField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Before; import org.junit.Test; import org.wltea.analyzer.lucene.IKAnalyzer; /** * txt文件索引 * @author lijunqing */ public class IndexFile { private Directory directory; private String indexPath = "D://lucene/index"; // 建立索引文件的目录 private String dirPath = "D://lucene/test"; // txt资源目录 private Analyzer analyzer = new IKAnalyzer(); private IndexWriter indexWriter; @Before public void init() { try { directory=FSDirectory.open(new File(indexPath)); indexWriter=getIndexWriter(directory); } catch(Exception e) { System.out.println("索引打开异常!"); } } /** * 获得所有txt文件 * @param dirPath * @return */ public List<File> getFileList(String dirPath) { File[] files=new File(dirPath).listFiles(); List<File> fileList=new ArrayList<File>(); for(File file: files) { if(isTxtFile(file.getName())) { fileList.add(file); } } return fileList; } /** * 创建索引 * @throws Exception */ @Test public void createIndex() throws Exception{ List<File> fileList = getFileList(dirPath); Document document = null; for(File file:fileList){ document = fileToDocument(file); indexWriter.addDocument(document); System.out.println("filename=="+document.get("filename")); indexWriter.commit(); } closeWriter(); } /** * 判断是否是txt文件 * @param fileName * @return */ public boolean isTxtFile(String fileName) { if(fileName.lastIndexOf(".txt") > 0) { return true; } return false; } /** * 将文件转换成Document对象 * @param file * @return * @throws Exception */ public Document fileToDocument(File file) throws Exception { Document document=new Document(); document.add(new TextField("filename", file.getName(), Store.YES)); document.add(new TextField("content", getFileContent(file), Store.YES)); document.add(new LongField("size", file.getTotalSpace(), Store.YES)); return document; } /** * 获得indexwriter对象 * @param dir * @return * @throws Exception */ public IndexWriter getIndexWriter(Directory dir) throws Exception { IndexWriterConfig iwc=new IndexWriterConfig(Version.LUCENE_40, analyzer); return new IndexWriter(dir, iwc); } /** * 关闭indexwriter对象 * @throws Exception */ public void closeWriter() throws Exception { if(indexWriter != null) { indexWriter.close(); } } /** * 读取文件内容 * @param file * @return * @throws Exception */ public String getFileContent(File file) throws Exception{ Reader reader = new InputStreamReader(new FileInputStream(file),"GBK"); BufferedReader br = new BufferedReader(reader); String result =""; while(br.readLine() != null){ result = result+"\n"+br.readLine(); } br.close(); reader.close(); return result; } }
说明:lucene4.0中TextField()中reader参数那个源代码中是
public TextField(String name, Reader reader) { super(name, reader, TYPE_NOT_STORED); }
而TYPE_NOT_STORED的定义为:是默认不保存的。
所以就先读出文件的内容用:
public TextField(String name, String value, Store store) { super(name, value, store == Store.YES ? TYPE_STORED : TYPE_NOT_STORED); }
这个方法。
上一篇: canvas怎样做出黑色背景的青色烟花