`
bevis.cn
  • 浏览: 149604 次
  • 性别: Icon_minigender_1
  • 来自: 杭州
社区版块
存档分类
最新评论

Lucene2.0中最常用的基本操作

阅读更多
<script>function StorePage(){d=document;t=d.selection?(d.selection.type!='None'?d.selection.createRange().text:''):(d.getSelection?d.getSelection():'');void(keyit=window.open('http://www.365key.com/storeit.aspx?t='+escape(d.title)+'&u='+escape(d.location.href)+'&c='+escape(t),'keyit','scrollbars=no,width=475,height=575,left=75,top=20,status=no,resizable=yes'));keyit.focus();}</script>

已经有两个项目中有机会接触lucene,由于之前用的lucene版本是2.0的,所以这里也说一下2.0中一些常用操作:

package com.wisekernel.em.business.index.impl;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;

public class IndexManager implements IndexIF ,Runnable{

private PathUtil pathUtil;

private PersistenceIF persistence;

private String magazineId;

private Log log = LogFactory.getLog(this.getClass());

public IndexManager(){}

public IndexManager(String mid,PersistenceIF persistence,PathUtil PathUtil){
this.magazineId = mid;
this.persistence = persistence;
this.pathUtil=PathUtil;
}

public synchronized void addIndex(final String perodicalId)
throws IOException {
log.info("in addIndex");
Directory indexDir=null;
try{
indexDir = getDirectory();
IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
addDocument(indexWriter, perodicalId);
indexWriter.optimize();
indexWriter.close();
}catch(IOException e){
try{
IndexReader.unlock(indexDir);
e.printStackTrace();
}catch(Exception ex){
ex.printStackTrace();
}
}finally{
indexDir.close();
}
System.gc();
}

public synchronized void deleteIndex(String perodicalId) throws IOException {
this.log.info("deleteIndex(String perodicalId) begin..");
Directory indexDir = getDirectory();
IndexReader reader = IndexReader.open(indexDir);
IndexReader.unlock(indexDir);
Term term = new Term("periodicalId", perodicalId);
System.out.println("perodicalId:"+perodicalId);
int num=reader.deleteDocuments(term);
System.out.println("delete num:"+num);
reader.close();

IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
indexWriter.optimize();
indexWriter.close();
indexDir.close();
this.log.info("deleteIndex(String perodicalId) end..");
}

public synchronized void modifyIndex(String perodicalId) throws IOException {
this.log.info("modifyIndex(String perodicalId) begin..");
deleteIndex(perodicalId);
addIndex(perodicalId);
log.info("modifyIndex(String perodicalId) end..");
}

private Directory getDirectory() throws IOException {
File homePath = new File(pathUtil.getIndexFolderPath().getFile()
.getAbsolutePath());
if (!homePath.exists()) {
homePath.mkdirs();
}
Directory indexDir = FSDirectory.getDirectory(homePath
.getAbsolutePath(), false);
return indexDir;
}

private static Analyzer getAnalyzer() {
XAnalyzer analyzer = XFactory.getWriterAnalyzer();
return analyzer;
}

@SuppressWarnings("unchecked")
private void addDocument(IndexWriter indexWriter, String periodicalId) {
log.info("begin add docuemnt");
log.info("periodicalId:"+periodicalId);
Collection pdfCol = getPagesByPId(periodicalId);
try {
for (Iterator iter = pdfCol.iterator(); iter.hasNext();) {
Document document = new Document();
PdfPage element = (PdfPage) iter.next();
String pathstr = pathUtil.getPdfTxtPath().getFile()
.getAbsolutePath()
+ File.separator + element.getMaterial().getNewname();
// Field.Index.TOKENIZED=分词建索引
if (element.isSetadpage() == true || element.isPostad() == true
|| element.getCrosspagenum() != 0) {
continue;
}
document.add(new Field("periodicalId", periodicalId,
Field.Store.YES, Field.Index.UN_TOKENIZED));
document.add(new Field("materialId", element.getMaterial()
.getId(), Field.Store.YES, Field.Index.NO));
document.add(new Field("pdfPageId", element.getId(),
Field.Store.YES, Field.Index.UN_TOKENIZED));
document.add(new Field("pagenum", String.valueOf(element
.getPagenum()), Field.Store.YES, Field.Index.NO));
document.add(new Field("crossnum", String.valueOf(element
.getCrosspagenum()), Field.Store.YES, Field.Index.NO));
String currentTxt = readTxt(pathstr, element.getPagenum());
// System.out.println("pagenum:"+element.getPagenum()+",txt:"+currentTxt.substring(0,
// 20));
document.add(new Field("isad", Boolean.toString(element
.isSetadpage()), Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("pdfPageText", currentTxt,
Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("magazineName", element.getMaterial()
.getMagazine().getMagazinename(), Field.Store.YES,
Field.Index.TOKENIZED));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String publishDate = sdf.format(element.getPeriodical().getPublishDate());
document.add(new Field("publishDate", publishDate, Field.Store.YES,
Field.Index.TOKENIZED));
indexWriter.addDocument(document);
log.info("add one document");
}
log.info("Build Index Success...");
} catch (Exception e) {
log.info("add document error");
e.printStackTrace();
}
}

private String readTxt(String pathDir, int pagenum) {
File file = new File(pathDir);
StringBuffer buffer = new StringBuffer();
if (file.isDirectory()) {
try {
File realTextFile = new File(pathDir + File.separator + pagenum+ ".txt");
BufferedReader is = new BufferedReader(new FileReader(realTextFile));
String text = "";
while ((text = is.readLine()) != null)
buffer.append(text + "\n");
is.close();
} catch (Exception ex) {
}
}
return buffer.toString();
}

private Collection getPagesByPId(String pid) {
PdfPageFilter ppf = new PdfPageFilter();
ppf.addEqualTo(ppf.PERIODICAL_ID, pid);
return this.persistence.query(ppf);
}

public PathUtil getPathUtil() {
return pathUtil;
}

public void setPathUtil(PathUtil pathUtil) {
this.pathUtil = pathUtil;
}

@SuppressWarnings("unchecked")
public Page getByPage(Page page, String magazineTitle,String magazineContext) throws IOException {
this.log.info("Index getByPage(Page page) begin..");
File homePath = new File(pathUtil.getIndexFolderPath().getFile().getAbsolutePath());
Directory indexDir = FSDirectory.getDirectory(homePath.getAbsolutePath(), false);
IndexSearcher indexSearcher = null;
try {
indexSearcher = new IndexSearcher(indexDir);
} catch (FileNotFoundException e) {
throw new RuntimeException("em.indexNotFind");
}
Query query;
Sort sort = null;
Hits hits;
try {
query = getQuery(magazineTitle,magazineContext);
sort = new Sort(new SortField("publishDate",false));
// sort = new Sort("publishDate", true);
hits = indexSearcher.search(query, sort);
Collection<Object[]> items = new ArrayList<Object[]>();
int start = page.getStartAtIndex();
int end = page.getStartAtIndex() + page.getPs();
items = processHits(hits, start, end);
page.setItems(items);
page.setCount(hits.length());
int totalPages = page.getCount() / page.getPs();
if (page.getCount() % page.getPs() != 0) {
totalPages = totalPages + 1;
}
page.setTotalPage(totalPages);
indexSearcher.close();
indexDir.close();
} catch (Exception e) {
System.gc();
return page;
}
System.gc();
log.info("Index getByPage(Page page) begin..");
return page;
}

private Collection processHits(Hits hits, int start, int end) throws IOException, java.text.ParseException {
if (end >= hits.length()) {
end = hits.length();
}
Collection<SearchedBean> articles = new ArrayList<SearchedBean>();
for (int i = start; i < end; i++) {
Document doc = hits.doc(i);
SearchedBean article = getArticle(doc);
articles.add(article);
}
return articles;
}

private SearchedBean getArticle(Document doc)throws java.text.ParseException {
SearchedBean searchedBean = new SearchedBean();
searchedBean.setPeriodicalId(doc.get("periodicalId"));
searchedBean.setCrossnum(doc.get("crossnum"));
searchedBean.setMagazineName(doc.get("magazineName"));
searchedBean.setMaterialId(doc.get("materialId"));
searchedBean.setPagenum(doc.get("pagenum"));
searchedBean.setPdfPageId(doc.get("pdfPageId"));
searchedBean.setIsad(doc.get("isad"));
String str = doc.get("pdfPageText");
if(str.length()>200){
str = str.substring(0, 200);
}
searchedBean.setPdfPageText(str);
searchedBean.setPublishDate(doc.get("publishDate"));
return searchedBean;
}

private Query getQuery(String magazineTitile, String magazineContext) {
BooleanQuery query = new BooleanQuery();
boolean flag = true;
if(magazineTitile!=null&&!magazineTitile.equals("")){
Query titleQuery = new WildcardQuery(new Term("magazineName", "*" + magazineTitile.trim()+ "*"));
query.add(titleQuery, BooleanClause.Occur.MUST);
flag = false;
}
if(magazineContext!=null&&!magazineContext.equals("")){
Query contextQuery = new WildcardQuery(new Term("pdfPageText", "*"+ magazineContext.trim() + "*"));
query.add(contextQuery, BooleanClause.Occur.MUST);
flag = false;
}
Term aTerm = new Term("isad", "false");
Query singleQuery = new TermQuery(aTerm);
query.add(singleQuery, BooleanClause.Occur.MUST);
return query;
}

public PersistenceIF getPersistence() {
return persistence;
}

public void setPersistence(PersistenceIF persistence) {
this.persistence = persistence;
}

public synchronized void deleteIndexByPage(String pdfPageId) throws IOException {
this.log.info("deleteIndex(String pdfPageId) begin..");
Directory indexDir = getDirectory();
IndexReader reader = IndexReader.open(indexDir);
IndexReader.unlock(indexDir);
Term term = new Term("pdfPageId", pdfPageId);
System.out.println("pdfPageId:"+pdfPageId);
int num=reader.deleteDocuments(term);
System.out.println("delete num:"+num);
reader.close();
IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
indexWriter.optimize();
indexWriter.close();
indexDir.close();
this.log.info("deleteIndex(String pdfPageId) end..");

}

public void modifyIndexByMagazine(String magazineId) throws IOException {
Magazine m=(Magazine) this.persistence.get(Magazine.class, magazineId);
PeriodicalFilter pfilter = new PeriodicalFilter();
pfilter.addEqualTo(pfilter.MAGAZINE_ID, magazineId);
Collection periodicalList = this.persistence.query(pfilter);
for(Iterator it =periodicalList.iterator();it.hasNext(); ){
Periodical p = (Periodical)it.next();
this.modifyIndex(p.getId());
}
}

public void run() {
System.out.println("thread begin===================");
try {
modifyIndexByMagazine(magazineId);
} catch (IOException e) {
e.printStackTrace();
}
}

public String getMagazineId() {
return magazineId;
}

public void setMagazineId(String magazineId) {
this.magazineId = magazineId;
}
}

分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics