<script>function StorePage(){d=document;t=d.selection?(d.selection.type!='None'?d.selection.createRange().text:''):(d.getSelection?d.getSelection():'');void(keyit=window.open('http://www.365key.com/storeit.aspx?t='+escape(d.title)+'&u='+escape(d.location.href)+'&c='+escape(t),'keyit','scrollbars=no,width=475,height=575,left=75,top=20,status=no,resizable=yes'));keyit.focus();}</script>
已经有两个项目中有机会接触lucene,由于之前用的lucene版本是2.0的,所以这里也说一下2.0中一些常用操作:
package com.wisekernel.em.business.index.impl;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
public class IndexManager implements IndexIF ,Runnable{
private PathUtil pathUtil;
private PersistenceIF persistence;
private String magazineId;
private Log log = LogFactory.getLog(this.getClass());
public IndexManager(){}
public IndexManager(String mid,PersistenceIF persistence,PathUtil PathUtil){
this.magazineId = mid;
this.persistence = persistence;
this.pathUtil=PathUtil;
}
public synchronized void addIndex(final String perodicalId)
throws IOException {
log.info("in addIndex");
Directory indexDir=null;
try{
indexDir = getDirectory();
IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
addDocument(indexWriter, perodicalId);
indexWriter.optimize();
indexWriter.close();
}catch(IOException e){
try{
IndexReader.unlock(indexDir);
e.printStackTrace();
}catch(Exception ex){
ex.printStackTrace();
}
}finally{
indexDir.close();
}
System.gc();
}
public synchronized void deleteIndex(String perodicalId) throws IOException {
this.log.info("deleteIndex(String perodicalId) begin..");
Directory indexDir = getDirectory();
IndexReader reader = IndexReader.open(indexDir);
IndexReader.unlock(indexDir);
Term term = new Term("periodicalId", perodicalId);
System.out.println("perodicalId:"+perodicalId);
int num=reader.deleteDocuments(term);
System.out.println("delete num:"+num);
reader.close();
IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
indexWriter.optimize();
indexWriter.close();
indexDir.close();
this.log.info("deleteIndex(String perodicalId) end..");
}
public synchronized void modifyIndex(String perodicalId) throws IOException {
this.log.info("modifyIndex(String perodicalId) begin..");
deleteIndex(perodicalId);
addIndex(perodicalId);
log.info("modifyIndex(String perodicalId) end..");
}
private Directory getDirectory() throws IOException {
File homePath = new File(pathUtil.getIndexFolderPath().getFile()
.getAbsolutePath());
if (!homePath.exists()) {
homePath.mkdirs();
}
Directory indexDir = FSDirectory.getDirectory(homePath
.getAbsolutePath(), false);
return indexDir;
}
private static Analyzer getAnalyzer() {
XAnalyzer analyzer = XFactory.getWriterAnalyzer();
return analyzer;
}
@SuppressWarnings("unchecked")
private void addDocument(IndexWriter indexWriter, String periodicalId) {
log.info("begin add docuemnt");
log.info("periodicalId:"+periodicalId);
Collection pdfCol = getPagesByPId(periodicalId);
try {
for (Iterator iter = pdfCol.iterator(); iter.hasNext();) {
Document document = new Document();
PdfPage element = (PdfPage) iter.next();
String pathstr = pathUtil.getPdfTxtPath().getFile()
.getAbsolutePath()
+ File.separator + element.getMaterial().getNewname();
// Field.Index.TOKENIZED=分词建索引
if (element.isSetadpage() == true || element.isPostad() == true
|| element.getCrosspagenum() != 0) {
continue;
}
document.add(new Field("periodicalId", periodicalId,
Field.Store.YES, Field.Index.UN_TOKENIZED));
document.add(new Field("materialId", element.getMaterial()
.getId(), Field.Store.YES, Field.Index.NO));
document.add(new Field("pdfPageId", element.getId(),
Field.Store.YES, Field.Index.UN_TOKENIZED));
document.add(new Field("pagenum", String.valueOf(element
.getPagenum()), Field.Store.YES, Field.Index.NO));
document.add(new Field("crossnum", String.valueOf(element
.getCrosspagenum()), Field.Store.YES, Field.Index.NO));
String currentTxt = readTxt(pathstr, element.getPagenum());
// System.out.println("pagenum:"+element.getPagenum()+",txt:"+currentTxt.substring(0,
// 20));
document.add(new Field("isad", Boolean.toString(element
.isSetadpage()), Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("pdfPageText", currentTxt,
Field.Store.YES, Field.Index.TOKENIZED));
document.add(new Field("magazineName", element.getMaterial()
.getMagazine().getMagazinename(), Field.Store.YES,
Field.Index.TOKENIZED));
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
String publishDate = sdf.format(element.getPeriodical().getPublishDate());
document.add(new Field("publishDate", publishDate, Field.Store.YES,
Field.Index.TOKENIZED));
indexWriter.addDocument(document);
log.info("add one document");
}
log.info("Build Index Success...");
} catch (Exception e) {
log.info("add document error");
e.printStackTrace();
}
}
private String readTxt(String pathDir, int pagenum) {
File file = new File(pathDir);
StringBuffer buffer = new StringBuffer();
if (file.isDirectory()) {
try {
File realTextFile = new File(pathDir + File.separator + pagenum+ ".txt");
BufferedReader is = new BufferedReader(new FileReader(realTextFile));
String text = "";
while ((text = is.readLine()) != null)
buffer.append(text + "\n");
is.close();
} catch (Exception ex) {
}
}
return buffer.toString();
}
private Collection getPagesByPId(String pid) {
PdfPageFilter ppf = new PdfPageFilter();
ppf.addEqualTo(ppf.PERIODICAL_ID, pid);
return this.persistence.query(ppf);
}
public PathUtil getPathUtil() {
return pathUtil;
}
public void setPathUtil(PathUtil pathUtil) {
this.pathUtil = pathUtil;
}
@SuppressWarnings("unchecked")
public Page getByPage(Page page, String magazineTitle,String magazineContext) throws IOException {
this.log.info("Index getByPage(Page page) begin..");
File homePath = new File(pathUtil.getIndexFolderPath().getFile().getAbsolutePath());
Directory indexDir = FSDirectory.getDirectory(homePath.getAbsolutePath(), false);
IndexSearcher indexSearcher = null;
try {
indexSearcher = new IndexSearcher(indexDir);
} catch (FileNotFoundException e) {
throw new RuntimeException("em.indexNotFind");
}
Query query;
Sort sort = null;
Hits hits;
try {
query = getQuery(magazineTitle,magazineContext);
sort = new Sort(new SortField("publishDate",false));
// sort = new Sort("publishDate", true);
hits = indexSearcher.search(query, sort);
Collection<Object[]> items = new ArrayList<Object[]>();
int start = page.getStartAtIndex();
int end = page.getStartAtIndex() + page.getPs();
items = processHits(hits, start, end);
page.setItems(items);
page.setCount(hits.length());
int totalPages = page.getCount() / page.getPs();
if (page.getCount() % page.getPs() != 0) {
totalPages = totalPages + 1;
}
page.setTotalPage(totalPages);
indexSearcher.close();
indexDir.close();
} catch (Exception e) {
System.gc();
return page;
}
System.gc();
log.info("Index getByPage(Page page) begin..");
return page;
}
private Collection processHits(Hits hits, int start, int end) throws IOException, java.text.ParseException {
if (end >= hits.length()) {
end = hits.length();
}
Collection<SearchedBean> articles = new ArrayList<SearchedBean>();
for (int i = start; i < end; i++) {
Document doc = hits.doc(i);
SearchedBean article = getArticle(doc);
articles.add(article);
}
return articles;
}
private SearchedBean getArticle(Document doc)throws java.text.ParseException {
SearchedBean searchedBean = new SearchedBean();
searchedBean.setPeriodicalId(doc.get("periodicalId"));
searchedBean.setCrossnum(doc.get("crossnum"));
searchedBean.setMagazineName(doc.get("magazineName"));
searchedBean.setMaterialId(doc.get("materialId"));
searchedBean.setPagenum(doc.get("pagenum"));
searchedBean.setPdfPageId(doc.get("pdfPageId"));
searchedBean.setIsad(doc.get("isad"));
String str = doc.get("pdfPageText");
if(str.length()>200){
str = str.substring(0, 200);
}
searchedBean.setPdfPageText(str);
searchedBean.setPublishDate(doc.get("publishDate"));
return searchedBean;
}
private Query getQuery(String magazineTitile, String magazineContext) {
BooleanQuery query = new BooleanQuery();
boolean flag = true;
if(magazineTitile!=null&&!magazineTitile.equals("")){
Query titleQuery = new WildcardQuery(new Term("magazineName", "*" + magazineTitile.trim()+ "*"));
query.add(titleQuery, BooleanClause.Occur.MUST);
flag = false;
}
if(magazineContext!=null&&!magazineContext.equals("")){
Query contextQuery = new WildcardQuery(new Term("pdfPageText", "*"+ magazineContext.trim() + "*"));
query.add(contextQuery, BooleanClause.Occur.MUST);
flag = false;
}
Term aTerm = new Term("isad", "false");
Query singleQuery = new TermQuery(aTerm);
query.add(singleQuery, BooleanClause.Occur.MUST);
return query;
}
public PersistenceIF getPersistence() {
return persistence;
}
public void setPersistence(PersistenceIF persistence) {
this.persistence = persistence;
}
public synchronized void deleteIndexByPage(String pdfPageId) throws IOException {
this.log.info("deleteIndex(String pdfPageId) begin..");
Directory indexDir = getDirectory();
IndexReader reader = IndexReader.open(indexDir);
IndexReader.unlock(indexDir);
Term term = new Term("pdfPageId", pdfPageId);
System.out.println("pdfPageId:"+pdfPageId);
int num=reader.deleteDocuments(term);
System.out.println("delete num:"+num);
reader.close();
IndexWriter indexWriter = new IndexWriter(indexDir, getAnalyzer(),false);
indexWriter.optimize();
indexWriter.close();
indexDir.close();
this.log.info("deleteIndex(String pdfPageId) end..");
}
public void modifyIndexByMagazine(String magazineId) throws IOException {
Magazine m=(Magazine) this.persistence.get(Magazine.class, magazineId);
PeriodicalFilter pfilter = new PeriodicalFilter();
pfilter.addEqualTo(pfilter.MAGAZINE_ID, magazineId);
Collection periodicalList = this.persistence.query(pfilter);
for(Iterator it =periodicalList.iterator();it.hasNext(); ){
Periodical p = (Periodical)it.next();
this.modifyIndex(p.getId());
}
}
public void run() {
System.out.println("thread begin===================");
try {
modifyIndexByMagazine(magazineId);
} catch (IOException e) {
e.printStackTrace();
}
}
public String getMagazineId() {
return magazineId;
}
public void setMagazineId(String magazineId) {
this.magazineId = magazineId;
}
}
分享到:
相关推荐
全面好用的lucene 2.0 api以及lucene 3.0 api帮助文档
开发自己的搜索引擎《lucene2.0+heritrix》一书对应的源码资料,总共有30M,只上传了几个例子. ch2-lucene入门小例子 myReserch-可用的网络搜索引擎
本软件是开发自己的搜索引擎《lucene2.0+heritrix》一书对应的源码资料,不过由于代码很大,只发布其中最全的部分,各章的源码就不发了。
lucene java 搜索引擎 比较经典的全文搜索引擎,最近发现在官方网站上找不到了,放在这里给大家,方便点,多谢谢支持!
Lucene.2.0的API文档合集 是从html装成chm的
自己学习Lucenes 2.0 和Heritrix之后的总结
这是Lucene 2.0+Heriterx书源代码里ch7文件夹下的lib文件,里面都是ch7项目里jar包,Lucene 2.0+Heriterx书源代码-ch7lib
lucene-2.0开发专用
lucene 2.0虽然是比较老的技术,它对实时搜索支持交差,不过很多普通的搜索引擎对实时性要求并不是很高,并且对于分词上没有太多的个性化专业要求,那么这个选择将是非常好的,提供各类分词接口、存储接口、索引接口...
Lucene.net 2.0 API,Lucene.net 2.0 dll,Lucene.net学习探索着必备。
Lucene2.0+Heritrix搜索引擎 随书光盘
Lucene.Net 2.0 源码+文档
Lucene2.0+Heritrix(ch3源代码)
lucene2.0+heritrix的随书光盘,有很多好用的lucene实例,还包括一个完整的垂直搜索引擎的设计
【别下这个哈,,】 【完整版的在】《开发自己的搜索引擎-Lucene 2.0 Heritrix》 http://download.csdn.net/source/1756566 文件大小:24.60 MB
NULL 博文链接:https://fishhappy365.iteye.com/blog/456739
lucene.net 2.0的全文检索源代码
Lucene2.0+Nutch0.8 API帮助文档,以前每次查看他们的API都得通过他们的网站去获取,实在麻烦。功夫不负有心人,通过自己的努力终于获得他们CHM格式的API,现在拿来跟大家分享一下
Lucene是apache组织的一个...总得来说用Lucene来进行建立和搜索和操作数据库是差不多的,Document可以看作是数据库的一行记录,Field可以看作是数据库的字段。用lucene实现搜索引擎就像用JDBC实现连接数据库一样简单。
开发自己的搜索引擎--Lucene 2.0+Heritrix(爬虫)第10章