最近两天看了下lucene的源码,印象还停留在2.x版本,现在都4.8了,变化不小,不过换汤不换药,还是比较容易理解的。
其实关于全文检索的倒排序,逻辑是非常简单的,“空间换时间”的概念也不复杂。
写了一段示意代码,说明一下。
以下的示意代码,采用mysql作为索引文件的存储介质。
使用“二元切分”,亦即“二元” “元切” “切分”。
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public class TestLucene {
public static Object insertToDB(String sql) {
return 0;
}
public static List<Object> getAllFromDB(String sql) {
return null;
}
public static Object getOneFromDB(String sql) {
return null;
}
public static Map<String, Integer> analyze(String text){
Map<String, Integer> words = new HashMap<String, Integer>();
for(int i = 0; i < text.length() - 1; i++) {
String word = text.substring(i, i + 1);
int freq = 1;
if(words.keySet().contains(word)) {
freq += words.get(word);
}
words.put(word, freq);
}
return words;
}
public static void buildIndex(String text) {
//保存文档
Integer docID = (Integer)insertToDB(String.format(
"insert into docs (doc=?) values ('%s')", text));
Map<String, Integer> words = analyze(text);
for(String word : words.keySet()) {
//词频
int freq = words.get(word);
//查询是否存在word
Integer wordID = (Integer)getOneFromDB(String.format(
"select id from words where word='%s'", word));
//如果没有则创建
if(wordID == null) {
wordID = (Integer)insertToDB(String.format(
"insert into words (word=?) values ('%s')", word));
}
//创建关联关系
insertToDB(String.format(
"insert into ref_doc_words (docid=?, wordid=?, freq=?) values ('%d', '%d', '%d')",
docID, wordID, freq));
}
}
public static String buidSqlInClause(Collection coll) {
StringBuffer sb = new StringBuffer("in (");
Iterator<Object> itor = coll.iterator();
while(itor.hasNext()) {
sb.append("'").append(itor.next()).append("',");
}
sb.deleteCharAt(sb.length() - 1);
sb.append(")");
return sb.toString();
}
public static List<Object> search(String keywords) {
List<Object> docIDs = null;
//切分查询词
Map<String, Integer> keys = analyze(keywords);
//查询词对应的所有wordID
List<Object> wordIDs = getAllFromDB(String.format("select id from words where word in %s",
buidSqlInClause(keys.keySet())));
//查询文档
docIDs = getAllFromDB(String.format(
"select d.id, d.doc from docs d, ref_doc_words r where r.wordid in %s order by freq desc",
buidSqlInClause(wordIDs)));
return docIDs;
}
public static void main(String[] args) {
buildIndex("100行搞定全文检索");
buildIndex("LUCENE的逻辑很简单");
buildIndex("我是为了凑足一百行");
for(Object doc : search("检索")) {
System.out.println(doc);
}
}
}
原文:http://blog.csdn.net/deltatang/article/details/28882435