问题描述
您好,我正在尝试使用Okapi BM25算法进行一些文档相似度计算。
但是我在查询类型上遇到了问题。 除了Im使用默认Queryparser时,我无法获得结果。
基本思想是为目标文档建立索引,并通过对文档内容进行查询来将它们与源文档进行比较。
这是一种非常简单的方法,但是我必须使其起作用。 如果我做某事愚蠢,请纠正我。
我的代码如下所示:
package de.paul.bm25;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
public class DocumentSimilarityBM25 {
Analyzer analyzer;
Directory index;
IndexWriterConfig config;
IndexWriter writer;
IndexReader reader;
IndexSearcher searcher;
Similarity similarity = new DefaultSimilarity();
String FIELD_CONTENT = "CONTENT";
public DocumentSimilarityBM25() throws IOException {
analyzer = new KeywordAnalyzer();
index = new RAMDirectory();
config = new IndexWriterConfig(analyzer);
writer = new IndexWriter(index, config);
similarity = new BM25Similarity();
}
public void start() {
try {
index();
List<TopDocs> candidates = search();
printResults(candidates);
} catch (IOException | ParseException e) {
e.printStackTrace();
}
}
String[] srcDocuments = new String[]{
"apples are tastefull",
"apples and oranges grow an trees",
"banana are yellow and very sweet",
"this is a zero"
};
String[] trgDocuments = new String[]{
"apples oranges and banana",
"apples grow on appletrees",
"bananes have much suga. " +
"so they are high caloric",
"bananas have a curvy form",
"oranges have the orangecolor and are bigger than apples"
};
private void index() throws IOException {
for(String target :trgDocuments) {
addDoc(createDoc(target));
}
System.out.println("Number of indexed Files:" + writer.maxDoc());
writer.close();
}
private Query createQuery(Document doc) {
final DisjunctionMaxQuery qry = new DisjunctionMaxQuery(0.0f);
BooleanQuery bQuery = new BooleanQuery();
PhraseQuery pQuery = new PhraseQuery();
//MultiPhraseQuery mPhrase = new MultiPhraseQuery();
String content = doc.get(FIELD_CONTENT);
String[] terms = content.split("\\s");
for(String term : terms) {
pQuery = new PhraseQuery();
pQuery.add(new Term(FIELD_CONTENT, term));
bQuery.add(pQuery, Occur.SHOULD);
}
qry.add(bQuery);
return qry;
}
private List<TopDocs> search() throws IOException, ParseException {
List<TopDocs> candidates = new ArrayList<>();
//Query query = new org.apache.lucene.queryparser.classic.QueryParser(FIELD_CONTENT, analyzer).parse(srcDocument);
reader = DirectoryReader.open(index);
searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
for(String source : srcDocuments) {
Query query = createQuery(createDoc(source));
System.out.println("Query:"+query.toString());
TopDocs candidate = searcher.search(query, reader.maxDoc());
candidates.add(candidate);
}
return candidates;
}
private void printResults(List<TopDocs> candidates) throws IOException {
for(TopDocs candidate : candidates) {
prinCandidate(candidate);
}
reader.close();
}
private void prinCandidate(TopDocs candidate) throws IOException {
float maxScore = candidate.getMaxScore();
ScoreDoc[] hits = candidate.scoreDocs;
System.out.println("Found " + hits.length + " hits.");
System.out.println("MaxScore:" + maxScore);
for (int i = 0; i < hits.length; ++i) {
int docId = hits[i].doc;
Document d = searcher.doc(docId);
float score = hits[i].score;
System.out.println((i + 1)
+ ". Score: " + score
+ " " + d.get(FIELD_CONTENT) + "\t"
);
}
}
private void addDoc(Document doc) throws IOException {
writer.addDocument(doc);
writer.commit();
}
private Document createDoc(String content) throws IOException {
Document doc = new Document();
doc.add(new TextField(FIELD_CONTENT, content, Field.Store.YES));
return doc;
}
}
1楼
您的分析仪出了问题。
KeywordAnalyzer
将整个字段索引为单个标记。
它应该用于关键字,唯一标识符,零件号等之类的东西。
但是,您正在尝试搜索文本。
改用StandardAnalyzer
,您将开始看到结果:
public DocumentSimilarityBM25() throws IOException {
analyzer = new StandardAnalyzer();
index = new RAMDirectory();
...