当前位置: 代码迷 >> java >> 如果使用BM25算法计算文档相似度,为什么Lucene不返回匹配结果?
  详细解决方案

如果使用BM25算法计算文档相似度,为什么Lucene不返回匹配结果?

热度:61   发布时间:2023-07-26 13:52:48.0

您好,我正在尝试使用Okapi BM25算法进行一些文档相似度计算。

但是我在查询类型上遇到了问题。 除了Im使用默认Queryparser时,我无法获得结果。

基本思想是为目标文档建立索引,并通过对文档内容进行查询来将它们与源文档进行比较。

这是一种非常简单的方法,但是我必须使其起作用。 如果我做某事愚蠢,请纠正我。

我的代码如下所示:

package de.paul.bm25;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DisjunctionMaxQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.similarities.Similarity;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;

public class DocumentSimilarityBM25 {

    Analyzer analyzer;
    Directory index;
    IndexWriterConfig config;
    IndexWriter writer;
    IndexReader reader;
    IndexSearcher searcher;
    Similarity similarity = new DefaultSimilarity();
    String FIELD_CONTENT = "CONTENT";

    public DocumentSimilarityBM25() throws IOException {
        analyzer = new KeywordAnalyzer();
        index  = new RAMDirectory();
        config = new IndexWriterConfig(analyzer);
        writer = new IndexWriter(index, config);
        similarity = new BM25Similarity();
    }

    public void start() {
        try {
            index();
            List<TopDocs> candidates = search();
            printResults(candidates);
        } catch (IOException | ParseException e) {
            e.printStackTrace();
        }
    }

    String[] srcDocuments = new String[]{
        "apples are tastefull",
        "apples and oranges grow an trees",
        "banana are yellow and very sweet",
        "this is a zero"
    };

    String[] trgDocuments = new String[]{
        "apples oranges and banana",
        "apples grow on appletrees",
        "bananes have much suga. " +
        "so they are high caloric",
        "bananas have a curvy form",
        "oranges have the orangecolor and are bigger than apples"
    };

    private void index() throws IOException {
        for(String target :trgDocuments) {
            addDoc(createDoc(target));  
        }
        System.out.println("Number of indexed Files:" + writer.maxDoc());
        writer.close();
    }

    private Query createQuery(Document doc) {
        final DisjunctionMaxQuery qry = new DisjunctionMaxQuery(0.0f);
        BooleanQuery bQuery = new BooleanQuery();
        PhraseQuery pQuery = new PhraseQuery();
        //MultiPhraseQuery mPhrase = new MultiPhraseQuery();

        String content = doc.get(FIELD_CONTENT);
        String[] terms = content.split("\\s");
        for(String term : terms) {
            pQuery = new PhraseQuery();
            pQuery.add(new Term(FIELD_CONTENT, term));
            bQuery.add(pQuery, Occur.SHOULD);
        }

        qry.add(bQuery);
        return qry;
    }

    private List<TopDocs> search() throws IOException, ParseException {
        List<TopDocs> candidates = new ArrayList<>();
        //Query query = new org.apache.lucene.queryparser.classic.QueryParser(FIELD_CONTENT, analyzer).parse(srcDocument);
        reader = DirectoryReader.open(index);
        searcher = new IndexSearcher(reader);
        searcher.setSimilarity(similarity);

        for(String source : srcDocuments) {
            Query query = createQuery(createDoc(source));

            System.out.println("Query:"+query.toString());
            TopDocs candidate = searcher.search(query, reader.maxDoc());
            candidates.add(candidate);
        }

        return candidates;
    }

    private void printResults(List<TopDocs> candidates) throws IOException {
        for(TopDocs candidate : candidates) {
            prinCandidate(candidate);
        }
        reader.close();
    }

    private void prinCandidate(TopDocs candidate) throws IOException {
        float maxScore = candidate.getMaxScore();
        ScoreDoc[] hits = candidate.scoreDocs;

        System.out.println("Found " + hits.length + " hits.");
        System.out.println("MaxScore:" + maxScore);

        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            float score = hits[i].score;

            System.out.println((i + 1)
                    + ". Score: " + score
                    + " " + d.get(FIELD_CONTENT) + "\t"
            );
        }   
    }

    private void addDoc(Document doc) throws IOException {
        writer.addDocument(doc);
        writer.commit();
    }

    private Document createDoc(String content) throws IOException {
        Document doc = new Document();
        doc.add(new TextField(FIELD_CONTENT, content, Field.Store.YES));
        return doc;
    }

}

您的分析仪出了问题。 KeywordAnalyzer将整个字段索引为单个标记。 它应该用于关键字,唯一标识符,零件号等之类的东西。

但是,您正在尝试搜索文本。 改用StandardAnalyzer ,您将开始看到结果:

public DocumentSimilarityBM25() throws IOException {
    analyzer = new StandardAnalyzer();
    index  = new RAMDirectory();
  ...
  相关解决方案