Quick attempt to normalize bm25 score by query length

This commit is contained in:
Jason Mancuso
2024-10-09 11:56:16 -04:00
parent 48ac888be3
commit fbd8b2b587
2 changed files with 5 additions and 3 deletions

View File

@@ -359,12 +359,14 @@ impl ProjectIndex {
let bm25_score = {
let corpus_stats =
worktree_corpus_stats.read().unwrap();
corpus_stats.calculate_bm25_score(
let score = corpus_stats.calculate_bm25_score(
query_term,
&chunk.term_frequencies.0,
bm25_params.k1,
bm25_params.b,
)
);
// quick hack to bound the score for long queries
score / query_term.values().sum::<u32>() as f32
};
mixing_param * embedding_score
+ (1. - mixing_param) * bm25_score

View File

@@ -86,7 +86,7 @@ pub trait Bm25Scorer {
let numerator = tf * (k1 + 1.0);
let denominator = tf + k1 * (1.0 - b + b * dl / avg_dl);
query_tf as f32 * idf * (numerator / denominator)
idf * (numerator / denominator)
})
.sum()
}