Quick attempt to normalize bm25 score by query length
This commit is contained in:
@@ -359,12 +359,14 @@ impl ProjectIndex {
|
||||
let bm25_score = {
|
||||
let corpus_stats =
|
||||
worktree_corpus_stats.read().unwrap();
|
||||
corpus_stats.calculate_bm25_score(
|
||||
let score = corpus_stats.calculate_bm25_score(
|
||||
query_term,
|
||||
&chunk.term_frequencies.0,
|
||||
bm25_params.k1,
|
||||
bm25_params.b,
|
||||
)
|
||||
);
|
||||
// quick hack to bound the score for long queries
|
||||
score / query_term.values().sum::<u32>() as f32
|
||||
};
|
||||
mixing_param * embedding_score
|
||||
+ (1. - mixing_param) * bm25_score
|
||||
|
||||
@@ -86,7 +86,7 @@ pub trait Bm25Scorer {
|
||||
let numerator = tf * (k1 + 1.0);
|
||||
let denominator = tf + k1 * (1.0 - b + b * dl / avg_dl);
|
||||
|
||||
query_tf as f32 * idf * (numerator / denominator)
|
||||
idf * (numerator / denominator)
|
||||
})
|
||||
.sum()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user