diff --git a/crates/semantic_index/src/project_index.rs b/crates/semantic_index/src/project_index.rs index 5a88a3aa95..86ee04402a 100644 --- a/crates/semantic_index/src/project_index.rs +++ b/crates/semantic_index/src/project_index.rs @@ -359,12 +359,14 @@ impl ProjectIndex { let bm25_score = { let corpus_stats = worktree_corpus_stats.read().unwrap(); - corpus_stats.calculate_bm25_score( + let score = corpus_stats.calculate_bm25_score( query_term, &chunk.term_frequencies.0, bm25_params.k1, bm25_params.b, - ) + ); + // quick hack to bound the score for long queries + score / query_term.values().sum::() as f32 }; mixing_param * embedding_score + (1. - mixing_param) * bm25_score diff --git a/crates/semantic_index/src/tfidf.rs b/crates/semantic_index/src/tfidf.rs index 13e1876ace..f12527fa3b 100644 --- a/crates/semantic_index/src/tfidf.rs +++ b/crates/semantic_index/src/tfidf.rs @@ -86,7 +86,7 @@ pub trait Bm25Scorer { let numerator = tf * (k1 + 1.0); let denominator = tf + k1 * (1.0 - b + b * dl / avg_dl); - query_tf as f32 * idf * (numerator / denominator) + idf * (numerator / denominator) }) .sum() }