Quick attempt to normalize bm25 score by query length

2024-10-09 11:56:16 -04:00
parent 48ac888be3
commit fbd8b2b587
2 changed files with 5 additions and 3 deletions
--- a/crates/semantic_index/src/project_index.rs
+++ b/crates/semantic_index/src/project_index.rs
@@ -359,12 +359,14 @@ impl ProjectIndex {
                                        let bm25_score = {
                                            let corpus_stats =
                                                worktree_corpus_stats.read().unwrap();
-                                            corpus_stats.calculate_bm25_score(
+                                            let score = corpus_stats.calculate_bm25_score(
                                                query_term,
                                                &chunk.term_frequencies.0,
                                                bm25_params.k1,
                                                bm25_params.b,
-                                            )
+                                            );
+                                            // quick hack to bound the score for long queries
+                                            score / query_term.values().sum::<u32>() as f32
                                        };
                                        mixing_param * embedding_score
                                            + (1. - mixing_param) * bm25_score
--- a/crates/semantic_index/src/tfidf.rs
+++ b/crates/semantic_index/src/tfidf.rs
@@ -86,7 +86,7 @@ pub trait Bm25Scorer {
                let numerator = tf * (k1 + 1.0);
                let denominator = tf + k1 * (1.0 - b + b * dl / avg_dl);

-                query_tf as f32 * idf * (numerator / denominator)
+                idf * (numerator / denominator)
            })
            .sum()
    }