From 3cef55d89f020728ca229e340daf20138e2de596 Mon Sep 17 00:00:00 2001 From: Richard Feldman Date: Tue, 23 Jul 2024 15:29:38 -0400 Subject: [PATCH] About to split out embedding index --- Cargo.lock | 4 ++ Cargo.toml | 1 + crates/semantic_index/Cargo.toml | 1 + crates/semantic_index/src/index_embeddings.rs | 0 crates/semantic_index/src/index_summaries.rs | 0 crates/semantic_index/src/semantic_index.rs | 41 ++++++++++++++----- 6 files changed, 36 insertions(+), 11 deletions(-) create mode 100644 crates/semantic_index/src/index_embeddings.rs create mode 100644 crates/semantic_index/src/index_summaries.rs diff --git a/Cargo.lock b/Cargo.lock index b301a2c0bd..52e9fde7ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -306,6 +306,9 @@ name = "arrayvec" version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" +dependencies = [ + "serde", +] [[package]] name = "as-raw-xcb-connection" @@ -9587,6 +9590,7 @@ name = "semantic_index" version = "0.1.0" dependencies = [ "anyhow", + "arrayvec", "blake3", "client", "clock", diff --git a/Cargo.toml b/Cargo.toml index 7feb4610cc..cda658d587 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -284,6 +284,7 @@ zed_actions = { path = "crates/zed_actions" } alacritty_terminal = "0.23" any_vec = "0.13" anyhow = "1.0.57" +arrayvec = { version = "0.7.4", features = ["serde"] } ashpd = "0.9.1" async-compression = { version = "0.4", features = ["gzip", "futures-io"] } async-dispatcher = { version = "0.1" } diff --git a/crates/semantic_index/Cargo.toml b/crates/semantic_index/Cargo.toml index fda964f5c5..12fa1d8d44 100644 --- a/crates/semantic_index/Cargo.toml +++ b/crates/semantic_index/Cargo.toml @@ -19,6 +19,7 @@ crate-type = ["bin"] [dependencies] anyhow.workspace = true +arrayvec.workspace = true blake3.workspace = true client.workspace = true clock.workspace = true diff --git a/crates/semantic_index/src/index_embeddings.rs b/crates/semantic_index/src/index_embeddings.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/crates/semantic_index/src/index_summaries.rs b/crates/semantic_index/src/index_summaries.rs new file mode 100644 index 0000000000..e69de29bb2 diff --git a/crates/semantic_index/src/semantic_index.rs b/crates/semantic_index/src/semantic_index.rs index fb776cbc1a..0dde357155 100644 --- a/crates/semantic_index/src/semantic_index.rs +++ b/crates/semantic_index/src/semantic_index.rs @@ -3,6 +3,7 @@ mod embedding; mod project_index_debug_view; use anyhow::{anyhow, Context as _, Result}; +use arrayvec::ArrayString; use chunking::{chunk_text, Chunk}; use collections::{Bound, HashMap, HashSet}; use completion::CompletionProvider; @@ -495,7 +496,8 @@ struct WorktreeIndex { worktree: Model, db_connection: heed::Env, embedding_db: heed::Database>, - summary_db: heed::Database, // Key: BLAKE3 hash of source code. Val: LLM summary of that code. + file_digest_db: heed::Database>, // Key: file path. Val: BLAKE3 digest of its contents. + summary_db: heed::Database, // Key: BLAKE3 digest of a file's contents. Val: LLM summary of those contents. language_registry: Arc, fs: Arc, embedding_provider: Arc, @@ -516,7 +518,7 @@ impl WorktreeIndex { ) -> Task>> { let worktree_abs_path = worktree.read(cx).abs_path(); cx.spawn(|mut cx| async move { - let (db, summary_db) = cx + let (db, file_digest_db, summary_db) = cx .background_executor() .spawn({ let db_connection = db_connection.clone(); @@ -526,6 +528,14 @@ impl WorktreeIndex { let db_name = worktree_abs_path.to_string_lossy(); db_connection.create_database(&mut txn, Some(&db_name))? }; + let file_digest_db = { + let db_name = + // Prepend something that wouldn't be found at the beginning of an + // absolute path, so we don't get db key namespace conflicts with + // embeddings, which use the abs path as a key. + format!("digests-{}", worktree_abs_path.to_string_lossy()); + db_connection.create_database(&mut txn, Some(&db_name))? + }; let summary_db = { let db_name = // Prepend something that wouldn't be found at the beginning of an @@ -535,15 +545,17 @@ impl WorktreeIndex { db_connection.create_database(&mut txn, Some(&db_name))? }; txn.commit()?; - anyhow::Ok((db, summary_db)) + anyhow::Ok((db, file_digest_db, summary_db)) } }) .await?; + cx.new_model(|cx| { Self::new( worktree, db_connection, db, + file_digest_db, summary_db, status_tx, language_registry, @@ -560,6 +572,7 @@ impl WorktreeIndex { worktree: Model, db_connection: heed::Env, embedding_db: heed::Database>, + file_digest_db: heed::Database>, summary_db: heed::Database, status: channel::Sender<()>, language_registry: Arc, @@ -582,6 +595,7 @@ impl WorktreeIndex { db_connection, embedding_db, summary_db, + file_digest_db, worktree, language_registry, fs, @@ -664,15 +678,9 @@ impl WorktreeIndex { } fn summarize_code(code: &str, cx: &AppContext) -> impl Future> { + let start = std::time::Instant::now(); let provider = CompletionProvider::global(cx); - let model = if provider - .available_models() - .contains(&PREFERRED_SUMMARIZATION_MODEL) - { - PREFERRED_SUMMARIZATION_MODEL - } else { - provider.model() - }; + let model = PREFERRED_SUMMARIZATION_MODEL; const PROMPT_BEFORE_CODE: &str = "Summarize this code in 3 sentences, using no newlines or bullet points in the summary:"; let prompt = format!("{PROMPT_BEFORE_CODE}\n{code}"); @@ -702,6 +710,7 @@ impl WorktreeIndex { answer.push_str(chunk?.as_str()); } + log::info!("Code summarization took {:?}", start.elapsed()); Ok(answer) }) } @@ -1208,6 +1217,16 @@ struct EmbeddedChunk { embedding: Embedding, } +/// This is what blake3's to_hex() method returns - see https://docs.rs/blake3/1.5.3/src/blake3/lib.rs.html#246 +type Blake3Digest = ArrayString<{ blake3::OUT_LEN * 2 }>; + +#[derive(Debug, Serialize, Deserialize)] +struct FileDigest { + path: Arc, + mtime: Option, + digest: Blake3Digest, +} + struct SummarizeFiles { files: channel::Receiver, task: Task>,