About to split out embedding index

This commit is contained in:
Richard Feldman
2024-07-23 15:29:38 -04:00
parent 678b6120fd
commit 3cef55d89f
6 changed files with 36 additions and 11 deletions

4
Cargo.lock generated
View File

@@ -306,6 +306,9 @@ name = "arrayvec"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
dependencies = [
"serde",
]
[[package]]
name = "as-raw-xcb-connection"
@@ -9587,6 +9590,7 @@ name = "semantic_index"
version = "0.1.0"
dependencies = [
"anyhow",
"arrayvec",
"blake3",
"client",
"clock",

View File

@@ -284,6 +284,7 @@ zed_actions = { path = "crates/zed_actions" }
alacritty_terminal = "0.23"
any_vec = "0.13"
anyhow = "1.0.57"
arrayvec = { version = "0.7.4", features = ["serde"] }
ashpd = "0.9.1"
async-compression = { version = "0.4", features = ["gzip", "futures-io"] }
async-dispatcher = { version = "0.1" }

View File

@@ -19,6 +19,7 @@ crate-type = ["bin"]
[dependencies]
anyhow.workspace = true
arrayvec.workspace = true
blake3.workspace = true
client.workspace = true
clock.workspace = true

View File

@@ -3,6 +3,7 @@ mod embedding;
mod project_index_debug_view;
use anyhow::{anyhow, Context as _, Result};
use arrayvec::ArrayString;
use chunking::{chunk_text, Chunk};
use collections::{Bound, HashMap, HashSet};
use completion::CompletionProvider;
@@ -495,7 +496,8 @@ struct WorktreeIndex {
worktree: Model<Worktree>,
db_connection: heed::Env,
embedding_db: heed::Database<Str, SerdeBincode<EmbeddedFile>>,
summary_db: heed::Database<Str, Str>, // Key: BLAKE3 hash of source code. Val: LLM summary of that code.
file_digest_db: heed::Database<Str, SerdeBincode<FileDigest>>, // Key: file path. Val: BLAKE3 digest of its contents.
summary_db: heed::Database<Str, Str>, // Key: BLAKE3 digest of a file's contents. Val: LLM summary of those contents.
language_registry: Arc<LanguageRegistry>,
fs: Arc<dyn Fs>,
embedding_provider: Arc<dyn EmbeddingProvider>,
@@ -516,7 +518,7 @@ impl WorktreeIndex {
) -> Task<Result<Model<Self>>> {
let worktree_abs_path = worktree.read(cx).abs_path();
cx.spawn(|mut cx| async move {
let (db, summary_db) = cx
let (db, file_digest_db, summary_db) = cx
.background_executor()
.spawn({
let db_connection = db_connection.clone();
@@ -526,6 +528,14 @@ impl WorktreeIndex {
let db_name = worktree_abs_path.to_string_lossy();
db_connection.create_database(&mut txn, Some(&db_name))?
};
let file_digest_db = {
let db_name =
// Prepend something that wouldn't be found at the beginning of an
// absolute path, so we don't get db key namespace conflicts with
// embeddings, which use the abs path as a key.
format!("digests-{}", worktree_abs_path.to_string_lossy());
db_connection.create_database(&mut txn, Some(&db_name))?
};
let summary_db = {
let db_name =
// Prepend something that wouldn't be found at the beginning of an
@@ -535,15 +545,17 @@ impl WorktreeIndex {
db_connection.create_database(&mut txn, Some(&db_name))?
};
txn.commit()?;
anyhow::Ok((db, summary_db))
anyhow::Ok((db, file_digest_db, summary_db))
}
})
.await?;
cx.new_model(|cx| {
Self::new(
worktree,
db_connection,
db,
file_digest_db,
summary_db,
status_tx,
language_registry,
@@ -560,6 +572,7 @@ impl WorktreeIndex {
worktree: Model<Worktree>,
db_connection: heed::Env,
embedding_db: heed::Database<Str, SerdeBincode<EmbeddedFile>>,
file_digest_db: heed::Database<Str, SerdeBincode<FileDigest>>,
summary_db: heed::Database<Str, Str>,
status: channel::Sender<()>,
language_registry: Arc<LanguageRegistry>,
@@ -582,6 +595,7 @@ impl WorktreeIndex {
db_connection,
embedding_db,
summary_db,
file_digest_db,
worktree,
language_registry,
fs,
@@ -664,15 +678,9 @@ impl WorktreeIndex {
}
fn summarize_code(code: &str, cx: &AppContext) -> impl Future<Output = Result<String>> {
let start = std::time::Instant::now();
let provider = CompletionProvider::global(cx);
let model = if provider
.available_models()
.contains(&PREFERRED_SUMMARIZATION_MODEL)
{
PREFERRED_SUMMARIZATION_MODEL
} else {
provider.model()
};
let model = PREFERRED_SUMMARIZATION_MODEL;
const PROMPT_BEFORE_CODE: &str = "Summarize this code in 3 sentences, using no newlines or bullet points in the summary:";
let prompt = format!("{PROMPT_BEFORE_CODE}\n{code}");
@@ -702,6 +710,7 @@ impl WorktreeIndex {
answer.push_str(chunk?.as_str());
}
log::info!("Code summarization took {:?}", start.elapsed());
Ok(answer)
})
}
@@ -1208,6 +1217,16 @@ struct EmbeddedChunk {
embedding: Embedding,
}
/// This is what blake3's to_hex() method returns - see https://docs.rs/blake3/1.5.3/src/blake3/lib.rs.html#246
type Blake3Digest = ArrayString<{ blake3::OUT_LEN * 2 }>;
#[derive(Debug, Serialize, Deserialize)]
struct FileDigest {
path: Arc<Path>,
mtime: Option<SystemTime>,
digest: Blake3Digest,
}
struct SummarizeFiles {
files: channel::Receiver<SummarizedFile>,
task: Task<Result<()>>,