About to split out embedding index
This commit is contained in:
4
Cargo.lock
generated
4
Cargo.lock
generated
@@ -306,6 +306,9 @@ name = "arrayvec"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711"
|
||||
dependencies = [
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "as-raw-xcb-connection"
|
||||
@@ -9587,6 +9590,7 @@ name = "semantic_index"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"arrayvec",
|
||||
"blake3",
|
||||
"client",
|
||||
"clock",
|
||||
|
||||
@@ -284,6 +284,7 @@ zed_actions = { path = "crates/zed_actions" }
|
||||
alacritty_terminal = "0.23"
|
||||
any_vec = "0.13"
|
||||
anyhow = "1.0.57"
|
||||
arrayvec = { version = "0.7.4", features = ["serde"] }
|
||||
ashpd = "0.9.1"
|
||||
async-compression = { version = "0.4", features = ["gzip", "futures-io"] }
|
||||
async-dispatcher = { version = "0.1" }
|
||||
|
||||
@@ -19,6 +19,7 @@ crate-type = ["bin"]
|
||||
|
||||
[dependencies]
|
||||
anyhow.workspace = true
|
||||
arrayvec.workspace = true
|
||||
blake3.workspace = true
|
||||
client.workspace = true
|
||||
clock.workspace = true
|
||||
|
||||
0
crates/semantic_index/src/index_embeddings.rs
Normal file
0
crates/semantic_index/src/index_embeddings.rs
Normal file
0
crates/semantic_index/src/index_summaries.rs
Normal file
0
crates/semantic_index/src/index_summaries.rs
Normal file
@@ -3,6 +3,7 @@ mod embedding;
|
||||
mod project_index_debug_view;
|
||||
|
||||
use anyhow::{anyhow, Context as _, Result};
|
||||
use arrayvec::ArrayString;
|
||||
use chunking::{chunk_text, Chunk};
|
||||
use collections::{Bound, HashMap, HashSet};
|
||||
use completion::CompletionProvider;
|
||||
@@ -495,7 +496,8 @@ struct WorktreeIndex {
|
||||
worktree: Model<Worktree>,
|
||||
db_connection: heed::Env,
|
||||
embedding_db: heed::Database<Str, SerdeBincode<EmbeddedFile>>,
|
||||
summary_db: heed::Database<Str, Str>, // Key: BLAKE3 hash of source code. Val: LLM summary of that code.
|
||||
file_digest_db: heed::Database<Str, SerdeBincode<FileDigest>>, // Key: file path. Val: BLAKE3 digest of its contents.
|
||||
summary_db: heed::Database<Str, Str>, // Key: BLAKE3 digest of a file's contents. Val: LLM summary of those contents.
|
||||
language_registry: Arc<LanguageRegistry>,
|
||||
fs: Arc<dyn Fs>,
|
||||
embedding_provider: Arc<dyn EmbeddingProvider>,
|
||||
@@ -516,7 +518,7 @@ impl WorktreeIndex {
|
||||
) -> Task<Result<Model<Self>>> {
|
||||
let worktree_abs_path = worktree.read(cx).abs_path();
|
||||
cx.spawn(|mut cx| async move {
|
||||
let (db, summary_db) = cx
|
||||
let (db, file_digest_db, summary_db) = cx
|
||||
.background_executor()
|
||||
.spawn({
|
||||
let db_connection = db_connection.clone();
|
||||
@@ -526,6 +528,14 @@ impl WorktreeIndex {
|
||||
let db_name = worktree_abs_path.to_string_lossy();
|
||||
db_connection.create_database(&mut txn, Some(&db_name))?
|
||||
};
|
||||
let file_digest_db = {
|
||||
let db_name =
|
||||
// Prepend something that wouldn't be found at the beginning of an
|
||||
// absolute path, so we don't get db key namespace conflicts with
|
||||
// embeddings, which use the abs path as a key.
|
||||
format!("digests-{}", worktree_abs_path.to_string_lossy());
|
||||
db_connection.create_database(&mut txn, Some(&db_name))?
|
||||
};
|
||||
let summary_db = {
|
||||
let db_name =
|
||||
// Prepend something that wouldn't be found at the beginning of an
|
||||
@@ -535,15 +545,17 @@ impl WorktreeIndex {
|
||||
db_connection.create_database(&mut txn, Some(&db_name))?
|
||||
};
|
||||
txn.commit()?;
|
||||
anyhow::Ok((db, summary_db))
|
||||
anyhow::Ok((db, file_digest_db, summary_db))
|
||||
}
|
||||
})
|
||||
.await?;
|
||||
|
||||
cx.new_model(|cx| {
|
||||
Self::new(
|
||||
worktree,
|
||||
db_connection,
|
||||
db,
|
||||
file_digest_db,
|
||||
summary_db,
|
||||
status_tx,
|
||||
language_registry,
|
||||
@@ -560,6 +572,7 @@ impl WorktreeIndex {
|
||||
worktree: Model<Worktree>,
|
||||
db_connection: heed::Env,
|
||||
embedding_db: heed::Database<Str, SerdeBincode<EmbeddedFile>>,
|
||||
file_digest_db: heed::Database<Str, SerdeBincode<FileDigest>>,
|
||||
summary_db: heed::Database<Str, Str>,
|
||||
status: channel::Sender<()>,
|
||||
language_registry: Arc<LanguageRegistry>,
|
||||
@@ -582,6 +595,7 @@ impl WorktreeIndex {
|
||||
db_connection,
|
||||
embedding_db,
|
||||
summary_db,
|
||||
file_digest_db,
|
||||
worktree,
|
||||
language_registry,
|
||||
fs,
|
||||
@@ -664,15 +678,9 @@ impl WorktreeIndex {
|
||||
}
|
||||
|
||||
fn summarize_code(code: &str, cx: &AppContext) -> impl Future<Output = Result<String>> {
|
||||
let start = std::time::Instant::now();
|
||||
let provider = CompletionProvider::global(cx);
|
||||
let model = if provider
|
||||
.available_models()
|
||||
.contains(&PREFERRED_SUMMARIZATION_MODEL)
|
||||
{
|
||||
PREFERRED_SUMMARIZATION_MODEL
|
||||
} else {
|
||||
provider.model()
|
||||
};
|
||||
let model = PREFERRED_SUMMARIZATION_MODEL;
|
||||
const PROMPT_BEFORE_CODE: &str = "Summarize this code in 3 sentences, using no newlines or bullet points in the summary:";
|
||||
let prompt = format!("{PROMPT_BEFORE_CODE}\n{code}");
|
||||
|
||||
@@ -702,6 +710,7 @@ impl WorktreeIndex {
|
||||
answer.push_str(chunk?.as_str());
|
||||
}
|
||||
|
||||
log::info!("Code summarization took {:?}", start.elapsed());
|
||||
Ok(answer)
|
||||
})
|
||||
}
|
||||
@@ -1208,6 +1217,16 @@ struct EmbeddedChunk {
|
||||
embedding: Embedding,
|
||||
}
|
||||
|
||||
/// This is what blake3's to_hex() method returns - see https://docs.rs/blake3/1.5.3/src/blake3/lib.rs.html#246
|
||||
type Blake3Digest = ArrayString<{ blake3::OUT_LEN * 2 }>;
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct FileDigest {
|
||||
path: Arc<Path>,
|
||||
mtime: Option<SystemTime>,
|
||||
digest: Blake3Digest,
|
||||
}
|
||||
|
||||
struct SummarizeFiles {
|
||||
files: channel::Receiver<SummarizedFile>,
|
||||
task: Task<Result<()>>,
|
||||
|
||||
Reference in New Issue
Block a user