From nobody Sun Feb 8 08:22:28 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 42DFE28AAEB for ; Sun, 11 Jan 2026 21:29:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1768166960; cv=none; b=mjaZ7h7q59f/cYFE2i549iAZLwz0weyVoOiZFvAgPtaGgBgTe0SfzlkP2HaQBwxg7mTBegxUaPXQrbLtwE0RPUrxug+AbOueQ02hwggw+cAyQRV3n+tpl//MlnRmHcLu9tCnGlqrb9omqjFqSnP1FtBKfscQhbtGB+5SGAmF3w4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1768166960; c=relaxed/simple; bh=rB9ge6/sTvQ0rqdXrCN2IZzyFDW2qz+HFsSCmyubcyY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=iV0vSPqudY5zwqc/IZiegYFNsqMinHu3vvsbb0nXBj9tNt/O7XIO11fmyUfr8wtc1TNdmpKoq575OrZTp7tIEV35SXm8VyjdmWeUE4Jk46SN2xY8Pf6oyBqLwR+NG/w0riEu/M/7slPbe7jv1dbdqV8eKmRsygdpWiTHrbpufWc= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=riE0PQip; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="riE0PQip" Received: by smtp.kernel.org (Postfix) with ESMTPSA id 590CDC4CEF7; Sun, 11 Jan 2026 21:29:19 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1768166959; bh=rB9ge6/sTvQ0rqdXrCN2IZzyFDW2qz+HFsSCmyubcyY=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=riE0PQipZiuNr8AlQf1TUzhdp4I3+eRp7RDtAKQ4SYyUSKiZtw3zcT/WJ2oUHbYTG 3Xw2BZwT445HPXsW5eNLutXFaPmTYWBDcc6Mxk8wv1Q+PaonnBpMjv4OorrxWsdbWl rU5dZx3E85/vFlbNKcUsY+UvQogaqR3WfB5zK/o7AT6ER4T3Q2QdswuD+/BzC+E2m5 oVCv1CfKXy2kWGVuM/b44DlD46CewMEH6BEdS29hK2BtYqKwdhNOPtyqwGCf/BLQkq kugruM7zI+TYibidw82w+qw9eLE/BAUQHUhpVvyVO38pjauhQfk2+j1HpP8Um2Coo9 meqjNs9duguPg== From: Sasha Levin To: tools@kernel.org Cc: linux-kernel@vger.kernel.org, torvalds@linux-foundation.org, broonie@kernel.org, Sasha Levin Subject: [RFC v2 2/7] LLMinus: Add vectorize command with fastembed Date: Sun, 11 Jan 2026 16:29:10 -0500 Message-ID: <20260111212915.195056-3-sashal@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20260111212915.195056-1-sashal@kernel.org> References: <20251219181629.1123823-1-sashal@kernel.org> <20260111212915.195056-1-sashal@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Add the vectorize command that generates embeddings for stored conflict resolutions using the BGE-small-en-v1.5 model via fastembed. The model produces 384-dimensional vectors. Processing is batched with incremental saves after each batch for crash recovery. Resolutions with existing embeddings are skipped. This enables RAG-based similarity search for finding historical conflict resolutions similar to current merge conflicts. Also adds cosine_similarity= () and init_embedding_model() helpers with corresponding tests. Signed-off-by: Sasha Levin --- tools/llminus/Cargo.toml | 1 + tools/llminus/src/main.rs | 157 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/tools/llminus/Cargo.toml b/tools/llminus/Cargo.toml index bdb42561a056..86740174de59 100644 --- a/tools/llminus/Cargo.toml +++ b/tools/llminus/Cargo.toml @@ -10,6 +10,7 @@ repository =3D "https://git.kernel.org/pub/scm/linux/kern= el/git/torvalds/linux.git [dependencies] anyhow =3D "1" clap =3D { version =3D "4", features =3D ["derive"] } +fastembed =3D "5" rayon =3D "1" serde =3D { version =3D "1", features =3D ["derive"] } serde_json =3D "1" diff --git a/tools/llminus/src/main.rs b/tools/llminus/src/main.rs index 508bdc085173..b97505d0cd99 100644 --- a/tools/llminus/src/main.rs +++ b/tools/llminus/src/main.rs @@ -2,6 +2,7 @@ =20 use anyhow::{bail, Context, Result}; use clap::{Parser, Subcommand}; +use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::collections::HashSet; @@ -28,6 +29,12 @@ enum Commands { /// Git revision range (e.g., "v6.0..v6.1"). If not specified, lea= rns from entire history. range: Option, }, + /// Generate embeddings for stored resolutions (for RAG similarity sea= rch) + Vectorize { + /// Batch size for embedding generation (default: 64) + #[arg(short, long, default_value =3D "64")] + batch_size: usize, + }, } =20 /// A single diff hunk representing a change region @@ -588,11 +595,118 @@ fn learn(range: Option<&str>) -> Result<()> { Ok(()) } =20 +/// Compute cosine similarity between two vectors +fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { + if a.len() !=3D b.len() || a.is_empty() { + return 0.0; + } + + let dot: f32 =3D a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let norm_a: f32 =3D a.iter().map(|x| x * x).sum::().sqrt(); + let norm_b: f32 =3D b.iter().map(|x| x * x).sum::().sqrt(); + + if norm_a =3D=3D 0.0 || norm_b =3D=3D 0.0 { + return 0.0; + } + + dot / (norm_a * norm_b) +} + +/// Initialize the BGE-small embedding model +fn init_embedding_model() -> Result { + TextEmbedding::try_new( + InitOptions::new(EmbeddingModel::BGESmallENV15) + .with_show_download_progress(true), + ).context("Failed to initialize embedding model") +} + +fn vectorize(batch_size: usize) -> Result<()> { + let store_path =3D Path::new(STORE_PATH); + + if !store_path.exists() { + bail!("No resolutions found. Run 'llminus learn' first."); + } + + let mut store =3D ResolutionStore::load(store_path)?; + + // Count how many need embeddings + let need_embedding: Vec =3D store + .resolutions + .iter() + .enumerate() + .filter(|(_, r)| r.embedding.is_none()) + .map(|(i, _)| i) + .collect(); + + if need_embedding.is_empty() { + println!("All {} resolutions already have embeddings.", store.reso= lutions.len()); + return Ok(()); + } + + println!("Found {} resolutions needing embeddings", need_embedding.len= ()); + println!("Initializing embedding model (BGE-small-en, ~33MB download o= n first run)..."); + + // Initialize the embedding model + let mut model =3D init_embedding_model()?; + + println!("Model loaded. Generating embeddings...\n"); + + // Process in batches + let total_batches =3D need_embedding.len().div_ceil(batch_size); + + for (batch_num, chunk) in need_embedding.chunks(batch_size).enumerate(= ) { + // Collect texts for this batch + let texts: Vec =3D chunk + .iter() + .map(|&i| store.resolutions[i].to_embedding_text()) + .collect(); + + // Generate embeddings + let embeddings =3D model + .embed(texts, None) + .context("Failed to generate embeddings")?; + + // Assign embeddings back to resolutions + for (j, &idx) in chunk.iter().enumerate() { + store.resolutions[idx].embedding =3D Some(embeddings[j].clone(= )); + } + + // Progress report + let done =3D (batch_num + 1) * batch_size.min(chunk.len()); + let pct =3D (done as f64 / need_embedding.len() as f64 * 100.0).mi= n(100.0); + println!( + " Batch {}/{}: {:.1}% ({}/{})", + batch_num + 1, + total_batches, + pct, + done.min(need_embedding.len()), + need_embedding.len() + ); + + // Save after each batch (incremental progress) + store.save(store_path)?; + } + + // Final stats + let json_size =3D std::fs::metadata(store_path).map(|m| m.len()).unwra= p_or(0); + let with_embeddings =3D store.resolutions.iter().filter(|r| r.embeddin= g.is_some()).count(); + + println!("\nResults:"); + println!(" Total resolutions: {}", store.resolutions.len()); + println!(" With embeddings: {}", with_embeddings); + println!(" Embedding dimensions: 384"); + println!(" Output size: {:.2} MB", json_size as f64 / 1024.0 / 1024.0= ); + println!("\nEmbeddings saved to: {}", store_path.display()); + + Ok(()) +} + fn main() -> Result<()> { let cli =3D Cli::parse(); =20 match cli.command { Commands::Learn { range } =3D> learn(range.as_deref()), + Commands::Vectorize { batch_size } =3D> vectorize(batch_size), } } =20 @@ -613,6 +727,7 @@ fn test_learn_command_parses() { let cli =3D Cli::try_parse_from(["llminus", "learn"]).unwrap(); match cli.command { Commands::Learn { range } =3D> assert!(range.is_none()), + _ =3D> panic!("Expected Learn command"), } } =20 @@ -621,9 +736,51 @@ fn test_learn_command_with_range() { let cli =3D Cli::try_parse_from(["llminus", "learn", "v6.0..v6.1"]= ).unwrap(); match cli.command { Commands::Learn { range } =3D> assert_eq!(range, Some("v6.0..v= 6.1".to_string())), + _ =3D> panic!("Expected Learn command"), } } =20 + #[test] + fn test_vectorize_command_parses() { + let cli =3D Cli::try_parse_from(["llminus", "vectorize"]).unwrap(); + match cli.command { + Commands::Vectorize { batch_size } =3D> assert_eq!(batch_size,= 64), + _ =3D> panic!("Expected Vectorize command"), + } + } + + #[test] + fn test_vectorize_command_with_batch_size() { + let cli =3D Cli::try_parse_from(["llminus", "vectorize", "-b", "12= 8"]).unwrap(); + match cli.command { + Commands::Vectorize { batch_size } =3D> assert_eq!(batch_size,= 128), + _ =3D> panic!("Expected Vectorize command"), + } + } + + #[test] + fn test_cosine_similarity() { + // Identical vectors should have similarity 1.0 + let a =3D vec![1.0, 0.0, 0.0]; + let b =3D vec![1.0, 0.0, 0.0]; + assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.0001); + + // Orthogonal vectors should have similarity 0.0 + let a =3D vec![1.0, 0.0, 0.0]; + let b =3D vec![0.0, 1.0, 0.0]; + assert!((cosine_similarity(&a, &b) - 0.0).abs() < 0.0001); + + // Opposite vectors should have similarity -1.0 + let a =3D vec![1.0, 0.0, 0.0]; + let b =3D vec![-1.0, 0.0, 0.0]; + assert!((cosine_similarity(&a, &b) - (-1.0)).abs() < 0.0001); + + // Different length vectors return 0 + let a =3D vec![1.0, 0.0]; + let b =3D vec![1.0, 0.0, 0.0]; + assert_eq!(cosine_similarity(&a, &b), 0.0); + } + #[test] fn test_get_file_type() { assert_eq!(get_file_type("foo/bar.c"), "c"); --=20 2.51.0