From nobody Tue Feb 10 22:15:19 2026 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D75E336D4FD for ; Fri, 19 Dec 2025 18:16:38 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1766168198; cv=none; b=L5doluooj0SkwcP+nb8tlS/Bi6M8NeWhSb99FhjKqBAq3AIRqZlP8ocmI/ePigiLi1qFOfsatMB1N5stkQWoVa8FHOmWBDTYvIZXh1W9HseeV4z/+MWtnFoYOY+TSi93Knba1HMqPPFHYNeXnTf6gQruSW9c410ZFubXe11MUt0= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1766168198; c=relaxed/simple; bh=hcNhg0u25/EPduVe+yXKj4bUmFVkwgt0UcSPBFb2SOI=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=e/XaUFI1QqpWhJD5I/Iqhptz3pGzJhOZwLjktJA/lkXQcJ+fdgCVuvuTuCscJ63jHeV62ZhBLkcTxedmYiqURqs/IQRHNxUbNb2Amfh2rPs2p9Zgu1ogfz9AKXO2h9VH0iitMW0ons7wK/+wOvdf/bFLtSsTaP+qPKXWnm1u6PA= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b=LSyt1bJt; arc=none smtp.client-ip=10.30.226.201 Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=kernel.org header.i=@kernel.org header.b="LSyt1bJt" Received: by smtp.kernel.org (Postfix) with ESMTPSA id EB518C19421; Fri, 19 Dec 2025 18:16:37 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=kernel.org; s=k20201202; t=1766168198; bh=hcNhg0u25/EPduVe+yXKj4bUmFVkwgt0UcSPBFb2SOI=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=LSyt1bJtJK3ewDK4J55Rbc9ekK2Um4MZngwPsbkA18qPR3v+EVBtrSk1/wTDBd6ro OeIPte5PHTviWGBqiMycaat/Gr0vYLM6sUgEvgS/nRN4VJuKaDJjmjtFktGI50VRT5 CVzoH+D9RlhhA2lbvXTssQnnzEC3ZweH7gHqyipnzbOjrQegpaZQQCWJJqW9e6MELE 1oLp6xuaDPSTgW3jl1Dn+88qv84cYGCC0yIaUvEcnRtrM/aBfEgXMgiJ0cjILXZclx 69IOEqco/BBlhLeKWk5ufBqcKbaeV6irro4IzUuh69BWihJQIIJeTizujpO38cSH56 hpzH01B6dwU0g== From: Sasha Levin To: tools@kernel.org Cc: linux-kernel@vger.kernel.org, torvalds@linux-foundation.org, broonie@kernel.org, Sasha Levin Subject: [RFC 2/5] LLMinus: Add vectorize command with fastembed Date: Fri, 19 Dec 2025 13:16:26 -0500 Message-ID: <20251219181629.1123823-3-sashal@kernel.org> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251219181629.1123823-1-sashal@kernel.org> References: <20251219181629.1123823-1-sashal@kernel.org> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Add the 'vectorize' command that generates embeddings for stored conflict resolutions using the BGE-small-en-v1.5 model via fastembed. Key features: - Uses fastembed v5 for local embedding generation - BGE-small model produces 384-dimensional vectors - Batch processing with configurable batch size (-b flag) - Incremental saves after each batch for crash recovery - Skips resolutions that already have embeddings - Progress reporting during vectorization This enables RAG-based similarity search for finding historical conflict resolutions that are similar to current merge conflicts. Also adds: - cosine_similarity() function for vector comparison - init_embedding_model() helper for model initialization - Tests for vectorize command parsing and cosine_similarity Signed-off-by: Sasha Levin --- tools/llminus/Cargo.toml | 1 + tools/llminus/src/main.rs | 157 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 158 insertions(+) diff --git a/tools/llminus/Cargo.toml b/tools/llminus/Cargo.toml index bdb42561a0565..86740174de598 100644 --- a/tools/llminus/Cargo.toml +++ b/tools/llminus/Cargo.toml @@ -10,6 +10,7 @@ repository =3D "https://git.kernel.org/pub/scm/linux/kern= el/git/torvalds/linux.git [dependencies] anyhow =3D "1" clap =3D { version =3D "4", features =3D ["derive"] } +fastembed =3D "5" rayon =3D "1" serde =3D { version =3D "1", features =3D ["derive"] } serde_json =3D "1" diff --git a/tools/llminus/src/main.rs b/tools/llminus/src/main.rs index 1c61836cc93f7..32a578030b0e3 100644 --- a/tools/llminus/src/main.rs +++ b/tools/llminus/src/main.rs @@ -2,6 +2,7 @@ =20 use anyhow::{bail, Context, Result}; use clap::{Parser, Subcommand}; +use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::collections::HashSet; @@ -26,6 +27,12 @@ enum Commands { /// Git revision range (e.g., "v6.0..v6.1"). If not specified, lea= rns from entire history. range: Option, }, + /// Generate embeddings for stored resolutions (for RAG similarity sea= rch) + Vectorize { + /// Batch size for embedding generation (default: 64) + #[arg(short, long, default_value =3D "64")] + batch_size: usize, + }, } =20 /// A single diff hunk representing a change region @@ -483,11 +490,118 @@ fn learn(range: Option<&str>) -> Result<()> { Ok(()) } =20 +/// Compute cosine similarity between two vectors +fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { + if a.len() !=3D b.len() || a.is_empty() { + return 0.0; + } + + let dot: f32 =3D a.iter().zip(b.iter()).map(|(x, y)| x * y).sum(); + let norm_a: f32 =3D a.iter().map(|x| x * x).sum::().sqrt(); + let norm_b: f32 =3D b.iter().map(|x| x * x).sum::().sqrt(); + + if norm_a =3D=3D 0.0 || norm_b =3D=3D 0.0 { + return 0.0; + } + + dot / (norm_a * norm_b) +} + +/// Initialize the BGE-small embedding model +fn init_embedding_model() -> Result { + TextEmbedding::try_new( + InitOptions::new(EmbeddingModel::BGESmallENV15) + .with_show_download_progress(true), + ).context("Failed to initialize embedding model") +} + +fn vectorize(batch_size: usize) -> Result<()> { + let store_path =3D Path::new(STORE_PATH); + + if !store_path.exists() { + bail!("No resolutions found. Run 'llminus learn' first."); + } + + let mut store =3D ResolutionStore::load(store_path)?; + + // Count how many need embeddings + let need_embedding: Vec =3D store + .resolutions + .iter() + .enumerate() + .filter(|(_, r)| r.embedding.is_none()) + .map(|(i, _)| i) + .collect(); + + if need_embedding.is_empty() { + println!("All {} resolutions already have embeddings.", store.reso= lutions.len()); + return Ok(()); + } + + println!("Found {} resolutions needing embeddings", need_embedding.len= ()); + println!("Initializing embedding model (BGE-small-en, ~33MB download o= n first run)..."); + + // Initialize the embedding model + let mut model =3D init_embedding_model()?; + + println!("Model loaded. Generating embeddings...\n"); + + // Process in batches + let total_batches =3D (need_embedding.len() + batch_size - 1) / batch_= size; + + for (batch_num, chunk) in need_embedding.chunks(batch_size).enumerate(= ) { + // Collect texts for this batch + let texts: Vec =3D chunk + .iter() + .map(|&i| store.resolutions[i].to_embedding_text()) + .collect(); + + // Generate embeddings + let embeddings =3D model + .embed(texts, None) + .context("Failed to generate embeddings")?; + + // Assign embeddings back to resolutions + for (j, &idx) in chunk.iter().enumerate() { + store.resolutions[idx].embedding =3D Some(embeddings[j].clone(= )); + } + + // Progress report + let done =3D (batch_num + 1) * batch_size.min(chunk.len()); + let pct =3D (done as f64 / need_embedding.len() as f64 * 100.0).mi= n(100.0); + println!( + " Batch {}/{}: {:.1}% ({}/{})", + batch_num + 1, + total_batches, + pct, + done.min(need_embedding.len()), + need_embedding.len() + ); + + // Save after each batch (incremental progress) + store.save(store_path)?; + } + + // Final stats + let json_size =3D std::fs::metadata(store_path).map(|m| m.len()).unwra= p_or(0); + let with_embeddings =3D store.resolutions.iter().filter(|r| r.embeddin= g.is_some()).count(); + + println!("\nResults:"); + println!(" Total resolutions: {}", store.resolutions.len()); + println!(" With embeddings: {}", with_embeddings); + println!(" Embedding dimensions: 384"); + println!(" Output size: {:.2} MB", json_size as f64 / 1024.0 / 1024.0= ); + println!("\nEmbeddings saved to: {}", store_path.display()); + + Ok(()) +} + fn main() -> Result<()> { let cli =3D Cli::parse(); =20 match cli.command { Commands::Learn { range } =3D> learn(range.as_deref()), + Commands::Vectorize { batch_size } =3D> vectorize(batch_size), } } =20 @@ -508,6 +622,7 @@ fn test_learn_command_parses() { let cli =3D Cli::try_parse_from(["llminus", "learn"]).unwrap(); match cli.command { Commands::Learn { range } =3D> assert!(range.is_none()), + _ =3D> panic!("Expected Learn command"), } } =20 @@ -516,9 +631,51 @@ fn test_learn_command_with_range() { let cli =3D Cli::try_parse_from(["llminus", "learn", "v6.0..v6.1"]= ).unwrap(); match cli.command { Commands::Learn { range } =3D> assert_eq!(range, Some("v6.0..v= 6.1".to_string())), + _ =3D> panic!("Expected Learn command"), } } =20 + #[test] + fn test_vectorize_command_parses() { + let cli =3D Cli::try_parse_from(["llminus", "vectorize"]).unwrap(); + match cli.command { + Commands::Vectorize { batch_size } =3D> assert_eq!(batch_size,= 64), + _ =3D> panic!("Expected Vectorize command"), + } + } + + #[test] + fn test_vectorize_command_with_batch_size() { + let cli =3D Cli::try_parse_from(["llminus", "vectorize", "-b", "12= 8"]).unwrap(); + match cli.command { + Commands::Vectorize { batch_size } =3D> assert_eq!(batch_size,= 128), + _ =3D> panic!("Expected Vectorize command"), + } + } + + #[test] + fn test_cosine_similarity() { + // Identical vectors should have similarity 1.0 + let a =3D vec![1.0, 0.0, 0.0]; + let b =3D vec![1.0, 0.0, 0.0]; + assert!((cosine_similarity(&a, &b) - 1.0).abs() < 0.0001); + + // Orthogonal vectors should have similarity 0.0 + let a =3D vec![1.0, 0.0, 0.0]; + let b =3D vec![0.0, 1.0, 0.0]; + assert!((cosine_similarity(&a, &b) - 0.0).abs() < 0.0001); + + // Opposite vectors should have similarity -1.0 + let a =3D vec![1.0, 0.0, 0.0]; + let b =3D vec![-1.0, 0.0, 0.0]; + assert!((cosine_similarity(&a, &b) - (-1.0)).abs() < 0.0001); + + // Different length vectors return 0 + let a =3D vec![1.0, 0.0]; + let b =3D vec![1.0, 0.0, 0.0]; + assert_eq!(cosine_similarity(&a, &b), 0.0); + } + #[test] fn test_get_file_type() { assert_eq!(get_file_type("foo/bar.c"), "c"); --=20 2.51.0