WIP 5 add image embedding provider (local only for now)
This commit is contained in:
		| @@ -1325,6 +1325,7 @@ impl Cmd { | ||||
|                 let p_lc = provider.to_lowercase(); | ||||
|                 let prov = match p_lc.as_str() { | ||||
|                     "test-hash" | "testhash" => EmbeddingProvider::TestHash, | ||||
|                     "testimagehash" | "image-test-hash" | "imagetesthash" => EmbeddingProvider::ImageTestHash, | ||||
|                     "fastembed" | "lancefastembed" => EmbeddingProvider::LanceFastEmbed, | ||||
|                     "openai" | "lanceopenai" => EmbeddingProvider::LanceOpenAI, | ||||
|                     other => EmbeddingProvider::LanceOther(other.to_string()), | ||||
| @@ -1346,6 +1347,7 @@ impl Cmd { | ||||
|                         arr.push(Protocol::BulkString("provider".to_string())); | ||||
|                         arr.push(Protocol::BulkString(match cfg.provider { | ||||
|                             EmbeddingProvider::TestHash => "test-hash".to_string(), | ||||
|                             EmbeddingProvider::ImageTestHash => "testimagehash".to_string(), | ||||
|                             EmbeddingProvider::LanceFastEmbed => "lancefastembed".to_string(), | ||||
|                             EmbeddingProvider::LanceOpenAI => "lanceopenai".to_string(), | ||||
|                             EmbeddingProvider::LanceOther(ref s) => s.clone(), | ||||
|   | ||||
| @@ -30,8 +30,10 @@ use serde_json::json; | ||||
| #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] | ||||
| #[serde(rename_all = "snake_case")] | ||||
| pub enum EmbeddingProvider { | ||||
|     // Deterministic, local-only embedder for CI and offline development. | ||||
|     // Deterministic, local-only embedder for CI and offline development (text). | ||||
|     TestHash, | ||||
|     // Deterministic, local-only embedder for CI and offline development (image). | ||||
|     ImageTestHash, | ||||
|     // Placeholders for LanceDB-supported providers; implementers can add concrete backends later. | ||||
|     LanceFastEmbed, | ||||
|     LanceOpenAI, | ||||
| @@ -71,6 +73,8 @@ pub trait Embedder: Send + Sync { | ||||
|     } | ||||
| } | ||||
|  | ||||
| //// ----------------------------- TEXT: deterministic test embedder ----------------------------- | ||||
|  | ||||
| /// Deterministic, no-deps, no-network embedder for CI and offline dev. | ||||
| /// Algorithm: | ||||
| /// - Fold bytes of UTF-8 into 'dim' buckets with a simple rolling hash | ||||
| @@ -127,6 +131,77 @@ impl Embedder for TestHashEmbedder { | ||||
|     } | ||||
| } | ||||
|  | ||||
| //// ----------------------------- IMAGE: trait + deterministic test embedder ----------------------------- | ||||
|  | ||||
| /// Image embedding interface (separate from text to keep modality-specific inputs). | ||||
| pub trait ImageEmbedder: Send + Sync { | ||||
|     /// Human-readable provider/model name | ||||
|     fn name(&self) -> String; | ||||
|     /// Embedding dimension | ||||
|     fn dim(&self) -> usize; | ||||
|     /// Embed a single image (raw bytes) | ||||
|     fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError>; | ||||
|     /// Embed many images; default maps embed_image() over inputs | ||||
|     fn embed_many_images(&self, images: &[Vec<u8>]) -> Result<Vec<Vec<f32>>, DBError> { | ||||
|         images.iter().map(|b| self.embed_image(b)).collect() | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Deterministic image embedder that folds bytes into buckets, applies tanh-like nonlinearity, | ||||
| /// and L2-normalizes. Suitable for CI and offline development. | ||||
| /// NOTE: This is NOT semantic; it is a stable hash-like representation. | ||||
| pub struct TestImageHashEmbedder { | ||||
|     dim: usize, | ||||
|     model_name: String, | ||||
| } | ||||
|  | ||||
| impl TestImageHashEmbedder { | ||||
|     pub fn new(dim: usize, model_name: impl Into<String>) -> Self { | ||||
|         Self { dim, model_name: model_name.into() } | ||||
|     } | ||||
|  | ||||
|     fn l2_normalize(mut v: Vec<f32>) -> Vec<f32> { | ||||
|         let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt(); | ||||
|         if norm > 0.0 { | ||||
|             for x in &mut v { | ||||
|                 *x /= norm; | ||||
|             } | ||||
|         } | ||||
|         v | ||||
|     } | ||||
| } | ||||
|  | ||||
| impl ImageEmbedder for TestImageHashEmbedder { | ||||
|     fn name(&self) -> String { | ||||
|         format!("test-image-hash:{}", self.model_name) | ||||
|     } | ||||
|  | ||||
|     fn dim(&self) -> usize { | ||||
|         self.dim | ||||
|     } | ||||
|  | ||||
|     fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError> { | ||||
|         // Deterministic fold across bytes with two rolling accumulators. | ||||
|         let mut acc = vec![0f32; self.dim]; | ||||
|         let mut h1: u32 = 0x811C9DC5; // FNV-like | ||||
|         let mut h2: u32 = 0x9E3779B9; // golden ratio | ||||
|         for (i, b) in bytes.iter().enumerate() { | ||||
|             h1 ^= *b as u32; | ||||
|             h1 = h1.wrapping_mul(16777619u32); | ||||
|             // combine with position and h2 | ||||
|             h2 = h2.wrapping_add(((i as u32).rotate_left((i % 13) as u32)) ^ h1.rotate_left((i % 7) as u32)); | ||||
|             let idx = (h1 ^ h2) as usize % self.dim; | ||||
|             // Map to [-1,1] and decay with position | ||||
|             let val = ((*b as f32) / 127.5 - 1.0) * (1.0 / (1.0 + (i as f32 / 128.0))); | ||||
|             acc[idx] += val; | ||||
|         } | ||||
|         for x in &mut acc { | ||||
|             *x = x.tanh(); | ||||
|         } | ||||
|         Ok(Self::l2_normalize(acc)) | ||||
|     } | ||||
| } | ||||
|  | ||||
| //// OpenAI embedder (supports OpenAI and Azure OpenAI via REST) | ||||
| struct OpenAIEmbedder { | ||||
|     model: String, | ||||
| @@ -320,7 +395,25 @@ pub fn create_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn Embedder>, DB | ||||
|             let inner = OpenAIEmbedder::new_from_config(config)?; | ||||
|             Ok(Arc::new(inner)) | ||||
|         } | ||||
|         EmbeddingProvider::ImageTestHash => { | ||||
|             Err(DBError("Use create_image_embedder() for image providers".into())) | ||||
|         } | ||||
|         EmbeddingProvider::LanceFastEmbed => Err(DBError("LanceFastEmbed provider not yet implemented in Rust embedding layer; configure 'test-hash' or use 'openai'".into())), | ||||
|         EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Lance provider '{}' not implemented; configure 'openai' or 'test-hash'", p))), | ||||
|     } | ||||
| } | ||||
|  | ||||
| /// Create an image embedder instance from a config. | ||||
| pub fn create_image_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn ImageEmbedder>, DBError> { | ||||
|     match &config.provider { | ||||
|         EmbeddingProvider::ImageTestHash => { | ||||
|             let dim = config.get_param_usize("dim").unwrap_or(512); | ||||
|             Ok(Arc::new(TestImageHashEmbedder::new(dim, config.model.clone()))) | ||||
|         } | ||||
|         EmbeddingProvider::TestHash | EmbeddingProvider::LanceOpenAI => { | ||||
|             Err(DBError("Configured text provider; dataset expects image provider (e.g., 'testimagehash')".into())) | ||||
|         } | ||||
|         EmbeddingProvider::LanceFastEmbed => Err(DBError("Image provider 'lancefastembed' not yet implemented".into())), | ||||
|         EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Image provider '{}' not implemented; use 'testimagehash' for now", p))), | ||||
|     } | ||||
| } | ||||
| @@ -996,6 +996,7 @@ impl RpcServer for RpcServerImpl { | ||||
|         } | ||||
|         let prov = match provider.to_lowercase().as_str() { | ||||
|             "test-hash" | "testhash" => EmbeddingProvider::TestHash, | ||||
|             "testimagehash" | "image-test-hash" | "imagetesthash" => EmbeddingProvider::ImageTestHash, | ||||
|             "fastembed" | "lancefastembed" => EmbeddingProvider::LanceFastEmbed, | ||||
|             "openai" | "lanceopenai" => EmbeddingProvider::LanceOpenAI, | ||||
|             other => EmbeddingProvider::LanceOther(other.to_string()), | ||||
| @@ -1030,6 +1031,7 @@ impl RpcServer for RpcServerImpl { | ||||
|         Ok(serde_json::json!({ | ||||
|             "provider": match cfg.provider { | ||||
|                 EmbeddingProvider::TestHash => "test-hash", | ||||
|                 EmbeddingProvider::ImageTestHash => "testimagehash", | ||||
|                 EmbeddingProvider::LanceFastEmbed => "lancefastembed", | ||||
|                 EmbeddingProvider::LanceOpenAI => "lanceopenai", | ||||
|                 EmbeddingProvider::LanceOther(ref s) => s, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user