fixed a few bugs related to vector embedding + added additional end to end documentation to showcase local and external embedders step-by-step + added example mock embedder python script

This commit is contained in:
Maxime Van Hees
2025-10-16 15:30:45 +02:00
parent a8720c06db
commit df780e20a2
5 changed files with 1188 additions and 198 deletions

View File

@@ -9,7 +9,7 @@ use sha2::{Digest, Sha256};
use crate::server::Server;
use crate::options::DBOption;
use crate::admin_meta;
use crate::embedding::{EmbeddingConfig, EmbeddingProvider};
use crate::embedding::EmbeddingConfig;
use base64::{engine::general_purpose, Engine as _};
/// Database backend types
@@ -248,9 +248,7 @@ pub trait Rpc {
&self,
db_id: u64,
name: String,
provider: String,
model: String,
params: Option<HashMap<String, String>>,
config: EmbeddingConfig,
) -> RpcResult<bool>;
/// Get per-dataset embedding configuration
@@ -1008,9 +1006,7 @@ impl RpcServer for RpcServerImpl {
&self,
db_id: u64,
name: String,
provider: String,
model: String,
params: Option<HashMap<String, String>>,
config: EmbeddingConfig,
) -> RpcResult<bool> {
let server = self.get_or_create_server(db_id).await?;
if db_id == 0 {
@@ -1022,19 +1018,17 @@ impl RpcServer for RpcServerImpl {
if !server.has_write_permission() {
return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "write permission denied", None::<()>));
}
let prov = match provider.to_lowercase().as_str() {
"test-hash" | "testhash" => EmbeddingProvider::TestHash,
"testimagehash" | "image-test-hash" | "imagetesthash" => EmbeddingProvider::ImageTestHash,
"fastembed" | "lancefastembed" => EmbeddingProvider::LanceFastEmbed,
"openai" | "lanceopenai" => EmbeddingProvider::LanceOpenAI,
other => EmbeddingProvider::LanceOther(other.to_string()),
};
let cfg = EmbeddingConfig {
provider: prov,
model,
params: params.unwrap_or_default(),
};
server.set_dataset_embedding_config(&name, &cfg)
// Validate provider and dimension (only a minimal set is allowed for now)
match config.provider {
crate::embedding::EmbeddingProvider::openai
| crate::embedding::EmbeddingProvider::test
| crate::embedding::EmbeddingProvider::image_test => {}
}
if config.dim == 0 {
return Err(jsonrpsee::types::ErrorObjectOwned::owned(-32000, "Invalid embedding config: dim must be > 0", None::<()>));
}
server.set_dataset_embedding_config(&name, &config)
.map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?;
Ok(true)
}
@@ -1056,17 +1050,7 @@ impl RpcServer for RpcServerImpl {
}
let cfg = server.get_dataset_embedding_config(&name)
.map_err(|e| jsonrpsee::types::ErrorObjectOwned::owned(-32000, e.0, None::<()>))?;
Ok(serde_json::json!({
"provider": match cfg.provider {
EmbeddingProvider::TestHash => "test-hash",
EmbeddingProvider::ImageTestHash => "testimagehash",
EmbeddingProvider::LanceFastEmbed => "lancefastembed",
EmbeddingProvider::LanceOpenAI => "lanceopenai",
EmbeddingProvider::LanceOther(ref s) => s,
},
"model": cfg.model,
"params": cfg.params
}))
Ok(serde_json::to_value(&cfg).unwrap_or(serde_json::json!({})))
}
async fn lance_store_text(