...

2025-08-23 04:57:47 +02:00
parent 56699b9abb
commit c9e1dcdb6c
47 changed files with 9267 additions and 191 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,9 +1,29 @@
-[workspace]
+[package]
-members = ["herodb"]
+name = "herodb"
-resolver = "2"
+version = "0.0.1"
 authors = ["Pin Fang <fpfangpin@hotmail.com>"]
 edition = "2021"
-# You can define shared profiles for all workspace members here
+[dependencies]
-[profile.release]
+anyhow = "1.0.59"
-lto = true
+bytes = "1.3.0"
-codegen-units = 1
+thiserror = "1.0.32"
-strip = true
+tokio = { version = "1.23.0", features = ["full"] }
 clap = { version = "4.5.20", features = ["derive"] }
 byteorder = "1.4.3"
 futures = "0.3"
 sled = "0.34"
 redb = "2.1.3"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 bincode = "1.3"
 chacha20poly1305 = "0.10.1"
 rand = "0.8"
 sha2 = "0.10"
 age = "0.10"
 secrecy = "0.8"
 ed25519-dalek = "2"
 base64 = "0.22"
 [dev-dependencies]
 redis = { version = "0.24", features = ["aio", "tokio-comp"] }
--- a/herodb/README.md
+++ b/herodb/README.md
--- a/herodb/build.sh
+++ b/herodb/build.sh
--- a/herodb/docs/age.md
+++ b/herodb/docs/age.md
--- a/herodb/docs/basics.md
+++ b/herodb/docs/basics.md
--- a/herodb/docs/cmds.md
+++ b/herodb/docs/cmds.md
@@ -70,6 +70,15 @@ MULTI/EXEC/DISCARD | ✅ | ❌ | Only supported in redb |
 **Encryption** | | | |
 Data-at-rest encryption | ✅ | ✅ | Both support [age](age.tech) encryption |
 AGE commands | ✅ | ✅ | Both support AGE crypto commands |
 **Full-Text Search** | | | |
 FT.CREATE | ✅ | ✅ | Create search index with schema |
 FT.ADD | ✅ | ✅ | Add document to search index |
 FT.SEARCH | ✅ | ✅ | Search documents with query |
 FT.DEL | ✅ | ✅ | Delete document from index |
 FT.INFO | ✅ | ✅ | Get index information |
 FT.DROP | ✅ | ✅ | Drop search index |
 FT.ALTER | ✅ | ✅ | Alter index schema |
 FT.AGGREGATE | ✅ | ✅ | Aggregate search results |
 ### Performance Considerations
--- a/docs/search.md
+++ b/docs/search.md
@@ -0,0 +1,397 @@
 # Full-Text Search with Tantivy
 HeroDB includes powerful full-text search capabilities powered by [Tantivy](https://github.com/quickwit-oss/tantivy), a fast full-text search engine library written in Rust. This provides Redis-compatible search commands similar to RediSearch.
 ## Overview
 The search functionality allows you to:
 - Create search indexes with custom schemas
 - Index documents with multiple field types
 - Perform complex queries with filters
 - Support for text, numeric, date, and geographic data
 - Real-time search with high performance
 ## Search Commands
 ### FT.CREATE - Create Search Index
 Create a new search index with a defined schema.
 ```bash
 FT.CREATE index_name SCHEMA field_name field_type [options] [field_name field_type [options] ...]
 ```
 **Field Types:**
 - `TEXT` - Full-text searchable text fields
 - `NUMERIC` - Numeric fields (integers, floats)
 - `TAG` - Tag fields for exact matching
 - `GEO` - Geographic coordinates (lat,lon)
 - `DATE` - Date/timestamp fields
 **Field Options:**
 - `STORED` - Store field value for retrieval
 - `INDEXED` - Make field searchable
 - `TOKENIZED` - Enable tokenization for text fields
 - `FAST` - Enable fast access for numeric fields
 **Example:**
 ```bash
 # Create a product search index
 FT.CREATE products SCHEMA 
  title TEXT STORED INDEXED TOKENIZED
  description TEXT STORED INDEXED TOKENIZED  
  price NUMERIC STORED INDEXED FAST
  category TAG STORED
  location GEO STORED
  created_date DATE STORED INDEXED
 ```
 ### FT.ADD - Add Document to Index
 Add a document to a search index.
 ```bash
 FT.ADD index_name doc_id [SCORE score] FIELDS field_name field_value [field_name field_value ...]
 ```
 **Example:**
 ```bash
 # Add a product document
 FT.ADD products product:1 SCORE 1.0 FIELDS 
  title "Wireless Headphones" 
  description "High-quality wireless headphones with noise cancellation"
  price 199.99
  category "electronics"
  location "37.7749,-122.4194"
  created_date 1640995200000
 ```
 ### FT.SEARCH - Search Documents
 Search for documents in an index.
 ```bash
 FT.SEARCH index_name query [LIMIT offset count] [FILTER field min max] [RETURN field [field ...]]
 ```
 **Query Syntax:**
 - Simple terms: `wireless headphones`
 - Phrase queries: `"noise cancellation"`
 - Field-specific: `title:wireless`
 - Boolean operators: `wireless AND headphones`
 - Wildcards: `head*`
 **Examples:**
 ```bash
 # Simple text search
 FT.SEARCH products "wireless headphones"
 # Search with filters
 FT.SEARCH products "headphones" FILTER price 100 300 LIMIT 0 10
 # Field-specific search
 FT.SEARCH products "title:wireless AND category:electronics"
 # Return specific fields only
 FT.SEARCH products "*" RETURN title price
 ```
 ### FT.DEL - Delete Document
 Remove a document from the search index.
 ```bash
 FT.DEL index_name doc_id
 ```
 **Example:**
 ```bash
 FT.DEL products product:1
 ```
 ### FT.INFO - Get Index Information
 Get information about a search index.
 ```bash
 FT.INFO index_name
 ```
 **Returns:**
 - Index name and document count
 - Field definitions and types
 - Index configuration
 **Example:**
 ```bash
 FT.INFO products
 ```
 ### FT.DROP - Drop Index
 Delete an entire search index.
 ```bash
 FT.DROP index_name
 ```
 **Example:**
 ```bash
 FT.DROP products
 ```
 ### FT.ALTER - Alter Index Schema
 Add new fields to an existing index.
 ```bash
 FT.ALTER index_name SCHEMA ADD field_name field_type [options]
 ```
 **Example:**
 ```bash
 FT.ALTER products SCHEMA ADD brand TAG STORED
 ```
 ### FT.AGGREGATE - Aggregate Search Results
 Perform aggregations on search results.
 ```bash
 FT.AGGREGATE index_name query [GROUPBY field] [REDUCE function field AS alias]
 ```
 **Example:**
 ```bash
 # Group products by category and count
 FT.AGGREGATE products "*" GROUPBY category REDUCE COUNT 0 AS count
 ```
 ## Field Types in Detail
 ### TEXT Fields
 - **Purpose**: Full-text search on natural language content
 - **Features**: Tokenization, stemming, stop-word removal
 - **Options**: `STORED`, `INDEXED`, `TOKENIZED`
 - **Example**: Product titles, descriptions, content
 ### NUMERIC Fields  
 - **Purpose**: Numeric data for range queries and sorting
 - **Types**: I64, U64, F64
 - **Options**: `STORED`, `INDEXED`, `FAST`
 - **Example**: Prices, quantities, ratings
 ### TAG Fields
 - **Purpose**: Exact-match categorical data
 - **Features**: No tokenization, exact string matching
 - **Options**: `STORED`, case sensitivity control
 - **Example**: Categories, brands, status values
 ### GEO Fields
 - **Purpose**: Geographic coordinates
 - **Format**: "latitude,longitude" (e.g., "37.7749,-122.4194")
 - **Features**: Geographic distance queries
 - **Options**: `STORED`
 ### DATE Fields
 - **Purpose**: Timestamp and date data
 - **Format**: Unix timestamp in milliseconds
 - **Features**: Range queries, temporal filtering
 - **Options**: `STORED`, `INDEXED`, `FAST`
 ## Search Query Syntax
 ### Basic Queries
 ```bash
 # Single term
 FT.SEARCH products "wireless"
 # Multiple terms (AND by default)
 FT.SEARCH products "wireless headphones"
 # Phrase query
 FT.SEARCH products "\"noise cancellation\""
 ```
 ### Field-Specific Queries
 ```bash
 # Search in specific field
 FT.SEARCH products "title:wireless"
 # Multiple field queries
 FT.SEARCH products "title:wireless AND description:bluetooth"
 ```
 ### Boolean Operators
 ```bash
 # AND operator
 FT.SEARCH products "wireless AND headphones"
 # OR operator  
 FT.SEARCH products "wireless OR bluetooth"
 # NOT operator
 FT.SEARCH products "headphones NOT wired"
 ```
 ### Wildcards and Fuzzy Search
 ```bash
 # Wildcard search
 FT.SEARCH products "head*"
 # Fuzzy search (approximate matching)
 FT.SEARCH products "%headphone%"
 ```
 ### Range Queries
 ```bash
 # Numeric range in query
 FT.SEARCH products "@price:[100 300]"
 # Date range
 FT.SEARCH products "@created_date:[1640995200000 1672531200000]"
 ```
 ## Filtering and Sorting
 ### FILTER Clause
 ```bash
 # Numeric filter
 FT.SEARCH products "headphones" FILTER price 100 300
 # Multiple filters
 FT.SEARCH products "*" FILTER price 100 500 FILTER rating 4 5
 ```
 ### LIMIT Clause
 ```bash
 # Pagination
 FT.SEARCH products "wireless" LIMIT 0 10    # First 10 results
 FT.SEARCH products "wireless" LIMIT 10 10   # Next 10 results
 ```
 ### RETURN Clause
 ```bash
 # Return specific fields
 FT.SEARCH products "*" RETURN title price
 # Return all stored fields (default)
 FT.SEARCH products "*"
 ```
 ## Performance Considerations
 ### Indexing Strategy
 - Only index fields you need to search on
 - Use `FAST` option for frequently filtered numeric fields
 - Consider storage vs. search performance trade-offs
 ### Query Optimization
 - Use specific field queries when possible
 - Combine filters with text queries for better performance
 - Use pagination with LIMIT for large result sets
 ### Memory Usage
 - Tantivy indexes are memory-mapped for performance
 - Index size depends on document count and field configuration
 - Monitor disk space for index storage
 ## Integration with Redis Commands
 Search indexes work alongside regular Redis data:
 ```bash
 # Store product data in Redis hash
 HSET product:1 title "Wireless Headphones" price "199.99"
 # Index the same data for search
 FT.ADD products product:1 FIELDS title "Wireless Headphones" price 199.99
 # Search returns document IDs that can be used with Redis commands
 FT.SEARCH products "wireless"
 # Returns: product:1
 # Retrieve full data using Redis
 HGETALL product:1
 ```
 ## Example Use Cases
 ### E-commerce Product Search
 ```bash
 # Create product catalog index
 FT.CREATE catalog SCHEMA 
  name TEXT STORED INDEXED TOKENIZED
  description TEXT INDEXED TOKENIZED
  price NUMERIC STORED INDEXED FAST
  category TAG STORED
  brand TAG STORED
  rating NUMERIC STORED FAST
 # Add products
 FT.ADD catalog prod:1 FIELDS name "iPhone 14" price 999 category "phones" brand "apple" rating 4.5
 FT.ADD catalog prod:2 FIELDS name "Samsung Galaxy" price 899 category "phones" brand "samsung" rating 4.3
 # Search queries
 FT.SEARCH catalog "iPhone"
 FT.SEARCH catalog "phones" FILTER price 800 1000
 FT.SEARCH catalog "@brand:apple"
 ```
 ### Content Management
 ```bash
 # Create content index
 FT.CREATE content SCHEMA
  title TEXT STORED INDEXED TOKENIZED
  body TEXT INDEXED TOKENIZED
  author TAG STORED
  published DATE STORED INDEXED
  tags TAG STORED
 # Search content
 FT.SEARCH content "machine learning"
 FT.SEARCH content "@author:john AND @tags:ai"
 FT.SEARCH content "*" FILTER published 1640995200000 1672531200000
 ```
 ### Geographic Search
 ```bash
 # Create location-based index
 FT.CREATE places SCHEMA
  name TEXT STORED INDEXED TOKENIZED
  location GEO STORED
  type TAG STORED
 # Add locations
 FT.ADD places place:1 FIELDS name "Golden Gate Bridge" location "37.8199,-122.4783" type "landmark"
 # Geographic queries (future feature)
 FT.SEARCH places "@location:[37.7749 -122.4194 10 km]"
 ```
 ## Error Handling
 Common error responses:
 - `ERR index not found` - Index doesn't exist
 - `ERR field not found` - Field not defined in schema
 - `ERR invalid query syntax` - Malformed query
 - `ERR document not found` - Document ID doesn't exist
 ## Best Practices
 1. **Schema Design**: Plan your schema carefully - changes require reindexing
 2. **Field Selection**: Only store and index fields you actually need
 3. **Batch Operations**: Add multiple documents efficiently
 4. **Query Testing**: Test queries for performance with realistic data
 5. **Monitoring**: Monitor index size and query performance
 6. **Backup**: Include search indexes in backup strategies
 ## Future Enhancements
 Planned features:
 - Geographic distance queries
 - Advanced aggregations and faceting
 - Highlighting of search results
 - Synonyms and custom analyzers
 - Real-time suggestions and autocomplete
 - Index replication and sharding
--- a/herodb/Cargo.toml
+++ b/herodb/Cargo.toml
@@ -1,29 +0,0 @@
 [package]
 name = "herodb"
 version = "0.0.1"
 authors = ["Pin Fang <fpfangpin@hotmail.com>"]
 edition = "2021"
 [dependencies]
 anyhow = "1.0.59"
 bytes = "1.3.0"
 thiserror = "1.0.32"
 tokio = { version = "1.23.0", features = ["full"] }
 clap = { version = "4.5.20", features = ["derive"] }
 byteorder = "1.4.3"
 futures = "0.3"
 sled = "0.34"
 redb = "2.1.3"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 bincode = "1.3"
 chacha20poly1305 = "0.10.1"
 rand = "0.8"
 sha2 = "0.10"
 age = "0.10"
 secrecy = "0.8"
 ed25519-dalek = "2"
 base64 = "0.22"
 [dev-dependencies]
 redis = { version = "0.24", features = ["aio", "tokio-comp"] }
--- a/herodb/examples/age_bash_demo.sh
+++ b/herodb/examples/age_bash_demo.sh
@@ -1,71 +0,0 @@
 #!/bin/bash
 # Start the herodb server in the background
 echo "Starting herodb server..."
 cargo run -p herodb -- --dir /tmp/herodb_age_test --port 6382 --debug --encryption-key "testkey" &
 SERVER_PID=$!
 sleep 2 # Give the server a moment to start
 REDIS_CLI="redis-cli -p 6382"
 echo "--- Generating and Storing Encryption Keys ---"
 # The new AGE commands are 'AGE KEYGEN <name>' etc., based on src/cmd.rs
 # This script uses older commands like 'AGE.GENERATE_KEYPAIR alice'
 # The demo script needs to be updated to match the implemented commands.
 # Let's assume the commands in the script are what's expected for now,
 # but note this discrepancy. The new commands are AGE KEYGEN etc.
 # The script here uses a different syntax not found in src/cmd.rs like 'AGE.GENERATE_KEYPAIR'.
 # For now, I will modify the script to fit the actual implementation.
 echo "--- Generating and Storing Encryption Keys ---"
 $REDIS_CLI AGE KEYGEN alice
 $REDIS_CLI AGE KEYGEN bob
 echo "--- Encrypting and Decrypting a Message ---"
 MESSAGE="Hello, AGE encryption!"
 # The new logic stores keys internally and does not expose a command to get the public key.
 # We will encrypt by name.
 ALICE_PUBKEY_REPLY=$($REDIS_CLI AGE KEYGEN alice | head -n 2 | tail -n 1)
 echo "Alice's Public Key: $ALICE_PUBKEY_REPLY"
 echo "Encrypting message: '$MESSAGE' with Alice's identity..."
 # AGE.ENCRYPT recipient message. But since we use persistent keys, let's use ENCRYPTNAME
 CIPHERTEXT=$($REDIS_CLI AGE ENCRYPTNAME alice "$MESSAGE")
 echo "Ciphertext: $CIPHERTEXT"
 echo "Decrypting ciphertext with Alice's private key..."
 DECRYPTED_MESSAGE=$($REDIS_CLI AGE DECRYPTNAME alice "$CIPHERTEXT")
 echo "Decrypted Message: $DECRYPTED_MESSAGE"
 echo "--- Generating and Storing Signing Keys ---"
 $REDIS_CLI AGE SIGNKEYGEN signer1
 echo "--- Signing and Verifying a Message ---"
 SIGN_MESSAGE="This is a message to be signed."
 # Similar to above, we don't have GET_SIGN_PUBKEY. We will verify by name.
 echo "Signing message: '$SIGN_MESSAGE' with signer1's private key..."
 SIGNATURE=$($REDIS_CLI AGE SIGNNAME "$SIGN_MESSAGE" signer1)
 echo "Signature: $SIGNATURE"
 echo "Verifying signature with signer1's public key..."
 VERIFY_RESULT=$($REDIS_CLI AGE VERIFYNAME signer1 "$SIGN_MESSAGE" "$SIGNATURE")
 echo "Verification Result: $VERIFY_RESULT"
 # There is no DELETE_KEYPAIR command in the implementation
 echo "--- Cleaning up keys (manual in herodb) ---"
 # We would use DEL for age:key:alice, etc.
 $REDIS_CLI DEL age:key:alice
 $REDIS_CLI DEL age:privkey:alice
 $REDIS_CLI DEL age:key:bob
 $REDIS_CLI DEL age:privkey:bob
 $REDIS_CLI DEL age:signpub:signer1
 $REDIS_CLI DEL age:signpriv:signer1
 echo "--- Stopping herodb server ---"
 kill $SERVER_PID
 wait $SERVER_PID 2>/dev/null
 echo "Server stopped."
 echo "Bash demo complete."
--- a/herodb/examples/age_persist_demo.rs
+++ b/herodb/examples/age_persist_demo.rs
@@ -1,83 +0,0 @@
 use std::io::{Read, Write};
 use std::net::TcpStream;
 // Minimal RESP helpers
 fn arr(parts: &[&str]) -> String {
    let mut out = format!("*{}\r\n", parts.len());
    for p in parts {
        out.push_str(&format!("${}\r\n{}\r\n", p.len(), p));
    }
    out
 }
 fn read_reply(s: &mut TcpStream) -> String {
    let mut buf = [0u8; 65536];
    let n = s.read(&mut buf).unwrap();
    String::from_utf8_lossy(&buf[..n]).to_string()
 }
 fn parse_two_bulk(reply: &str) -> Option<(String,String)> {
    let mut lines = reply.split("\r\n");
    if lines.next()? != "*2" { return None; }
    let _n = lines.next()?;
    let a = lines.next()?.to_string();
    let _m = lines.next()?;
    let b = lines.next()?.to_string();
    Some((a,b))
 }
 fn parse_bulk(reply: &str) -> Option<String> {
    let mut lines = reply.split("\r\n");
    let hdr = lines.next()?;
    if !hdr.starts_with('$') { return None; }
    Some(lines.next()?.to_string())
 }
 fn parse_simple(reply: &str) -> Option<String> {
    let mut lines = reply.split("\r\n");
    let hdr = lines.next()?;
    if !hdr.starts_with('+') { return None; }
    Some(hdr[1..].to_string())
 }
 fn main() {
    let mut args = std::env::args().skip(1);
    let host = args.next().unwrap_or_else(|| "127.0.0.1".into());
    let port = args.next().unwrap_or_else(|| "6379".into());
    let addr = format!("{host}:{port}");
    println!("Connecting to {addr}...");
    let mut s = TcpStream::connect(addr).expect("connect");
    // Generate & persist X25519 enc keys under name "alice"
    s.write_all(arr(&["age","keygen","alice"]).as_bytes()).unwrap();
    let (_alice_recip, _alice_ident) = parse_two_bulk(&read_reply(&mut s)).expect("gen enc");
    // Generate & persist Ed25519 signing key under name "signer"
    s.write_all(arr(&["age","signkeygen","signer"]).as_bytes()).unwrap();
    let (_verify, _secret) = parse_two_bulk(&read_reply(&mut s)).expect("gen sign");
    // Encrypt by name
    let msg = "hello from persistent keys";
    s.write_all(arr(&["age","encryptname","alice", msg]).as_bytes()).unwrap();
    let ct_b64 = parse_bulk(&read_reply(&mut s)).expect("ct b64");
    println!("ciphertext b64: {}", ct_b64);
    // Decrypt by name
    s.write_all(arr(&["age","decryptname","alice", &ct_b64]).as_bytes()).unwrap();
    let pt = parse_bulk(&read_reply(&mut s)).expect("pt");
    assert_eq!(pt, msg);
    println!("decrypted ok");
    // Sign by name
    s.write_all(arr(&["age","signname","signer", msg]).as_bytes()).unwrap();
    let sig_b64 = parse_bulk(&read_reply(&mut s)).expect("sig b64");
    // Verify by name
    s.write_all(arr(&["age","verifyname","signer", msg, &sig_b64]).as_bytes()).unwrap();
    let ok = parse_simple(&read_reply(&mut s)).expect("verify");
    assert_eq!(ok, "1");
    println!("signature verified");
    // List names
    s.write_all(arr(&["age","list"]).as_bytes()).unwrap();
    let list = read_reply(&mut s);
    println!("LIST -> {list}");
    println!("✔ persistent AGE workflow complete.");
 }
--- a/herodb/run_tests.sh
+++ b/herodb/run_tests.sh
--- a/herodb/specs/backgroundinfo/encrypt.md
+++ b/herodb/specs/backgroundinfo/encrypt.md
--- a/specs/backgroundinfo/lance.md
+++ b/specs/backgroundinfo/lance.md
--- a/specs/backgroundinfo/lancedb.md
+++ b/specs/backgroundinfo/lancedb.md
--- a/herodb/specs/backgroundinfo/redb.md
+++ b/herodb/specs/backgroundinfo/redb.md
--- a/herodb/specs/backgroundinfo/redis_basic_client.md
+++ b/herodb/specs/backgroundinfo/redis_basic_client.md
--- a/herodb/specs/backgroundinfo/redis_basics.md
+++ b/herodb/specs/backgroundinfo/redis_basics.md
--- a/herodb/specs/backgroundinfo/redis_hset_functions.md
+++ b/herodb/specs/backgroundinfo/redis_hset_functions.md
--- a/herodb/specs/backgroundinfo/redis_lists.md
+++ b/herodb/specs/backgroundinfo/redis_lists.md
--- a/herodb/specs/backgroundinfo/sled.md
+++ b/herodb/specs/backgroundinfo/sled.md
--- a/herodb/specs/backgroundinfo/tantivy.md
+++ b/herodb/specs/backgroundinfo/tantivy.md
--- a/specs/lance_implementation.md
+++ b/specs/lance_implementation.md
@@ -0,0 +1,735 @@
 Looking at your request to ensure embedding happens with an external embedding provider configured in Redis at `config:core:aiembed:url`, I'll modify the implementation to use an external embedding service instead of local models.
 ## Updated Implementation with External Embedding Provider
 ### 1. Update Dependencies in `Cargo.toml`
 ```toml
 [dependencies]
 # ... existing dependencies ...
 lance = "0.18"
 lance-index = "0.18"
 lance-linalg = "0.18"
 arrow = "52"
 arrow-array = "52"
 arrow-schema = "52"
 parquet = "52"
 uuid = { version = "1.10", features = ["v4"] }
 reqwest = { version = "0.11", features = ["json"] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 base64 = "0.22"
 image = "0.25"
 ```
 ### 2. Create Enhanced Lance Module with External Embedding
 Create `src/lance_store.rs`:
 ```rust
 use std::collections::HashMap;
 use std::path::PathBuf;
 use std::sync::Arc;
 use tokio::sync::RwLock;
 use arrow::array::{Float32Array, StringArray, BinaryArray, ArrayRef};
 use arrow::datatypes::{DataType, Field, Schema};
 use arrow::record_batch::RecordBatch;
 use lance::dataset::{Dataset, WriteParams, WriteMode};
 use lance::index::vector::VectorIndexParams;
 use lance_index::vector::pq::PQBuildParams;
 use lance_index::vector::ivf::IvfBuildParams;
 use serde::{Deserialize, Serialize};
 use crate::error::DBError;
 use crate::cmd::Protocol;
 #[derive(Debug, Serialize, Deserialize)]
 struct EmbeddingRequest {
    texts: Option<Vec<String>>,
    images: Option<Vec<String>>, // base64 encoded
    model: Option<String>,
 }
 #[derive(Debug, Serialize, Deserialize)]
 struct EmbeddingResponse {
    embeddings: Vec<Vec<f32>>,
    model: String,
    usage: Option<HashMap<String, u32>>,
 }
 pub struct LanceStore {
    datasets: Arc<RwLock<HashMap<String, Arc<Dataset>>>>,
    data_dir: PathBuf,
    http_client: reqwest::Client,
 }
 impl LanceStore {
    pub async fn new(data_dir: PathBuf) -> Result<Self, DBError> {
        // Create data directory if it doesn't exist
        std::fs::create_dir_all(&data_dir)
            .map_err(|e| DBError(format!("Failed to create Lance data directory: {}", e)))?;
        let http_client = reqwest::Client::builder()
            .timeout(std::time::Duration::from_secs(30))
            .build()
            .map_err(|e| DBError(format!("Failed to create HTTP client: {}", e)))?;
        Ok(Self {
            datasets: Arc::new(RwLock::new(HashMap::new())),
            data_dir,
            http_client,
        })
    }
    /// Get embedding service URL from Redis config
    async fn get_embedding_url(&self, server: &crate::server::Server) -> Result<String, DBError> {
        // Get the embedding URL from Redis config
        let key = "config:core:aiembed:url";
        // Use HGET to retrieve the URL from Redis hash
        let cmd = crate::cmd::Cmd::HGet {
            key: key.to_string(),
            field: "url".to_string(),
        };
        // Execute command to get the config
        let result = cmd.run(server).await?;
        match result {
            Protocol::BulkString(url) => Ok(url),
            Protocol::SimpleString(url) => Ok(url),
            Protocol::Nil => Err(DBError(
                "Embedding service URL not configured. Set it with: HSET config:core:aiembed:url url <YOUR_EMBEDDING_SERVICE_URL>".to_string()
            )),
            _ => Err(DBError("Invalid embedding URL configuration".to_string())),
        }
    }
    /// Call external embedding service
    async fn call_embedding_service(
        &self,
        server: &crate::server::Server,
        texts: Option<Vec<String>>,
        images: Option<Vec<String>>,
    ) -> Result<Vec<Vec<f32>>, DBError> {
        let url = self.get_embedding_url(server).await?;
        let request = EmbeddingRequest {
            texts,
            images,
            model: None, // Let the service use its default
        };
        let response = self.http_client
            .post(&url)
            .json(&request)
            .send()
            .await
            .map_err(|e| DBError(format!("Failed to call embedding service: {}", e)))?;
        if !response.status().is_success() {
            let status = response.status();
            let error_text = response.text().await.unwrap_or_default();
            return Err(DBError(format!(
                "Embedding service returned error {}: {}", 
                status, error_text
            )));
        }
        let embedding_response: EmbeddingResponse = response
            .json()
            .await
            .map_err(|e| DBError(format!("Failed to parse embedding response: {}", e)))?;
        Ok(embedding_response.embeddings)
    }
    pub async fn embed_text(
        &self, 
        server: &crate::server::Server,
        texts: Vec<String>
    ) -> Result<Vec<Vec<f32>>, DBError> {
        if texts.is_empty() {
            return Ok(Vec::new());
        }
        self.call_embedding_service(server, Some(texts), None).await
    }
    pub async fn embed_image(
        &self,
        server: &crate::server::Server,
        image_bytes: Vec<u8>
    ) -> Result<Vec<f32>, DBError> {
        // Convert image bytes to base64
        let base64_image = base64::encode(&image_bytes);
        let embeddings = self.call_embedding_service(
            server, 
            None, 
            Some(vec![base64_image])
        ).await?;
        embeddings.into_iter()
            .next()
            .ok_or_else(|| DBError("No embedding returned for image".to_string()))
    }
    pub async fn create_dataset(
        &self,
        name: &str,
        schema: Schema,
    ) -> Result<(), DBError> {
        let dataset_path = self.data_dir.join(format!("{}.lance", name));
        // Create empty dataset with schema
        let write_params = WriteParams {
            mode: WriteMode::Create,
            ..Default::default()
        };
        // Create an empty RecordBatch with the schema
        let empty_batch = RecordBatch::new_empty(Arc::new(schema));
        let batches = vec![empty_batch];
        let dataset = Dataset::write(
            batches,
            dataset_path.to_str().unwrap(),
            Some(write_params)
        ).await
        .map_err(|e| DBError(format!("Failed to create dataset: {}", e)))?;
        let mut datasets = self.datasets.write().await;
        datasets.insert(name.to_string(), Arc::new(dataset));
        Ok(())
    }
    pub async fn write_vectors(
        &self,
        dataset_name: &str,
        vectors: Vec<Vec<f32>>,
        metadata: Option<HashMap<String, Vec<String>>>,
    ) -> Result<usize, DBError> {
        let dataset_path = self.data_dir.join(format!("{}.lance", dataset_name));
        // Open or get cached dataset
        let dataset = self.get_or_open_dataset(dataset_name).await?;
        // Build RecordBatch
        let num_vectors = vectors.len();
        if num_vectors == 0 {
            return Ok(0);
        }
        let dim = vectors.first()
            .ok_or_else(|| DBError("Empty vectors".to_string()))?
            .len();
        // Flatten vectors
        let flat_vectors: Vec<f32> = vectors.into_iter().flatten().collect();
        let vector_array = Float32Array::from(flat_vectors);
        let vector_array = arrow::array::FixedSizeListArray::try_new_from_values(
            vector_array, 
            dim as i32
        ).map_err(|e| DBError(format!("Failed to create vector array: {}", e)))?;
        let mut arrays: Vec<ArrayRef> = vec![Arc::new(vector_array)];
        let mut fields = vec![Field::new(
            "vector",
            DataType::FixedSizeList(
                Arc::new(Field::new("item", DataType::Float32, true)),
                dim as i32
            ),
            false
        )];
        // Add metadata columns if provided
        if let Some(metadata) = metadata {
            for (key, values) in metadata {
                if values.len() != num_vectors {
                    return Err(DBError(format!(
                        "Metadata field '{}' has {} values but expected {}", 
                        key, values.len(), num_vectors
                    )));
                }
                let array = StringArray::from(values);
                arrays.push(Arc::new(array));
                fields.push(Field::new(&key, DataType::Utf8, true));
            }
        }
        let schema = Arc::new(Schema::new(fields));
        let batch = RecordBatch::try_new(schema, arrays)
            .map_err(|e| DBError(format!("Failed to create RecordBatch: {}", e)))?;
        // Append to dataset
        let write_params = WriteParams {
            mode: WriteMode::Append,
            ..Default::default()
        };
        Dataset::write(
            vec![batch],
            dataset_path.to_str().unwrap(),
            Some(write_params)
        ).await
        .map_err(|e| DBError(format!("Failed to write to dataset: {}", e)))?;
        // Refresh cached dataset
        let mut datasets = self.datasets.write().await;
        datasets.remove(dataset_name);
        Ok(num_vectors)
    }
    pub async fn search_vectors(
        &self,
        dataset_name: &str,
        query_vector: Vec<f32>,
        k: usize,
        nprobes: Option<usize>,
        refine_factor: Option<usize>,
    ) -> Result<Vec<(f32, HashMap<String, String>)>, DBError> {
        let dataset = self.get_or_open_dataset(dataset_name).await?;
        // Build query
        let mut query = dataset.scan();
        query = query.nearest(
            "vector",
            &query_vector,
            k,
        ).map_err(|e| DBError(format!("Failed to build search query: {}", e)))?;
        if let Some(nprobes) = nprobes {
            query = query.nprobes(nprobes);
        }
        if let Some(refine) = refine_factor {
            query = query.refine_factor(refine);
        }
        // Execute search
        let results = query
            .try_into_stream()
            .await
            .map_err(|e| DBError(format!("Failed to execute search: {}", e)))?
            .try_collect::<Vec<_>>()
            .await
            .map_err(|e| DBError(format!("Failed to collect results: {}", e)))?;
        // Process results
        let mut output = Vec::new();
        for batch in results {
            // Get distances
            let distances = batch
                .column_by_name("_distance")
                .ok_or_else(|| DBError("No distance column".to_string()))?
                .as_any()
                .downcast_ref::<Float32Array>()
                .ok_or_else(|| DBError("Invalid distance type".to_string()))?;
            // Get metadata
            for i in 0..batch.num_rows() {
                let distance = distances.value(i);
                let mut metadata = HashMap::new();
                for field in batch.schema().fields() {
                    if field.name() != "vector" && field.name() != "_distance" {
                        if let Some(col) = batch.column_by_name(field.name()) {
                            if let Some(str_array) = col.as_any().downcast_ref::<StringArray>() {
                                if !str_array.is_null(i) {
                                    metadata.insert(
                                        field.name().to_string(),
                                        str_array.value(i).to_string()
                                    );
                                }
                            }
                        }
                    }
                }
                output.push((distance, metadata));
            }
        }
        Ok(output)
    }
    pub async fn store_multimodal(
        &self,
        server: &crate::server::Server,
        dataset_name: &str,
        text: Option<String>,
        image_bytes: Option<Vec<u8>>,
        metadata: HashMap<String, String>,
    ) -> Result<String, DBError> {
        // Generate ID
        let id = uuid::Uuid::new_v4().to_string();
        // Generate embeddings using external service
        let embedding = if let Some(text) = text.as_ref() {
            self.embed_text(server, vec![text.clone()]).await?
                .into_iter()
                .next()
                .ok_or_else(|| DBError("No embedding returned".to_string()))?
        } else if let Some(img) = image_bytes.as_ref() {
            self.embed_image(server, img.clone()).await?
        } else {
            return Err(DBError("No text or image provided".to_string()));
        };
        // Prepare metadata
        let mut full_metadata = metadata;
        full_metadata.insert("id".to_string(), id.clone());
        if let Some(text) = text {
            full_metadata.insert("text".to_string(), text);
        }
        if let Some(img) = image_bytes {
            full_metadata.insert("image_base64".to_string(), base64::encode(img));
        }
        // Convert metadata to column vectors
        let mut metadata_cols = HashMap::new();
        for (key, value) in full_metadata {
            metadata_cols.insert(key, vec![value]);
        }
        // Write to dataset
        self.write_vectors(dataset_name, vec![embedding], Some(metadata_cols)).await?;
        Ok(id)
    }
    pub async fn search_with_text(
        &self,
        server: &crate::server::Server,
        dataset_name: &str,
        query_text: String,
        k: usize,
        nprobes: Option<usize>,
        refine_factor: Option<usize>,
    ) -> Result<Vec<(f32, HashMap<String, String>)>, DBError> {
        // Embed the query text using external service
        let embeddings = self.embed_text(server, vec![query_text]).await?;
        let query_vector = embeddings.into_iter()
            .next()
            .ok_or_else(|| DBError("No embedding returned for query".to_string()))?;
        // Search with the embedding
        self.search_vectors(dataset_name, query_vector, k, nprobes, refine_factor).await
    }
    pub async fn create_index(
        &self,
        dataset_name: &str,
        index_type: &str,
        num_partitions: Option<usize>,
        num_sub_vectors: Option<usize>,
    ) -> Result<(), DBError> {
        let dataset = self.get_or_open_dataset(dataset_name).await?;
        let mut params = VectorIndexParams::default();
        match index_type.to_uppercase().as_str() {
            "IVF_PQ" => {
                params.ivf = IvfBuildParams {
                    num_partitions: num_partitions.unwrap_or(256),
                    ..Default::default()
                };
                params.pq = PQBuildParams {
                    num_sub_vectors: num_sub_vectors.unwrap_or(16),
                    ..Default::default()
                };
            }
            _ => return Err(DBError(format!("Unsupported index type: {}", index_type))),
        }
        dataset.create_index(
            &["vector"],
            lance::index::IndexType::Vector,
            None,
            &params,
            true
        ).await
        .map_err(|e| DBError(format!("Failed to create index: {}", e)))?;
        Ok(())
    }
    async fn get_or_open_dataset(&self, name: &str) -> Result<Arc<Dataset>, DBError> {
        let mut datasets = self.datasets.write().await;
        if let Some(dataset) = datasets.get(name) {
            return Ok(dataset.clone());
        }
        let dataset_path = self.data_dir.join(format!("{}.lance", name));
        if !dataset_path.exists() {
            return Err(DBError(format!("Dataset '{}' does not exist", name)));
        }
        let dataset = Dataset::open(dataset_path.to_str().unwrap())
            .await
            .map_err(|e| DBError(format!("Failed to open dataset: {}", e)))?;
        let dataset = Arc::new(dataset);
        datasets.insert(name.to_string(), dataset.clone());
        Ok(dataset)
    }
    pub async fn list_datasets(&self) -> Result<Vec<String>, DBError> {
        let mut datasets = Vec::new();
        let entries = std::fs::read_dir(&self.data_dir)
            .map_err(|e| DBError(format!("Failed to read data directory: {}", e)))?;
        for entry in entries {
            let entry = entry.map_err(|e| DBError(format!("Failed to read entry: {}", e)))?;
            let path = entry.path();
            if path.is_dir() {
                if let Some(name) = path.file_name() {
                    if let Some(name_str) = name.to_str() {
                        if name_str.ends_with(".lance") {
                            let dataset_name = name_str.trim_end_matches(".lance");
                            datasets.push(dataset_name.to_string());
                        }
                    }
                }
            }
        }
        Ok(datasets)
    }
    pub async fn drop_dataset(&self, name: &str) -> Result<(), DBError> {
        // Remove from cache
        let mut datasets = self.datasets.write().await;
        datasets.remove(name);
        // Delete from disk
        let dataset_path = self.data_dir.join(format!("{}.lance", name));
        if dataset_path.exists() {
            std::fs::remove_dir_all(dataset_path)
                .map_err(|e| DBError(format!("Failed to delete dataset: {}", e)))?;
        }
        Ok(())
    }
    pub async fn get_dataset_info(&self, name: &str) -> Result<HashMap<String, String>, DBError> {
        let dataset = self.get_or_open_dataset(name).await?;
        let mut info = HashMap::new();
        info.insert("name".to_string(), name.to_string());
        info.insert("version".to_string(), dataset.version().to_string());
        info.insert("num_rows".to_string(), dataset.count_rows().await?.to_string());
        // Get schema info
        let schema = dataset.schema();
        let fields: Vec<String> = schema.fields()
            .iter()
            .map(|f| format!("{}:{}", f.name(), f.data_type()))
            .collect();
        info.insert("schema".to_string(), fields.join(", "));
        Ok(info)
    }
 }
 ```
 ### 3. Update Command Implementations
 Update the command implementations to pass the server reference for embedding service access:
 ```rust
 // In cmd.rs, update the lance command implementations
 async fn lance_store_cmd(
    server: &Server,
    dataset: &str,
    text: Option<String>,
    image_base64: Option<String>,
    metadata: HashMap<String, String>,
 ) -> Result<Protocol, DBError> {
    let lance_store = server.lance_store()?;
    // Decode image if provided
    let image_bytes = if let Some(b64) = image_base64 {
        Some(base64::decode(b64).map_err(|e| 
            DBError(format!("Invalid base64 image: {}", e)))?)
    } else {
        None
    };
    // Pass server reference for embedding service access
    let id = lance_store.store_multimodal(
        server,  // Pass server to access Redis config
        dataset,
        text,
        image_bytes,
        metadata,
    ).await?;
    Ok(Protocol::BulkString(id))
 }
 async fn lance_embed_text_cmd(
    server: &Server,
    texts: &[String],
 ) -> Result<Protocol, DBError> {
    let lance_store = server.lance_store()?;
    // Pass server reference for embedding service access
    let embeddings = lance_store.embed_text(server, texts.to_vec()).await?;
    // Return as array of vectors
    let mut output = Vec::new();
    for embedding in embeddings {
        let vector_str = format!("[{}]", 
            embedding.iter()
                .map(|f| f.to_string())
                .collect::<Vec<_>>()
                .join(",")
        );
        output.push(Protocol::BulkString(vector_str));
    }
    Ok(Protocol::Array(output))
 }
 async fn lance_search_text_cmd(
    server: &Server,
    dataset: &str,
    query_text: &str,
    k: usize,
    nprobes: Option<usize>,
    refine_factor: Option<usize>,
 ) -> Result<Protocol, DBError> {
    let lance_store = server.lance_store()?;
    // Search using text query (will be embedded automatically)
    let results = lance_store.search_with_text(
        server,
        dataset,
        query_text.to_string(),
        k,
        nprobes,
        refine_factor,
    ).await?;
    // Format results
    let mut output = Vec::new();
    for (distance, metadata) in results {
        let metadata_json = serde_json::to_string(&metadata)
            .unwrap_or_else(|_| "{}".to_string());
        output.push(Protocol::Array(vec![
            Protocol::BulkString(distance.to_string()),
            Protocol::BulkString(metadata_json),
        ]));
    }
    Ok(Protocol::Array(output))
 }
 // Add new command for text-based search
 pub enum Cmd {
    // ... existing commands ...
    LanceSearchText {
        dataset: String,
        query_text: String,
        k: usize,
        nprobes: Option<usize>,
        refine_factor: Option<usize>,
    },
 }
 ```
 ## Usage Examples
 ### 1. Configure the Embedding Service
 First, users need to configure the embedding service URL:
 ```bash
 # Configure the embedding service endpoint
 redis-cli> HSET config:core:aiembed:url url "http://localhost:8000/embeddings"
 OK
 # Or use a cloud service
 redis-cli> HSET config:core:aiembed:url url "https://api.openai.com/v1/embeddings"
 OK
 ```
 ### 2. Use Lance Commands with Automatic External Embedding
 ```bash
 # Create a dataset
 redis-cli> LANCE.CREATE products DIM 1536 SCHEMA name:string price:float category:string
 OK
 # Store text with automatic embedding (calls external service)
 redis-cli> LANCE.STORE products TEXT "Wireless noise-canceling headphones with 30-hour battery" name:AirPods price:299.99 category:Electronics
 "uuid-123-456"
 # Search using text query (automatically embeds the query)
 redis-cli> LANCE.SEARCH.TEXT products "best headphones for travel" K 5
 1) "0.92" 
 2) "{\"id\":\"uuid-123\",\"name\":\"AirPods\",\"price\":\"299.99\"}"
 # Get embeddings directly
 redis-cli> LANCE.EMBED.TEXT "This text will be embedded"
 1) "[0.123, 0.456, 0.789, ...]"
 ```
 ## External Embedding Service API Specification
 The external embedding service should accept POST requests with this format:
 ```json
 // Request
 {
  "texts": ["text1", "text2"],  // Optional
  "images": ["base64_img1"],    // Optional
  "model": "text-embedding-ada-002"  // Optional
 }
 // Response
 {
  "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]],
  "model": "text-embedding-ada-002",
  "usage": {
    "prompt_tokens": 100,
    "total_tokens": 100
  }
 }
 ```
 ## Error Handling
 The implementation includes comprehensive error handling:
 1. **Missing Configuration**: Clear error message if embedding URL not configured
 2. **Service Failures**: Graceful handling of embedding service errors
 3. **Timeout Protection**: 30-second timeout for embedding requests
 4. **Retry Logic**: Could be added for resilience
 ## Benefits of This Approach
 1. **Flexibility**: Supports any embedding service with compatible API
 2. **Cost Control**: Use your preferred embedding provider
 3. **Scalability**: Embedding service can be scaled independently
 4. **Consistency**: All embeddings use the same configured service
 5. **Security**: API keys and endpoints stored securely in Redis
 This implementation ensures that all embedding operations go through the external service configured in Redis, providing a clean separation between the vector database functionality and the embedding generation.
 TODO EXTRA:
 - secret for the embedding service API key
--- a/herodb/src/age.rs
+++ b/herodb/src/age.rs
--- a/herodb/src/cmd.rs
+++ b/herodb/src/cmd.rs
--- a/herodb/src/crypto.rs
+++ b/herodb/src/crypto.rs
--- a/herodb/src/error.rs
+++ b/herodb/src/error.rs
--- a/herodb/src/lib.rs
+++ b/herodb/src/lib.rs
--- a/herodb/src/main.rs
+++ b/herodb/src/main.rs
--- a/herodb/src/options.rs
+++ b/herodb/src/options.rs
--- a/herodb/src/protocol.rs
+++ b/herodb/src/protocol.rs
--- a/herodb/src/server.rs
+++ b/herodb/src/server.rs
--- a/herodb/src/storage/mod.rs
+++ b/herodb/src/storage/mod.rs
--- a/herodb/src/storage/storage_basic.rs
+++ b/herodb/src/storage/storage_basic.rs
--- a/herodb/src/storage/storage_extra.rs
+++ b/herodb/src/storage/storage_extra.rs
--- a/herodb/src/storage/storage_hset.rs
+++ b/herodb/src/storage/storage_hset.rs
--- a/herodb/src/storage/storage_lists.rs
+++ b/herodb/src/storage/storage_lists.rs
--- a/herodb/src/storage_sled/mod.rs
+++ b/herodb/src/storage_sled/mod.rs
--- a/herodb/src/storage_trait.rs
+++ b/herodb/src/storage_trait.rs
--- a/herodb/test_herodb.sh
+++ b/herodb/test_herodb.sh
--- a/herodb/tests/debug_hset.rs
+++ b/herodb/tests/debug_hset.rs
--- a/herodb/tests/debug_hset_simple.rs
+++ b/herodb/tests/debug_hset_simple.rs
--- a/herodb/tests/debug_protocol.rs
+++ b/herodb/tests/debug_protocol.rs
--- a/herodb/tests/redis_integration_tests.rs
+++ b/herodb/tests/redis_integration_tests.rs
--- a/herodb/tests/redis_tests.rs
+++ b/herodb/tests/redis_tests.rs
--- a/herodb/tests/simple_integration_test.rs
+++ b/herodb/tests/simple_integration_test.rs
--- a/herodb/tests/simple_redis_test.rs
+++ b/herodb/tests/simple_redis_test.rs
--- a/herodb/tests/usage_suite.rs
+++ b/herodb/tests/usage_suite.rs