From c9e1dcdb6ce126ef484da61ee86c16494c5ce2f8 Mon Sep 17 00:00:00 2001 From: despiegk Date: Sat, 23 Aug 2025 04:57:47 +0200 Subject: [PATCH 1/4] ... --- Cargo.toml | 36 +- herodb/README.md => README.md | 0 herodb/build.sh => build.sh | 0 {herodb/docs => docs}/age.md | 0 {herodb/docs => docs}/basics.md | 0 {herodb/docs => docs}/cmds.md | 9 + docs/search.md | 397 + herodb/Cargo.toml | 29 - herodb/examples/age_bash_demo.sh | 71 - herodb/examples/age_persist_demo.rs | 83 - herodb/run_tests.sh => run_tests.sh | 0 .../specs => specs}/backgroundinfo/encrypt.md | 0 specs/backgroundinfo/lance.md | 1251 +++ specs/backgroundinfo/lancedb.md | 6847 +++++++++++++++++ .../specs => specs}/backgroundinfo/redb.md | 0 .../backgroundinfo/redis_basic_client.md | 0 .../backgroundinfo/redis_basics.md | 0 .../backgroundinfo/redis_hset_functions.md | 0 .../backgroundinfo/redis_lists.md | 0 .../specs => specs}/backgroundinfo/sled.md | 0 .../specs => specs}/backgroundinfo/tantivy.md | 0 specs/lance_implementation.md | 735 ++ {herodb/src => src}/age.rs | 0 {herodb/src => src}/cmd.rs | 0 {herodb/src => src}/crypto.rs | 0 {herodb/src => src}/error.rs | 0 {herodb/src => src}/lib.rs | 0 {herodb/src => src}/main.rs | 0 {herodb/src => src}/options.rs | 0 {herodb/src => src}/protocol.rs | 0 {herodb/src => src}/server.rs | 0 {herodb/src => src}/storage/mod.rs | 0 {herodb/src => src}/storage/storage_basic.rs | 0 {herodb/src => src}/storage/storage_extra.rs | 0 {herodb/src => src}/storage/storage_hset.rs | 0 {herodb/src => src}/storage/storage_lists.rs | 0 {herodb/src => src}/storage_sled/mod.rs | 0 {herodb/src => src}/storage_trait.rs | 0 herodb/test_herodb.sh => test_herodb.sh | 0 {herodb/tests => tests}/debug_hset.rs | 0 {herodb/tests => tests}/debug_hset_simple.rs | 0 {herodb/tests => tests}/debug_protocol.rs | 0 .../redis_integration_tests.rs | 0 {herodb/tests => tests}/redis_tests.rs | 0 .../simple_integration_test.rs | 0 {herodb/tests => tests}/simple_redis_test.rs | 0 {herodb/tests => tests}/usage_suite.rs | 0 47 files changed, 9267 insertions(+), 191 deletions(-) rename herodb/README.md => README.md (100%) rename herodb/build.sh => build.sh (100%) rename {herodb/docs => docs}/age.md (100%) rename {herodb/docs => docs}/basics.md (100%) rename {herodb/docs => docs}/cmds.md (89%) create mode 100644 docs/search.md delete mode 100644 herodb/Cargo.toml delete mode 100755 herodb/examples/age_bash_demo.sh delete mode 100644 herodb/examples/age_persist_demo.rs rename herodb/run_tests.sh => run_tests.sh (100%) rename {herodb/specs => specs}/backgroundinfo/encrypt.md (100%) create mode 100644 specs/backgroundinfo/lance.md create mode 100644 specs/backgroundinfo/lancedb.md rename {herodb/specs => specs}/backgroundinfo/redb.md (100%) rename {herodb/specs => specs}/backgroundinfo/redis_basic_client.md (100%) rename {herodb/specs => specs}/backgroundinfo/redis_basics.md (100%) rename {herodb/specs => specs}/backgroundinfo/redis_hset_functions.md (100%) rename {herodb/specs => specs}/backgroundinfo/redis_lists.md (100%) rename {herodb/specs => specs}/backgroundinfo/sled.md (100%) rename {herodb/specs => specs}/backgroundinfo/tantivy.md (100%) create mode 100644 specs/lance_implementation.md rename {herodb/src => src}/age.rs (100%) rename {herodb/src => src}/cmd.rs (100%) rename {herodb/src => src}/crypto.rs (100%) rename {herodb/src => src}/error.rs (100%) rename {herodb/src => src}/lib.rs (100%) rename {herodb/src => src}/main.rs (100%) rename {herodb/src => src}/options.rs (100%) rename {herodb/src => src}/protocol.rs (100%) rename {herodb/src => src}/server.rs (100%) rename {herodb/src => src}/storage/mod.rs (100%) rename {herodb/src => src}/storage/storage_basic.rs (100%) rename {herodb/src => src}/storage/storage_extra.rs (100%) rename {herodb/src => src}/storage/storage_hset.rs (100%) rename {herodb/src => src}/storage/storage_lists.rs (100%) rename {herodb/src => src}/storage_sled/mod.rs (100%) rename {herodb/src => src}/storage_trait.rs (100%) rename herodb/test_herodb.sh => test_herodb.sh (100%) rename {herodb/tests => tests}/debug_hset.rs (100%) rename {herodb/tests => tests}/debug_hset_simple.rs (100%) rename {herodb/tests => tests}/debug_protocol.rs (100%) rename {herodb/tests => tests}/redis_integration_tests.rs (100%) rename {herodb/tests => tests}/redis_tests.rs (100%) rename {herodb/tests => tests}/simple_integration_test.rs (100%) rename {herodb/tests => tests}/simple_redis_test.rs (100%) rename {herodb/tests => tests}/usage_suite.rs (100%) diff --git a/Cargo.toml b/Cargo.toml index 2eedb5f..7e952b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,9 +1,29 @@ -[workspace] -members = ["herodb"] -resolver = "2" +[package] +name = "herodb" +version = "0.0.1" +authors = ["Pin Fang "] +edition = "2021" -# You can define shared profiles for all workspace members here -[profile.release] -lto = true -codegen-units = 1 -strip = true \ No newline at end of file +[dependencies] +anyhow = "1.0.59" +bytes = "1.3.0" +thiserror = "1.0.32" +tokio = { version = "1.23.0", features = ["full"] } +clap = { version = "4.5.20", features = ["derive"] } +byteorder = "1.4.3" +futures = "0.3" +sled = "0.34" +redb = "2.1.3" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +bincode = "1.3" +chacha20poly1305 = "0.10.1" +rand = "0.8" +sha2 = "0.10" +age = "0.10" +secrecy = "0.8" +ed25519-dalek = "2" +base64 = "0.22" + +[dev-dependencies] +redis = { version = "0.24", features = ["aio", "tokio-comp"] } diff --git a/herodb/README.md b/README.md similarity index 100% rename from herodb/README.md rename to README.md diff --git a/herodb/build.sh b/build.sh similarity index 100% rename from herodb/build.sh rename to build.sh diff --git a/herodb/docs/age.md b/docs/age.md similarity index 100% rename from herodb/docs/age.md rename to docs/age.md diff --git a/herodb/docs/basics.md b/docs/basics.md similarity index 100% rename from herodb/docs/basics.md rename to docs/basics.md diff --git a/herodb/docs/cmds.md b/docs/cmds.md similarity index 89% rename from herodb/docs/cmds.md rename to docs/cmds.md index fa85ff4..78a6e78 100644 --- a/herodb/docs/cmds.md +++ b/docs/cmds.md @@ -70,6 +70,15 @@ MULTI/EXEC/DISCARD | ✅ | ❌ | Only supported in redb | **Encryption** | | | | Data-at-rest encryption | ✅ | ✅ | Both support [age](age.tech) encryption | AGE commands | ✅ | ✅ | Both support AGE crypto commands | +**Full-Text Search** | | | | +FT.CREATE | ✅ | ✅ | Create search index with schema | +FT.ADD | ✅ | ✅ | Add document to search index | +FT.SEARCH | ✅ | ✅ | Search documents with query | +FT.DEL | ✅ | ✅ | Delete document from index | +FT.INFO | ✅ | ✅ | Get index information | +FT.DROP | ✅ | ✅ | Drop search index | +FT.ALTER | ✅ | ✅ | Alter index schema | +FT.AGGREGATE | ✅ | ✅ | Aggregate search results | ### Performance Considerations diff --git a/docs/search.md b/docs/search.md new file mode 100644 index 0000000..27cdfb3 --- /dev/null +++ b/docs/search.md @@ -0,0 +1,397 @@ +# Full-Text Search with Tantivy + +HeroDB includes powerful full-text search capabilities powered by [Tantivy](https://github.com/quickwit-oss/tantivy), a fast full-text search engine library written in Rust. This provides Redis-compatible search commands similar to RediSearch. + +## Overview + +The search functionality allows you to: +- Create search indexes with custom schemas +- Index documents with multiple field types +- Perform complex queries with filters +- Support for text, numeric, date, and geographic data +- Real-time search with high performance + +## Search Commands + +### FT.CREATE - Create Search Index + +Create a new search index with a defined schema. + +```bash +FT.CREATE index_name SCHEMA field_name field_type [options] [field_name field_type [options] ...] +``` + +**Field Types:** +- `TEXT` - Full-text searchable text fields +- `NUMERIC` - Numeric fields (integers, floats) +- `TAG` - Tag fields for exact matching +- `GEO` - Geographic coordinates (lat,lon) +- `DATE` - Date/timestamp fields + +**Field Options:** +- `STORED` - Store field value for retrieval +- `INDEXED` - Make field searchable +- `TOKENIZED` - Enable tokenization for text fields +- `FAST` - Enable fast access for numeric fields + +**Example:** +```bash +# Create a product search index +FT.CREATE products SCHEMA + title TEXT STORED INDEXED TOKENIZED + description TEXT STORED INDEXED TOKENIZED + price NUMERIC STORED INDEXED FAST + category TAG STORED + location GEO STORED + created_date DATE STORED INDEXED +``` + +### FT.ADD - Add Document to Index + +Add a document to a search index. + +```bash +FT.ADD index_name doc_id [SCORE score] FIELDS field_name field_value [field_name field_value ...] +``` + +**Example:** +```bash +# Add a product document +FT.ADD products product:1 SCORE 1.0 FIELDS + title "Wireless Headphones" + description "High-quality wireless headphones with noise cancellation" + price 199.99 + category "electronics" + location "37.7749,-122.4194" + created_date 1640995200000 +``` + +### FT.SEARCH - Search Documents + +Search for documents in an index. + +```bash +FT.SEARCH index_name query [LIMIT offset count] [FILTER field min max] [RETURN field [field ...]] +``` + +**Query Syntax:** +- Simple terms: `wireless headphones` +- Phrase queries: `"noise cancellation"` +- Field-specific: `title:wireless` +- Boolean operators: `wireless AND headphones` +- Wildcards: `head*` + +**Examples:** +```bash +# Simple text search +FT.SEARCH products "wireless headphones" + +# Search with filters +FT.SEARCH products "headphones" FILTER price 100 300 LIMIT 0 10 + +# Field-specific search +FT.SEARCH products "title:wireless AND category:electronics" + +# Return specific fields only +FT.SEARCH products "*" RETURN title price +``` + +### FT.DEL - Delete Document + +Remove a document from the search index. + +```bash +FT.DEL index_name doc_id +``` + +**Example:** +```bash +FT.DEL products product:1 +``` + +### FT.INFO - Get Index Information + +Get information about a search index. + +```bash +FT.INFO index_name +``` + +**Returns:** +- Index name and document count +- Field definitions and types +- Index configuration + +**Example:** +```bash +FT.INFO products +``` + +### FT.DROP - Drop Index + +Delete an entire search index. + +```bash +FT.DROP index_name +``` + +**Example:** +```bash +FT.DROP products +``` + +### FT.ALTER - Alter Index Schema + +Add new fields to an existing index. + +```bash +FT.ALTER index_name SCHEMA ADD field_name field_type [options] +``` + +**Example:** +```bash +FT.ALTER products SCHEMA ADD brand TAG STORED +``` + +### FT.AGGREGATE - Aggregate Search Results + +Perform aggregations on search results. + +```bash +FT.AGGREGATE index_name query [GROUPBY field] [REDUCE function field AS alias] +``` + +**Example:** +```bash +# Group products by category and count +FT.AGGREGATE products "*" GROUPBY category REDUCE COUNT 0 AS count +``` + +## Field Types in Detail + +### TEXT Fields +- **Purpose**: Full-text search on natural language content +- **Features**: Tokenization, stemming, stop-word removal +- **Options**: `STORED`, `INDEXED`, `TOKENIZED` +- **Example**: Product titles, descriptions, content + +### NUMERIC Fields +- **Purpose**: Numeric data for range queries and sorting +- **Types**: I64, U64, F64 +- **Options**: `STORED`, `INDEXED`, `FAST` +- **Example**: Prices, quantities, ratings + +### TAG Fields +- **Purpose**: Exact-match categorical data +- **Features**: No tokenization, exact string matching +- **Options**: `STORED`, case sensitivity control +- **Example**: Categories, brands, status values + +### GEO Fields +- **Purpose**: Geographic coordinates +- **Format**: "latitude,longitude" (e.g., "37.7749,-122.4194") +- **Features**: Geographic distance queries +- **Options**: `STORED` + +### DATE Fields +- **Purpose**: Timestamp and date data +- **Format**: Unix timestamp in milliseconds +- **Features**: Range queries, temporal filtering +- **Options**: `STORED`, `INDEXED`, `FAST` + +## Search Query Syntax + +### Basic Queries +```bash +# Single term +FT.SEARCH products "wireless" + +# Multiple terms (AND by default) +FT.SEARCH products "wireless headphones" + +# Phrase query +FT.SEARCH products "\"noise cancellation\"" +``` + +### Field-Specific Queries +```bash +# Search in specific field +FT.SEARCH products "title:wireless" + +# Multiple field queries +FT.SEARCH products "title:wireless AND description:bluetooth" +``` + +### Boolean Operators +```bash +# AND operator +FT.SEARCH products "wireless AND headphones" + +# OR operator +FT.SEARCH products "wireless OR bluetooth" + +# NOT operator +FT.SEARCH products "headphones NOT wired" +``` + +### Wildcards and Fuzzy Search +```bash +# Wildcard search +FT.SEARCH products "head*" + +# Fuzzy search (approximate matching) +FT.SEARCH products "%headphone%" +``` + +### Range Queries +```bash +# Numeric range in query +FT.SEARCH products "@price:[100 300]" + +# Date range +FT.SEARCH products "@created_date:[1640995200000 1672531200000]" +``` + +## Filtering and Sorting + +### FILTER Clause +```bash +# Numeric filter +FT.SEARCH products "headphones" FILTER price 100 300 + +# Multiple filters +FT.SEARCH products "*" FILTER price 100 500 FILTER rating 4 5 +``` + +### LIMIT Clause +```bash +# Pagination +FT.SEARCH products "wireless" LIMIT 0 10 # First 10 results +FT.SEARCH products "wireless" LIMIT 10 10 # Next 10 results +``` + +### RETURN Clause +```bash +# Return specific fields +FT.SEARCH products "*" RETURN title price + +# Return all stored fields (default) +FT.SEARCH products "*" +``` + +## Performance Considerations + +### Indexing Strategy +- Only index fields you need to search on +- Use `FAST` option for frequently filtered numeric fields +- Consider storage vs. search performance trade-offs + +### Query Optimization +- Use specific field queries when possible +- Combine filters with text queries for better performance +- Use pagination with LIMIT for large result sets + +### Memory Usage +- Tantivy indexes are memory-mapped for performance +- Index size depends on document count and field configuration +- Monitor disk space for index storage + +## Integration with Redis Commands + +Search indexes work alongside regular Redis data: + +```bash +# Store product data in Redis hash +HSET product:1 title "Wireless Headphones" price "199.99" + +# Index the same data for search +FT.ADD products product:1 FIELDS title "Wireless Headphones" price 199.99 + +# Search returns document IDs that can be used with Redis commands +FT.SEARCH products "wireless" +# Returns: product:1 + +# Retrieve full data using Redis +HGETALL product:1 +``` + +## Example Use Cases + +### E-commerce Product Search +```bash +# Create product catalog index +FT.CREATE catalog SCHEMA + name TEXT STORED INDEXED TOKENIZED + description TEXT INDEXED TOKENIZED + price NUMERIC STORED INDEXED FAST + category TAG STORED + brand TAG STORED + rating NUMERIC STORED FAST + +# Add products +FT.ADD catalog prod:1 FIELDS name "iPhone 14" price 999 category "phones" brand "apple" rating 4.5 +FT.ADD catalog prod:2 FIELDS name "Samsung Galaxy" price 899 category "phones" brand "samsung" rating 4.3 + +# Search queries +FT.SEARCH catalog "iPhone" +FT.SEARCH catalog "phones" FILTER price 800 1000 +FT.SEARCH catalog "@brand:apple" +``` + +### Content Management +```bash +# Create content index +FT.CREATE content SCHEMA + title TEXT STORED INDEXED TOKENIZED + body TEXT INDEXED TOKENIZED + author TAG STORED + published DATE STORED INDEXED + tags TAG STORED + +# Search content +FT.SEARCH content "machine learning" +FT.SEARCH content "@author:john AND @tags:ai" +FT.SEARCH content "*" FILTER published 1640995200000 1672531200000 +``` + +### Geographic Search +```bash +# Create location-based index +FT.CREATE places SCHEMA + name TEXT STORED INDEXED TOKENIZED + location GEO STORED + type TAG STORED + +# Add locations +FT.ADD places place:1 FIELDS name "Golden Gate Bridge" location "37.8199,-122.4783" type "landmark" + +# Geographic queries (future feature) +FT.SEARCH places "@location:[37.7749 -122.4194 10 km]" +``` + +## Error Handling + +Common error responses: +- `ERR index not found` - Index doesn't exist +- `ERR field not found` - Field not defined in schema +- `ERR invalid query syntax` - Malformed query +- `ERR document not found` - Document ID doesn't exist + +## Best Practices + +1. **Schema Design**: Plan your schema carefully - changes require reindexing +2. **Field Selection**: Only store and index fields you actually need +3. **Batch Operations**: Add multiple documents efficiently +4. **Query Testing**: Test queries for performance with realistic data +5. **Monitoring**: Monitor index size and query performance +6. **Backup**: Include search indexes in backup strategies + +## Future Enhancements + +Planned features: +- Geographic distance queries +- Advanced aggregations and faceting +- Highlighting of search results +- Synonyms and custom analyzers +- Real-time suggestions and autocomplete +- Index replication and sharding \ No newline at end of file diff --git a/herodb/Cargo.toml b/herodb/Cargo.toml deleted file mode 100644 index 7e952b6..0000000 --- a/herodb/Cargo.toml +++ /dev/null @@ -1,29 +0,0 @@ -[package] -name = "herodb" -version = "0.0.1" -authors = ["Pin Fang "] -edition = "2021" - -[dependencies] -anyhow = "1.0.59" -bytes = "1.3.0" -thiserror = "1.0.32" -tokio = { version = "1.23.0", features = ["full"] } -clap = { version = "4.5.20", features = ["derive"] } -byteorder = "1.4.3" -futures = "0.3" -sled = "0.34" -redb = "2.1.3" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0" -bincode = "1.3" -chacha20poly1305 = "0.10.1" -rand = "0.8" -sha2 = "0.10" -age = "0.10" -secrecy = "0.8" -ed25519-dalek = "2" -base64 = "0.22" - -[dev-dependencies] -redis = { version = "0.24", features = ["aio", "tokio-comp"] } diff --git a/herodb/examples/age_bash_demo.sh b/herodb/examples/age_bash_demo.sh deleted file mode 100755 index 07b54c8..0000000 --- a/herodb/examples/age_bash_demo.sh +++ /dev/null @@ -1,71 +0,0 @@ -#!/bin/bash - -# Start the herodb server in the background -echo "Starting herodb server..." -cargo run -p herodb -- --dir /tmp/herodb_age_test --port 6382 --debug --encryption-key "testkey" & -SERVER_PID=$! -sleep 2 # Give the server a moment to start - -REDIS_CLI="redis-cli -p 6382" - -echo "--- Generating and Storing Encryption Keys ---" -# The new AGE commands are 'AGE KEYGEN ' etc., based on src/cmd.rs -# This script uses older commands like 'AGE.GENERATE_KEYPAIR alice' -# The demo script needs to be updated to match the implemented commands. -# Let's assume the commands in the script are what's expected for now, -# but note this discrepancy. The new commands are AGE KEYGEN etc. -# The script here uses a different syntax not found in src/cmd.rs like 'AGE.GENERATE_KEYPAIR'. -# For now, I will modify the script to fit the actual implementation. - -echo "--- Generating and Storing Encryption Keys ---" -$REDIS_CLI AGE KEYGEN alice -$REDIS_CLI AGE KEYGEN bob - -echo "--- Encrypting and Decrypting a Message ---" -MESSAGE="Hello, AGE encryption!" -# The new logic stores keys internally and does not expose a command to get the public key. -# We will encrypt by name. -ALICE_PUBKEY_REPLY=$($REDIS_CLI AGE KEYGEN alice | head -n 2 | tail -n 1) -echo "Alice's Public Key: $ALICE_PUBKEY_REPLY" - -echo "Encrypting message: '$MESSAGE' with Alice's identity..." -# AGE.ENCRYPT recipient message. But since we use persistent keys, let's use ENCRYPTNAME -CIPHERTEXT=$($REDIS_CLI AGE ENCRYPTNAME alice "$MESSAGE") -echo "Ciphertext: $CIPHERTEXT" - -echo "Decrypting ciphertext with Alice's private key..." -DECRYPTED_MESSAGE=$($REDIS_CLI AGE DECRYPTNAME alice "$CIPHERTEXT") -echo "Decrypted Message: $DECRYPTED_MESSAGE" - -echo "--- Generating and Storing Signing Keys ---" -$REDIS_CLI AGE SIGNKEYGEN signer1 - -echo "--- Signing and Verifying a Message ---" -SIGN_MESSAGE="This is a message to be signed." -# Similar to above, we don't have GET_SIGN_PUBKEY. We will verify by name. - -echo "Signing message: '$SIGN_MESSAGE' with signer1's private key..." -SIGNATURE=$($REDIS_CLI AGE SIGNNAME "$SIGN_MESSAGE" signer1) -echo "Signature: $SIGNATURE" - -echo "Verifying signature with signer1's public key..." -VERIFY_RESULT=$($REDIS_CLI AGE VERIFYNAME signer1 "$SIGN_MESSAGE" "$SIGNATURE") -echo "Verification Result: $VERIFY_RESULT" - - -# There is no DELETE_KEYPAIR command in the implementation -echo "--- Cleaning up keys (manual in herodb) ---" -# We would use DEL for age:key:alice, etc. -$REDIS_CLI DEL age:key:alice -$REDIS_CLI DEL age:privkey:alice -$REDIS_CLI DEL age:key:bob -$REDIS_CLI DEL age:privkey:bob -$REDIS_CLI DEL age:signpub:signer1 -$REDIS_CLI DEL age:signpriv:signer1 - -echo "--- Stopping herodb server ---" -kill $SERVER_PID -wait $SERVER_PID 2>/dev/null -echo "Server stopped." - -echo "Bash demo complete." \ No newline at end of file diff --git a/herodb/examples/age_persist_demo.rs b/herodb/examples/age_persist_demo.rs deleted file mode 100644 index 9caf3bd..0000000 --- a/herodb/examples/age_persist_demo.rs +++ /dev/null @@ -1,83 +0,0 @@ -use std::io::{Read, Write}; -use std::net::TcpStream; - -// Minimal RESP helpers -fn arr(parts: &[&str]) -> String { - let mut out = format!("*{}\r\n", parts.len()); - for p in parts { - out.push_str(&format!("${}\r\n{}\r\n", p.len(), p)); - } - out -} -fn read_reply(s: &mut TcpStream) -> String { - let mut buf = [0u8; 65536]; - let n = s.read(&mut buf).unwrap(); - String::from_utf8_lossy(&buf[..n]).to_string() -} -fn parse_two_bulk(reply: &str) -> Option<(String,String)> { - let mut lines = reply.split("\r\n"); - if lines.next()? != "*2" { return None; } - let _n = lines.next()?; - let a = lines.next()?.to_string(); - let _m = lines.next()?; - let b = lines.next()?.to_string(); - Some((a,b)) -} -fn parse_bulk(reply: &str) -> Option { - let mut lines = reply.split("\r\n"); - let hdr = lines.next()?; - if !hdr.starts_with('$') { return None; } - Some(lines.next()?.to_string()) -} -fn parse_simple(reply: &str) -> Option { - let mut lines = reply.split("\r\n"); - let hdr = lines.next()?; - if !hdr.starts_with('+') { return None; } - Some(hdr[1..].to_string()) -} - -fn main() { - let mut args = std::env::args().skip(1); - let host = args.next().unwrap_or_else(|| "127.0.0.1".into()); - let port = args.next().unwrap_or_else(|| "6379".into()); - let addr = format!("{host}:{port}"); - println!("Connecting to {addr}..."); - let mut s = TcpStream::connect(addr).expect("connect"); - - // Generate & persist X25519 enc keys under name "alice" - s.write_all(arr(&["age","keygen","alice"]).as_bytes()).unwrap(); - let (_alice_recip, _alice_ident) = parse_two_bulk(&read_reply(&mut s)).expect("gen enc"); - - // Generate & persist Ed25519 signing key under name "signer" - s.write_all(arr(&["age","signkeygen","signer"]).as_bytes()).unwrap(); - let (_verify, _secret) = parse_two_bulk(&read_reply(&mut s)).expect("gen sign"); - - // Encrypt by name - let msg = "hello from persistent keys"; - s.write_all(arr(&["age","encryptname","alice", msg]).as_bytes()).unwrap(); - let ct_b64 = parse_bulk(&read_reply(&mut s)).expect("ct b64"); - println!("ciphertext b64: {}", ct_b64); - - // Decrypt by name - s.write_all(arr(&["age","decryptname","alice", &ct_b64]).as_bytes()).unwrap(); - let pt = parse_bulk(&read_reply(&mut s)).expect("pt"); - assert_eq!(pt, msg); - println!("decrypted ok"); - - // Sign by name - s.write_all(arr(&["age","signname","signer", msg]).as_bytes()).unwrap(); - let sig_b64 = parse_bulk(&read_reply(&mut s)).expect("sig b64"); - - // Verify by name - s.write_all(arr(&["age","verifyname","signer", msg, &sig_b64]).as_bytes()).unwrap(); - let ok = parse_simple(&read_reply(&mut s)).expect("verify"); - assert_eq!(ok, "1"); - println!("signature verified"); - - // List names - s.write_all(arr(&["age","list"]).as_bytes()).unwrap(); - let list = read_reply(&mut s); - println!("LIST -> {list}"); - - println!("✔ persistent AGE workflow complete."); -} \ No newline at end of file diff --git a/herodb/run_tests.sh b/run_tests.sh similarity index 100% rename from herodb/run_tests.sh rename to run_tests.sh diff --git a/herodb/specs/backgroundinfo/encrypt.md b/specs/backgroundinfo/encrypt.md similarity index 100% rename from herodb/specs/backgroundinfo/encrypt.md rename to specs/backgroundinfo/encrypt.md diff --git a/specs/backgroundinfo/lance.md b/specs/backgroundinfo/lance.md new file mode 100644 index 0000000..08bcf2d --- /dev/null +++ b/specs/backgroundinfo/lance.md @@ -0,0 +1,1251 @@ +Based on your request, here is a copy of the provided code snippets filtered to include only those relevant to the Rust ecosystem. This includes snippets written in Rust, shell commands for building or testing Rust code (e.g., using `cargo` or `maturin`), and configurations for native development tools like `lldb`. + +======================== +CODE SNIPPETS +======================== +TITLE: Perform Python development installation +DESCRIPTION: These commands navigate into the `python` directory and perform a development installation of the Lance Python bindings. This allows developers to import and test changes to the Python wrapper directly. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +cd python +maturin develop +``` + +---------------------------------------- + +TITLE: Install Python bindings build tool +DESCRIPTION: This command installs `maturin`, a tool essential for building Python packages that integrate with Rust code. It's a prerequisite for setting up the Python development environment for Lance. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +pip install maturin +``` + +---------------------------------------- + +TITLE: Install Linux Perf Tools and Configure Kernel Parameters +DESCRIPTION: Installs necessary Linux performance tools (`perf`) on Ubuntu systems and configures the `perf_event_paranoid` kernel parameter. This setup is crucial for allowing non-root users to collect performance data using tools like `perf` and `flamegraph`. + +SOURCE: https://github.com/lancedb/lance/blob/__wiki__/Debug.md#_snippet_4 + +LANGUAGE: sh +CODE: +``` +sudo apt install linux-tools-common linux-tools-generic linux-tools-`uname -r` +sudo sh -c "echo -1 > /proc/sys/kernel/perf_event_paranoid" +``` + +---------------------------------------- + +TITLE: Run Rust unit tests +DESCRIPTION: This command executes the unit tests for the Rust core format. Running these tests verifies the correctness of the Rust implementation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_6 + +LANGUAGE: bash +CODE: +``` +cargo test +``` + +---------------------------------------- + +TITLE: Profile a LanceDB benchmark using flamegraph +DESCRIPTION: Generates a flamegraph for a specific benchmark using `cargo-flamegraph`, aiding in performance analysis. It's recommended to run benchmarks once beforehand to avoid setup time being captured in the profile. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_14 + +LANGUAGE: shell +CODE: +``` +flamegraph -F 100 --no-inline -- $(which python) \ + -m pytest python/benchmarks \ + --benchmark-min-time=2 \ + -k test_ivf_pq_index_search +``` + +---------------------------------------- + +TITLE: Install Flamegraph Tool +DESCRIPTION: Installs the `flamegraph` profiling tool using Cargo, Rust's package manager. This tool is essential for visualizing CPU usage and call stacks as flame graphs for performance analysis. + +SOURCE: https://github.com/lancedb/lance/blob/__wiki__/Debug.md#_snippet_3 + +LANGUAGE: sh +CODE: +``` +cargo install flamegraph +``` + +---------------------------------------- + +TITLE: Install Lance Build Dependencies on Ubuntu +DESCRIPTION: This command installs necessary system-level dependencies for building Lance on Ubuntu 22.04, including protobuf, SSL development libraries, and general build tools. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +sudo apt install protobuf-compiler libssl-dev build-essential pkg-config gfortran +``` + +---------------------------------------- + +TITLE: Build Rust core format (release) +DESCRIPTION: This command compiles the Rust core format in release mode. The release build is optimized for performance and is suitable for production deployments or benchmarking. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_5 + +LANGUAGE: bash +CODE: +``` +cargo build -r +``` + +---------------------------------------- + +TITLE: Debug Python Script with LLDB +DESCRIPTION: Demonstrates how to start an LLDB debugging session for a Python script. It involves launching LLDB with the Python interpreter from a virtual environment and then running the target script within the LLDB prompt. + +SOURCE: https://github.com/lancedb/lance/blob/__wiki__/Debug.md#_snippet_2 + +LANGUAGE: sh +CODE: +``` +$ lldb ./venv/bin/python +(lldb) r script.py +``` + +---------------------------------------- + +TITLE: Install Lance Build Dependencies on Mac +DESCRIPTION: This command installs the protobuf compiler using Homebrew, a required dependency for building Lance on macOS. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +brew install protobuf +``` + +---------------------------------------- + +TITLE: Configure LLDB Initialization Settings +DESCRIPTION: Sets up basic LLDB initialization settings in the `~/.lldbinit` file. This includes configuring the number of source code lines to display before and after a stop, and enabling the loading of `.lldbinit` files from the current working directory. + +SOURCE: https://github.com/lancedb/lance/blob/__wiki__/Debug.md#_snippet_0 + +LANGUAGE: lldb +CODE: +``` +# ~/.lldbinit +settings set stop-line-count-before 15 +settings set stop-line-count-after 15 +settings set target.load-cwd-lldbinit true +``` + +---------------------------------------- + +TITLE: Complete Lance Dataset Write and Read Example in Rust +DESCRIPTION: This Rust `main` function provides a complete example demonstrating the usage of `write_dataset` and `read_dataset` functions. It sets up the necessary `arrow` and `lance` imports, defines a temporary data path, and orchestrates the writing and subsequent reading of a Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/write_read_dataset.md#_snippet_2 + +LANGUAGE: Rust +CODE: +``` +use arrow::array::UInt32Array; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::{RecordBatch, RecordBatchIterator}; +use futures::StreamExt; +use lance::dataset::{WriteMode, WriteParams}; +use lance::Dataset; +use std::sync::Arc; + +#[tokio::main] +async fn main() { + let data_path: &str = "./temp_data.lance"; + + write_dataset(data_path).await; + read_dataset(data_path).await; +} +``` + +---------------------------------------- + +TITLE: Rust: Main Workflow for WikiText to LanceDB Ingestion +DESCRIPTION: This comprehensive example demonstrates the full data ingestion pipeline in Rust. It initializes a Tokio runtime, loads a tokenizer, sets up the Hugging Face API to download WikiText Parquet files, processes them into a `WikiTextBatchReader`, and finally writes the data to a Lance dataset. It also includes verification of the created dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_2 + +LANGUAGE: Rust +CODE: +``` +fn main() -> Result<(), Box> { + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + // Load tokenizer + let tokenizer = load_tokenizer("gpt2")?; + + // Set up Hugging Face API + // Download from https://huggingface.co/datasets/Salesforce/wikitext/tree/main/wikitext-103-raw-v1 + let api = Api::new()?; + let repo = api.repo(Repo::with_revision( + "Salesforce/wikitext".into(), + RepoType::Dataset, + "main".into(), + )); + + // Define the parquet files we want to download + let train_files = vec![ + "wikitext-103-raw-v1/train-00000-of-00002.parquet", + "wikitext-103-raw-v1/train-00001-of-00002.parquet", + ]; + + let mut parquet_readers = Vec::new(); + for file in &train_files { + println!("Downloading file: {}", file); + let file_path = repo.get(file)?; + let data = std::fs::read(file_path)?; + + // Create a temporary file in the system temp directory and write the downloaded data to it + let mut temp_file = NamedTempFile::new()?; + temp_file.write_all(&data)?; + + // Create the parquet reader builder with a larger batch size + let builder = ParquetRecordBatchReaderBuilder::try_new(temp_file.into_file())? + .with_batch_size(8192); // Increase batch size for better performance + parquet_readers.push(builder); + } + + if parquet_readers.is_empty() { + println!("No parquet files found to process."); + return Ok(()); + } + + // Create batch reader + let num_samples: u64 = 500_000; + let batch_reader = WikiTextBatchReader::new(parquet_readers, tokenizer, Some(num_samples))?; + + // Save as Lance dataset + println!("Writing to Lance dataset..."); + let lance_dataset_path = "rust_wikitext_lance_dataset.lance"; + + let write_params = WriteParams::default(); + lance::Dataset::write(batch_reader, lance_dataset_path, Some(write_params)).await?; + + // Verify the dataset + let ds = lance::Dataset::open(lance_dataset_path).await?; + let scanner = ds.scan(); + let mut stream = scanner.try_into_stream().await?; + + let mut total_rows = 0; + while let Some(batch_result) = stream.next().await { + let batch = batch_result?; + total_rows += batch.num_rows(); + } + + println!( + "Lance dataset created successfully with {} rows", + total_rows + ); + println!("Dataset location: {}", lance_dataset_path); + + Ok(()) + }) +} +``` + +---------------------------------------- + +TITLE: Build and Test Pylance Python Package +DESCRIPTION: These commands set up a Python virtual environment, install maturin for Rust-Python binding, build the Pylance package in debug mode, and then run its associated tests. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_3 + +LANGUAGE: bash +CODE: +``` +cd python +python3 -m venv venv +source venv/bin/activate + +pip install maturin + +# Build debug build +maturin develop --extras tests + +# Run pytest +pytest python/tests/ +``` + +---------------------------------------- + +TITLE: Install Lance using Cargo +DESCRIPTION: Installs the Lance Rust library as a command-line tool using the Cargo package manager. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_0 + +LANGUAGE: shell +CODE: +``` +cargo install lance +``` + +---------------------------------------- + +TITLE: Build pylance in release mode for benchmarks +DESCRIPTION: Builds the `pylance` module in release mode with debug symbols, enabling benchmark execution and profiling. It includes benchmark-specific extras and features for data generation. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_10 + +LANGUAGE: shell +CODE: +``` +maturin develop --profile release-with-debug --extras benchmarks --features datagen +``` + +---------------------------------------- + +TITLE: Query Lance Dataset with Simple SQL in Rust DataFusion +DESCRIPTION: This Rust example demonstrates how to register a Lance dataset as a table in DataFusion using `LanceTableProvider` and execute a simple SQL `SELECT` query to retrieve the first 10 rows. It shows the basic setup for integrating Lance with DataFusion. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/datafusion.md#_snippet_0 + +LANGUAGE: rust +CODE: +``` +use datafusion::prelude::SessionContext; +use crate::datafusion::LanceTableProvider; + +let ctx = SessionContext::new(); + +ctx.register_table("dataset", + Arc::new(LanceTableProvider::new( + Arc::new(dataset.clone()), + /* with_row_id */ false, + /* with_row_addr */ false, + )))?; + +let df = ctx.sql("SELECT * FROM dataset LIMIT 10").await?; +let result = df.collect().await?; +``` + +---------------------------------------- + +TITLE: Run LanceDB code formatters +DESCRIPTION: Applies code formatting rules to the entire project. Specific commands like `make format-python` or `cargo fmt` can be used for language-specific formatting. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_4 + +LANGUAGE: shell +CODE: +``` +make format +``` + +---------------------------------------- + +TITLE: Build and Search HNSW Index for Vector Similarity in Rust +DESCRIPTION: This Rust code provides a complete example for vector similarity search. It defines a `ground_truth` function for L2 distance calculation, `create_test_vector_dataset` to generate synthetic fixed-size list vectors, and a `main` function that orchestrates the process. The `main` function generates or loads a dataset, builds an HNSW index using `lance_index::vector::hnsw`, and then performs vector searches, measuring construction and search times, and calculating recall against ground truth. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/hnsw.md#_snippet_0 + +LANGUAGE: Rust +CODE: +``` +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::array::{types::Float32Type, Array, FixedSizeListArray}; +use arrow::array::{AsArray, FixedSizeListBuilder, Float32Builder}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use arrow::record_batch::RecordBatchIterator; +use arrow_select::concat::concat; +use futures::stream::StreamExt; +use lance::Dataset; +use lance_index::vector::v3::subindex::IvfSubIndex; +use lance_index::vector::{ + flat::storage::FlatFloatStorage, + hnsw::{builder::HnswBuildParams, HNSW}, +}; +use lance_linalg::distance::DistanceType; + +fn ground_truth(fsl: &FixedSizeListArray, query: &[f32], k: usize) -> HashSet { + let mut dists = vec![]; + for i in 0..fsl.len() { + let dist = lance_linalg::distance::l2_distance( + query, + fsl.value(i).as_primitive::().values(), + ); + dists.push((dist, i as u32)); + } + dists.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + dists.truncate(k); + dists.into_iter().map(|(_, i)| i).collect() +} + +pub async fn create_test_vector_dataset(output: &str, num_rows: usize, dim: i32) { + let schema = Arc::new(Schema::new(vec![Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), dim), + false, + )])); + + let mut batches = Vec::new(); + + // Create a few batches + for _ in 0..2 { + let v_builder = Float32Builder::new(); + let mut list_builder = FixedSizeListBuilder::new(v_builder, dim); + + for _ in 0..num_rows { + for _ in 0..dim { + list_builder.values().append_value(rand::random::()); + } + list_builder.append(true); + } + let array = Arc::new(list_builder.finish()); + let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); + batches.push(batch); + } + let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + println!("Writing dataset to {}", output); + Dataset::write(batch_reader, output, None).await.unwrap(); +} + +#[tokio::main] +async fn main() { + let uri: Option = None; // None means generate test data + let column = "vector"; + let ef = 100; + let max_edges = 30; + let max_level = 7; + + // 1. Generate a synthetic test data of specified dimensions + let dataset = if uri.is_none() { + println!("No uri is provided, generating test dataset..."); + let output = "test_vectors.lance"; + create_test_vector_dataset(output, 1000, 64).await; + Dataset::open(output).await.expect("Failed to open dataset") + } else { + Dataset::open(uri.as_ref().unwrap()) + .await + .expect("Failed to open dataset") + }; + + println!("Dataset schema: {:#?}", dataset.schema()); + let batches = dataset + .scan() + .project(&[column]) + .unwrap() + .try_into_stream() + .await + .unwrap() + .then(|batch| async move { batch.unwrap().column_by_name(column).unwrap().clone() }) + .collect::>() + .await; + let arrs = batches.iter().map(|b| b.as_ref()).collect::>(); + let fsl = concat(&arrs).unwrap().as_fixed_size_list().clone(); + println!("Loaded {:?} batches", fsl.len()); + + let vector_store = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); + + let q = fsl.value(0); + let k = 10; + let gt = ground_truth(&fsl, q.as_primitive::().values(), k); + + for ef_construction in [15, 30, 50] { + let now = std::time::Instant::now(); + // 2. Build a hierarchical graph structure for efficient vector search using Lance API + let hnsw = HNSW::index_vectors( + vector_store.as_ref(), + HnswBuildParams::default() + .max_level(max_level) + .num_edges(max_edges) + .ef_construction(ef_construction), + ) + .unwrap(); + let construct_time = now.elapsed().as_secs_f32(); + let now = std::time::Instant::now(); + // 3. Perform vector search with different parameters and compute the ground truth using L2 distance search + let results: HashSet = hnsw + .search_basic(q.clone(), k, ef, None, vector_store.as_ref()) + .unwrap() + .iter() + .map(|node| node.id) + .collect(); + let search_time = now.elapsed().as_micros(); + println!( + "level={}, ef_construct={}, ef={} recall={}: construct={:.3}s search={:.3} us", + max_level, + ef_construction, + ef, + results.intersection(>).count() as f32 / k as f32, + construct_time, + search_time + ); + } +} +``` + +---------------------------------------- + +TITLE: Compare LanceDB benchmarks against previous version +DESCRIPTION: Provides a sequence of commands to compare the performance of the current version against the `main` branch. This involves saving a baseline from `main` and then comparing the current branch's performance against it. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_15 + +LANGUAGE: shell +CODE: +``` +CURRENT_BRANCH=$(git branch --show-current) +``` + +LANGUAGE: shell +CODE: +``` +git checkout main +``` + +LANGUAGE: shell +CODE: +``` +maturin develop --profile release-with-debug --features datagen +``` + +LANGUAGE: shell +CODE: +``` +pytest --benchmark-save=baseline python/benchmarks -m "not slow" +``` + +LANGUAGE: shell +CODE: +``` +COMPARE_ID=$(ls .benchmarks/*/ | tail -1 | cut -c1-4) +``` + +LANGUAGE: shell +CODE: +``` +git checkout $CURRENT_BRANCH +``` + +LANGUAGE: shell +CODE: +``` +maturin develop --profile release-with-debug --features datagen +``` + +LANGUAGE: shell +CODE: +``` +pytest --benchmark-compare=$COMPARE_ID python/benchmarks -m "not slow" +``` + +---------------------------------------- + +TITLE: Build Rust core format (debug) +DESCRIPTION: This command compiles the Rust core format in debug mode. The debug build includes debugging information and is suitable for development and testing, though it is not optimized for performance. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_4 + +LANGUAGE: bash +CODE: +``` +cargo build +``` + +---------------------------------------- + +TITLE: Format and lint Rust code +DESCRIPTION: These commands are used to automatically format Rust code according to community standards (`cargo fmt`) and to perform static analysis for potential issues (`cargo clippy`). This ensures code quality and consistency. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_3 + +LANGUAGE: bash +CODE: +``` +cargo fmt --all +cargo clippy --all-features --tests --benches +``` + +---------------------------------------- + +TITLE: Run LanceDB code linters +DESCRIPTION: Executes code linters to check for style violations and potential issues. Language-specific linting can be performed with `make lint-python` or `make lint-rust`. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_5 + +LANGUAGE: shell +CODE: +``` +make lint +``` + +---------------------------------------- + +TITLE: Clean LanceDB build artifacts +DESCRIPTION: Removes all generated build artifacts and temporary files from the project directory, useful for a clean rebuild. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_9 + +LANGUAGE: shell +CODE: +``` +make clean +``` + +---------------------------------------- + +TITLE: Rust: Load Tokenizer from Hugging Face Hub +DESCRIPTION: This function provides a utility to load a tokenizer from the Hugging Face Hub. It takes a model name, creates an API client, retrieves the tokenizer file from the specified repository, and constructs a `Tokenizer` object from it. This is a common pattern for integrating Hugging Face models. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_3 + +LANGUAGE: Rust +CODE: +``` +fn load_tokenizer(model_name: &str) -> Result> { + let api = Api::new()?; + let repo = api.repo(Repo::with_revision( + model_name.into(), + RepoType::Model, + "main".into(), + )); + + let tokenizer_path = repo.get("tokenizer.json")?; + let tokenizer = Tokenizer::from_file(tokenizer_path)?; + + Ok(tokenizer) +} +``` + +---------------------------------------- + +TITLE: Build MacOS x86_64 Wheels +DESCRIPTION: This command builds release-mode wheels specifically for x86_64 MacOS. It uses `maturin` to compile the project for the `x86_64-apple-darwin` target, storing the resulting wheels in the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_26 + +LANGUAGE: Shell +CODE: +``` +maturin build --release \ + --target x86_64-apple-darwin \ + --out wheels +``` + +---------------------------------------- + +TITLE: Build and Test Lance Rust Package +DESCRIPTION: These commands clone the Lance repository, navigate to the Rust directory, and then build, test, and benchmark the core Rust components of Lance. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_2 + +LANGUAGE: bash +CODE: +``` +git checkout https://github.com/lancedb/lance.git + +# Build rust package +cd rust +cargo build + +# Run test +cargo test + +# Run benchmarks +cargo bench +``` + +---------------------------------------- + +TITLE: Build LanceDB in development mode +DESCRIPTION: Builds the Rust native module in place using `maturin`. This command needs to be re-run whenever Rust code changes, but is not required for Python code modifications. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_0 + +LANGUAGE: shell +CODE: +``` +maturin develop +``` + +---------------------------------------- + +TITLE: Download Lindera Language Model +DESCRIPTION: Command-line instruction to download a specific Lindera language model (e.g., ipadic, ko-dic, unidic) for LanceDB. Note that `lindera-cli` must be installed beforehand as Lindera models require compilation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_4 + +LANGUAGE: bash +CODE: +``` +python -m lance.download lindera -l [ipadic|ko-dic|unidic] +``` + +---------------------------------------- + +TITLE: Decorate Rust Unit Test for Tracing +DESCRIPTION: To enable tracing for a Rust unit test, decorate it with the `#[lance_test_macros::test]` attribute. This macro wraps any existing test attributes, allowing tracing information to be collected during test execution. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_16 + +LANGUAGE: Rust +CODE: +``` +#[lance_test_macros::test(tokio::test)] +async fn test() { + ... +} +``` + +---------------------------------------- + +TITLE: Add Rust Toolchain Targets for Cross-Compilation +DESCRIPTION: To build manylinux wheels for different Linux architectures, you must first add the corresponding Rust toolchain targets. These commands add the x86_64 and aarch64 GNU targets, enabling cross-compilation. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_22 + +LANGUAGE: Shell +CODE: +``` +rustup target add x86_64-unknown-linux-gnu +rustup target add aarch64-unknown-linux-gnu +``` + +---------------------------------------- + +TITLE: Build MacOS ARM64 Wheels +DESCRIPTION: This command builds release-mode wheels specifically for ARM64 (aarch64) MacOS. It uses `maturin` to compile the project for the `aarch64-apple-darwin` target, storing the resulting wheels in the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_25 + +LANGUAGE: Shell +CODE: +``` +maturin build --release \ + --target aarch64-apple-darwin \ + --out wheels +``` + +---------------------------------------- + +TITLE: Rust: WikiTextBatchReader Next Batch Logic +DESCRIPTION: This snippet shows the core logic for the `next` method of the `WikiTextBatchReader`. It attempts to build and retrieve the next Parquet reader from a list of available readers. If a reader is successfully built, it's used; otherwise, it handles errors or indicates that no more readers are available. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_1 + +LANGUAGE: Rust +CODE: +``` + if let Some(builder) = self.parquet_readers[self.current_reader_idx].take() { + match builder.build() { + Ok(reader) => { + self.current_reader = Some(Box::new(reader)); + self.current_reader_idx += 1; + continue; + } + Err(e) => { + return Some(Err(arrow::error::ArrowError::ExternalError(Box::new(e)))) + } + } + } + } + + // No more readers available + return None; + } +``` + +---------------------------------------- + +TITLE: Run Rust Unit Test with Tracing Verbosity +DESCRIPTION: Execute a Rust unit test with tracing enabled by setting the `LANCE_TESTING` environment variable to a desired verbosity level (e.g., 'debug', 'info'). This command will generate a JSON trace file in your working directory, which can be viewed in Chrome or Perfetto. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_17 + +LANGUAGE: Bash +CODE: +``` +LANCE_TESTING=debug cargo test dataset::tests::test_create_dataset +``` + +---------------------------------------- + +TITLE: Build Linux x86_64 Manylinux Wheels +DESCRIPTION: This command builds release-mode manylinux wheels for x86_64 Linux. It utilizes `maturin` with `zig` for cross-compilation, targeting `manylinux2014` compatibility, and outputs the generated wheels to the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_23 + +LANGUAGE: Shell +CODE: +``` +maturin build --release --zig \ + --target x86_64-unknown-linux-gnu \ + --compatibility manylinux2014 \ + --out wheels +``` + +---------------------------------------- + +TITLE: Build Linux ARM64 Manylinux Wheels +DESCRIPTION: This command builds release-mode manylinux wheels for ARM64 (aarch64) Linux. It uses `maturin` with `zig` for cross-compilation, targeting `manylinux2014` compatibility, and places the output wheels in the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_24 + +LANGUAGE: Shell +CODE: +``` +maturin build --release --zig \ + --target aarch_64-unknown-linux-gnu \ + --compatibility manylinux2014 \ + --out wheels +``` + +---------------------------------------- + +TITLE: Join Multiple Lance Datasets with SQL in Rust DataFusion +DESCRIPTION: This Rust example illustrates how to register multiple Lance datasets (e.g., 'orders' and 'customers') as separate tables in DataFusion. It then performs a SQL `JOIN` operation between these tables to combine data based on a common key, demonstrating more complex query capabilities. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/datafusion.md#_snippet_1 + +LANGUAGE: rust +CODE: +``` +use datafusion::prelude::SessionContext; +use crate::datafusion::LanceTableProvider; + +let ctx = SessionContext::new(); + +ctx.register_table("orders", + Arc::new(LanceTableProvider::new( + Arc::new(orders_dataset.clone()), + /* with_row_id */ false, + /* with_row_addr */ false, + )))?; + +ctx.register_table("customers", + Arc::new(LanceTableProvider::new( + Arc::new(customers_dataset.clone()), + /* with_row_id */ false, + /* with_row_addr */ false, + )))?; + +let df = ctx.sql(" + SELECT o.order_id, o.amount, c.customer_name + FROM orders o + JOIN customers c ON o.customer_id = c.customer_id + LIMIT 10 +").await?; + +let result = df.collect().await?; +``` + +---------------------------------------- + +TITLE: Generate Flame Graph from Process ID +DESCRIPTION: Generates a flame graph for a running process using its Process ID (PID). This command is used to capture and visualize CPU profiles, helping to identify performance bottlenecks in an application. + +SOURCE: https://github.com/lancedb/lance/blob/__wiki__/Debug.md#_snippet_5 + +LANGUAGE: sh +CODE: +``` +flamegraph -p +``` + +---------------------------------------- + +TITLE: Clone LanceDB GitHub Repository +DESCRIPTION: Instructions to clone the LanceDB project repository from GitHub to a local machine. This is the first step for setting up the development environment. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_11 + +LANGUAGE: shell +CODE: +``` +git clone https://github.com/lancedb/lance.git +``` + +---------------------------------------- + +TITLE: Rust Implementation of WikiTextBatchReader +DESCRIPTION: This Rust code defines `WikiTextBatchReader`, a custom implementation of `arrow::record_batch::RecordBatchReader`. It's designed to read text data from Parquet files, tokenize it using a `Tokenizer` from the `tokenizers` crate, and transform it into Arrow `RecordBatch`es. The `process_batch` method handles tokenization, limits the number of samples, and shuffles the tokenized IDs before creating the final `RecordBatch`. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_0 + +LANGUAGE: rust +CODE: +``` +use arrow::array::{Array, Int64Builder, ListBuilder, UInt32Array}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use arrow::record_batch::RecordBatchReader; +use futures::StreamExt; +use hf_hub::{api::sync::Api, Repo, RepoType}; +use lance::dataset::WriteParams; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use rand::seq::SliceRandom; +use rand::SeedableRng; +use std::error::Error; +use std::fs::File; +use std::io::Write; +use std::sync::Arc; +use tempfile::NamedTempFile; +use tokenizers::Tokenizer; + +// Implement a custom stream batch reader +struct WikiTextBatchReader { + schema: Arc, + parquet_readers: Vec>>, + current_reader_idx: usize, + current_reader: Option>, + tokenizer: Tokenizer, + num_samples: u64, + cur_samples_cnt: u64, +} + +impl WikiTextBatchReader { + fn new( + parquet_readers: Vec>, + tokenizer: Tokenizer, + num_samples: Option, + ) -> Result> { + let schema = Arc::new(Schema::new(vec![Field::new( + "input_ids", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + false, + )])); + + Ok(Self { + schema, + parquet_readers: parquet_readers.into_iter().map(Some).collect(), + current_reader_idx: 0, + current_reader: None, + tokenizer, + num_samples: num_samples.unwrap_or(100_000), + cur_samples_cnt: 0, + }) + } + + fn process_batch( + &mut self, + input_batch: &RecordBatch, + ) -> Result { + let num_rows = input_batch.num_rows(); + let mut token_builder = ListBuilder::new(Int64Builder::with_capacity(num_rows * 1024)); // Pre-allocate space + let mut should_break = false; + + let column = input_batch.column_by_name("text").unwrap(); + let string_array = column + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..num_rows { + if self.cur_samples_cnt >= self.num_samples { + should_break = true; + break; + } + if !Array::is_null(string_array, i) { + let text = string_array.value(i); + // Split paragraph into lines + for line in text.split(' +') { + if let Ok(encoding) = self.tokenizer.encode(line, true) { + let tb_values = token_builder.values(); + for &id in encoding.get_ids() { + tb_values.append_value(id as i64); + } + token_builder.append(true); + self.cur_samples_cnt += 1; + if self.cur_samples_cnt % 5000 == 0 { + println!("Processed {} rows", self.cur_samples_cnt); + } + if self.cur_samples_cnt >= self.num_samples { + should_break = true; + break; + } + } + } + } + } + + // Create array and shuffle it + let input_ids_array = token_builder.finish(); + + // Create shuffled array by randomly sampling indices + let mut rng = rand::rngs::StdRng::seed_from_u64(1337); + let len = input_ids_array.len(); + let mut indices: Vec = (0..len as u32).collect(); + indices.shuffle(&mut rng); + + // Take values in shuffled order + let indices_array = UInt32Array::from(indices); + let shuffled = arrow::compute::take(&input_ids_array, &indices_array, None)?; + + let batch = RecordBatch::try_new(self.schema.clone(), vec![Arc::new(shuffled)]); + if should_break { + println!("Stop at {} rows", self.cur_samples_cnt); + self.parquet_readers.clear(); + self.current_reader = None; + } + + batch + } +} + +impl RecordBatchReader for WikiTextBatchReader { + fn schema(&self) -> Arc { + self.schema.clone() + } +} + +impl Iterator for WikiTextBatchReader { + type Item = Result; + fn next(&mut self) -> Option { + loop { + // If we have a current reader, try to get next batch + if let Some(reader) = &mut self.current_reader { + if let Some(batch_result) = reader.next() { + return Some(batch_result.and_then(|batch| self.process_batch(&batch))); + } + } + + // If no current reader or current reader is exhausted, try to get next reader + if self.current_reader_idx < self.parquet_readers.len() { +``` + +---------------------------------------- + +TITLE: Set DYLD_LIBRARY_PATH for Lance Python Debugging in LLDB +DESCRIPTION: Configures the `DYLD_LIBRARY_PATH` environment variable specifically for debugging Lance Python projects within LLDB. This ensures that the dynamic linker can find necessary shared libraries located in the third-party distribution directory. + +SOURCE: https://github.com/lancedb/lance/blob/__wiki__/Debug.md#_snippet_1 + +LANGUAGE: lldb +CODE: +``` +# /path/to/lance/python/.lldbinit +env DYLD_LIBRARY_PATH=/path/to/thirdparty/dist/lib:${DYLD_LIBRARY_PATH} +``` + +---------------------------------------- + +TITLE: Download and extract MeCab Ipadic model +DESCRIPTION: This snippet downloads the gzipped tarball of the MeCab Ipadic model from GitHub and then extracts its contents using tar. This is the first step in preparing the dictionary for building. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/python/tests/models/lindera/README.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +curl -L -o mecab-ipadic-2.7.0-20070801.tar.gz "https://github.com/lindera-morphology/mecab-ipadic/archive/refs/tags/2.7.0-20070801.tar.gz" +tar xvf mecab-ipadic-2.7.0-20070801.tar.gz +``` + +---------------------------------------- + +TITLE: Build user dictionary with Lindera +DESCRIPTION: This command demonstrates how to build a custom user dictionary using 'lindera build'. It takes a CSV file as input and creates a new user dictionary, which can be used to extend the base language model. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/python/tests/models/lindera/README.md#_snippet_2 + +LANGUAGE: bash +CODE: +``` +lindera build --build-user-dictionary --dictionary-kind=ipadic user_dict/userdict.csv user_dict2 +``` + +---------------------------------------- + +TITLE: Download Jieba Language Model +DESCRIPTION: Command-line instruction to download the Jieba language model for use with LanceDB. The model will be automatically stored in the default Jieba model directory within the configured language model home. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +python -m lance.download jieba +``` + +---------------------------------------- + +TITLE: Read and Inspect Lance Dataset in Rust +DESCRIPTION: This Rust function `read_dataset` shows how to open an existing Lance dataset from a given path. It uses a `scanner` to create a `batch_stream` and then iterates through each `RecordBatch`, printing its number of rows, columns, schema, and the entire batch content. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/write_read_dataset.md#_snippet_1 + +LANGUAGE: Rust +CODE: +``` +// Reads dataset from the given path and prints batch size, schema for all record batches. Also extracts and prints a slice from the first batch +async fn read_dataset(data_path: &str) { + let dataset = Dataset::open(data_path).await.unwrap(); + let scanner = dataset.scan(); + + let mut batch_stream = scanner.try_into_stream().await.unwrap().map(|b| b.unwrap()); + + while let Some(batch) = batch_stream.next().await { + println!("Batch size: {}, {}", batch.num_rows(), batch.num_columns()); // print size of batch + println!("Schema: {:?}", batch.schema()); // print schema of recordbatch + + println!("Batch: {:?}", batch); // print the entire recordbatch (schema and data) + } +} // End read dataset +``` + +---------------------------------------- + +TITLE: Create a Lance Dataset from Arrow RecordBatches in Rust +DESCRIPTION: Demonstrates how to write a collection of Arrow RecordBatches and an Arrow Schema into a new Lance Dataset. It uses default write parameters and an iterator for the batches. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_1 + +LANGUAGE: rust +CODE: +``` +use lance::{dataset::WriteParams, Dataset}; + +let write_params = WriteParams::default(); +let mut reader = RecordBatchIterator::new( + batches.into_iter().map(Ok), + schema +); +Dataset::write(reader, &uri, Some(write_params)).await.unwrap(); +``` + +---------------------------------------- + +TITLE: Build Ipadic language model with Lindera +DESCRIPTION: This command uses the 'lindera build' tool to compile the Ipadic dictionary. It specifies the dictionary kind as 'ipadic' and points to the extracted model directory to create the main dictionary. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/python/tests/models/lindera/README.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +lindera build --dictionary-kind=ipadic mecab-ipadic-2.7.0-20070801 main +``` + +---------------------------------------- + +TITLE: Write Lance Dataset in Rust +DESCRIPTION: This Rust function `write_dataset` demonstrates how to create and write a Lance dataset to a specified path. It defines a schema with `UInt32` fields, creates a `RecordBatch` with sample data, and uses `WriteParams` to set the write mode to `Overwrite` before writing the dataset to disk. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/write_read_dataset.md#_snippet_0 + +LANGUAGE: Rust +CODE: +``` +// Writes sample dataset to the given path +async fn write_dataset(data_path: &str) { + // Define new schema + let schema = Arc::new(Schema::new(vec![ + Field::new("key", DataType::UInt32, false), + Field::new("value", DataType::UInt32, false), + ])); + + // Create new record batches + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6])), + Arc::new(UInt32Array::from(vec![6, 7, 8, 9, 10, 11])), + ], + ) + .unwrap(); + + let batches = RecordBatchIterator::new([Ok(batch)], schema.clone()); + + // Define write parameters (e.g. overwrite dataset) + let write_params = WriteParams { + mode: WriteMode::Overwrite, + ..Default::default() + }; + + Dataset::write(batches, data_path, Some(write_params)) + .await + .unwrap(); +} // End write dataset +``` + +---------------------------------------- + +TITLE: Build LanceDB Rust JNI Module +DESCRIPTION: Specifies the command to build only the Rust-based JNI (Java Native Interface) module of LanceDB. This is useful for developers focusing on the native components without rebuilding the entire Java project. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_10 + +LANGUAGE: shell +CODE: +``` +cargo build +``` + +---------------------------------------- + +TITLE: Read a Lance Dataset and Collect RecordBatches in Rust +DESCRIPTION: Opens an existing Lance Dataset from a specified path, scans its content, and collects all resulting RecordBatches into a vector. Error handling is included. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_2 + +LANGUAGE: rust +CODE: +``` +let dataset = Dataset::open(path).await.unwrap(); +let mut scanner = dataset.scan(); +let batches: Vec = scanner + .try_into_stream() + .await + .unwrap() + .map(|b| b.unwrap()) + .collect::>() + .await; +``` + +---------------------------------------- + +TITLE: Create a Vector Index on a Lance Dataset in Rust +DESCRIPTION: Demonstrates how to create a vector index on a specified column (e.g., 'embeddings') within a Lance Dataset. It configures vector index parameters like the number of partitions and sub-vectors, noting potential alignment requirements. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_4 + +LANGUAGE: rust +CODE: +``` +use ::lance::index::vector::VectorIndexParams; + +let params = VectorIndexParams::default(); +params.num_partitions = 256; +params.num_sub_vectors = 16; + +// this will Err if list_size(embeddings) / num_sub_vectors does not meet simd alignment +dataset.create_index(&["embeddings"], IndexType::Vector, None, ¶ms, true).await; +``` + +---------------------------------------- + +TITLE: Retrieve Specific Records from a Lance Dataset in Rust +DESCRIPTION: Retrieves specific records from a Lance Dataset based on their indices and a projection. The result is a RecordBatch containing the requested data. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_3 + +LANGUAGE: rust +CODE: +``` +let values: Result = dataset.take(&[200, 199, 39, 40, 100], &projection).await; +``` \ No newline at end of file diff --git a/specs/backgroundinfo/lancedb.md b/specs/backgroundinfo/lancedb.md new file mode 100644 index 0000000..58a9bfc --- /dev/null +++ b/specs/backgroundinfo/lancedb.md @@ -0,0 +1,6847 @@ +======================== +CODE SNIPPETS +======================== +TITLE: Run LanceDB documentation examples tests +DESCRIPTION: Checks the documentation examples for correctness and consistency, ensuring they function as expected. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_3 + +LANGUAGE: shell +CODE: +``` +make doctest +``` + +---------------------------------------- + +TITLE: Install documentation website requirements +DESCRIPTION: This command installs the necessary Python packages for building the main documentation website, which is powered by `mkdocs-material`. It ensures all dependencies are met before serving the docs. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_7 + +LANGUAGE: bash +CODE: +``` +pip install -r docs/requirements.txt +``` + +---------------------------------------- + +TITLE: Build and serve documentation website locally +DESCRIPTION: These commands navigate to the `docs` directory and start a local development server for the documentation website. This allows contributors to preview changes to the documentation in real-time. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_8 + +LANGUAGE: bash +CODE: +``` +cd docs +mkdocs serve +``` + +---------------------------------------- + +TITLE: Perform Python development installation +DESCRIPTION: These commands navigate into the `python` directory and perform a development installation of the Lance Python bindings. This allows developers to import and test changes to the Python wrapper directly. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +cd python +maturin develop +``` + +---------------------------------------- + +TITLE: Example output of git commit with pre-commit hooks +DESCRIPTION: Demonstrates the console output when committing changes after pre-commit hooks are installed, showing the execution and status of linters like black, isort, and ruff. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_8 + +LANGUAGE: shell +CODE: +``` +git commit -m"Changed some python files" +black....................................................................Passed +isort (python)...........................................................Passed +ruff.....................................................................Passed +[main daf91ed] Changed some python files + 1 file changed, 1 insertion(+), 1 deletion(-) +``` + +---------------------------------------- + +TITLE: Install LanceDB test dependencies +DESCRIPTION: Installs the necessary Python packages for running tests, including optional test dependencies specified in the project's setup. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_1 + +LANGUAGE: shell +CODE: +``` +pip install '.[tests]' +``` + +---------------------------------------- + +TITLE: Install pre-commit tool for LanceDB +DESCRIPTION: Installs the `pre-commit` tool, which enables running formatters and linters automatically before each Git commit. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_6 + +LANGUAGE: shell +CODE: +``` +pip install pre-commit +``` + +---------------------------------------- + +TITLE: Download and Extract SIFT 1M Dataset +DESCRIPTION: This snippet provides shell commands to download and extract the SIFT 1M dataset, which is used as a large-scale example for vector search demonstrations. It includes commands to clean up previous downloads and extract the compressed archive. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_11 + +LANGUAGE: bash +CODE: +``` +rm -rf sift* vec_data.lance +wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz +tar -xzf sift.tar.gz +``` + +---------------------------------------- + +TITLE: Create Pandas DataFrame +DESCRIPTION: This code demonstrates how to create a simple Pandas DataFrame. This DataFrame serves as a basic example for subsequent operations, such as writing data to a Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +df = pd.DataFrame({"a": [5]}) +df +``` + +---------------------------------------- + +TITLE: TPCH Benchmark Setup and Execution +DESCRIPTION: This snippet outlines the steps to set up the dataset and run the TPCH Q1 benchmark comparing LanceDB and Parquet. It includes navigating to the benchmark directory, creating a dataset folder, downloading and renaming the necessary Parquet file, and executing the benchmark script. Note: The step to 'generate lance file' is a conceptual action within the benchmark process, not an explicit command provided. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/tpch/README.md#_snippet_0 + +LANGUAGE: Shell +CODE: +``` +cd lance/benchmarks/tpch +mkdir dataset && cd dataset +wget https://github.com/cwida/duckdb-data/releases/download/v1.0/lineitemsf1.snappy.parquet -O lineitem_sf1.parquet +cd .. +``` + +LANGUAGE: Shell +CODE: +``` +python3 benchmark.py q1 +``` + +---------------------------------------- + +TITLE: Install LanceDB pre-commit hooks +DESCRIPTION: Installs the pre-commit hooks defined in the project's configuration, activating automatic linting and formatting on commit attempts. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_7 + +LANGUAGE: shell +CODE: +``` +pre-commit install +``` + +---------------------------------------- + +TITLE: Install Python bindings build tool +DESCRIPTION: This command installs `maturin`, a tool essential for building Python packages that integrate with Rust code. It's a prerequisite for setting up the Python development environment for Lance. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +pip install maturin +``` + +---------------------------------------- + +TITLE: Start Local Services for S3 Integration Tests +DESCRIPTION: Before running S3 integration tests, you need to start local Minio and DynamoDB services. This command uses Docker Compose to bring up these required services, ensuring the test environment is ready. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_20 + +LANGUAGE: Shell +CODE: +``` +docker compose up +``` + +---------------------------------------- + +TITLE: Install preview pylance Python SDK via pip +DESCRIPTION: Install the preview version of the pylance Python SDK to access the latest features and bug fixes. This uses a specific extra index URL for LanceDB's PyPI. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/install.md#_snippet_1 + +LANGUAGE: Bash +CODE: +``` +pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ pylance +``` + +---------------------------------------- + +TITLE: Access Specific Lance Dataset Version +DESCRIPTION: This example demonstrates how to load and query a specific historical version of a Lance dataset. By specifying the `version` parameter, users can access data as it existed at a particular point in time, enabling historical analysis or rollbacks. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_8 + +LANGUAGE: python +CODE: +``` +# Version 1 +lance.dataset('/tmp/test.lance', version=1).to_table().to_pandas() + +# Version 2 +lance.dataset('/tmp/test.lance', version=2).to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Install stable pylance Python SDK via pip +DESCRIPTION: Install the stable and recommended version of the pylance Python SDK using the pip package manager. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/install.md#_snippet_0 + +LANGUAGE: Bash +CODE: +``` +pip install pylance +``` + +---------------------------------------- + +TITLE: Run all LanceDB tests +DESCRIPTION: Executes the full test suite for the LanceDB project using the `make test` command. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_2 + +LANGUAGE: shell +CODE: +``` +make test +``` + +---------------------------------------- + +TITLE: Install Linux Perf Tools and Configure Kernel Parameters +DESCRIPTION: Installs necessary Linux performance tools (`perf`) on Ubuntu systems and configures the `perf_event_paranoid` kernel parameter. This setup is crucial for allowing non-root users to collect performance data using tools like `perf` and `flamegraph`. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/Debug.md#_snippet_4 + +LANGUAGE: sh +CODE: +``` +sudo apt install linux-tools-common linux-tools-generic linux-tools-`uname -r` +sudo sh -c "echo -1 > /proc/sys/kernel/perf_event_paranoid" +``` + +---------------------------------------- + +TITLE: Load Lance Vector Dataset +DESCRIPTION: This snippet shows how to load a previously created Lance vector dataset. This step is essential before performing any vector search queries or other operations on the dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_13 + +LANGUAGE: python +CODE: +``` +uri = "vec_data.lance" +sift1m = lance.dataset(uri) +``` + +---------------------------------------- + +TITLE: Prepare Parquet File from Pandas DataFrame +DESCRIPTION: This code prepares a Parquet file from a Pandas DataFrame using PyArrow. It cleans up any existing Parquet or Lance files to ensure a fresh start, then converts the DataFrame to a PyArrow Table and writes it as a Parquet dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +shutil.rmtree("/tmp/test.parquet", ignore_errors=True) +shutil.rmtree("/tmp/test.lance", ignore_errors=True) + +tbl = pa.Table.from_pandas(df) +pa.dataset.write_dataset(tbl, "/tmp/test.parquet", format='parquet') + +parquet = pa.dataset.dataset("/tmp/test.parquet") +parquet.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Install required Python libraries +DESCRIPTION: Installs necessary Python packages for data handling, OpenAI API interaction, rate limiting, and LanceDB. The `--quiet` flag suppresses verbose output during installation. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_0 + +LANGUAGE: python +CODE: +``` +pip install --quiet openai tqdm ratelimiter retry datasets pylance +``` + +---------------------------------------- + +TITLE: Run Rust unit tests +DESCRIPTION: This command executes the unit tests for the Rust core format. Running these tests verifies the correctness of the Rust implementation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_6 + +LANGUAGE: bash +CODE: +``` +cargo test +``` + +---------------------------------------- + +TITLE: Profile a LanceDB benchmark using flamegraph +DESCRIPTION: Generates a flamegraph for a specific benchmark using `cargo-flamegraph`, aiding in performance analysis. It's recommended to run benchmarks once beforehand to avoid setup time being captured in the profile. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_14 + +LANGUAGE: shell +CODE: +``` +flamegraph -F 100 --no-inline -- $(which python) \ + -m pytest python/benchmarks \ + --benchmark-min-time=2 \ + -k test_ivf_pq_index_search +``` + +---------------------------------------- + +TITLE: Install Flamegraph Tool +DESCRIPTION: Installs the `flamegraph` profiling tool using Cargo, Rust's package manager. This tool is essential for visualizing CPU usage and call stacks as flame graphs for performance analysis. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/Debug.md#_snippet_3 + +LANGUAGE: sh +CODE: +``` +cargo install flamegraph +``` + +---------------------------------------- + +TITLE: Set up BigANN Benchmark Environment +DESCRIPTION: This snippet provides commands to set up a Python virtual environment, clone the 'big-ann-benchmarks' repository, and install its required dependencies. It prepares the system for running BigANN benchmarks by ensuring all necessary tools and libraries are in place. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/bigann/README.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +python -m venv venv +. ./venv/bin/activate +git clone https://github.com/harsha-simhadri/big-ann-benchmarks.git +cd big-ann-benchmarks +pip install -r requirements_py3.10.txt +``` + +---------------------------------------- + +TITLE: List Lance Dataset Versions +DESCRIPTION: This code shows how to retrieve a list of all available versions for a Lance dataset. This functionality is crucial for understanding the history of changes and for accessing specific historical states of the data. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_7 + +LANGUAGE: python +CODE: +``` +dataset.versions() +``` + +---------------------------------------- + +TITLE: Install Lance Build Dependencies on Ubuntu +DESCRIPTION: This command installs necessary system-level dependencies for building Lance on Ubuntu 22.04, including protobuf, SSL development libraries, and general build tools. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +sudo apt install protobuf-compiler libssl-dev build-essential pkg-config gfortran +``` + +---------------------------------------- + +TITLE: Build Rust core format (release) +DESCRIPTION: This command compiles the Rust core format in release mode. The release build is optimized for performance and is suitable for production deployments or benchmarking. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_5 + +LANGUAGE: bash +CODE: +``` +cargo build -r +``` + +---------------------------------------- + +TITLE: Debug Python Script with LLDB +DESCRIPTION: Demonstrates how to start an LLDB debugging session for a Python script. It involves launching LLDB with the Python interpreter from a virtual environment and then running the target script within the LLDB prompt. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/Debug.md#_snippet_2 + +LANGUAGE: sh +CODE: +``` +$ lldb ./venv/bin/python +(lldb) r script.py +``` + +---------------------------------------- + +TITLE: Install Lance Build Dependencies on Mac +DESCRIPTION: This command installs the protobuf compiler using Homebrew, a required dependency for building Lance on macOS. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +brew install protobuf +``` + +---------------------------------------- + +TITLE: Configure LLDB Initialization Settings +DESCRIPTION: Sets up basic LLDB initialization settings in the `~/.lldbinit` file. This includes configuring the number of source code lines to display before and after a stop, and enabling the loading of `.lldbinit` files from the current working directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/Debug.md#_snippet_0 + +LANGUAGE: lldb +CODE: +``` +# ~/.lldbinit +settings set stop-line-count-before 15 +settings set stop-line-count-after 15 +settings set target.load-cwd-lldbinit true +``` + +---------------------------------------- + +TITLE: List all versions of a Lance dataset +DESCRIPTION: Retrieves and displays the version history of the Lance dataset, showing all previous and current states of the data. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_9 + +LANGUAGE: Python +CODE: +``` +dataset.versions() +``` + +---------------------------------------- + +TITLE: Load Lance Dataset +DESCRIPTION: Initializes a Lance dataset object from a specified URI, preparing it for subsequent operations like nearest neighbor searches. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_20 + +LANGUAGE: python +CODE: +``` +sift1m = lance.dataset(uri) +``` + +---------------------------------------- + +TITLE: Complete Lance Dataset Write and Read Example in Rust +DESCRIPTION: This Rust `main` function provides a complete example demonstrating the usage of `write_dataset` and `read_dataset` functions. It sets up the necessary `arrow` and `lance` imports, defines a temporary data path, and orchestrates the writing and subsequent reading of a Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/write_read_dataset.md#_snippet_2 + +LANGUAGE: Rust +CODE: +``` +use arrow::array::UInt32Array; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::{RecordBatch, RecordBatchIterator}; +use futures::StreamExt; +use lance::dataset::{WriteMode, WriteParams}; +use lance::Dataset; +use std::sync::Arc; + +#[tokio::main] +async fn main() { + let data_path: &str = "./temp_data.lance"; + + write_dataset(data_path).await; + read_dataset(data_path).await; +} +``` + +---------------------------------------- + +TITLE: Rust: Main Workflow for WikiText to LanceDB Ingestion +DESCRIPTION: This comprehensive example demonstrates the full data ingestion pipeline in Rust. It initializes a Tokio runtime, loads a tokenizer, sets up the Hugging Face API to download WikiText Parquet files, processes them into a `WikiTextBatchReader`, and finally writes the data to a Lance dataset. It also includes verification of the created dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_2 + +LANGUAGE: Rust +CODE: +``` +fn main() -> Result<(), Box> { + let rt = tokio::runtime::Runtime::new()?; + rt.block_on(async { + // Load tokenizer + let tokenizer = load_tokenizer("gpt2")?; + + // Set up Hugging Face API + // Download from https://huggingface.co/datasets/Salesforce/wikitext/tree/main/wikitext-103-raw-v1 + let api = Api::new()?; + let repo = api.repo(Repo::with_revision( + "Salesforce/wikitext".into(), + RepoType::Dataset, + "main".into(), + )); + + // Define the parquet files we want to download + let train_files = vec![ + "wikitext-103-raw-v1/train-00000-of-00002.parquet", + "wikitext-103-raw-v1/train-00001-of-00002.parquet", + ]; + + let mut parquet_readers = Vec::new(); + for file in &train_files { + println!("Downloading file: {}", file); + let file_path = repo.get(file)?; + let data = std::fs::read(file_path)?; + + // Create a temporary file in the system temp directory and write the downloaded data to it + let mut temp_file = NamedTempFile::new()?; + temp_file.write_all(&data)?; + + // Create the parquet reader builder with a larger batch size + let builder = ParquetRecordBatchReaderBuilder::try_new(temp_file.into_file())? + .with_batch_size(8192); // Increase batch size for better performance + parquet_readers.push(builder); + } + + if parquet_readers.is_empty() { + println!("No parquet files found to process."); + return Ok(()); + } + + // Create batch reader + let num_samples: u64 = 500_000; + let batch_reader = WikiTextBatchReader::new(parquet_readers, tokenizer, Some(num_samples))?; + + // Save as Lance dataset + println!("Writing to Lance dataset..."); + let lance_dataset_path = "rust_wikitext_lance_dataset.lance"; + + let write_params = WriteParams::default(); + lance::Dataset::write(batch_reader, lance_dataset_path, Some(write_params)).await?; + + // Verify the dataset + let ds = lance::Dataset::open(lance_dataset_path).await?; + let scanner = ds.scan(); + let mut stream = scanner.try_into_stream().await?; + + let mut total_rows = 0; + while let Some(batch_result) = stream.next().await { + let batch = batch_result?; + total_rows += batch.num_rows(); + } + + println!( + "Lance dataset created successfully with {} rows", + total_rows + ); + println!("Dataset location: {}", lance_dataset_path); + + Ok(()) + }) +} +``` + +---------------------------------------- + +TITLE: Build and Test Pylance Python Package +DESCRIPTION: These commands set up a Python virtual environment, install maturin for Rust-Python binding, build the Pylance package in debug mode, and then run its associated tests. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_3 + +LANGUAGE: bash +CODE: +``` +cd python +python3 -m venv venv +source venv/bin/activate + +pip install maturin + +# Build debug build +maturin develop --extras tests + +# Run pytest +pytest python/tests/ +``` + +---------------------------------------- + +TITLE: Install Lance using Cargo +DESCRIPTION: Installs the Lance Rust library as a command-line tool using the Cargo package manager. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_0 + +LANGUAGE: shell +CODE: +``` +cargo install lance +``` + +---------------------------------------- + +TITLE: Append Data to Lance Dataset +DESCRIPTION: This example illustrates how to append new data to an existing Lance dataset. It creates a new Pandas DataFrame, converts it to a PyArrow Table, and then uses `lance.write_dataset` with `mode="append"` to add the new rows, creating a new version of the dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +df = pd.DataFrame({"a": [10]}) +tbl = pa.Table.from_pandas(df) +dataset = lance.write_dataset(tbl, "/tmp/test.lance", mode="append") + +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Access Lance Dataset by Tag +DESCRIPTION: This code demonstrates how to load a Lance dataset using a previously defined tag instead of a numerical version. This allows for more intuitive access to specific, meaningful versions of the data, improving readability and maintainability. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_10 + +LANGUAGE: python +CODE: +``` +lance.dataset('/tmp/test.lance', version="stable").to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Build pylance in release mode for benchmarks +DESCRIPTION: Builds the `pylance` module in release mode with debug symbols, enabling benchmark execution and profiling. It includes benchmark-specific extras and features for data generation. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_10 + +LANGUAGE: shell +CODE: +``` +maturin develop --profile release-with-debug --extras benchmarks --features datagen +``` + +---------------------------------------- + +TITLE: Query Lance Dataset with Simple SQL in Rust DataFusion +DESCRIPTION: This Rust example demonstrates how to register a Lance dataset as a table in DataFusion using `LanceTableProvider` and execute a simple SQL `SELECT` query to retrieve the first 10 rows. It shows the basic setup for integrating Lance with DataFusion. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/datafusion.md#_snippet_0 + +LANGUAGE: rust +CODE: +``` +use datafusion::prelude::SessionContext; +use crate::datafusion::LanceTableProvider; + +let ctx = SessionContext::new(); + +ctx.register_table("dataset", + Arc::new(LanceTableProvider::new( + Arc::new(dataset.clone()), + /* with_row_id */ false, + /* with_row_addr */ false, + )))?; + +let df = ctx.sql("SELECT * FROM dataset LIMIT 10").await?; +let result = df.collect().await?; +``` + +---------------------------------------- + +TITLE: Install Lance Preview Release +DESCRIPTION: Installs a preview release of the `pylance` library, which includes the latest features and bug fixes. Preview releases are published more frequently and offer early access to new developments. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_1 + +LANGUAGE: shell +CODE: +``` +pip install --pre --extra-index-url https://pypi.fury.io/lancedb/ pylance +``` + +---------------------------------------- + +TITLE: Install LanceDB and Python Dependencies +DESCRIPTION: Installs specific versions of LanceDB, pandas, and duckdb required for running the benchmarks. This ensures compatibility and reproducibility of the benchmark results. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_0 + +LANGUAGE: sh +CODE: +``` +pip lancedb==0.3.6 +pip install pandas~=2.1.0 +pip duckdb~=0.9.0 +``` + +---------------------------------------- + +TITLE: Prepare HD-Vila Dataset with Python venv +DESCRIPTION: This snippet outlines the steps to set up a Python virtual environment, activate it, and install necessary dependencies from `requirements.txt` for the HD-Vila dataset. It ensures a clean and isolated environment for project dependencies. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/hd-vila/README.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +python3 -m venv venv +. ./venv/bin/activate +pip install -r requirements.txt +``` + +---------------------------------------- + +TITLE: Run Python unit and integration tests +DESCRIPTION: These commands execute the unit tests and integration tests for the Python components of the Lance project. Running these tests is crucial to ensure code changes do not introduce regressions. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_2 + +LANGUAGE: bash +CODE: +``` +make test +make integtest +``` + +---------------------------------------- + +TITLE: Import necessary libraries for LanceDB operations +DESCRIPTION: This snippet imports `shutil`, `lance`, `numpy`, `pandas`, and `pyarrow` for file system operations, LanceDB interactions, numerical computing, data manipulation, and Arrow table handling, respectively. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_0 + +LANGUAGE: Python +CODE: +``` +import shutil + +import lance +import numpy as np +import pandas as pd +import pyarrow as pa +``` + +---------------------------------------- + +TITLE: Create a Pandas DataFrame for LanceDB +DESCRIPTION: Initializes a simple Pandas DataFrame with a single column 'a' and a value of 5. This DataFrame will be used as input for creating a Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_1 + +LANGUAGE: Python +CODE: +``` +df = pd.DataFrame({"a": [5]}) +df +``` + +---------------------------------------- + +TITLE: Sample Query Vectors from Lance Dataset +DESCRIPTION: This code demonstrates how to sample a subset of vectors from the loaded Lance dataset to be used as query vectors for nearest neighbor search. It leverages DuckDB for efficient sampling of the vector column. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_14 + +LANGUAGE: python +CODE: +``` +import duckdb +# Make sure DuckDB v0.7+ is installed +samples = duckdb.query("SELECT vector FROM sift1m USING SAMPLE 100").to_df().vector +``` + +---------------------------------------- + +TITLE: Execute Tunable Nearest Neighbor Search +DESCRIPTION: Demonstrates how to perform a nearest neighbor search with tunable parameters like 'nprobes' and 'refine_factor' to balance latency and recall. The result is converted to a Pandas DataFrame. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_22 + +LANGUAGE: python +CODE: +``` +%%time + +sift1m.to_table( + nearest={ + "column": "vector", + "q": samples[0], + "k": 10, + "nprobes": 10, + "refine_factor": 5 + } +).to_pandas() +``` + +---------------------------------------- + +TITLE: Load SIFT vector dataset from Lance file +DESCRIPTION: Defines the URI for the Lance vector dataset and then loads it using `lance.dataset()`, making the SIFT 1M vector data accessible for further operations. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_16 + +LANGUAGE: Python +CODE: +``` +uri = "vec_data.lance" +sift1m = lance.dataset(uri) +``` + +---------------------------------------- + +TITLE: Import LanceDB Libraries +DESCRIPTION: This snippet imports the necessary Python libraries for working with LanceDB, including `shutil` for file operations, `lance` for core LanceDB functionalities, `numpy` for numerical operations, `pandas` for data manipulation, and `pyarrow` for data interchange. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import shutil +import lance +import numpy as np +import pandas as pd +import pyarrow as pa +``` + +---------------------------------------- + +TITLE: Run all LanceDB benchmarks (including slow tests) +DESCRIPTION: Executes all performance benchmarks, including those marked as 'slow', which may take a longer time to complete. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_12 + +LANGUAGE: shell +CODE: +``` +pytest python/benchmarks +``` + +---------------------------------------- + +TITLE: Prepare Python Virtual Environment for Benchmarks +DESCRIPTION: Creates and activates a Python virtual environment, then installs required packages from `requirements.txt`. This isolates project dependencies and ensures a clean execution environment for the benchmark scripts. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_2 + +LANGUAGE: sh +CODE: +``` +python3 -m venv venv +. ./venv/bin/activate +pip install -r requirements.txt +``` + +---------------------------------------- + +TITLE: Create Tags for Lance Dataset Versions +DESCRIPTION: This snippet illustrates how to create human-readable tags for specific versions of a Lance dataset. Tags provide a convenient way to mark and reference important dataset states, such as 'stable' or 'nightly' builds, simplifying version management. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_9 + +LANGUAGE: python +CODE: +``` +dataset.tags.create("stable", 2) +dataset.tags.create("nightly", 3) +dataset.tags.list() +``` + +---------------------------------------- + +TITLE: Run LanceDB code formatters +DESCRIPTION: Applies code formatting rules to the entire project. Specific commands like `make format-python` or `cargo fmt` can be used for language-specific formatting. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_4 + +LANGUAGE: shell +CODE: +``` +make format +``` + +---------------------------------------- + +TITLE: Build and Search HNSW Index for Vector Similarity in Rust +DESCRIPTION: This Rust code provides a complete example for vector similarity search. It defines a `ground_truth` function for L2 distance calculation, `create_test_vector_dataset` to generate synthetic fixed-size list vectors, and a `main` function that orchestrates the process. The `main` function generates or loads a dataset, builds an HNSW index using `lance_index::vector::hnsw`, and then performs vector searches, measuring construction and search times, and calculating recall against ground truth. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/hnsw.md#_snippet_0 + +LANGUAGE: Rust +CODE: +``` +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::array::{types::Float32Type, Array, FixedSizeListArray}; +use arrow::array::{AsArray, FixedSizeListBuilder, Float32Builder}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use arrow::record_batch::RecordBatchIterator; +use arrow_select::concat::concat; +use futures::stream::StreamExt; +use lance::Dataset; +use lance_index::vector::v3::subindex::IvfSubIndex; +use lance_index::vector::{ + flat::storage::FlatFloatStorage, + hnsw::{builder::HnswBuildParams, HNSW}, +}; +use lance_linalg::distance::DistanceType; + +fn ground_truth(fsl: &FixedSizeListArray, query: &[f32], k: usize) -> HashSet { + let mut dists = vec![]; + for i in 0..fsl.len() { + let dist = lance_linalg::distance::l2_distance( + query, + fsl.value(i).as_primitive::().values(), + ); + dists.push((dist, i as u32)); + } + dists.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap()); + dists.truncate(k); + dists.into_iter().map(|(_, i)| i).collect() +} + +pub async fn create_test_vector_dataset(output: &str, num_rows: usize, dim: i32) { + let schema = Arc::new(Schema::new(vec![Field::new( + "vector", + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), dim), + false, + )])); + + let mut batches = Vec::new(); + + // Create a few batches + for _ in 0..2 { + let v_builder = Float32Builder::new(); + let mut list_builder = FixedSizeListBuilder::new(v_builder, dim); + + for _ in 0..num_rows { + for _ in 0..dim { + list_builder.values().append_value(rand::random::()); + } + list_builder.append(true); + } + let array = Arc::new(list_builder.finish()); + let batch = RecordBatch::try_new(schema.clone(), vec![array]).unwrap(); + batches.push(batch); + } + let batch_reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema.clone()); + println!("Writing dataset to {}", output); + Dataset::write(batch_reader, output, None).await.unwrap(); +} + +#[tokio::main] +async fn main() { + let uri: Option = None; // None means generate test data + let column = "vector"; + let ef = 100; + let max_edges = 30; + let max_level = 7; + + // 1. Generate a synthetic test data of specified dimensions + let dataset = if uri.is_none() { + println!("No uri is provided, generating test dataset..."); + let output = "test_vectors.lance"; + create_test_vector_dataset(output, 1000, 64).await; + Dataset::open(output).await.expect("Failed to open dataset") + } else { + Dataset::open(uri.as_ref().unwrap()) + .await + .expect("Failed to open dataset") + }; + + println!("Dataset schema: {:#?}", dataset.schema()); + let batches = dataset + .scan() + .project(&[column]) + .unwrap() + .try_into_stream() + .await + .unwrap() + .then(|batch| async move { batch.unwrap().column_by_name(column).unwrap().clone() }) + .collect::>() + .await; + let arrs = batches.iter().map(|b| b.as_ref()).collect::>(); + let fsl = concat(&arrs).unwrap().as_fixed_size_list().clone(); + println!("Loaded {:?} batches", fsl.len()); + + let vector_store = Arc::new(FlatFloatStorage::new(fsl.clone(), DistanceType::L2)); + + let q = fsl.value(0); + let k = 10; + let gt = ground_truth(&fsl, q.as_primitive::().values(), k); + + for ef_construction in [15, 30, 50] { + let now = std::time::Instant::now(); + // 2. Build a hierarchical graph structure for efficient vector search using Lance API + let hnsw = HNSW::index_vectors( + vector_store.as_ref(), + HnswBuildParams::default() + .max_level(max_level) + .num_edges(max_edges) + .ef_construction(ef_construction), + ) + .unwrap(); + let construct_time = now.elapsed().as_secs_f32(); + let now = std::time::Instant::now(); + // 3. Perform vector search with different parameters and compute the ground truth using L2 distance search + let results: HashSet = hnsw + .search_basic(q.clone(), k, ef, None, vector_store.as_ref()) + .unwrap() + .iter() + .map(|node| node.id) + .collect(); + let search_time = now.elapsed().as_micros(); + println!( + "level={}, ef_construct={}, ef={} recall={}: construct={:.3}s search={:.3} us", + max_level, + ef_construction, + ef, + results.intersection(>).count() as f32 / k as f32, + construct_time, + search_time + ); + } +} +``` + +---------------------------------------- + +TITLE: LanceDB Nearest Neighbor Search Parameters +DESCRIPTION: This section details the parameters available for tuning nearest neighbor searches in LanceDB, including 'q', 'k', 'nprobes', and 'refine_factor'. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_19 + +LANGUAGE: APIDOC +CODE: +``` +"nearest": { + "column": "string", // Name of the vector column + "q": "vector", // The query vector for nearest neighbor search + "k": "integer", // The number of nearest neighbors to return + "nprobes": "integer", // How many IVF partitions to search + "refine_factor": "integer" // Controls re-ranking: if k=10 and refine_factor=5, retrieves 50 nearest neighbors by ANN and re-sorts using actual distances, then returns top 10. Improves recall without sacrificing performance too much. +} +``` + +---------------------------------------- + +TITLE: Install Lance Python Library +DESCRIPTION: Installs the stable release of the `pylance` library using pip, providing access to Lance's functionalities in Python. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_0 + +LANGUAGE: shell +CODE: +``` +pip install pylance +``` + +---------------------------------------- + +TITLE: Convert Parquet Dataset to Lance +DESCRIPTION: This snippet demonstrates the straightforward conversion of an existing PyArrow Parquet dataset into a Lance dataset. It uses `lance.write_dataset` to perform the conversion and then verifies the content of the newly created Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +dataset = lance.write_dataset(parquet, "/tmp/test.lance") + +# Make sure it's the same +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Convert Parquet dataset to Lance dataset +DESCRIPTION: Converts an existing PyArrow Parquet dataset directly into a Lance dataset in a single line of code, demonstrating seamless integration. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_4 + +LANGUAGE: Python +CODE: +``` +dataset = lance.write_dataset(parquet, "/tmp/test.lance") +``` + +---------------------------------------- + +TITLE: Compare LanceDB benchmarks against previous version +DESCRIPTION: Provides a sequence of commands to compare the performance of the current version against the `main` branch. This involves saving a baseline from `main` and then comparing the current branch's performance against it. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_15 + +LANGUAGE: shell +CODE: +``` +CURRENT_BRANCH=$(git branch --show-current) +``` + +LANGUAGE: shell +CODE: +``` +git checkout main +``` + +LANGUAGE: shell +CODE: +``` +maturin develop --profile release-with-debug --features datagen +``` + +LANGUAGE: shell +CODE: +``` +pytest --benchmark-save=baseline python/benchmarks -m "not slow" +``` + +LANGUAGE: shell +CODE: +``` +COMPARE_ID=$(ls .benchmarks/*/ | tail -1 | cut -c1-4) +``` + +LANGUAGE: shell +CODE: +``` +git checkout $CURRENT_BRANCH +``` + +LANGUAGE: shell +CODE: +``` +maturin develop --profile release-with-debug --features datagen +``` + +LANGUAGE: shell +CODE: +``` +pytest --benchmark-compare=$COMPARE_ID python/benchmarks -m "not slow" +``` + +---------------------------------------- + +TITLE: Build Rust core format (debug) +DESCRIPTION: This command compiles the Rust core format in debug mode. The debug build includes debugging information and is suitable for development and testing, though it is not optimized for performance. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_4 + +LANGUAGE: bash +CODE: +``` +cargo build +``` + +---------------------------------------- + +TITLE: Download and extract SIFT 1M dataset for vector operations +DESCRIPTION: Removes any existing SIFT files and then downloads the `sift.tar.gz` archive from the specified FTP server. Finally, it extracts the contents of the tarball, preparing the SIFT 1M dataset for vector processing. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_14 + +LANGUAGE: Bash +CODE: +``` +!rm -rf sift* vec_data.lance +!wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz +!tar -xzf sift.tar.gz +``` + +---------------------------------------- + +TITLE: Format and lint Rust code +DESCRIPTION: These commands are used to automatically format Rust code according to community standards (`cargo fmt`) and to perform static analysis for potential issues (`cargo clippy`). This ensures code quality and consistency. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/community/contributing.md#_snippet_3 + +LANGUAGE: bash +CODE: +``` +cargo fmt --all +cargo clippy --all-features --tests --benches +``` + +---------------------------------------- + +TITLE: Run a specific LanceDB benchmark by name +DESCRIPTION: Filters and runs a particular benchmark using pytest's `-k` flag, allowing substring matching for the benchmark name. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_13 + +LANGUAGE: shell +CODE: +``` +pytest python/benchmarks -k test_ivf_pq_index_search +``` + +---------------------------------------- + +TITLE: Run LanceDB code linters +DESCRIPTION: Executes code linters to check for style violations and potential issues. Language-specific linting can be performed with `make lint-python` or `make lint-rust`. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_5 + +LANGUAGE: shell +CODE: +``` +make lint +``` + +---------------------------------------- + +TITLE: Verify converted Lance dataset content +DESCRIPTION: Reads the newly created Lance dataset and converts it back to a Pandas DataFrame to confirm that the data was correctly written and matches the original content. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_5 + +LANGUAGE: Python +CODE: +``` +# make sure it's the same +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Prepare Dbpedia-entities-openai Dataset +DESCRIPTION: This snippet provides shell commands to set up a Python virtual environment, install necessary dependencies from 'requirements.txt', and then generate the Dbpedia-entities-openai dataset in Lance format using 'datagen.py'. It requires Python 3.10 or newer. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/dbpedia-openai/README.md#_snippet_0 + +LANGUAGE: sh +CODE: +``` +# Python 3.10+ +python3 -m venv venv +. ./venv/bin/activate + +# install dependencies +pip install -r requirements.txt + +# Generate dataset in lance format. +./datagen.py +``` + +---------------------------------------- + +TITLE: Clean LanceDB build artifacts +DESCRIPTION: Removes all generated build artifacts and temporary files from the project directory, useful for a clean rebuild. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_9 + +LANGUAGE: shell +CODE: +``` +make clean +``` + +---------------------------------------- + +TITLE: Query Nearest Neighbors with Specific Features +DESCRIPTION: Performs a nearest neighbor search while simultaneously retrieving specific feature columns ('revenue') alongside the vector results. This demonstrates fetching combined data in a single call. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_25 + +LANGUAGE: python +CODE: +``` +sift1m.to_table(columns=["revenue"], nearest={"column": "vector", "q": samples[0], "k": 10}).to_pandas() +``` + +---------------------------------------- + +TITLE: Create named tags for Lance dataset versions +DESCRIPTION: Assigns human-readable tags ('stable', 'nightly') to specific versions (2 and 3) of the Lance dataset. Then, it lists all defined tags, providing aliases for version numbers. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_12 + +LANGUAGE: Python +CODE: +``` +dataset.tags.create("stable", 2) +dataset.tags.create("nightly", 3) +dataset.tags.list() +``` + +---------------------------------------- + +TITLE: Access Lance dataset using a named tag +DESCRIPTION: Loads the Lance dataset by referencing a previously created tag ('stable') instead of a version number, and converts it to a Pandas DataFrame, showcasing tag-based version access. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_13 + +LANGUAGE: Python +CODE: +``` +lance.dataset('/tmp/test.lance', version="stable").to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Run LanceDB benchmarks (excluding slow tests) +DESCRIPTION: Executes the performance benchmarks located in `python/benchmarks`, skipping tests explicitly marked as 'slow'. These benchmarks are designed for quick iteration and regression catching. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_11 + +LANGUAGE: shell +CODE: +``` +pytest python/benchmarks -m "not slow" +``` + +---------------------------------------- + +TITLE: Verify overwritten Lance dataset content +DESCRIPTION: Reads the current state of the Lance dataset and converts it to a Pandas DataFrame to confirm that the overwrite operation was successful and the dataset now contains the new data. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_8 + +LANGUAGE: Python +CODE: +``` +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Rust: Load Tokenizer from Hugging Face Hub +DESCRIPTION: This function provides a utility to load a tokenizer from the Hugging Face Hub. It takes a model name, creates an API client, retrieves the tokenizer file from the specified repository, and constructs a `Tokenizer` object from it. This is a common pattern for integrating Hugging Face models. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_3 + +LANGUAGE: Rust +CODE: +``` +fn load_tokenizer(model_name: &str) -> Result> { + let api = Api::new()?; + let repo = api.repo(Repo::with_revision( + model_name.into(), + RepoType::Model, + "main".into(), + )); + + let tokenizer_path = repo.get("tokenizer.json")?; + let tokenizer = Tokenizer::from_file(tokenizer_path)?; + + Ok(tokenizer) +} +``` + +---------------------------------------- + +TITLE: Sample query vectors from Lance dataset using DuckDB +DESCRIPTION: Imports `duckdb` and queries the `sift1m` Lance dataset to sample 100 vectors from the 'vector' column. The sampled vectors are converted to a Pandas DataFrame column, to be used as query inputs for KNN search. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_17 + +LANGUAGE: Python +CODE: +``` +import duckdb +# if this segfaults make sure duckdb v0.7+ is installed +samples = duckdb.query("SELECT vector FROM sift1m USING SAMPLE 100").to_df().vector +samples +``` + +---------------------------------------- + +TITLE: Prepare Parquet file for conversion to Lance +DESCRIPTION: Cleans up previous test files. Converts the Pandas DataFrame `df` to a PyArrow Table, then writes it to a Parquet file. Finally, it reads the Parquet file back into a PyArrow dataset and converts it to a Pandas DataFrame for display. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_3 + +LANGUAGE: Python +CODE: +``` +shutil.rmtree("/tmp/test.parquet", ignore_errors=True) +shutil.rmtree("/tmp/test.lance", ignore_errors=True) + +tbl = pa.Table.from_pandas(df) +pa.dataset.write_dataset(tbl, "/tmp/test.parquet", format='parquet') + +parquet = pa.dataset.dataset("/tmp/test.parquet") +parquet.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Access a specific historical version of Lance dataset (Version 2) +DESCRIPTION: Loads another specific historical version (version 2) of the Lance dataset and converts it to a Pandas DataFrame, further illustrating the versioning capabilities. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_11 + +LANGUAGE: Python +CODE: +``` +lance.dataset('/tmp/test.lance', version=2).to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Lance I/O Trace Events +DESCRIPTION: Describes events emitted during significant I/O operations, particularly those related to indices, useful for debugging cache utilization. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/performance.md#_snippet_1 + +LANGUAGE: APIDOC +CODE: +``` +Event: lance::io_events + Parameter: type + Description: The type of I/O operation (open_scalar_index, open_vector_index, load_vector_part, load_scalar_part) +``` + +---------------------------------------- + +TITLE: Import libraries and define dataset paths for Flickr8k +DESCRIPTION: This snippet imports essential Python libraries such as `os`, `cv2`, `lance`, `pyarrow`, `matplotlib`, and `tqdm`. It also defines the file paths for the Flickr8k captions file and the image dataset folder, which are crucial for subsequent data processing. It assumes the dataset and required libraries like pyarrow, pylance, opencv, and tqdm are already installed and present. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/flickr8k_dataset_creation.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import os +import cv2 +import random + +import lance +import pyarrow as pa + +import matplotlib.pyplot as plt + +from tqdm.auto import tqdm + +captions = "Flickr8k.token.txt" +image_folder = "Flicker8k_Dataset/" +``` + +---------------------------------------- + +TITLE: Build IVF_PQ index on Lance vector dataset +DESCRIPTION: Builds an IVF_PQ (Inverted File Index with Product Quantization) index on the 'vector' column of the `sift1m` dataset. It configures the index with 256 partitions and 16 sub-vectors for efficient approximate nearest neighbor search, significantly speeding up vector queries. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_19 + +LANGUAGE: Python +CODE: +``` +%%time + +sift1m.create_index( + "vector", + index_type="IVF_PQ", # IVF_PQ, IVF_HNSW_PQ and IVF_HNSW_SQ are supported + num_partitions=256, # IVF + num_sub_vectors=16 # PQ +) +``` + +---------------------------------------- + +TITLE: Python Environment Setup for LanceDB Testing +DESCRIPTION: Sets up the Python environment by ensuring the project's root directory is added to sys.path and preventing bytecode generation. This is crucial for module imports within the project structure. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_0 + +LANGUAGE: python +CODE: +``` +import sys +sys.dont_write_bytecode = True + +import os + +module_path = os.path.abspath(os.path.join('.')) +if module_path not in sys.path: + sys.path.append(module_path) +``` + +---------------------------------------- + +TITLE: Add Metadata Columns to Lance Table +DESCRIPTION: Appends new feature columns, 'item_id' and 'revenue', to an existing Lance table. This illustrates how to enrich dataset entries with additional metadata before writing them back. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_23 + +LANGUAGE: python +CODE: +``` +tbl = sift1m.to_table() +tbl = tbl.append_column("item_id", pa.array(range(len(tbl)))) +tbl = tbl.append_column("revenue", pa.array((np.random.randn(len(tbl))+5)*1000)) +tbl.to_pandas() +``` + +---------------------------------------- + +TITLE: Build MacOS x86_64 Wheels +DESCRIPTION: This command builds release-mode wheels specifically for x86_64 MacOS. It uses `maturin` to compile the project for the `x86_64-apple-darwin` target, storing the resulting wheels in the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_26 + +LANGUAGE: Shell +CODE: +``` +maturin build --release \ + --target x86_64-apple-darwin \ + --out wheels +``` + +---------------------------------------- + +TITLE: Overwrite Lance dataset to create new version +DESCRIPTION: Creates a new Pandas DataFrame with different data. Converts it to a PyArrow Table and overwrites the existing Lance dataset at `/tmp/test.lance` using `mode="overwrite"`, effectively creating a new version of the dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_7 + +LANGUAGE: Python +CODE: +``` +df = pd.DataFrame({"a": [50, 100]}) +tbl = pa.Table.from_pandas(df) +dataset = lance.write_dataset(tbl, "/tmp/test.lance", mode="overwrite") +``` + +---------------------------------------- + +TITLE: Run Dbpedia-entities-openai Benchmark +DESCRIPTION: This command executes the 'benchmarks.py' script to run top-k vector queries. The script tests various combinations of IVF and PQ values, as well as 'refine_factor', to evaluate performance. The example specifies a top-k value of 20. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/dbpedia-openai/README.md#_snippet_1 + +LANGUAGE: sh +CODE: +``` +./benchmarks.py -k 20 +``` + +---------------------------------------- + +TITLE: Build and Test Lance Rust Package +DESCRIPTION: These commands clone the Lance repository, navigate to the Rust directory, and then build, test, and benchmark the core Rust components of Lance. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/How-to-Build.md#_snippet_2 + +LANGUAGE: bash +CODE: +``` +git checkout https://github.com/lancedb/lance.git + +# Build rust package +cd rust +cargo build + +# Run test +cargo test + +# Run benchmarks +cargo bench +``` + +---------------------------------------- + +TITLE: Query Lance Dataset with Simple SQL in Python DataFusion +DESCRIPTION: This Python example shows how to integrate Lance datasets with DataFusion using `FFILanceTableProvider` from `pylance`. It demonstrates registering a Lance dataset as a table and executing a basic SQL `SELECT` query to fetch the first 10 rows, highlighting the Python FFI integration. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/datafusion.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +from datafusion import SessionContext # pip install datafusion +from lance import FFILanceTableProvider + +ctx = SessionContext() + +table1 = FFILanceTableProvider( + my_lance_dataset, with_row_id=True, with_row_addr=True +) +ctx.register_table_provider("table1", table1) +ctx.table("table1") +ctx.sql("SELECT * FROM table1 LIMIT 10") +``` + +---------------------------------------- + +TITLE: Open a LanceDB Dataset +DESCRIPTION: Provides a basic example of how to open an existing Lance dataset using the `lance.dataset` function. This function can be used to access datasets stored locally or in cloud storage like S3. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_11 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset("s3://bucket/path/imagenet.lance") +``` + +---------------------------------------- + +TITLE: Build LanceDB in development mode +DESCRIPTION: Builds the Rust native module in place using `maturin`. This command needs to be re-run whenever Rust code changes, but is not required for Python code modifications. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_0 + +LANGUAGE: shell +CODE: +``` +maturin develop +``` + +---------------------------------------- + +TITLE: Lance File Audit Trace Events +DESCRIPTION: Details the events emitted when significant files are created or deleted in Lance, including the mode of I/O operation and the type of file affected. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/performance.md#_snippet_0 + +LANGUAGE: APIDOC +CODE: +``` +Event: lance::file_audit + Parameter: mode + Description: The mode of I/O operation (create, delete, delete_unverified) + Parameter: type + Description: The type of file affected (manifest, data file, index file, deletion file) +``` + +---------------------------------------- + +TITLE: Download Lindera Language Model +DESCRIPTION: Command-line instruction to download a specific Lindera language model (e.g., ipadic, ko-dic, unidic) for LanceDB. Note that `lindera-cli` must be installed beforehand as Lindera models require compilation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_4 + +LANGUAGE: bash +CODE: +``` +python -m lance.download lindera -l [ipadic|ko-dic|unidic] +``` + +---------------------------------------- + +TITLE: Access a specific historical version of Lance dataset (Version 1) +DESCRIPTION: Loads a specific historical version (version 1) of the Lance dataset and converts it to a Pandas DataFrame, demonstrating the ability to revert to or inspect past states of the data. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_10 + +LANGUAGE: Python +CODE: +``` +lance.dataset('/tmp/test.lance', version=1).to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Decorate Rust Unit Test for Tracing +DESCRIPTION: To enable tracing for a Rust unit test, decorate it with the `#[lance_test_macros::test]` attribute. This macro wraps any existing test attributes, allowing tracing information to be collected during test execution. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_16 + +LANGUAGE: Rust +CODE: +``` +#[lance_test_macros::test(tokio::test)] +async fn test() { + ... +} +``` + +---------------------------------------- + +TITLE: Add Rust Toolchain Targets for Cross-Compilation +DESCRIPTION: To build manylinux wheels for different Linux architectures, you must first add the corresponding Rust toolchain targets. These commands add the x86_64 and aarch64 GNU targets, enabling cross-compilation. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_22 + +LANGUAGE: Shell +CODE: +``` +rustup target add x86_64-unknown-linux-gnu +rustup target add aarch64-unknown-linux-gnu +``` + +---------------------------------------- + +TITLE: Query Vectors and Metadata Together in LanceDB +DESCRIPTION: This code demonstrates how to perform a nearest neighbor search in LanceDB while simultaneously retrieving specified metadata columns. It allows users to fetch both vector embeddings and associated feature data ('item_id', 'revenue') in a single query, streamlining data retrieval for applications requiring both. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_21 + +LANGUAGE: python +CODE: +``` +result = sift1m.to_table( + columns=["item_id", "revenue"], + nearest={"column": "vector", "q": samples[0], "k": 10} +) +print(result.to_pandas()) +``` + +---------------------------------------- + +TITLE: Build MacOS ARM64 Wheels +DESCRIPTION: This command builds release-mode wheels specifically for ARM64 (aarch64) MacOS. It uses `maturin` to compile the project for the `aarch64-apple-darwin` target, storing the resulting wheels in the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_25 + +LANGUAGE: Shell +CODE: +``` +maturin build --release \ + --target aarch64-apple-darwin \ + --out wheels +``` + +---------------------------------------- + +TITLE: Rust: WikiTextBatchReader Next Batch Logic +DESCRIPTION: This snippet shows the core logic for the `next` method of the `WikiTextBatchReader`. It attempts to build and retrieve the next Parquet reader from a list of available readers. If a reader is successfully built, it's used; otherwise, it handles errors or indicates that no more readers are available. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_1 + +LANGUAGE: Rust +CODE: +``` + if let Some(builder) = self.parquet_readers[self.current_reader_idx].take() { + match builder.build() { + Ok(reader) => { + self.current_reader = Some(Box::new(reader)); + self.current_reader_idx += 1; + continue; + } + Err(e) => { + return Some(Err(arrow::error::ArrowError::ExternalError(Box::new(e)))) + } + } + } + } + + // No more readers available + return None; + } +``` + +---------------------------------------- + +TITLE: Download and Extract SIFT1M Dataset +DESCRIPTION: Downloads the SIFT1M dataset, a common benchmark for vector search, and extracts its contents. This is a prerequisite step for running the subsequent vector search examples. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_6 + +LANGUAGE: shell +CODE: +``` +wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz +tar -xzf sift.tar.gz +``` + +---------------------------------------- + +TITLE: Measure Nearest Neighbor Query Performance +DESCRIPTION: Performs multiple nearest neighbor queries on the Lance dataset using a list of sample vectors and measures the average query time. It also prints the resulting table for the last query. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_21 + +LANGUAGE: python +CODE: +``` +import time + +tot = 0 +for q in samples: + start = time.time() + tbl = sift1m.to_table(nearest={"column": "vector", "q": q, "k": 10}) + end = time.time() + tot += (end - start) + +print(f"Avg(sec): {tot / len(samples)}") +print(tbl.to_pandas()) +``` + +---------------------------------------- + +TITLE: Run Rust Unit Test with Tracing Verbosity +DESCRIPTION: Execute a Rust unit test with tracing enabled by setting the `LANCE_TESTING` environment variable to a desired verbosity level (e.g., 'debug', 'info'). This command will generate a JSON trace file in your working directory, which can be viewed in Chrome or Perfetto. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_17 + +LANGUAGE: Bash +CODE: +``` +LANCE_TESTING=debug cargo test dataset::tests::test_create_dataset +``` + +---------------------------------------- + +TITLE: Build Linux x86_64 Manylinux Wheels +DESCRIPTION: This command builds release-mode manylinux wheels for x86_64 Linux. It utilizes `maturin` with `zig` for cross-compilation, targeting `manylinux2014` compatibility, and outputs the generated wheels to the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_23 + +LANGUAGE: Shell +CODE: +``` +maturin build --release --zig \ + --target x86_64-unknown-linux-gnu \ + --compatibility manylinux2014 \ + --out wheels +``` + +---------------------------------------- + +TITLE: Append new rows to an existing Lance dataset +DESCRIPTION: Creates a new Pandas DataFrame with a single row. Converts it to a PyArrow Table and appends it to the existing Lance dataset at `/tmp/test.lance` using `mode="append"`. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_6 + +LANGUAGE: Python +CODE: +``` +df = pd.DataFrame({"a": [10]}) +tbl = pa.Table.from_pandas(df) +dataset = lance.write_dataset(tbl, "/tmp/test.lance", mode="append") + +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Overwrite Lance Dataset +DESCRIPTION: This snippet demonstrates how to completely overwrite the data in a Lance dataset, effectively creating a new version. A new Pandas DataFrame is prepared and written to the dataset using `mode="overwrite"`, replacing the previous content while preserving the old version for historical access. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_6 + +LANGUAGE: python +CODE: +``` +df = pd.DataFrame({"a": [50, 100]}) +tbl = pa.Table.from_pandas(df) +dataset = lance.write_dataset(tbl, "/tmp/test.lance", mode="overwrite") + +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Lance Execution Trace Events +DESCRIPTION: Outlines events emitted when an execution plan is run, providing insights into query performance, including output rows, I/O operations, bytes read, and index statistics. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/performance.md#_snippet_2 + +LANGUAGE: APIDOC +CODE: +``` +Event: lance::execution + Parameter: type + Description: The type of execution event (plan_run is the only type today) + Parameter: output_rows + Description: The number of rows in the output of the plan + Parameter: iops + Description: The number of I/O operations performed by the plan + Parameter: bytes_read + Description: The number of bytes read by the plan + Parameter: indices_loaded + Description: The number of indices loaded by the plan + Parameter: parts_loaded + Description: The number of index partitions loaded by the plan + Parameter: index_comparisons + Description: The number of comparisons performed inside the various indices +``` + +---------------------------------------- + +TITLE: Example Console Output of CLIP Model Training Progress +DESCRIPTION: This snippet shows a typical console output during the training of the CLIP model. It displays the epoch number, the progress bar indicating batch processing, and the reported loss value for each epoch, demonstrating the training's progression and convergence. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_10 + +LANGUAGE: console +CODE: +``` +==================== Epoch: 1 / 2 ==================== +loss: 2.0799: 100%|██████████| 253/253 [02:14<00:00, 1.88it/s] + +==================== Epoch: 2 / 2 ==================== +loss: 1.3064: 100%|██████████| 253/253 [02:10<00:00, 1.94it/s] +``` + +---------------------------------------- + +TITLE: Convert SIFT Data to Lance Vector Dataset +DESCRIPTION: This code demonstrates how to convert the raw SIFT 1M dataset, stored in a binary format, into a Lance vector dataset. It involves reading the binary data, reshaping it into a NumPy array, and then using `vec_to_table` and `lance.write_dataset` to store it efficiently for vector search. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_12 + +LANGUAGE: python +CODE: +``` +from lance.vector import vec_to_table +import struct + +uri = "vec_data.lance" + +with open("sift/sift_base.fvecs", mode="rb") as fobj: + buf = fobj.read() + data = np.array(struct.unpack("<128000000f", buf[4 : 4 + 4 * 1000000 * 128])).reshape((1000000, 128)) + dd = dict(zip(range(1000000), data)) + +table = vec_to_table(dd) +lance.write_dataset(table, uri, max_rows_per_group=8192, max_rows_per_file=1024*1024) +``` + +---------------------------------------- + +TITLE: Perform KNN Search on Lance Dataset (No Index) +DESCRIPTION: This snippet illustrates how to perform a K-Nearest Neighbors (KNN) search on a Lance dataset without utilizing an index. It measures the execution time to highlight the performance implications of a full dataset scan, demonstrating the need for ANN indexes in real-time scenarios. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_15 + +LANGUAGE: python +CODE: +``` +import time + +start = time.time() +tbl = sift1m.to_table(columns=["id"], nearest={"column": "vector", "q": samples[0], "k": 10}) +end = time.time() + +print(f"Time(sec): {end-start}") +print(tbl.to_pandas()) +``` + +---------------------------------------- + +TITLE: Build Linux ARM64 Manylinux Wheels +DESCRIPTION: This command builds release-mode manylinux wheels for ARM64 (aarch64) Linux. It uses `maturin` with `zig` for cross-compilation, targeting `manylinux2014` compatibility, and places the output wheels in the 'wheels' directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_24 + +LANGUAGE: Shell +CODE: +``` +maturin build --release --zig \ + --target aarch_64-unknown-linux-gnu \ + --compatibility manylinux2014 \ + --out wheels +``` + +---------------------------------------- + +TITLE: Overwrite Lance Dataset with New Features +DESCRIPTION: Writes the modified table, including newly added feature columns, back to the Lance dataset URI, overwriting the existing dataset. This updates the dataset with enriched data. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_24 + +LANGUAGE: python +CODE: +``` +sift1m = lance.write_dataset(tbl, uri, mode="overwrite") +``` + +---------------------------------------- + +TITLE: Append Metadata Columns to LanceDB Dataset +DESCRIPTION: This Python snippet illustrates how to append additional metadata columns, such as 'item_id' and 'revenue', to an existing LanceDB dataset. This allows for storing and managing feature data alongside vector embeddings within the same dataset, simplifying data management. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_20 + +LANGUAGE: python +CODE: +``` +tbl = sift1m.to_table() +tbl = tbl.append_column("item_id", pa.array(range(len(tbl)))) +tbl = tbl.append_column("revenue", pa.array((np.random.randn(len(tbl))+5)*1000)) +``` + +---------------------------------------- + +TITLE: Create Vector Index in LanceDB (IVF_PQ) +DESCRIPTION: This code demonstrates how to create a vector index on a LanceDB dataset. It specifies the vector column, index type (IVF_PQ, IVF_HNSW_PQ, IVF_HNSW_SQ are supported), number of partitions for IVF, and number of sub-vectors for PQ. This improves the efficiency of Approximate Nearest Neighbor (ANN) searches. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_16 + +LANGUAGE: python +CODE: +``` +sift1m.create_index( + "vector", + index_type="IVF_PQ", # IVF_PQ, IVF_HNSW_PQ and IVF_HNSW_SQ are supported + num_partitions=256, # IVF + num_sub_vectors=16, # PQ +) +``` + +---------------------------------------- + +TITLE: Convert SIFT FVECS data to Lance vector dataset +DESCRIPTION: Imports `vec_to_table` from `lance.vector` and `struct`. Reads the SIFT base vectors from `sift_base.fvecs`, unpacks the binary data into a NumPy array, and converts it into a PyArrow Table using `vec_to_table`. Finally, it writes this table to a Lance dataset named `vec_data.lance`, optimizing for vector storage. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_15 + +LANGUAGE: Python +CODE: +``` +from lance.vector import vec_to_table +import struct + +uri = "vec_data.lance" + +with open("sift/sift_base.fvecs", mode="rb") as fobj: + buf = fobj.read() + data = np.array(struct.unpack("<128000000f", buf[4 : 4 + 4 * 1000000 * 128])).reshape((1000000, 128)) + dd = dict(zip(range(1000000), data)) + +table = vec_to_table(dd) +lance.write_dataset(table, uri, max_rows_per_group=8192, max_rows_per_file=1024*1024) +``` + +---------------------------------------- + +TITLE: Read Lance Dataset in Java +DESCRIPTION: This Java snippet demonstrates how to open and access an existing Lance dataset. It uses `Dataset.open` with the dataset's path and a `BufferAllocator` to load the dataset. Once opened, it shows how to retrieve basic information such as row count, schema, and version details, providing a starting point for data querying and manipulation. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_3 + +LANGUAGE: Java +CODE: +``` +void readDataset() { + String datasetPath = ""; // specify a path point to a dataset + try (BufferAllocator allocator = new RootAllocator()) { + try (Dataset dataset = Dataset.open(datasetPath, allocator)) { + dataset.countRows(); + dataset.getSchema(); + dataset.version(); + dataset.latestVersion(); + // access more information + } + } +} +``` + +---------------------------------------- + +TITLE: Execute Python S3 Integration Tests +DESCRIPTION: Once local S3 services are running, this command executes the Python S3 integration tests using `pytest`. The `--run-integration` flag ensures that tests requiring external services are included, specifically targeting the `test_s3_ddb.py` file. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_21 + +LANGUAGE: Shell +CODE: +``` +pytest --run-integration python/tests/test_s3_ddb.py +``` + +---------------------------------------- + +TITLE: Perform Random Access on Lance Dataset in Java +DESCRIPTION: This Java example demonstrates how to perform random access queries on a Lance dataset, retrieving specific rows and columns. It opens an existing dataset, specifies a list of row indices and desired column names, and then uses `dataset.take` to fetch the corresponding data. The results are processed using an `ArrowReader` to iterate through batches and access individual field values. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_5 + +LANGUAGE: Java +CODE: +``` +void randomAccess() { + String datasetPath = ""; // specify a path point to a dataset + try (BufferAllocator allocator = new RootAllocator()) { + try (Dataset dataset = Dataset.open(datasetPath, allocator)) { + List indices = Arrays.asList(1L, 4L); + List columns = Arrays.asList("id", "name"); + try (ArrowReader reader = dataset.take(indices, columns)) { + while (reader.loadNextBatch()) { + VectorSchemaRoot result = reader.getVectorSchemaRoot(); + result.getRowCount(); + + for (int i = 0; i < indices.size(); i++) { + result.getVector("id").getObject(i); + result.getVector("name").getObject(i); + } + } + } + } + } +} +``` + +---------------------------------------- + +TITLE: Load Subset of Lance Dataset with Projection and Predicates +DESCRIPTION: This Python example illustrates how to efficiently load a subset of a Lance dataset into memory. It utilizes column projection (`columns`), filter push-down (`filter`), and pagination (`limit`, `offset`) to optimize data retrieval for large datasets by reducing I/O. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_14 + +LANGUAGE: python +CODE: +``` +table = ds.to_table( + columns=["image", "label"], + filter="label = 2 AND text IS NOT NULL", + limit=1000, + offset=3000) +``` + +---------------------------------------- + +TITLE: Create PyTorch DataLoader from LanceDataset (Unsafe) +DESCRIPTION: This example shows how to load a Lance dataset into a PyTorch `IterableDataset` using `lance.torch.data.LanceDataset` and then create a standard PyTorch `DataLoader`. It highlights an inference loop, but notes that this approach is not fork-safe for multiprocessing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/pytorch.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +import torch +import lance.torch.data + +# Load lance dataset into a PyTorch IterableDataset. +# with only columns "image" and "prompt". +dataset = lance.torch.data.LanceDataset( + "diffusiondb_train.lance", + columns=["image", "prompt"], + batch_size=128, + batch_readahead=8, # Control multi-threading reads. +) + +# Create a PyTorch DataLoader +dataloader = torch.utils.data.DataLoader(dataset) + +# Inference loop +for batch in dataloader: + inputs, targets = batch["prompt"], batch["image"] + outputs = model(inputs) + ... +``` + +---------------------------------------- + +TITLE: Manage LanceDB Dataset Tags (Create, Update, Delete, List) +DESCRIPTION: This Python example demonstrates how to interact with `LanceDataset.tags` to manage dataset versions. It covers creating a tag for a specific version, updating its associated version, listing all tags, and finally deleting a tag. It also shows how `list_ordered()` can be used to retrieve tags in the order they were created or last updated. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tags.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset("./tags.lance") +print(len(ds.versions())) +# 2 +print(ds.tags.list()) +# {} +ds.tags.create("v1-prod", 1) +print(ds.tags.list()) +# {'v1-prod': {'version': 1, 'manifest_size': ...}} +ds.tags.update("v1-prod", 2) +print(ds.tags.list()) +# {'v1-prod': {'version': 2, 'manifest_size': ...}} +ds.tags.delete("v1-prod") +print(ds.tags.list()) +# {} +print(ds.tags.list_ordered()) +# [] +ds.tags.create("v1-prod", 1) +print(ds.tags.list_ordered()) +# [('v1-prod', {'version': 1, 'manifest_size': ...})] +ds.tags.update("v1-prod", 2) +print(ds.tags.list_ordered()) +# [('v1-prod', {'version': 2, 'manifest_size': ...})] +ds.tags.delete("v1-prod") +print(ds.tags.list_ordered()) +# [] +``` + +---------------------------------------- + +TITLE: Write Pandas DataFrame to Lance Dataset +DESCRIPTION: Removes any existing Lance dataset at `/tmp/test.lance` to ensure a clean write. Then, it writes the Pandas DataFrame `df` to a new Lance dataset and converts the resulting dataset back to a Pandas DataFrame for verification. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_2 + +LANGUAGE: Python +CODE: +``` +shutil.rmtree("/tmp/test.lance", ignore_errors=True) + +dataset = lance.write_dataset(df, "/tmp/test.lance") +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Perform K-Nearest Neighbors search without an index +DESCRIPTION: Measures the time taken to perform a K-Nearest Neighbors (KNN) search on the `sift1m` dataset. It queries for the 10 nearest neighbors to the first sampled vector (`samples[0]`) based on the 'vector' column, demonstrating a full scan approach. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/quickstart.ipynb#_snippet_18 + +LANGUAGE: Python +CODE: +``` +import time + +start = time.time() +tbl = sift1m.to_table(columns=["id"], nearest={"column": "vector", "q": samples[0], "k": 10}) +end = time.time() + +print(f"Time(sec): {end-start}") +print(tbl.to_pandas()) +``` + +---------------------------------------- + +TITLE: Write Pandas DataFrame to Lance Dataset +DESCRIPTION: This snippet shows how to persist a Pandas DataFrame into a Lance dataset. It first ensures a clean state by removing any existing file and then uses `lance.write_dataset` to save the DataFrame, followed by reading it back to confirm the write operation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +shutil.rmtree("/tmp/test.lance", ignore_errors=True) + +dataset = lance.write_dataset(df, "/tmp/test.lance") +dataset.to_table().to_pandas() +``` + +---------------------------------------- + +TITLE: Join Multiple Lance Datasets with SQL in Rust DataFusion +DESCRIPTION: This Rust example illustrates how to register multiple Lance datasets (e.g., 'orders' and 'customers') as separate tables in DataFusion. It then performs a SQL `JOIN` operation between these tables to combine data based on a common key, demonstrating more complex query capabilities. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/datafusion.md#_snippet_1 + +LANGUAGE: rust +CODE: +``` +use datafusion::prelude::SessionContext; +use crate::datafusion::LanceTableProvider; + +let ctx = SessionContext::new(); + +ctx.register_table("orders", + Arc::new(LanceTableProvider::new( + Arc::new(orders_dataset.clone()), + /* with_row_id */ false, + /* with_row_addr */ false, + )))?; + +ctx.register_table("customers", + Arc::new(LanceTableProvider::new( + Arc::new(customers_dataset.clone()), + /* with_row_id */ false, + /* with_row_addr */ false, + )))?; + +let df = ctx.sql(" + SELECT o.order_id, o.amount, c.customer_name + FROM orders o + JOIN customers c ON o.customer_id = c.customer_id + LIMIT 10 +").await?; + +let result = df.collect().await?; +``` + +---------------------------------------- + +TITLE: Read ImageURIs into Lance EncodedImageArray +DESCRIPTION: This example shows how to use `ImageURIArray.read_uris()` to load images referenced by URIs into memory. The method returns an `EncodedImageArray` containing the binary data of the images, enabling direct processing of image content. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/arrays.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +from lance.arrow import ImageURIArray + +relative_path = "images/1.png" +uris = [os.path.join(os.path.dirname(__file__), relative_path)] +ImageURIArray.from_uris(uris).read_uris() +# +# [b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00...'] +``` + +---------------------------------------- + +TITLE: Create and Write Lance Dataset from Arrow Stream in Java +DESCRIPTION: This Java example illustrates how to create a Lance dataset and populate it with data from an existing Arrow file. It reads bytes from a source path, converts them into an `ArrowArrayStream`, and then uses `Dataset.create` with `WriteParams` to configure writing options like `maxRowsPerFile`, `maxRowsPerGroup`, and `WriteMode`. This method is suitable for ingesting data from Arrow-formatted sources. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_2 + +LANGUAGE: Java +CODE: +``` +void createAndWriteDataset() throws IOException, URISyntaxException { + Path path = ""; // the original source path + String datasetPath = ""; // specify a path point to a dataset + try (BufferAllocator allocator = new RootAllocator(); + ArrowFileReader reader = + new ArrowFileReader( + new SeekableReadChannel( + new ByteArrayReadableSeekableByteChannel(Files.readAllBytes(path))), allocator); + ArrowArrayStream arrowStream = ArrowArrayStream.allocateNew(allocator)) { + Data.exportArrayStream(allocator, reader, arrowStream); + try (Dataset dataset = + Dataset.create( + allocator, + arrowStream, + datasetPath, + new WriteParams.Builder() + .withMaxRowsPerFile(10) + .withMaxRowsPerGroup(20) + .withMode(WriteParams.WriteMode.CREATE) + .withStorageOptions(new HashMap<>()) + .build())) { + // access dataset + } + } +} +``` + +---------------------------------------- + +TITLE: Generate Flame Graph from Process ID +DESCRIPTION: Generates a flame graph for a running process using its Process ID (PID). This command is used to capture and visualize CPU profiles, helping to identify performance bottlenecks in an application. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/Debug.md#_snippet_5 + +LANGUAGE: sh +CODE: +``` +flamegraph -p +``` + +---------------------------------------- + +TITLE: Create Lance BFloat16 Arrow Array +DESCRIPTION: This example illustrates how to construct a `BFloat16Array` directly using the `lance.arrow.bfloat16_array` function. It takes a list of floating-point numbers and converts them into an Arrow array with BFloat16 precision, suitable for Arrow-based data processing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/arrays.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +from lance.arrow import bfloat16_array + +bfloat16_array([1.1, 2.1, 3.4]) +# +# [ +# 1.1015625, +# 2.09375, +# 3.40625 +# ] +``` + +---------------------------------------- + +TITLE: Clone LanceDB GitHub Repository +DESCRIPTION: Instructions to clone the LanceDB project repository from GitHub to a local machine. This is the first step for setting up the development environment. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_11 + +LANGUAGE: shell +CODE: +``` +git clone https://github.com/lancedb/lance.git +``` + +---------------------------------------- + +TITLE: Rust Implementation of WikiTextBatchReader +DESCRIPTION: This Rust code defines `WikiTextBatchReader`, a custom implementation of `arrow::record_batch::RecordBatchReader`. It's designed to read text data from Parquet files, tokenize it using a `Tokenizer` from the `tokenizers` crate, and transform it into Arrow `RecordBatch`es. The `process_batch` method handles tokenization, limits the number of samples, and shuffles the tokenized IDs before creating the final `RecordBatch`. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/llm_dataset_creation.md#_snippet_0 + +LANGUAGE: rust +CODE: +``` +use arrow::array::{Array, Int64Builder, ListBuilder, UInt32Array}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use arrow::record_batch::RecordBatchReader; +use futures::StreamExt; +use hf_hub::{api::sync::Api, Repo, RepoType}; +use lance::dataset::WriteParams; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use rand::seq::SliceRandom; +use rand::SeedableRng; +use std::error::Error; +use std::fs::File; +use std::io::Write; +use std::sync::Arc; +use tempfile::NamedTempFile; +use tokenizers::Tokenizer; + +// Implement a custom stream batch reader +struct WikiTextBatchReader { + schema: Arc, + parquet_readers: Vec>>, + current_reader_idx: usize, + current_reader: Option>, + tokenizer: Tokenizer, + num_samples: u64, + cur_samples_cnt: u64, +} + +impl WikiTextBatchReader { + fn new( + parquet_readers: Vec>, + tokenizer: Tokenizer, + num_samples: Option, + ) -> Result> { + let schema = Arc::new(Schema::new(vec![Field::new( + "input_ids", + DataType::List(Arc::new(Field::new("item", DataType::Int64, true))), + false, + )])); + + Ok(Self { + schema, + parquet_readers: parquet_readers.into_iter().map(Some).collect(), + current_reader_idx: 0, + current_reader: None, + tokenizer, + num_samples: num_samples.unwrap_or(100_000), + cur_samples_cnt: 0, + }) + } + + fn process_batch( + &mut self, + input_batch: &RecordBatch, + ) -> Result { + let num_rows = input_batch.num_rows(); + let mut token_builder = ListBuilder::new(Int64Builder::with_capacity(num_rows * 1024)); // Pre-allocate space + let mut should_break = false; + + let column = input_batch.column_by_name("text").unwrap(); + let string_array = column + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..num_rows { + if self.cur_samples_cnt >= self.num_samples { + should_break = true; + break; + } + if !Array::is_null(string_array, i) { + let text = string_array.value(i); + // Split paragraph into lines + for line in text.split(' +') { + if let Ok(encoding) = self.tokenizer.encode(line, true) { + let tb_values = token_builder.values(); + for &id in encoding.get_ids() { + tb_values.append_value(id as i64); + } + token_builder.append(true); + self.cur_samples_cnt += 1; + if self.cur_samples_cnt % 5000 == 0 { + println!("Processed {} rows", self.cur_samples_cnt); + } + if self.cur_samples_cnt >= self.num_samples { + should_break = true; + break; + } + } + } + } + } + + // Create array and shuffle it + let input_ids_array = token_builder.finish(); + + // Create shuffled array by randomly sampling indices + let mut rng = rand::rngs::StdRng::seed_from_u64(1337); + let len = input_ids_array.len(); + let mut indices: Vec = (0..len as u32).collect(); + indices.shuffle(&mut rng); + + // Take values in shuffled order + let indices_array = UInt32Array::from(indices); + let shuffled = arrow::compute::take(&input_ids_array, &indices_array, None)?; + + let batch = RecordBatch::try_new(self.schema.clone(), vec![Arc::new(shuffled)]); + if should_break { + println!("Stop at {} rows", self.cur_samples_cnt); + self.parquet_readers.clear(); + self.current_reader = None; + } + + batch + } +} + +impl RecordBatchReader for WikiTextBatchReader { + fn schema(&self) -> Arc { + self.schema.clone() + } +} + +impl Iterator for WikiTextBatchReader { + type Item = Result; + fn next(&mut self) -> Option { + loop { + // If we have a current reader, try to get next batch + if let Some(reader) = &mut self.current_reader { + if let Some(batch_result) = reader.next() { + return Some(batch_result.and_then(|batch| self.process_batch(&batch))); + } + } + + // If no current reader or current reader is exhausted, try to get next reader + if self.current_reader_idx < self.parquet_readers.len() { +``` + +---------------------------------------- + +TITLE: Inefficient Row Update by Iteration +DESCRIPTION: Provides an example of an inefficient way to update multiple individual rows by iterating through a table and calling `update` for each row. It notes that a merge insert operation is generally more efficient for bulk updates. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_6 + +LANGUAGE: python +CODE: +``` +import lance + +# Change the ages of both Alice and Bob +new_table = pa.Table.from_pylist([{"name": "Alice", "age": 30}, + {"name": "Bob", "age": 20}]) + +# This works, but is inefficient, see below for a better approach +dataset = lance.dataset("./alice_and_bob.lance") +for idx in range(new_table.num_rows): + name = new_table[0][idx].as_py() + new_age = new_table[1][idx].as_py() + dataset.update({"age": new_age}, where=f"name='{name}'") +``` + +---------------------------------------- + +TITLE: Generate and Merge Columns in Parallel with Ray and Lance +DESCRIPTION: This example illustrates how to generate new columns in parallel using Ray and Lance. It defines an Arrow schema, creates an initial dataset with 'id', 'height', and 'weight' columns, and then uses a custom Python function (`generate_labels`) to add a new 'size_labels' column based on existing 'height' data, demonstrating Lance's `add_columns` functionality for parallel processing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/ray.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +import pyarrow as pa +from pathlib import Path +import lance + +# Define schema +schema = pa.schema([ + pa.field("id", pa.int64()), + pa.field("height", pa.int64()), + pa.field("weight", pa.int64()), +]) + +# Generate initial dataset +ds = ( + ray.data.range(10) # Create 0-9 IDs + .map(lambda x: { + "id": x["id"], + "height": x["id"] + 5, # height = id + 5 + "weight": x["id"] * 2 # weight = id * 2 + }) + .write_lance(str(output_path), schema=schema) +) + +# Define label generation logic +def generate_labels(batch: pa.RecordBatch) -> pa.RecordBatch: + heights = batch.column("height").to_pylist() + size_labels = ["tall" if h > 8 else "medium" if h > 6 else "short" for h in heights] + return pa.RecordBatch.from_arrays([ + pa.array(size_labels) + ], names=["size_labels"]) + +# Add new columns in parallel +lance_ds = lance.dataset(output_path) +add_columns( + lance_ds, + generate_labels, + source_columns=["height"], # Input columns needed +) + +# Display final results +final_df = lance_ds.to_table().to_pandas() +print("\nEnhanced dataset with size labels:\n") +print(final_df.sort_values("id").to_string(index=False)) +``` + +---------------------------------------- + +TITLE: Configure Python Benchmark for Single Iteration Tracing +DESCRIPTION: When tracing Python benchmarks, it's often useful to force them to run only once for sensible results. This snippet demonstrates how to use the `pedantic` API to limit a benchmark to a single iteration and round, ensuring a focused trace. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_19 + +LANGUAGE: Python +CODE: +``` +def run(): + "Put code to benchmark here" + ... +benchmark.pedantic(run, iterations=1, rounds=1) +``` + +---------------------------------------- + +TITLE: Enable Tracing for Python Script +DESCRIPTION: To trace a Python script, import the `trace_to_chrome` function from `lance.tracing` and call it at the beginning of your script, specifying the desired tracing level. A single JSON trace file will be generated upon the script's exit, suitable for Chrome's trace viewer. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/DEVELOPMENT.md#_snippet_18 + +LANGUAGE: Python +CODE: +``` +from lance.tracing import trace_to_chrome + +trace_to_chrome(level="debug") + +# rest of script +``` + +---------------------------------------- + +TITLE: LanceDB Encoding Metadata Key Specifications +DESCRIPTION: This section provides a detailed specification of the metadata keys used in LanceDB for column-level encoding. It describes each key's type, purpose, example values, and how it's used in Python to configure data storage and optimization. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_8 + +LANGUAGE: APIDOC +CODE: +``` +Metadata Key Specifications: +- lance-encoding:compression + Type: Compression + Description: Specifies compression algorithm + Example Values: zstd + Example Usage (Python): metadata={"lance-encoding:compression": "zstd"} +- lance-encoding:compression-level + Type: Compression + Description: Zstd compression level (1-22) + Example Values: 3 + Example Usage (Python): metadata={"lance-encoding:compression-level": "3"} +- lance-encoding:blob + Type: Storage + Description: Marks binary data (>4MB) for chunked storage + Example Values: true/false + Example Usage (Python): metadata={"lance-encoding:blob": "true"} +- lance-encoding:packed + Type: Optimization + Description: Struct memory layout optimization + Example Values: true/false + Example Usage (Python): metadata={"lance-encoding:packed": "true"} +- lance-encoding:structural-encoding + Type: Nested Data + Description: Encoding strategy for nested structures + Example Values: miniblock/fullzip + Example Usage (Python): metadata={"lance-encoding:structural-encoding": "miniblock"} +``` + +---------------------------------------- + +TITLE: Initialize Tokenizer and Load Wikitext Dataset (Python) +DESCRIPTION: This snippet initializes a Hugging Face tokenizer (gpt2) and loads the wikitext-103-raw-v1 dataset in streaming mode. The 'streaming=True' argument is crucial for processing large datasets without downloading the entire dataset upfront, allowing samples to be downloaded as needed. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_dataset_creation.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import lance +import pyarrow as pa + +from datasets import load_dataset +from transformers import AutoTokenizer +from tqdm.auto import tqdm # optional for progress tracking + +tokenizer = AutoTokenizer.from_pretrained('gpt2') + +dataset = load_dataset('wikitext', 'wikitext-103-raw-v1', streaming=True)['train'] +dataset = dataset.shuffle(seed=1337) +``` + +---------------------------------------- + +TITLE: Example of Hierarchical Schema Definition +DESCRIPTION: This snippet demonstrates a sample schema definition within the LanceDB data format, showcasing primitive types, nested structs, and lists. It illustrates how complex data structures are defined before being flattened into a field list for metadata representation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_6 + +LANGUAGE: APIDOC +CODE: +``` +a: i32 +b: struct { + c: list + d: i32 +} +``` + +---------------------------------------- + +TITLE: Define Custom PyTorch Dataset for Lance Data +DESCRIPTION: The `LanceDataset` class extends PyTorch's `Dataset` to provide an interface for loading data from a Lance dataset. It initializes by loading the specified Lance dataset and setting a `block_size` for token windows. The `__len__` method calculates the total number of possible starting indices, while `__getitem__` generates a window of indices and uses the `from_indices` utility to load and return corresponding 'input_ids' and 'labels' as PyTorch tensors, forming a causal sample for LLM training. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_training.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +class LanceDataset(Dataset): + def __init__( + self, + dataset_path, + block_size, + ): + # Load the lance dataset from the saved path + self.ds = lance.dataset(dataset_path) + self.block_size = block_size + + # Doing this so the sampler never asks for an index at the end of text + self.length = self.ds.count_rows() - block_size + + def __len__(self): + return self.length + + def __getitem__(self, idx): + """ + Generate a window of indices starting from the current idx to idx+block_size + and return the tokens at those indices + """ + window = np.arange(idx, idx + self.block_size) + sample = from_indices(self.ds, window) + + return {"input_ids": torch.tensor(sample), "labels": torch.tensor(sample)} +``` + +---------------------------------------- + +TITLE: Complex SQL Filter Expression for Lance Dataset +DESCRIPTION: This SQL snippet provides an example of a complex filter expression that can be pushed down to the Lance storage system. It demonstrates the use of `IN`, `AND`, `OR`, `NOT`, and nested field access for filtering data efficiently at the storage layer. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_16 + +LANGUAGE: sql +CODE: +``` +((label IN [10, 20]) AND (note['email'] IS NOT NULL)) + OR NOT note['created'] +``` + +---------------------------------------- + +TITLE: Tune ANN Search Parameters in LanceDB (nprobes, refine_factor) +DESCRIPTION: This code demonstrates how to tune the performance of an Approximate Nearest Neighbor (ANN) search in LanceDB by adjusting 'nprobes' and 'refine_factor'. 'nprobes' controls the number of IVF partitions to search, while 'refine_factor' determines how many vectors are retrieved for re-ranking, balancing latency and recall. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_18 + +LANGUAGE: python +CODE: +``` +%%time + +sift1m.to_table( + nearest={ + "column": "vector", + "q": samples[0], + "k": 10, + "nprobes": 10, + "refine_factor": 5, + } +).to_pandas() +``` + +---------------------------------------- + +TITLE: Querying Lance Datasets with DuckDB in Python +DESCRIPTION: This snippet demonstrates how to perform SQL queries on a Lance dataset using DuckDB in Python. It shows examples of selecting all data and calculating the mean of a column, illustrating DuckDB's direct access to Lance datasets via Arrow compatibility. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/duckdb.md#_snippet_0 + +LANGUAGE: Python +CODE: +``` +import duckdb # pip install duckdb + +duckdb.query("SELECT * FROM my_lance_dataset") +# ┌─────────────┬─────────┬────────┐ +# │ vector │ item │ price │ +# │ float[] │ varchar │ double │ +# ├─────────────┼─────────┼────────┤ +# │ [3.1, 4.1] │ foo │ 10.0 │ +# │ [5.9, 26.5] │ bar │ 20.0 │ +# └─────────────┴─────────┴────────┘ + +duckdb.query("SELECT mean(price) FROM my_lance_dataset") +# ┌─────────────┐ +# │ mean(price) │ +# │ double │ +# ├─────────────┤ +# │ 15.0 │ +# └─────────────┘ +``` + +---------------------------------------- + +TITLE: Use Sharded Sampler with LanceDataset for Distributed Training +DESCRIPTION: This example illustrates how to integrate `lance.sampler.ShardedFragmentSampler` with `LanceDataset` to control the data sampling strategy for distributed training environments. It shows how to configure the sampler with the current process's rank and the total number of processes (world size) for sharded data access. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/pytorch.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +from lance.sampler import ShardedFragmentSampler +from lance.torch.data import LanceDataset + +# Load lance dataset into a PyTorch IterableDataset. +# with only columns "image" and "prompt". +dataset = LanceDataset( + "diffusiondb_train.lance", + columns=["image", "prompt"], + batch_size=128, + batch_readahead=8, # Control multi-threading reads. + sampler=ShardedFragmentSampler( + rank=1, # Rank of the current process + world_size=8, # Total number of processes + ), +) +``` + +---------------------------------------- + +TITLE: Filter and Select Columns from Lance Dataset in TensorFlow +DESCRIPTION: This example illustrates efficient data loading from Lance into TensorFlow by specifying desired columns and applying filter conditions. It leverages Lance's columnar format for optimized data retrieval, reducing memory and processing overhead. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/tensorflow.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +ds = lance.tf.data.from_lance( + "s3://my-bucket/my-dataset", + columns=["image", "label"], + filter="split = 'train' AND collected_time > timestamp '2020-01-01'", + batch_size=256) +``` + +---------------------------------------- + +TITLE: Python: Decode EncodedImageArray to FixedShapeImageTensorArray +DESCRIPTION: This Python example demonstrates how to load images from URIs into an `ImageURIArray`, read them into an `EncodedImageArray`, and then decode them into a `FixedShapeImageTensorArray`. It also illustrates how to provide a custom TensorFlow-based decoder function for the `to_tensor` method, allowing for flexible image processing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/arrays.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +from lance.arrow import ImageURIArray + +uris = [os.path.join(os.path.dirname(__file__), "images/1.png")] +encoded_images = ImageURIArray.from_uris(uris).read_uris() +print(encoded_images.to_tensor()) + +def tensorflow_decoder(images): + import tensorflow as tf + import numpy as np + + return np.stack(tf.io.decode_png(img.as_py(), channels=3) for img in images.storage) + +print(encoded_images.to_tensor(tensorflow_decoder)) +``` + +---------------------------------------- + +TITLE: Add and Populate Columns with Python UDF in Lance +DESCRIPTION: Shows how to add and populate new columns in a Lance dataset using a custom Python function (UDF). The UDF processes data in batches, and the example includes using `lance.batch_udf` with checkpointing for robust, expensive computations. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +import lance +import pyarrow as pa +import numpy as np + +table = pa.table({"id": pa.array([1, 2, 3])}) +dataset = lance.write_dataset(table, "ids") + +@lance.batch_udf(checkpoint_file="embedding_checkpoint.sqlite") +def add_random_vector(batch): + embeddings = np.random.rand(batch.num_rows, 128).astype("float32") + return pd.DataFrame({"embedding": embeddings}) +dataset.add_columns(add_random_vector) +``` + +---------------------------------------- + +TITLE: Construct OpenAI prompt with context +DESCRIPTION: Defines a function `create_prompt` that takes a query and contextual information to build a structured prompt for a large language model. It dynamically appends context, ensuring the total prompt length stays within a specified token limit for the LLM. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_10 + +LANGUAGE: python +CODE: +``` +def create_prompt(query, context): + limit = 3750 + + prompt_start = ( + "Answer the question based on the context below.\n\n"+ + "Context:\n" + ) + prompt_end = ( + f"\n\nQuestion: {query}\nAnswer:" + ) + # append contexts until hitting limit + for i in range(1, len(context)): + if len("\n\n---\n\n".join(context.text[:i])) >= limit: + prompt = ( + prompt_start + + "\n\n---\n\n".join(context.text[:i-1]) + + prompt_end + ) + break + elif i == len(context)-1: + prompt = ( + prompt_start + + "\n\n---\n\n".join(context.text) + + prompt_end + ) + return prompt +``` + +---------------------------------------- + +TITLE: Set DYLD_LIBRARY_PATH for Lance Python Debugging in LLDB +DESCRIPTION: Configures the `DYLD_LIBRARY_PATH` environment variable specifically for debugging Lance Python projects within LLDB. This ensures that the dynamic linker can find necessary shared libraries located in the third-party distribution directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/Debug.md#_snippet_1 + +LANGUAGE: lldb +CODE: +``` +# /path/to/lance/python/.lldbinit +env DYLD_LIBRARY_PATH=/path/to/thirdparty/dist/lib:${DYLD_LIBRARY_PATH} +``` + +---------------------------------------- + +TITLE: Rename Top-Level Columns in LanceDB Dataset +DESCRIPTION: This snippet illustrates how to rename top-level columns in a LanceDB dataset using the `lance.LanceDataset.alter_columns` method. It shows a simple example of changing a column name and verifying the change by printing the dataset as a Pandas DataFrame. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +table = pa.table({"id": pa.array([1, 2, 3])}) +dataset = lance.write_dataset(table, "ids") +dataset.alter_columns({"path": "id", "name": "new_id"}) +print(dataset.to_table().to_pandas()) +# new_id +# 0 1 +# 1 2 +# 2 3 +``` + +---------------------------------------- + +TITLE: Python: Encode FixedShapeImageTensorArray to EncodedImageArray +DESCRIPTION: This Python example shows how to convert a `FixedShapeImageTensorArray` back into an `EncodedImageArray`. It first obtains a tensor array by decoding an `EncodedImageArray` (which was read from URIs) and then calls the `to_encoded()` method. This process is useful for saving processed images back into a compressed format. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/arrays.md#_snippet_6 + +LANGUAGE: python +CODE: +``` +from lance.arrow import ImageURIArray + +uris = [image_uri] +tensor_images = ImageURIArray.from_uris(uris).read_uris().to_tensor() +tensor_images.to_encoded() +``` + +---------------------------------------- + +TITLE: Initialize LLM Training Environment with GPT2 and Lance +DESCRIPTION: This snippet imports essential libraries for LLM training, including Lance, PyTorch, and Hugging Face Transformers. It initializes the GPT2 tokenizer and model from pre-trained weights. Key hyperparameters such as learning rate, epochs, block size, batch size, device, and the Lance dataset path are defined, preparing the environment for subsequent data loading and model training. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_training.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import numpy as np +import lance + +import torch +from torch.utils.data import Dataset, DataLoader, Sampler + +from transformers import AutoTokenizer, AutoModelForCausalLM +from tqdm.auto import tqdm + +# We'll be training the pre-trained GPT2 model in this example +model_name = 'gpt2' +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name) + +# Also define some hyperparameters +lr = 3e-4 +nb_epochs = 10 +block_size = 1024 +batch_size = 8 +device = 'cuda:0' +dataset_path = 'wikitext_500K.lance' +``` + +---------------------------------------- + +TITLE: Define context window and stride parameters +DESCRIPTION: Initializes `window` and `stride` variables for creating rolling contextual windows from text data. These parameters define the size of each context (number of sentences) and the step size for generating subsequent contexts, respectively. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_3 + +LANGUAGE: python +CODE: +``` +import numpy as np +import pandas as pd + +window = 20 +stride = 4 +``` + +---------------------------------------- + +TITLE: Append New Fragments to an Existing Lance Dataset +DESCRIPTION: This example illustrates how to append new data to an existing Lance dataset. It retrieves the current dataset version, uses `lance.LanceOperation.Append` with the collected fragments, and commits them, ensuring the `read_version` is correctly set to maintain data consistency during the append operation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/distributed_write.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +import lance + +ds = lance.dataset(data_uri) +read_version = ds.version # record the read version + +op = lance.LanceOperation.Append(schema, all_fragments) +lance.LanceDataset.commit( + data_uri, + op, + read_version=read_version, +) +``` + +---------------------------------------- + +TITLE: Extract Video Frames from Lance Blob Data in Python +DESCRIPTION: This Python example illustrates how to fetch and process large binary video data stored as blobs in a Lance dataset. It uses `lance.dataset.LanceDataset.take_blobs` to retrieve a `BlobFile` object, then leverages the `av` library to open the video and extract frames within a specified time range without loading the entire video into memory. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/blob.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +import av # pip install av +import lance + +ds = lance.dataset("./youtube.lance") +start_time, end_time = 500, 1000 +blobs = ds.take_blobs([5], "video") +with av.open(blobs[0]) as container: + stream = container.streams.video[0] + stream.codec_context.skip_frame = "NONKEY" + + start_time = start_time / stream.time_base + start_time = start_time.as_integer_ratio()[0] + end_time = end_time / stream.time_base + container.seek(start_time, stream=stream) + + for frame in container.decode(stream): + if frame.time > end_time: + break + display(frame.to_image()) + clear_output(wait=True) +``` + +---------------------------------------- + +TITLE: Perform Approximate Nearest Neighbor (ANN) Search in LanceDB +DESCRIPTION: This Python snippet shows how to perform an Approximate Nearest Neighbor (ANN) search on a LanceDB dataset with an existing index. It queries a specified vector column for the 'k' nearest neighbors to a given query vector 'q', measuring the average query time. The result is converted to a Pandas DataFrame for display. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/start/quickstart.md#_snippet_17 + +LANGUAGE: python +CODE: +``` +sift1m = lance.dataset(uri) + +import time + +tot = 0 +for q in samples: + start = time.time() + tbl = sift1m.to_table(nearest={"column": "vector", "q": q, "k": 10}) + end = time.time() + tot += (end - start) + +print(f"Avg(sec): {tot / len(samples)}") +print(tbl.to_pandas()) +``` + +---------------------------------------- + +TITLE: Cast Column Data Types in LanceDB Dataset +DESCRIPTION: This snippet explains how to change the data type of a column in a LanceDB dataset using `lance.LanceDataset.alter_columns`. It notes that this operation rewrites only the affected column's data files and that any existing index on the column will be dropped. An example is provided for converting a float32 embedding column to float16 to save disk space. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_7 + +LANGUAGE: python +CODE: +``` +table = pa.table({ + "id": pa.array([1, 2, 3]), + "embedding": pa.FixedShapeTensorArray.from_numpy_ndarray( + np.random.rand(3, 128).astype("float32")) +}) +dataset = lance.write_dataset(table, "embeddings") +dataset.alter_columns({"path": "embedding", + "data_type": pa.list_(pa.float16(), 128)}) +print(dataset.schema) +# id: int64 +# embedding: fixed_size_list[128] +# child 0, item: halffloat +``` + +---------------------------------------- + +TITLE: Call OpenAI Completion API for text generation +DESCRIPTION: Defines the `complete` function to interact with OpenAI's `text-davinci-003` model. It sends a given prompt and retrieves the generated text completion, configuring parameters like temperature, max tokens, and presence/frequency penalties for desired output characteristics. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_11 + +LANGUAGE: python +CODE: +``` +def complete(prompt): + # query text-davinci-003 + res = openai.Completion.create( + engine='text-davinci-003', + prompt=prompt, + temperature=0, + max_tokens=400, + top_p=1, + frequency_penalty=0, + presence_penalty=0, + stop=None + ) + return res['choices'][0]['text'].strip() + +# check that it works +query = "who was the 12th person on the moon and when did they land?" +complete(query) +``` + +---------------------------------------- + +TITLE: Build LanceDB Java Project with Maven +DESCRIPTION: Provides the Maven command to clean and package the entire LanceDB Java project, including its dependencies and sub-modules. This command compiles the Java code and prepares it for deployment. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_9 + +LANGUAGE: shell +CODE: +``` +mvn clean package +``` + +---------------------------------------- + +TITLE: Import IPython.display for multimedia output +DESCRIPTION: Imports the `YouTubeVideo` class from `IPython.display`. This class is essential for embedding and displaying YouTube videos directly within an IPython or Jupyter environment, allowing for rich multimedia output. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_13 + +LANGUAGE: python +CODE: +``` +from IPython.display import YouTubeVideo +``` + +---------------------------------------- + +TITLE: Initialize CLIP Model Instances, Tokenizer, and PyTorch Optimizer +DESCRIPTION: This snippet initializes instances of the `ImageEncoder`, `TextEncoder`, and `Head` modules, along with a Hugging Face `AutoTokenizer`. It then sets up a PyTorch `Adam` optimizer, explicitly defining separate learning rates for the image encoder, text encoder, and the combined head modules, preparing the model for training. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_7 + +LANGUAGE: python +CODE: +``` +# Define image encoder, image head, text encoder, text head and a tokenizer for tokenizing the caption +img_encoder = ImageEncoder(model_name=Config.img_encoder_model).to('cuda') +img_head = Head(Config.img_embed_dim, Config.projection_dim).to('cuda') + +tokenizer = AutoTokenizer.from_pretrained(Config.text_encoder_model) +text_encoder = TextEncoder(model_name=Config.text_encoder_model).to('cuda') +text_head = Head(Config.text_embed_dim, Config.projection_dim).to('cuda') + +# Since we are optimizing two different models together, we will define parameters manually +parameters = [ + {"params": img_encoder.parameters(), "lr": Config.img_enc_lr}, + {"params": text_encoder.parameters(), "lr": Config.text_enc_lr}, + { + "params": itertools.chain( + img_head.parameters(), + text_head.parameters(), + ), + "lr": Config.head_lr, + }, +] + +optimizer = torch.optim.Adam(parameters) +``` + +---------------------------------------- + +TITLE: Build vector index for LanceDB dataset +DESCRIPTION: Creates an IVF_PQ (Inverted File Index with Product Quantization) index on the 'vector' column of the LanceDB dataset. This indexing significantly speeds up similarity search queries, making the retrieval of relevant contexts much faster. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_9 + +LANGUAGE: python +CODE: +``` +ds = ds.create_index("vector", + index_type="IVF_PQ", + num_partitions=64, # IVF + num_sub_vectors=96) # PQ +``` + +---------------------------------------- + +TITLE: Import necessary modules for Lance and PyTorch deep learning artifact management +DESCRIPTION: This snippet imports essential Python libraries required for deep learning artifact management using Lance. It includes `os` and `shutil` for file system operations, `lance` for data storage, `pyarrow` for schema definition, `torch` for PyTorch model handling, and `collections.OrderedDict` for managing model state dictionaries. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/artifact_management.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import os +import shutil +import lance +import pyarrow as pa +import torch +from collections import OrderedDict +``` + +---------------------------------------- + +TITLE: Download and extract MeCab Ipadic model +DESCRIPTION: This snippet downloads the gzipped tarball of the MeCab Ipadic model from GitHub and then extracts its contents using tar. This is the first step in preparing the dictionary for building. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/python/tests/models/lindera/README.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +curl -L -o mecab-ipadic-2.7.0-20070801.tar.gz "https://github.com/lindera-morphology/mecab-ipadic/archive/refs/tags/2.7.0-20070801.tar.gz" +tar xvf mecab-ipadic-2.7.0-20070801.tar.gz +``` + +---------------------------------------- + +TITLE: Process Image Captions and Images for Lance Dataset in Python +DESCRIPTION: This Python function `process` takes a list of image captions, reads corresponding image files, converts them to binary, and yields PyArrow RecordBatches. Each batch contains `image_id`, binary `image` data, and a list of `captions`, preparing data for a Lance dataset. It handles `FileNotFoundError` for missing images and uses `tqdm` for progress indication. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/flickr8k_dataset_creation.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +def process(captions): + for img_id, img_captions in tqdm(captions): + try: + with open(os.path.join(image_folder, img_id), 'rb') as im: + binary_im = im.read() + + except FileNotFoundError: + print(f"img_id '{img_id}' not found in the folder, skipping.") + continue + + img_id = pa.array([img_id], type=pa.string()) + img = pa.array([binary_im], type=pa.binary()) + capt = pa.array([img_captions], pa.list_(pa.string(), -1)) + + yield pa.RecordBatch.from_arrays( + [img_id, img, capt], + ["image_id", "image", "captions"] + ) +``` + +---------------------------------------- + +TITLE: Create Empty Lance Dataset in Java +DESCRIPTION: This Java code demonstrates how to create a new, empty Lance dataset at a specified path. It defines the dataset's schema with 'id' (Int32) and 'name' (Utf8) fields, initializes a `BufferAllocator`, and uses `Dataset.create` to persist the schema. The snippet also shows how to access dataset version information immediately after creation. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_1 + +LANGUAGE: Java +CODE: +``` +void createDataset() throws IOException, URISyntaxException { + String datasetPath = tempDir.resolve("write_stream").toString(); + Schema schema = + new Schema( + Arrays.asList( + Field.nullable("id", new ArrowType.Int(32, true)), + Field.nullable("name", new ArrowType.Utf8())), + null); + try (BufferAllocator allocator = new RootAllocator();) { + Dataset.create(allocator, datasetPath, schema, new WriteParams.Builder().build()); + try (Dataset dataset = Dataset.create(allocator, datasetPath, schema, new WriteParams.Builder().build());) { + dataset.version(); + dataset.latestVersion(); + } + } +} +``` + +---------------------------------------- + +TITLE: Generate contextual text windows from video transcripts +DESCRIPTION: Defines the `contextualize` function to create overlapping text contexts from video transcripts. It processes each video, combining sentences into windows based on `window` and `stride` parameters, and returns a new DataFrame with these generated contexts. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_4 + +LANGUAGE: python +CODE: +``` +def contextualize(raw_df, window, stride): + def process_video(vid): + # For each video, create the text rolling window + text = vid.text.values + time_end = vid["end"].values + contexts = vid.iloc[:-window:stride, :].copy() + contexts["text"] = [' '.join(text[start_i:start_i+window]) + for start_i in range(0, len(vid)-window, stride)] + contexts["end"] = [time_end[start_i+window-1] + for start_i in range(0, len(vid)-window, stride)] + return contexts + # concat result from all videos + return pd.concat([process_video(vid) for _, vid in raw_df.groupby("title")]) + +df = contextualize(data.to_pandas(), 20, 4) +``` + +---------------------------------------- + +TITLE: Display answer and relevant YouTube video segment +DESCRIPTION: Executes the full Q&A pipeline: poses a query, retrieves the answer and relevant context, prints the generated answer, and then displays the most relevant YouTube video segment using `YouTubeVideo` at the precise timestamp where the context was found. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_14 + +LANGUAGE: python +CODE: +``` +query = ("Which training method should I use for sentence transformers " + "when I only have pairs of related sentences?") +completion, context = answer(query) + +print(completion) +top_match = context.iloc[0] +YouTubeVideo(top_match["url"].split("/")[-1], start=top_match["start"]) +``` + +---------------------------------------- + +TITLE: Create LanceDB dataset from embeddings and contexts +DESCRIPTION: Converts the generated embeddings into a LanceDB vector table and combines it with the original contextualized DataFrame. This process creates a new LanceDB dataset named 'chatbot.lance' on disk, ready for efficient vector search operations. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_8 + +LANGUAGE: python +CODE: +``` +import lance +import pyarrow as pa +from lance.vector import vec_to_table + +table = vec_to_table(np.array(embeds)) +combined = pa.Table.from_pandas(df).append_column("vector", table["vector"]) +ds = lance.write_dataset(combined, "chatbot.lance") +``` + +---------------------------------------- + +TITLE: Create LanceDB Index for GIST-1M Dataset +DESCRIPTION: Builds an index on the GIST-1M Lance dataset using `index.py`. The specified parameters for IVF partitions (-i) and PQ subvectors (-p) are crucial for optimizing query performance. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_11 + +LANGUAGE: sh +CODE: +``` +./index.py ./.lancedb/gist1m.lance -i 256 -p 120 +``` + +---------------------------------------- + +TITLE: Generate Lance Dataset +DESCRIPTION: This command executes the `datagen.py` script to create the Lance dataset required for the Cohere wiki text embedding benchmark. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/wiki/README.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +python datagen.py +``` + +---------------------------------------- + +TITLE: Generate answer using vector search and LLM +DESCRIPTION: Combines embedding generation, LanceDB vector search, and prompt creation to answer a question. It first embeds the query, then finds the most relevant contexts using vector similarity in LanceDB, and finally uses an LLM to formulate an answer based on those retrieved contexts. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_12 + +LANGUAGE: python +CODE: +``` +def answer(question): + emb = embed_func(query)[0] + context = ds.to_table( + nearest={ + "column": "vector", + "k": 3, + "q": emb, + "nprobes": 20, + "refine_factor": 100 + }).to_pandas() + prompt = create_prompt(question, context) + return complete(prompt), context.reset_index() +``` + +---------------------------------------- + +TITLE: Create LanceDB Index for SIFT-1M Dataset +DESCRIPTION: Builds an index on the SIFT-1M Lance dataset using `index.py`. The specified parameters for IVF partitions (-i) and PQ subvectors (-p) are crucial for optimizing query performance. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_6 + +LANGUAGE: sh +CODE: +``` +./index.py ./.lancedb/sift1m.lance -i 256 -p 16 +``` + +---------------------------------------- + +TITLE: LanceDB Manifest Naming Schemes +DESCRIPTION: Describes the V1 (legacy) and V2 (new) naming conventions for manifest files in LanceDB, emphasizing the V2 scheme's zero-padded, descending-sortable versioning for efficient latest manifest retrieval. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_10 + +LANGUAGE: APIDOC +CODE: +``` +Manifest Naming Schemes: + V1: _versions/{version}.manifest + V2: _versions/{u64::MAX - version:020}.manifest +``` + +---------------------------------------- + +TITLE: Initialize LanceDB Dataset and PyTorch DataLoader +DESCRIPTION: This snippet demonstrates how to initialize a CLIPLanceDataset using a LanceDB file (flickr8k.lance) and then wrap it with a PyTorch DataLoader. It configures the dataset with tokenization and augmentations, and the dataloader for efficient batch processing during training. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_8 + +LANGUAGE: python +CODE: +``` +dataset = CLIPLanceDataset( + lance_path="flickr8k.lance", + max_len=Config.max_len, + tokenizer=tokenizer, + transforms=train_augments +) + +dataloader = DataLoader( + dataset, + shuffle=False, + batch_size=Config.bs, + pin_memory=True +) +``` + +---------------------------------------- + +TITLE: Run GIST-1M Benchmark and Store Results +DESCRIPTION: Executes the benchmark for GIST-1M using `metrics.py`, querying the indexed dataset with specified parameters like number of results to fetch (-k) and query vectors (-q). The results, including mean query time and recall@1, are saved to a CSV file. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_12 + +LANGUAGE: sh +CODE: +``` +./metrics.py ./.lancedb/gist1m.lance results-gist.csv -i 256 -p 120 -q ./.lancedb/gist_query.lance -k 1 +``` + +---------------------------------------- + +TITLE: Run SIFT-1M Benchmark and Store Results +DESCRIPTION: Executes the benchmark for SIFT-1M using `metrics.py`, querying the indexed dataset with specified parameters like number of results to fetch (-k) and query vectors (-q). The results, including mean query time and recall@1, are saved to a CSV file. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_7 + +LANGUAGE: sh +CODE: +``` +./metrics.py ./.lancedb/sift1m.lance results-sift.csv -i 256 -p 16 -q ./.lancedb/sift_query.lance -k 1 +``` + +---------------------------------------- + +TITLE: Object Store General Configuration Options +DESCRIPTION: Details configuration parameters applicable to all object stores, including network, security, and retry settings. These options control connection behavior, certificate validation, timeouts, user agents, proxy usage, and client-side retry logic. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_2 + +LANGUAGE: APIDOC +CODE: +``` +Key: allow_http +Description: Allow non-TLS, i.e. non-HTTPS connections. Default, False. + +Key: download_retry_count +Description: Number of times to retry a download. Default, 3. This limit is applied when the HTTP request succeeds but the response is not fully downloaded, typically due to a violation of request_timeout. + +Key: allow_invalid_certificates +Description: Skip certificate validation on https connections. Default, False. Warning: This is insecure and should only be used for testing. + +Key: connect_timeout +Description: Timeout for only the connect phase of a Client. Default, 5s. + +Key: request_timeout +Description: Timeout for the entire request, from connection until the response body has finished. Default, 30s. + +Key: user_agent +Description: User agent string to use in requests. + +Key: proxy_url +Description: URL of a proxy server to use for requests. Default, None. + +Key: proxy_ca_certificate +Description: PEM-formatted CA certificate for proxy connections + +Key: proxy_excludes +Description: List of hosts that bypass proxy. This is a comma separated list of domains and IP masks. Any subdomain of the provided domain will be bypassed. For example, example.com, 192.168.1.0/24 would bypass https://api.example.com, https://www.example.com, and any IP in the range 192.168.1.0/24. + +Key: client_max_retries +Description: Number of times for a s3 client to retry the request. Default, 10. + +Key: client_retry_timeout +Description: Timeout for a s3 client to retry the request in seconds. Default, 180. +``` + +---------------------------------------- + +TITLE: Import necessary libraries for CLIP model training +DESCRIPTION: This snippet imports essential Python libraries like cv2, lance, numpy, torch, timm, and transformers, which are required for building and training a multi-modal CLIP model. It also includes utility libraries such as itertools and tqdm, and a warning filter. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import cv2 +import lance + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset, DataLoader +from torchvision import transforms + +import timm +from transformers import AutoModel, AutoTokenizer + +import itertools +from tqdm import tqdm + +import warnings +warnings.simplefilter('ignore') +``` + +---------------------------------------- + +TITLE: Build user dictionary with Lindera +DESCRIPTION: This command demonstrates how to build a custom user dictionary using 'lindera build'. It takes a CSV file as input and creates a new user dictionary, which can be used to extend the base language model. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/python/tests/models/lindera/README.md#_snippet_2 + +LANGUAGE: bash +CODE: +``` +lindera build --build-user-dictionary --dictionary-kind=ipadic user_dict/userdict.csv user_dict2 +``` + +---------------------------------------- + +TITLE: Google Cloud Storage Configuration Keys +DESCRIPTION: Reference for configuration keys available for Google Cloud Storage when used with LanceDB. These keys can be set as environment variables or within the `storage_options` parameter. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_8 + +LANGUAGE: APIDOC +CODE: +``` +Key / Environment Variable | Description +--------------------------|------------ +google_service_account / service_account | Path to the service account JSON file. +google_service_account_key / service_account_key | The serialized service account key. +google_application_credentials / application_credentials | Path to the application credentials. +``` + +---------------------------------------- + +TITLE: Load YouTube transcription dataset +DESCRIPTION: Downloads and loads the 'jamescalam/youtube-transcriptions' dataset from Hugging Face datasets. The 'train' split is specified to retrieve the main training portion of the data. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_1 + +LANGUAGE: python +CODE: +``` +from datasets import load_dataset + +data = load_dataset('jamescalam/youtube-transcriptions', split='train') +data +``` + +---------------------------------------- + +TITLE: Index Lance Data for Benchmarking +DESCRIPTION: This command runs the `index.py` script to build an index on the generated Lance dataset. It configures the index with an L2 metric, 2048 partitions, and 96 sub-vectors for optimized benchmarking. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/wiki/README.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +python index.py --metric L2 --num-partitions 2048 --num-sub-vectors 96 +``` + +---------------------------------------- + +TITLE: Jieba User Dictionary Configuration File (config.json) +DESCRIPTION: JSON configuration for Jieba user dictionaries. This file, named `config.json`, specifies an optional 'main' dictionary and an array of paths to additional 'users' dictionary files. It should be placed in the model's root directory. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_3 + +LANGUAGE: json +CODE: +``` +{ + "main": "dict.txt", + "users": ["path/to/user/dict.txt"] +} +``` + +---------------------------------------- + +TITLE: Batch and generate embeddings using OpenAI API +DESCRIPTION: Configures the OpenAI API key and defines a `to_batches` helper function for processing data in chunks. It then uses this to generate embeddings for the contextualized text in batches, improving efficiency and adhering to API best practices by reducing individual API calls. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_7 + +LANGUAGE: python +CODE: +``` +from tqdm.auto import tqdm +import math + +openai.api_key = "sk-..." + +# We request in batches rather than 1 embedding at a time +def to_batches(arr, batch_size): + length = len(arr) + def _chunker(arr): + for start_i in range(0, len(df), batch_size): + yield arr[start_i:start_i+batch_size] + # add progress meter + yield from tqdm(_chunker(arr), total=math.ceil(length / batch_size)) + +batch_size = 1000 +batches = to_batches(df.text.values.tolist(), batch_size) +embeds = [emb for c in batches for emb in rate_limited(c)] +``` + +---------------------------------------- + +TITLE: Download Jieba Language Model +DESCRIPTION: Command-line instruction to download the Jieba language model for use with LanceDB. The model will be automatically stored in the default Jieba model directory within the configured language model home. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +python -m lance.download jieba +``` + +---------------------------------------- + +TITLE: Read and Inspect Lance Dataset in Rust +DESCRIPTION: This Rust function `read_dataset` shows how to open an existing Lance dataset from a given path. It uses a `scanner` to create a `batch_stream` and then iterates through each `RecordBatch`, printing its number of rows, columns, schema, and the entire batch content. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/write_read_dataset.md#_snippet_1 + +LANGUAGE: Rust +CODE: +``` +// Reads dataset from the given path and prints batch size, schema for all record batches. Also extracts and prints a slice from the first batch +async fn read_dataset(data_path: &str) { + let dataset = Dataset::open(data_path).await.unwrap(); + let scanner = dataset.scan(); + + let mut batch_stream = scanner.try_into_stream().await.unwrap().map(|b| b.unwrap()); + + while let Some(batch) = batch_stream.next().await { + println!("Batch size: {}, {}", batch.num_rows(), batch.num_columns()); // print size of batch + println!("Schema: {:?}", batch.schema()); // print schema of recordbatch + + println!("Batch: {:?}", batch); // print the entire recordbatch (schema and data) + } +} // End read dataset +``` + +---------------------------------------- + +TITLE: Define configuration class for CLIP model hyperparameters +DESCRIPTION: This Python class, `Config`, centralizes all hyperparameters necessary for training the CLIP model. It includes image and text dimensions, learning rates for different components, batch size, maximum sequence length, projection dimensions, temperature, number of epochs, and the names of the image and text encoder models. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +class Config: + img_size = (128, 128) + bs = 32 + head_lr = 1e-3 + img_enc_lr = 1e-4 + text_enc_lr = 1e-5 + max_len = 18 + img_embed_dim = 2048 + text_embed_dim = 768 + projection_dim = 256 + temperature = 1.0 + num_epochs = 2 + img_encoder_model = 'resnet50' + text_encoder_model = 'bert-base-cased' +``` + +---------------------------------------- + +TITLE: LanceDB External Manifest Store Reader Operations +DESCRIPTION: Explains the reader's load process when an external manifest store is in use, including retrieving the manifest path, reattempting synchronization if needed, and ensuring the dataset remains portable. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_13 + +LANGUAGE: APIDOC +CODE: +``` +External Store Reader Load Process: + 1. GET_EXTERNAL_STORE base_uri, version, path + - Action: Retrieve manifest path from external store. + - Condition: If path does not end in UUID, return path. + 2. COPY_OBJECT_STORE mydataset.lance/_versions/{version}.manifest-{uuid} mydataset.lance/_versions/{version}.manifest + - Action: Reattempt synchronization (copy staged to final). + 3. PUT_EXTERNAL_STORE base_uri, version, mydataset.lance/_versions/{version}.manifest + - Action: Update external store to point to final manifest. + 4. RETURN mydataset.lance/_versions/{version}.manifest + - Action: Always return the finalized path. + - Error: Return error if synchronization fails. +``` + +---------------------------------------- + +TITLE: Generate Text-to-Image 10M Dataset in Lance Format +DESCRIPTION: This snippet demonstrates how to create the 'text2image-10m' dataset in Lance format using scripts from the 'big-ann-benchmarks' repository. Upon execution, it generates two Lance datasets: a base dataset and a corresponding queries/ground truth dataset, essential for benchmarking. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/bigann/README.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +python ./big-ann-benchmarks/create_dataset.py --dataset yfcc-10M +./dataset.py -t text2image-10m data/text2image1B +``` + +---------------------------------------- + +TITLE: Run Flat Index Search Benchmark +DESCRIPTION: Executes the benchmark script to measure performance of flat index search. This command generates `benchmark.csv` for raw data and `benchmark.html` for latency plots. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/flat/README.md#_snippet_0 + +LANGUAGE: Shell +CODE: +``` +./benchmark.py +``` + +---------------------------------------- + +TITLE: PyTorch Model Training Loop with LanceDB DataLoader +DESCRIPTION: This snippet illustrates a complete PyTorch training loop. It begins by defining a `LanceDataset` and `LanceSampler` to efficiently load data, then sets up a `DataLoader`. The code proceeds to initialize a PyTorch model and an AdamW optimizer. The core of the snippet is the epoch-based training loop, which includes iterating through batches, performing forward and backward passes, calculating loss, updating model parameters, and reporting training perplexity. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_training.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +dataset = LanceDataset(dataset_path, block_size) +sampler = LanceSampler(dataset, block_size) +dataloader = DataLoader( + dataset, + shuffle=False, + batch_size=batch_size, + sampler=sampler, + pin_memory=True +) + +# Define the optimizer, training loop and train the model! +model = model.to(device) +model.train() +optimizer = torch.optim.AdamW(model.parameters(), lr=lr) + +for epoch in range(nb_epochs): + print(f"========= Epoch: {epoch+1} / {nb_epochs} ========") + epoch_loss = [] + prog_bar = tqdm(dataloader, total=len(dataloader)) + for batch in prog_bar: + optimizer.zero_grad(set_to_none=True) + + # Put both input_ids and labels to the device + for k, v in batch.items(): + batch[k] = v.to(device) + + # Perform one forward pass and get the loss + outputs = model(**batch) + loss = outputs.loss + + # Perform backward pass + loss.backward() + optimizer.step() + + prog_bar.set_description(f"loss: {loss.item():.4f}") + + epoch_loss.append(loss.item()) + + # Calculate training perplexity for this epoch + try: + perplexity = np.exp(np.mean(epoch_loss)) + except OverflowError: + perplexity = float("-inf") + + print(f"train_perplexity: {perplexity}") +``` + +---------------------------------------- + +TITLE: Create PyArrow RecordBatchReader from Processed Samples (Python) +DESCRIPTION: This code creates a PyArrow RecordBatchReader, which acts as an iterator over the data generated by the 'process_samples' function. It uses the defined schema to ensure data consistency and prepares the stream of record batches for writing to a Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_dataset_creation.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +reader = pa.RecordBatchReader.from_batches( + schema, + process_samples(dataset, num_samples=500_000, field='text') # For 500K samples +) +``` + +---------------------------------------- + +TITLE: Download and Extract GIST-1M Dataset +DESCRIPTION: Downloads the GIST-1M dataset archive from the specified FTP server and extracts its contents. This is a prerequisite for generating Lance datasets for GIST-1M. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_8 + +LANGUAGE: sh +CODE: +``` +wget ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz +tar -xzf gist.tar.gz +``` + +---------------------------------------- + +TITLE: Create a Lance Dataset from Arrow RecordBatches in Rust +DESCRIPTION: Demonstrates how to write a collection of Arrow RecordBatches and an Arrow Schema into a new Lance Dataset. It uses default write parameters and an iterator for the batches. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_1 + +LANGUAGE: rust +CODE: +``` +use lance::{dataset::WriteParams, Dataset}; + +let write_params = WriteParams::default(); +let mut reader = RecordBatchIterator::new( + batches.into_iter().map(Ok), + schema +); +Dataset::write(reader, &uri, Some(write_params)).await.unwrap(); +``` + +---------------------------------------- + +TITLE: Create TensorFlow Dataset from Lance URI +DESCRIPTION: This snippet demonstrates how to initialize a `tf.data.Dataset` directly from a Lance dataset URI using `lance.tf.data.from_lance`. It also shows how to chain standard TensorFlow dataset operations like shuffling and mapping for data preprocessing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/tensorflow.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import tensorflow as tf +import lance + +# Create tf dataset +ds = lance.tf.data.from_lance("s3://my-bucket/my-dataset") + +# Chain tf dataset with other tf primitives + +for batch in ds.shuffling(32).map(lambda x: tf.io.decode_png(x["image"])): + print(batch) +``` + +---------------------------------------- + +TITLE: Write PyArrow Record Batches to Lance Dataset in Python +DESCRIPTION: This Python code demonstrates how to write PyArrow Record Batches to a Lance dataset. It creates a `RecordBatchReader` from the defined schema and the output of the `process` function, then uses `lance.write_dataset` to efficiently persist the data to a file named 'flickr8k.lance' on disk. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/flickr8k_dataset_creation.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +reader = pa.RecordBatchReader.from_batches(schema, process(captions)) +lance.write_dataset(reader, "flickr8k.lance", schema) +``` + +---------------------------------------- + +TITLE: Implement PyTorch CLIP Model Training Loop +DESCRIPTION: This code defines the core training loop for a CLIP model. It sets all model components to training mode, iterates through epochs and batches from the DataLoader, performs forward and backward passes, calculates loss, and updates model weights using an optimizer. A progress bar provides real-time feedback on the training process. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_9 + +LANGUAGE: python +CODE: +``` +img_encoder.train() +img_head.train() +text_encoder.train() +text_head.train() + +for epoch in range(Config.num_epochs): + print(f"{'='*20} Epoch: {epoch+1} / {Config.num_epochs} {'='*20}") + + prog_bar = tqdm(dataloader) + for img, caption in prog_bar: + optimizer.zero_grad(set_to_none=True); + + img_embed, text_embed = forward(img, caption) + loss = loss_fn(img_embed, text_embed, temperature=Config.temperature).mean() + + loss.backward() + optimizer.step() + + prog_bar.set_description(f"loss: {loss.item():.4f}") + print() +``` + +---------------------------------------- + +TITLE: Build Ipadic language model with Lindera +DESCRIPTION: This command uses the 'lindera build' tool to compile the Ipadic dictionary. It specifies the dictionary kind as 'ipadic' and points to the extracted model directory to create the main dictionary. + +SOURCE: https://github.com/lancedb/lance/blob/main/python/python/tests/models/lindera/README.md#_snippet_1 + +LANGUAGE: bash +CODE: +``` +lindera build --dictionary-kind=ipadic mecab-ipadic-2.7.0-20070801 main +``` + +---------------------------------------- + +TITLE: Write Lance Dataset in Rust +DESCRIPTION: This Rust function `write_dataset` demonstrates how to create and write a Lance dataset to a specified path. It defines a schema with `UInt32` fields, creates a `RecordBatch` with sample data, and uses `WriteParams` to set the write mode to `Overwrite` before writing the dataset to disk. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/rust/write_read_dataset.md#_snippet_0 + +LANGUAGE: Rust +CODE: +``` +// Writes sample dataset to the given path +async fn write_dataset(data_path: &str) { + // Define new schema + let schema = Arc::new(Schema::new(vec![ + Field::new("key", DataType::UInt32, false), + Field::new("value", DataType::UInt32, false), + ])); + + // Create new record batches + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt32Array::from(vec![1, 2, 3, 4, 5, 6])), + Arc::new(UInt32Array::from(vec![6, 7, 8, 9, 10, 11])), + ], + ) + .unwrap(); + + let batches = RecordBatchIterator::new([Ok(batch)], schema.clone()); + + // Define write parameters (e.g. overwrite dataset) + let write_params = WriteParams { + mode: WriteMode::Overwrite, + ..Default::default() + }; + + Dataset::write(batches, data_path, Some(write_params)) + .await + .unwrap(); +} // End write dataset +``` + +---------------------------------------- + +TITLE: Download and Extract SIFT-1M Dataset +DESCRIPTION: Downloads the SIFT-1M dataset archive from the specified FTP server and extracts its contents. This is a prerequisite for generating Lance datasets for SIFT-1M. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_1 + +LANGUAGE: sh +CODE: +``` +wget ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz +tar -xzf sift.tar.gz +``` + +---------------------------------------- + +TITLE: Query Lance Dataset with DuckDB +DESCRIPTION: Demonstrates querying a Lance dataset directly using DuckDB. It highlights the integration with DuckDB for SQL-based data exploration and retrieval, enabling powerful analytical queries. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +import duckdb + +# If this segfaults, make sure you have duckdb v0.7+ installed +duckdb.query("SELECT * FROM dataset LIMIT 10").to_df() +``` + +---------------------------------------- + +TITLE: Build LanceDB Rust JNI Module +DESCRIPTION: Specifies the command to build only the Rust-based JNI (Java Native Interface) module of LanceDB. This is useful for developers focusing on the native components without rebuilding the entire Java project. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_10 + +LANGUAGE: shell +CODE: +``` +cargo build +``` + +---------------------------------------- + +TITLE: Initialize Lance Dataset from Local Path +DESCRIPTION: This Python snippet demonstrates how to initialize a Lance dataset object from a local file path. It sets up the dataset for subsequent read operations, enabling access to the data stored in the specified Lance file. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_12 + +LANGUAGE: python +CODE: +``` +ds = lance.dataset("./imagenet.lance") +``` + +---------------------------------------- + +TITLE: Implement custom PyTorch Dataset for Lance-based CLIP training +DESCRIPTION: This `CLIPLanceDataset` class extends PyTorch's `Dataset` to handle Lance datasets for CLIP model training. It initializes with a Lance dataset path, an optional tokenizer, and image transformations, providing methods to retrieve pre-processed images and tokenized captions for use in a DataLoader. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +class CLIPLanceDataset(Dataset): + """Custom Dataset to load images and their corresponding captions""" + def __init__(self, lance_path, max_len=18, tokenizer=None, transforms=None): + self.ds = lance.dataset(lance_path) + self.max_len = max_len + # Init a new tokenizer if not specified already + self.tokenizer = AutoTokenizer.from_pretrained('bert-base-cased') if not tokenizer else tokenizer + self.transforms = transforms + + def __len__(self): + return self.ds.count_rows() + + def __getitem__(self, idx): + # Load the image and caption + img = load_image(self.ds, idx) + caption = load_caption(self.ds, idx) + + # Apply transformations to the images + if self.transforms: + img = self.transforms(img) + + # Tokenize the caption + caption = self.tokenizer( + caption, + truncation=True, + padding='max_length', + max_length=self.max_len, + return_tensors='pt' + ) + # Flatten each component of tokenized caption otherwise they will cause size mismatch errors during training + caption = {k: v.flatten() for k, v in caption.items()} + + return img, caption +``` + +---------------------------------------- + +TITLE: Azure Blob Storage Configuration Keys +DESCRIPTION: Reference for configuration keys available for Azure Blob Storage when used with LanceDB. These keys can be set as environment variables or within the `storage_options` parameter. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_10 + +LANGUAGE: APIDOC +CODE: +``` +Key / Environment Variable | Description +--------------------------|------------ +azure_storage_account_name / account_name | The name of the azure storage account. +azure_storage_account_key / account_key | The serialized service account key. +azure_client_id / client_id | Service principal client id for authorizing requests. +azure_client_secret / client_secret | Service principal client secret for authorizing requests. +azure_tenant_id / tenant_id | Tenant id used in oauth flows. +azure_storage_sas_key / azure_storage_sas_token / sas_key / sas_token | Shared access signature. The signature is expected to be percent-encoded, much like they are provided in the azure storage explorer or azure portal. +azure_storage_token / bearer_token / token | Bearer token. +azure_storage_use_emulator / object_store_use_emulator / use_emulator | Use object store with azurite storage emulator. +azure_endpoint / endpoint | Override the endpoint used to communicate with blob storage. +azure_use_fabric_endpoint / use_fabric_endpoint | Use object store with url scheme account.dfs.fabric.microsoft.com. +azure_msi_endpoint / azure_identity_endpoint / identity_endpoint / msi_endpoint | Endpoint to request a imds managed identity token. +azure_object_id / object_id | Object id for use with managed identity authentication. +azure_msi_resource_id / msi_resource_id | Msi resource id for use with managed identity authentication. +azure_federated_token_file / federated_token_file | File containing token for Azure AD workload identity federation. +azure_use_azure_cli / use_azure_cli | Use azure cli for acquiring access token. +azure_disable_tagging / disable_tagging | Disables tagging objects. This can be desirable if not supported by the backing store. +``` + +---------------------------------------- + +TITLE: Define Function to Process and Tokenize Samples for Lance (Python) +DESCRIPTION: This function iterates over a dataset, tokenizes individual samples using the 'tokenize' function, and yields PyArrow RecordBatches. It processes a specified number of samples, skipping empty ones, and is designed to efficiently prepare data for writing to a Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_dataset_creation.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +def process_samples(dataset, num_samples=100_000, field='text'): + current_sample = 0 + for sample in tqdm(dataset, total=num_samples): + # If we have added all 5M samples, stop + if current_sample == num_samples: + break + if not sample[field]: + continue + # Tokenize the current sample + tokenized_sample = tokenize(sample, field) + # Increment the counter + current_sample += 1 + # Yield a PyArrow RecordBatch + yield pa.RecordBatch.from_arrays( + [tokenized_sample], + names=["input_ids"] + ) +``` + +---------------------------------------- + +TITLE: Read a Lance Dataset and Collect RecordBatches in Rust +DESCRIPTION: Opens an existing Lance Dataset from a specified path, scans its content, and collects all resulting RecordBatches into a vector. Error handling is included. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_2 + +LANGUAGE: rust +CODE: +``` +let dataset = Dataset::open(path).await.unwrap(); +let mut scanner = dataset.scan(); +let batches: Vec = scanner + .try_into_stream() + .await + .unwrap() + .map(|b| b.unwrap()) + .collect::>() + .await; +``` + +---------------------------------------- + +TITLE: Visualize Latency vs. NProbes with IVF and PQ +DESCRIPTION: This snippet generates a scatter plot using seaborn to visualize the relationship between 'nprobes' and '50%' (median response time). It uses 'ivf' for color encoding and 'pq' for marker style, allowing for a multi-dimensional analysis of performance. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_7 + +LANGUAGE: python +CODE: +``` +sns.scatterplot(data=df, x="nprobes", y="50%", hue="ivf", style="pq") +``` + +---------------------------------------- + +TITLE: Write HuggingFace Dataset to Lance Format +DESCRIPTION: This Python code snippet demonstrates how to load a HuggingFace dataset and write it into the Lance format. It uses the `datasets` library to load a specific split of a dataset and then `lance.write_dataset` to save it as a Lance file. Dependencies include `datasets` and `lance`. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/huggingface.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import datasets # pip install datasets +import lance + +lance.write_dataset(datasets.load_dataset( + "poloclub/diffusiondb", split="train[:10]" +), "diffusiondb_train.lance") +``` + +---------------------------------------- + +TITLE: Describe Median Latency by PQ Configuration +DESCRIPTION: This snippet groups the DataFrame by the 'pq' column and calculates descriptive statistics for the '50%' (median response time) column. This provides insights into latency performance based on different Product Quantization (PQ) configurations. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_4 + +LANGUAGE: python +CODE: +``` +df.groupby("pq")["50%"].describe() +``` + +---------------------------------------- + +TITLE: Check number of generated contexts +DESCRIPTION: Prints the total number of contextualized entries created after processing the dataset. This helps verify the output of the `contextualize` function and understand the volume of data prepared for embedding. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_5 + +LANGUAGE: python +CODE: +``` +len(df) +``` + +---------------------------------------- + +TITLE: Convert HuggingFace Dataset to LanceDB +DESCRIPTION: This snippet demonstrates how to load a dataset from HuggingFace and convert it into a Lance dataset using `lance.write_dataset`. This is a foundational step for preparing data to be used with LanceDB's PyTorch integration. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/pytorch.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import datasets # pip install datasets +import lance + +hf_ds = datasets.load_dataset( + "poloclub/diffusiondb", + split="train", + # name="2m_first_1k", # for a smaller subset of the dataset +) +lance.write_dataset(hf_ds, "diffusiondb_train.lance") +``` + +---------------------------------------- + +TITLE: Build IVF_PQ Vector Index on Lance Dataset +DESCRIPTION: Creates an IVF_PQ (Inverted File Index with Product Quantization) index on the 'vector' column of the Lance dataset. This index significantly speeds up nearest neighbor searches by efficiently partitioning and quantizing the vector space. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_8 + +LANGUAGE: python +CODE: +``` +sift1m.create_index("vector", + index_type="IVF_PQ", + num_partitions=256, # IVF + num_sub_vectors=16) # PQ +``` + +---------------------------------------- + +TITLE: LanceDB S3 Storage Options Reference +DESCRIPTION: Reference for available keys in the `storage_options` parameter for S3 and S3-compatible storage configurations in LanceDB. These options can be set via environment variables or directly in the `storage_options` dictionary, controlling aspects like region, endpoint, and encryption. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_4 + +LANGUAGE: APIDOC +CODE: +``` +S3 Storage Options: +- aws_region / region: The AWS region the bucket is in. This can be automatically detected when using AWS S3, but must be specified for S3-compatible stores. +- aws_access_key_id / access_key_id: The AWS access key ID to use. +- aws_secret_access_key / secret_access_key: The AWS secret access key to use. +- aws_session_token / session_token: The AWS session token to use. +- aws_endpoint / endpoint: The endpoint to use for S3-compatible stores. +- aws_virtual_hosted_style_request / virtual_hosted_style_request: Whether to use virtual hosted-style requests, where bucket name is part of the endpoint. Meant to be used with `aws_endpoint`. Default, `False`. +- aws_s3_express / s3_express: Whether to use S3 Express One Zone endpoints. Default, `False`. See more details below. +- aws_server_side_encryption: The server-side encryption algorithm to use. Must be one of `"AES256"`, `"aws:kms"`, or `"aws:kms:dsse"`. Default, `None`. +- aws_sse_kms_key_id: The KMS key ID to use for server-side encryption. If set, `aws_server_side_encryption` must be `"aws:kms"` or `"aws:kms:dsse"`. +- aws_sse_bucket_key_enabled: Whether to use bucket keys for server-side encryption. +``` + +---------------------------------------- + +TITLE: Define OpenAI embedding function with rate limiting and retry +DESCRIPTION: Sets up an embedding function using OpenAI's `text-embedding-ada-002` model. It incorporates `ratelimiter` to respect API rate limits and `retry` for robust API calls, ensuring successful embedding generation even with transient network issues. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_6 + +LANGUAGE: python +CODE: +``` +import functools +import openai +import ratelimiter +from retry import retry + +embed_model = "text-embedding-ada-002" + +# API limit at 60/min == 1/sec +limiter = ratelimiter.RateLimiter(max_calls=0.9, period=1.0) + +# Get the embedding with retry +@retry(tries=10, delay=1, max_delay=30, backoff=3, jitter=1) +def embed_func(c): + rs = openai.Embedding.create(input=c, engine=embed_model) + return [record["embedding"] for record in rs["data"]] + +rate_limited = limiter(embed_func) +``` + +---------------------------------------- + +TITLE: Add Lance SDK Java Maven Dependency +DESCRIPTION: This snippet provides the Maven XML configuration required to include the LanceDB Java SDK as a dependency in your project. It specifies the `groupId`, `artifactId`, and `version` for the `lance-core` library, enabling access to LanceDB functionalities. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_0 + +LANGUAGE: Shell +CODE: +``` + + com.lancedb + lance-core + 0.18.0 + +``` + +---------------------------------------- + +TITLE: Define PyArrow Schema for Lance Dataset (Python) +DESCRIPTION: This snippet defines a PyArrow schema required by Lance to understand the structure of the data being written. It specifies that the dataset will contain a single field named 'input_ids', which will store tokenized data as 64-bit integers. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_dataset_creation.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +schema = pa.schema([ + pa.field("input_ids", pa.int64()) +]) +``` + +---------------------------------------- + +TITLE: Add Columns to LanceDB Dataset in Java +DESCRIPTION: Demonstrates how to add new columns to a LanceDB dataset. This can be done either by providing SQL expressions to derive new column values or by defining a new Arrow Schema for the dataset, allowing for flexible schema evolution. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_6 + +LANGUAGE: java +CODE: +``` +void addColumnsByExpressions() { + String datasetPath = ""; // specify a path point to a dataset + try (BufferAllocator allocator = new RootAllocator()) { + try (Dataset dataset = Dataset.open(datasetPath, allocator)) { + SqlExpressions sqlExpressions = new SqlExpressions.Builder().withExpression("double_id", "id * 2").build(); + dataset.addColumns(sqlExpressions, Optional.empty()); + } + } +} +``` + +LANGUAGE: java +CODE: +``` +void addColumnsBySchema() { + String datasetPath = ""; // specify a path point to a dataset + try (BufferAllocator allocator = new RootAllocator()) { + try (Dataset dataset = Dataset.open(datasetPath, allocator)) { + SqlExpressions sqlExpressions = new SqlExpressions.Builder().withExpression("double_id", "id * 2").build(); + dataset.addColumns(new Schema( + Arrays.asList( + Field.nullable("id", new ArrowType.Int(32, true)), + Field.nullable("name", new ArrowType.Utf8()), + Field.nullable("age", new ArrowType.Int(32, true)))), Optional.empty()); + } + } +} +``` + +---------------------------------------- + +TITLE: Write Processed Data to Lance Dataset (Python) +DESCRIPTION: This final step uses the 'lance.write_dataset' function to persist the processed and tokenized data to disk as a Lance dataset. It takes the RecordBatchReader, the desired output file path, and the defined schema as arguments, completing the dataset creation process. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_dataset_creation.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +# Write the dataset to disk +lance.write_dataset( + reader, + "wikitext_500K.lance", + schema +) +``` + +---------------------------------------- + +TITLE: Create a Vector Index on a Lance Dataset in Rust +DESCRIPTION: Demonstrates how to create a vector index on a specified column (e.g., 'embeddings') within a Lance Dataset. It configures vector index parameters like the number of partitions and sub-vectors, noting potential alignment requirements. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_4 + +LANGUAGE: rust +CODE: +``` +use ::lance::index::vector::VectorIndexParams; + +let params = VectorIndexParams::default(); +params.num_partitions = 256; +params.num_sub_vectors = 16; + +// this will Err if list_size(embeddings) / num_sub_vectors does not meet simd alignment +dataset.create_index(&["embeddings"], IndexType::Vector, None, ¶ms, true).await; +``` + +---------------------------------------- + +TITLE: Load Query Data with Pandas +DESCRIPTION: This snippet imports the pandas library and loads query performance data from a CSV file named 'query.csv' into a DataFrame. This DataFrame will be used for subsequent analysis and visualization. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_0 + +LANGUAGE: python +CODE: +``` +import pandas as pd +df = pd.read_csv("query.csv") +``` + +---------------------------------------- + +TITLE: Query Lance Dataset with Pandas +DESCRIPTION: Illustrates how to convert a Lance dataset to a PyArrow Table and then to a Pandas DataFrame for easy data manipulation and analysis using familiar Pandas operations. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +df = dataset.to_table().to_pandas() +df +``` + +---------------------------------------- + +TITLE: Lance Manifest Protobuf Message Reference +DESCRIPTION: References the Protobuf message definition for the Manifest file, which encapsulates the metadata for a specific version of a Lance dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_1 + +LANGUAGE: APIDOC +CODE: +``` +proto.message.Manifest +``` + +---------------------------------------- + +TITLE: Define Tokenization Function (Python) +DESCRIPTION: This function takes a single sample from a Hugging Face dataset and a specified field name (e.g., 'text'). It uses the pre-initialized tokenizer to convert the text content of that field into 'input_ids', which are numerical representations of tokens. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_dataset_creation.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +def tokenize(sample, field='text'): + return tokenizer(sample[field])['input_ids'] +``` + +---------------------------------------- + +TITLE: Implement CLIP Loss Function and Forward Pass Utilities +DESCRIPTION: This snippet provides utility functions for training a CLIP model. The `loss_fn` calculates the contrastive loss between image and text embeddings based on the CLIP paper, using logits, image similarity, and text similarity. The `forward` function performs a single forward pass, moving inputs to the GPU, and obtaining image and text embeddings using the defined encoder and head modules. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_6 + +LANGUAGE: python +CODE: +``` +def loss_fn(img_embed, text_embed, temperature=0.2): + """ + https://arxiv.org/abs/2103.00020/ + """ + # Calculate logits, image similarity and text similarity + logits = (text_embed @ img_embed.T) / temperature + img_sim = img_embed @ img_embed.T + text_sim = text_embed @ text_embed.T + # Calculate targets by taking the softmax of the similarities + targets = F.softmax( + (img_sim + text_sim) / 2 * temperature, dim=-1 + ) + img_loss = (-targets.T * nn.LogSoftmax(dim=-1)(logits.T)).sum(1) + text_loss = (-targets * nn.LogSoftmax(dim=-1)(logits)).sum(1) + return (img_loss + text_loss) / 2.0 + +def forward(img, caption): + # Transfer to device + img = img.to('cuda') + for k, v in caption.items(): + caption[k] = v.to('cuda') + + # Get embeddings for both img and caption + img_embed = img_head(img_encoder(img)) + text_embed = text_head(text_encoder(caption)) + + return img_embed, text_embed +``` + +---------------------------------------- + +TITLE: Read Data from Lance Dataset +DESCRIPTION: Shows how to open and read a Lance dataset from a specified URI. It asserts that the returned object is a PyArrow Dataset, confirming seamless integration with the Apache Arrow ecosystem. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +dataset = lance.dataset("/tmp/test.lance") +assert isinstance(dataset, pa.dataset.Dataset) +``` + +---------------------------------------- + +TITLE: Globally Set Object Store Timeout (Bash) +DESCRIPTION: Demonstrates how to set a global timeout for object store operations using an environment variable. This configuration applies to all subsequent Lance operations that interact with object storage. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +export TIMEOUT=60s +``` + +---------------------------------------- + +TITLE: Lance File Format Version Details +DESCRIPTION: This table provides a comprehensive overview of the Lance file format versions, including their compatibility, features, and stability status. It details the breaking changes, new functionalities introduced in each version, and aliases for common use cases. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_5 + +LANGUAGE: APIDOC +CODE: +``` +Version: 0.1 + Minimal Lance Version: Any + Maximum Lance Version: Any + Description: This is the initial Lance format. + +Version: 2.0 + Minimal Lance Version: 0.16.0 + Maximum Lance Version: Any + Description: Rework of the Lance file format that removed row groups and introduced null support for lists, fixed size lists, and primitives + +Version: 2.1 (unstable) + Minimal Lance Version: None + Maximum Lance Version: Any + Description: Enhances integer and string compression, adds support for nulls in struct fields, and improves random access performance with nested fields. + +Version: legacy + Minimal Lance Version: N/A + Maximum Lance Version: N/A + Description: Alias for 0.1 + +Version: stable + Minimal Lance Version: N/A + Maximum Lance Version: N/A + Description: Alias for the latest stable version (currently 2.0) + +Version: next + Minimal Lance Version: N/A + Maximum Lance Version: N/A + Description: Alias for the latest unstable version (currently 2.1) +``` + +---------------------------------------- + +TITLE: Connect LanceDB to S3-Compatible Stores (e.g., MinIO) +DESCRIPTION: Illustrates how to configure LanceDB to connect to S3-compatible storage solutions like MinIO. This requires specifying both the `region` and `endpoint` within the `storage_options` parameter to direct LanceDB to the custom S3 endpoint, enabling connectivity beyond AWS S3. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset( + "s3://bucket/path", + storage_options={ + "region": "us-east-1", + "endpoint": "http://minio:9000", + } +) +``` + +---------------------------------------- + +TITLE: Load and parse Flickr8k token file annotations +DESCRIPTION: This code reads the 'Flickr8k.token.txt' file, which contains image annotations. It then processes each line to extract the image file name, a unique caption number, and the caption text itself, storing them as structured tuples for further processing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/flickr8k_dataset_creation.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +with open(captions, "r") as fl: + annotations = fl.readlines() + +# Converts the annotations where each element of this list is a tuple consisting of image file name, caption number and caption itself +annotations = list(map(lambda x: tuple([*x.split('\t')[0].split('#'), x.split('\t')[1]]), annotations)) +``` + +---------------------------------------- + +TITLE: Lance File Footer and Overall Layout Specification +DESCRIPTION: Provides a detailed byte-level specification of the .lance file format, including the arrangement of data pages, column metadata, offset tables, and the final footer. It outlines alignment requirements and the structure of various fields. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_4 + +LANGUAGE: APIDOC +CODE: +``` +// Note: the number of buffers (BN) is independent of the number of columns (CN) +// and pages. +// +// Buffers often need to be aligned. 64-byte alignment is common when +// working with SIMD operations. 4096-byte alignment is common when +// working with direct I/O. In order to ensure these buffers are aligned +// writers may need to insert padding before the buffers. +// +// If direct I/O is required then most (but not all) fields described +// below must be sector aligned. We have marked these fields with an +// asterisk for clarity. Readers should assume there will be optional +// padding inserted before these fields. +// +// All footer fields are unsigned integers written with little endian +// byte order. +// +// ├───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── +``` + +---------------------------------------- + +TITLE: LanceDB Conflict Resolution Process +DESCRIPTION: Outlines the commit process in LanceDB, detailing how writers handle concurrent modifications, create transaction files for conflict detection, and retry commits after checking for compatibility with successful writes. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_11 + +LANGUAGE: APIDOC +CODE: +``` +Commit Process: + 1. Writer finishes writing all data files. + 2. Writer creates a transaction file in _transactions directory. + - Purpose: detect conflicts, re-build manifest during retries. + 3. Check for new commits since writer started. + - If conflicts detected (via transaction files), abort commit. + 4. Build manifest and attempt to commit to next version. + - If commit fails due to concurrent write, go back to step 3. + +Conflict Detection: + - Conservative approach: assume conflict if transaction file is missing or has unknown operation. +``` + +---------------------------------------- + +TITLE: Lance Dataset Directory Structure +DESCRIPTION: Illustrates the typical organization of a Lance dataset within a directory, detailing the location of data files, version manifests, secondary indices, and deletion files. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_0 + +LANGUAGE: plaintext +CODE: +``` +/path/to/dataset: + data/*.lance -- Data directory + _versions/*.manifest -- Manifest file for each dataset version. + _indices/{UUID-*}/index.idx -- Secondary index, each index per directory. + _deletions/*.{arrow,bin} -- Deletion files, which contain ids of rows + that have been deleted. +``` + +---------------------------------------- + +TITLE: Define PyArrow Schema for Lance Dataset in Python +DESCRIPTION: This Python code defines a PyArrow schema for the Lance dataset. It specifies the data types for `image_id` (string), `image` (binary), and `captions` (list of strings), ensuring proper data structure and type consistency for the dataset when written to Lance. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/flickr8k_dataset_creation.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +schema = pa.schema([ + pa.field("image_id", pa.string()), + pa.field("image", pa.binary()), + pa.field("captions", pa.list_(pa.string(), -1)), +]) +``` + +---------------------------------------- + +TITLE: Define image augmentations for CLIP model training +DESCRIPTION: This snippet defines a `torchvision.transforms.Compose` object for basic image augmentations applied during CLIP model training. It includes converting images to tensors, resizing them to a consistent shape, and normalizing pixel values to stabilize the training process. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +train_augments = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Resize(Config.img_size), + transforms.Normalize([0.5], [0.5]), + ] +) +``` + +---------------------------------------- + +TITLE: Generate GIST-1M Database Vectors Lance Dataset +DESCRIPTION: Uses the `datagen.py` script to convert GIST-1M base vectors into a Lance dataset. This dataset will serve as the primary data source for indexing and querying in the GIST-1M benchmark. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_9 + +LANGUAGE: sh +CODE: +``` +./datagen.py ./gist/gist_base.fvecs ./.lancedb/gist1m.lance -g 1024 -m 50000 -d 960 +``` + +---------------------------------------- + +TITLE: Set Object Store Timeout for a Single Dataset (Python) +DESCRIPTION: Shows how to specify storage options, such as a timeout, for a specific Lance dataset using the `storage_options` parameter in `lance.dataset`. This allows fine-grained control over individual dataset configurations without affecting global settings. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset("s3://path", storage_options={"timeout": "60s"}) +``` + +---------------------------------------- + +TITLE: Connect LanceDB to Google Cloud Storage +DESCRIPTION: This Python snippet demonstrates how to connect a LanceDB dataset to Google Cloud Storage using `storage_options` to specify service account credentials. It provides an alternative to setting the `GOOGLE_SERVICE_ACCOUNT` environment variable. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_7 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset( + "gs://my-bucket/my-dataset", + storage_options={ + "service_account": "path/to/service-account.json", + } +) +``` + +---------------------------------------- + +TITLE: Read and Write Lance Data with Ray and Pandas +DESCRIPTION: This snippet demonstrates how to write data to the Lance format using Ray's data sink (`ray.data.Dataset.write_lance`) and subsequently read it back using both the native Lance API (`lance.dataset`) and Ray's data source (`ray.data.read_lance`). It includes assertions to verify data integrity after read/write operations. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/ray.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import ray +import pandas as pd + +ray.init() + +data = [ + {"id": 1, "name": "alice"}, + {"id": 2, "name": "bob"}, + {"id": 3, "name": "charlie"} +] +ray.data.from_items(data).write_lance("./alice_bob_and_charlie.lance") + +# It can be read via lance directly +df = ( + lance. + dataset("./alice_bob_and_charlie.lance") + .to_table() + .to_pandas() + .sort_values(by=["id"]) + .reset_index(drop=True) +) +assert df.equals(pd.DataFrame(data)), "{} != {}".format( + df, pd.DataFrame(data) +) + +# Or via Ray.data.read_lance +ray_df = ( + ray.data.read_lance("./alice_bob_and_charlie.lance") + .to_pandas() + .sort_values(by=["id"]) + .reset_index(drop=True) +) +assert df.equals(ray_df) +``` + +---------------------------------------- + +TITLE: Load PyTorch Model State Dictionary from Lance Dataset (Python) +DESCRIPTION: This function reads all model weights from a specified Lance dataset file and constructs an OrderedDict suitable as a PyTorch model state dictionary. It iterates through each weight, converting it using _load_weight, and places it on the specified device. This function assumes all weights can fit into memory; large models may cause memory errors. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/artifact_management.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +def _load_state_dict(file_name: str, version: int = 1, map_location=None) -> OrderedDict: + """Reads the model weights from lance file and returns a model state dict + If the model weights are too large, this function will fail with a memory error. + + Args: + file_name (str): Lance model name + version (int): Version of the model to load + map_location (str): Device to load the model on + + Returns: + OrderedDict: Model state dict + """ + ds = lance.dataset(file_name, version=version) + weights = ds.take([x for x in range(ds.count_rows())]).to_pylist() + state_dict = OrderedDict() + + for weight in weights: + state_dict[weight["name"]] = _load_weight(weight).to(map_location) + + return state_dict +``` + +---------------------------------------- + +TITLE: Load Data Chunk from Lance Dataset by Indices +DESCRIPTION: This utility function, `from_indices`, efficiently loads specific elements from a Lance dataset based on a list of provided indices. It takes a Lance dataset object and a list of integer indices, then retrieves the corresponding rows. The function processes these rows to extract only the 'input_ids' from each, returning them as a list of token IDs, which is crucial for preparing data chunks. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_training.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +def from_indices(dataset, indices): + """Load the elements on given indices from the dataset""" + chunk = dataset.take(indices).to_pylist() + chunk = list(map(lambda x: x['input_ids'], chunk)) + return chunk +``` + +---------------------------------------- + +TITLE: Run LanceDB Vector Search Recall Test +DESCRIPTION: Defines run_test, a comprehensive function for evaluating LanceDB's vector search recall. It generates ground truth, writes data to a temporary LanceDB dataset, creates an IVF_PQ index, and performs nearest neighbor queries with varying nprobes and refine_factor to calculate recall for both in-sample and out-of-sample queries. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_2 + +LANGUAGE: python +CODE: +``` +def run_test( + data, + query, + metric, + num_partitions=256, + num_sub_vectors=8, + nprobes_list=[1, 2, 5, 10, 16], + refine_factor_list=[1, 2, 5, 10, 20], +): + in_sample = data[random.sample(range(data.shape[0]), 1000), :] + # ground truth + print("generating gt") + + gt = knn(query, data, metric, 10) + gt_in_sample = knn(in_sample, data, metric, 10) + + print("generated gt") + + with tempfile.TemporaryDirectory() as d: + write_lance(d, data) + ds = lance.dataset(d) + + for q, target in zip(tqdm(in_sample, desc="checking brute force"), gt_in_sample): + res = ds.to_table(nearest={ + "column": "vec", + "q": q, + "k": 10, + "metric": metric, + }, columns=["id"]) + assert len(np.intersect1d(res["id"].to_numpy(), target)) == 10 + + ds = ds.create_index("vec", "IVF_PQ", metric=metric, num_partitions=num_partitions, num_sub_vectors=num_sub_vectors) + + recall_data = [] + for nprobes in nprobes_list: + for refine_factor in refine_factor_list: + hits = 0 + # check that brute force impl is correct + for q, target in zip(tqdm(query, desc=f"out of sample, nprobes={nprobes}, refine={refine_factor}"), gt): + res = ds.to_table(nearest={ + "column": "vec", + "q": q, + "k": 10, + "nprobes": nprobes, + "refine_factor": refine_factor, + }, columns=["id"])["id"].to_numpy() + hits += len(np.intersect1d(res, target)) + recall_data.append([ + "out_of_sample", + nprobes, + refine_factor, + hits / 10 / len(gt), + ]) + # check that brute force impl is correct + for q, target in zip(tqdm(in_sample, desc=f"in sample nprobes={nprobes}, refine={refine_factor}"), gt_in_sample): + res = ds.to_table(nearest={ + "column": "vec", + "q": q, + "k": 10, + "nprobes": nprobes, + "refine_factor": refine_factor, + }, columns=["id"])["id"].to_numpy() + hits += len(np.intersect1d(res, target)) + recall_data.append([ + "in_sample", + nprobes, + refine_factor, + hits / 10 / len(gt_in_sample), + ]) + return recall_data +``` + +---------------------------------------- + +TITLE: Stream PyArrow RecordBatches to Lance Dataset +DESCRIPTION: Shows how to write a Lance dataset from an iterator of `pyarrow.RecordBatch` objects. This method is ideal for large datasets that cannot be fully loaded into memory, requiring a `pyarrow.Schema` to be provided. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +def producer() -> Iterator[pa.RecordBatch]: + """An iterator of RecordBatches.""" + yield pa.RecordBatch.from_pylist([{"name": "Alice", "age": 20}]) + yield pa.RecordBatch.from_pylist([{"name": "Bob", "age": 30}]) + +schema = pa.schema([ + ("name", pa.string()), + ("age", pa.int32()), +]) + +ds = lance.write_dataset(producer(), + "./alice_and_bob.lance", + schema=schema, mode="overwrite") +print(ds.count_rows()) # Output: 2 +``` + +---------------------------------------- + +TITLE: LanceDB External Manifest Store Commit Operations +DESCRIPTION: Details the four-step commit process when using an external manifest store for concurrent writes in LanceDB, involving staging manifests, committing paths to the external store, and finalizing the manifest in object storage. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_12 + +LANGUAGE: APIDOC +CODE: +``` +External Store Commit Process: + 1. PUT_OBJECT_STORE mydataset.lance/_versions/{version}.manifest-{uuid} + - Action: Stage new manifest in object store under unique path. + 2. PUT_EXTERNAL_STORE base_uri, version, mydataset.lance/_versions/{version}.manifest-{uuid} + - Action: Commit staged manifest path to external KV store. + - Note: Commit is effectively complete after this step. + 3. COPY_OBJECT_STORE mydataset.lance/_versions/{version}.manifest-{uuid} mydataset.lance/_versions/{version}.manifest + - Action: Copy staged manifest to final path. + 4. PUT_EXTERNAL_STORE base_uri, version, mydataset.lance/_versions/{version}.manifest + - Action: Update external store to point to final manifest. +``` + +---------------------------------------- + +TITLE: Write PyArrow Table to Lance Dataset +DESCRIPTION: Demonstrates how to write a `pyarrow.Table` to a Lance dataset using the `lance.write_dataset` function. This is suitable for datasets that can be fully loaded into memory. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import lance +import pyarrow as pa + +table = pa.Table.from_pylist([{"name": "Alice", "age": 20}, + {"name": "Bob", "age": 30}]) +ds = lance.write_dataset(table, "./alice_and_bob.lance") +``` + +---------------------------------------- + +TITLE: Lance DataFragment Protobuf Message Reference +DESCRIPTION: References the Protobuf message definition for DataFragment, which represents a logical chunk of data within a Lance dataset. It can include one or more DataFiles and an optional DeletionFile. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_2 + +LANGUAGE: APIDOC +CODE: +``` +proto.message.DataFragment +``` + +---------------------------------------- + +TITLE: Import Libraries for LanceDB Vector Search Testing +DESCRIPTION: Imports necessary Python libraries for numerical operations (numpy), temporary file handling (tempfile), data manipulation (pandas), plotting (seaborn, matplotlib), and LanceDB specific functionalities (lance, _lib). These imports provide the foundational tools for the vector search tests. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_1 + +LANGUAGE: python +CODE: +``` +from _lib import knn, write_lance, _get_nyt_vectors + +import numpy as np +import tempfile +import random +import lance +import pandas as pd +import seaborn as sns + +from matplotlib import pyplot as plt +from tqdm.auto import tqdm +``` + +---------------------------------------- + +TITLE: Generate SIFT-1M Database Vectors Lance Dataset +DESCRIPTION: Uses the `datagen.py` script to convert SIFT-1M base vectors into a Lance dataset. This dataset will serve as the primary data source for indexing and querying in the SIFT-1M benchmark. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_3 + +LANGUAGE: sh +CODE: +``` +./datagen.py ./sift/sift_base.fvecs ./.lancedb/sift1m.lance -d 128 +``` + +---------------------------------------- + +TITLE: Compact LanceDB Dataset Files with Python +DESCRIPTION: This Python code demonstrates how to compact data files within a LanceDB dataset using the `compact_files` method. It specifies a `target_rows_per_fragment` to optimize file count and can remove soft-deleted rows, improving query performance. Note that compaction creates a new table version and invalidates old row addresses for indexing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_21 + +LANGUAGE: python +CODE: +``` +import lance + +dataset = lance.dataset("./alice_and_bob.lance") +dataset.optimize.compact_files(target_rows_per_fragment=1024 * 1024) +``` + +---------------------------------------- + +TITLE: Prepare PyTorch Model State Dict for LanceDB Saving +DESCRIPTION: This utility function processes a PyTorch model's `state_dict`, iterating through each parameter. It flattens the parameter's tensor, extracts its name and original shape, and then packages this information into a PyArrow `RecordBatch`. This prepares the model weights for efficient storage in a LanceDB dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/artifact_management.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +def _save_model_writer(state_dict): + """Yields a RecordBatch for each parameter in the model state dict""" + for param_name, param in state_dict.items(): + param_shape = list(param.size()) + param_value = param.flatten().tolist() + yield pa.RecordBatch.from_arrays( + [ + pa.array( + [param_name], + pa.string(), + ), + pa.array( + [param_value], + pa.list_(pa.float64(), -1), + ), + pa.array( + [param_shape], + pa.list_(pa.int64(), -1), + ), + ], + ["name", "value", "shape"], + ) +``` + +---------------------------------------- + +TITLE: Create PyTorch DataLoader from LanceDataset (Safe) +DESCRIPTION: This snippet demonstrates how to create a multiprocessing-safe PyTorch DataLoader using `SafeLanceDataset` and `get_safe_loader`. It explicitly uses the 'spawn' method to avoid fork-safety issues that can arise when LanceDB's internal multithreading interacts with Python's multiprocessing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/pytorch.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +from lance.torch.data import SafeLanceDataset, get_safe_loader + +dataset = SafeLanceDataset(temp_lance_dataset) +# use spawn method to avoid fork-safe issue +loader = get_safe_loader( + dataset, + num_workers=2, + batch_size=16, + drop_last=False, +) + +total_samples = 0 +for batch in loader: + total_samples += batch["id"].shape[0] +``` + +---------------------------------------- + +TITLE: Generate SIFT-1M Ground Truth Lance Dataset +DESCRIPTION: Generates a ground truth Lance dataset for SIFT-1M using the `gt.py` script. This dataset is essential for evaluating the recall of the benchmark queries against known correct results. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_4 + +LANGUAGE: sh +CODE: +``` +./gt.py ./.lancedb/sift1m.lance -o ./.lancedb/ground_truth.lance +``` + +---------------------------------------- + +TITLE: Lindera User Dictionary Configuration File (config.yml) +DESCRIPTION: YAML configuration for Lindera, defining the segmenter mode and the path to the dictionary. This file, typically named `config.yml`, can be placed in the model's root directory or specified via the `LINDERA_CONFIG_PATH` environment variable. The `kind` field is not supported in LanceDB's context. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_6 + +LANGUAGE: yaml +CODE: +``` +segmenter: + mode: "normal" + dictionary: + # Note: in lance, the `kind` field is not supported. You need to specify the model path using the `path` field instead. + path: /path/to/lindera/ipadic/main +``` + +---------------------------------------- + +TITLE: Test LanceDB Vector Search with Random Data (L2 Metric) +DESCRIPTION: Demonstrates running the run_test function with randomly generated data (100,000 vectors, 64 dimensions) and queries, using the L2 (Euclidean) distance metric. It then visualizes the recall results using make_plot. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_4 + +LANGUAGE: python +CODE: +``` +# test randomly generated data +data = np.random.standard_normal((100000, 64)) +query = np.random.standard_normal((1000, 64)) + +recall_data = run_test( + data, + query, + "L2", +) + +make_plot(recall_data) +``` + +---------------------------------------- + +TITLE: Lance ColumnMetadata Protobuf Message Reference +DESCRIPTION: References the Protobuf message definition for ColumnMetadata, which is used to describe the encoding and properties of individual columns within a .lance file. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_3 + +LANGUAGE: APIDOC +CODE: +``` +proto.message.ColumnMetadata +``` + +---------------------------------------- + +TITLE: Generate GIST-1M Query Vectors Lance Dataset +DESCRIPTION: Converts GIST-1M query vectors into a Lance dataset using `datagen.py`. These vectors will be used to perform similarity searches against the indexed database during the benchmark. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_10 + +LANGUAGE: sh +CODE: +``` +./datagen.py ./gist/gist_query.fvecs ./.lancedb/gist_query.lance -g 1024 -m 50000 -d 960 -n 1000 +``` + +---------------------------------------- + +TITLE: Test LanceDB Vector Search with Random Data (Cosine Metric) +DESCRIPTION: Shows how to execute the run_test function using randomly generated data and queries, but this time employing the cosine similarity metric. The recall performance is subsequently plotted using make_plot. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_5 + +LANGUAGE: python +CODE: +``` +# test randomly generated data -- cosine +data = np.random.standard_normal((100000, 64)) +query = np.random.standard_normal((1000, 64)) + +recall_data = run_test( + data, + query, + "cosine", +) + +make_plot(recall_data) +``` + +---------------------------------------- + +TITLE: Load PyTorch Model with Weights from Lance Dataset (Python) +DESCRIPTION: This high-level function facilitates loading weights directly into a given PyTorch model from a Lance dataset. It internally calls _load_state_dict to retrieve the complete state dictionary and then applies it to the provided model instance. This simplifies the process of restoring a model's state from a Lance-backed storage. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/artifact_management.md#_snippet_6 + +LANGUAGE: python +CODE: +``` +def load_model( + model: torch.nn.Module, file_name: str, version: int = 1, map_location=None +): + """Loads the model weights from lance file and sets them to the model + + Args: + model (torch.nn.Module): PyTorch model + file_name (str): Lance model name + version (int): Version of the model to load + map_location (str): Device to load the model on + """ + state_dict = _load_state_dict(file_name, version=version, map_location=map_location) + model.load_state_dict(state_dict) +``` + +---------------------------------------- + +TITLE: Connect LanceDB to Azure Blob Storage +DESCRIPTION: This Python snippet illustrates how to connect a LanceDB dataset to Azure Blob Storage. It shows how to pass `account_name` and `account_key` directly via `storage_options`, offering an alternative to environment variables. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_9 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset( + "az://my-container/my-dataset", + storage_options={ + "account_name": "some-account", + "account_key": "some-key", + } +) +``` + +---------------------------------------- + +TITLE: Default Lance Language Model Home Directory +DESCRIPTION: This snippet illustrates the default directory path where LanceDB stores language models if the LANCE_LANGUAGE_MODEL_HOME environment variable is not explicitly set by the user. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_0 + +LANGUAGE: bash +CODE: +``` +${system data directory}/lance/language_models +``` + +---------------------------------------- + +TITLE: Perform Random Row Access in Lance Dataset +DESCRIPTION: This Python snippet demonstrates Lance's capability for fast random access to individual rows using the `take()` method. This feature is crucial for workflows like random sampling, shuffling in ML training, and building secondary indices for enhanced query performance. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_20 + +LANGUAGE: python +CODE: +``` +data = ds.take([1, 100, 500], columns=["image", "label"]) +``` + +---------------------------------------- + +TITLE: Configure AWS Credentials for LanceDB S3 Dataset +DESCRIPTION: Demonstrates how to pass AWS access key ID, secret access key, and session token directly to the `storage_options` parameter when initializing a LanceDB dataset from an S3 path. This method provides explicit credential management for S3 access, overriding environment variables if set. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset( + "s3://bucket/path", + storage_options={ + "access_key_id": "my-access-key", + "secret_access_key": "my-secret-key", + "session_token": "my-session-token", + } +) +``` + +---------------------------------------- + +TITLE: Create Scalar Index with Jieba Tokenizer in Python +DESCRIPTION: Python code demonstrating how to create a scalar index on a 'text' field using the 'INVERTED' index type, specifying 'jieba/default' as the base tokenizer for text processing within LanceDB. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +ds.create_scalar_index("text", "INVERTED", base_tokenizer="jieba/default") +``` + +---------------------------------------- + +TITLE: Add and Populate Columns with SQL Expressions in Lance +DESCRIPTION: Illustrates adding and populating new columns in a Lance dataset using SQL expressions. This method allows defining column values based on existing columns or literal values, enabling data backfill within a single operation. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +table = pa.table({"name": pa.array(["Alice", "Bob", "Carla"])}) +dataset = lance.write_dataset(table, "names") +dataset.add_columns({ + "hash": "sha256(name)", + "status": "'active'", +}) +print(dataset.to_table().to_pandas()) +``` + +---------------------------------------- + +TITLE: Perform Nearest Neighbor Vector Search on Lance Dataset +DESCRIPTION: Demonstrates how to perform nearest neighbor searches on a Lance dataset with a vector index. It samples query vectors using DuckDB and then retrieves the top 10 similar vectors for each query using Lance's `nearest` functionality, showcasing its vector search capabilities. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_9 + +LANGUAGE: python +CODE: +``` +# Get top 10 similar vectors +import duckdb + +dataset = lance.dataset(uri) + +# Sample 100 query vectors. If this segfaults, make sure you have duckdb v0.7+ installed +sample = duckdb.query("SELECT vector FROM dataset USING SAMPLE 100").to_df() +query_vectors = np.array([np.array(x) for x in sample.vector]) + +# Get nearest neighbors for all of them +rs = [dataset.to_table(nearest={"column": "vector", "k": 10, "q": q}) + for q in query_vectors] +``` + +---------------------------------------- + +TITLE: Convert Parquet to Lance Dataset +DESCRIPTION: Demonstrates how to convert a Pandas DataFrame to a PyArrow Table, save it as a Parquet file, and then convert the Parquet dataset into a Lance dataset. This showcases Lance's compatibility with existing data formats and its ease of use for data migration. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +import lance + +import pandas as pd +import pyarrow as pa +import pyarrow.dataset + +df = pd.DataFrame({"a": [5], "b": [10]}) +uri = "/tmp/test.parquet" +tbl = pa.Table.from_pandas(df) +pa.dataset.write_dataset(tbl, uri, format='parquet') + +parquet = pa.dataset.dataset(uri, format='parquet') +lance.write_dataset(parquet, "/tmp/test.lance") +``` + +---------------------------------------- + +TITLE: Define PyArrow Schema with Lance Encoding Metadata +DESCRIPTION: This Python snippet demonstrates how to define a PyArrow schema for a LanceDB table, applying column-level encoding configurations. It shows how to use PyArrow field metadata to specify compression algorithms, compression levels, structural encoding strategies, and packed memory layout for string columns. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_7 + +LANGUAGE: python +CODE: +``` +import pyarrow as pa + +schema = pa.schema([ + pa.field( + "compressible_strings", + pa.string(), + metadata={ + "lance-encoding:compression": "zstd", + "lance-encoding:compression-level": "3", + "lance-encoding:structural-encoding": "miniblock", + "lance-encoding:packed": "true" + } + ) +]) +``` + +---------------------------------------- + +TITLE: Configure Seaborn Plot Style +DESCRIPTION: This snippet imports the seaborn library and sets the default plot style to 'darkgrid'. This improves the visual aesthetics of subsequent plots generated using seaborn. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_1 + +LANGUAGE: python +CODE: +``` +import seaborn as sns +sns.set_style("darkgrid") +``` + +---------------------------------------- + +TITLE: Generate SIFT-1M Query Vectors Lance Dataset +DESCRIPTION: Converts SIFT-1M query vectors into a Lance dataset using `datagen.py`. These vectors will be used to perform similarity searches against the indexed database during the benchmark. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/README.md#_snippet_5 + +LANGUAGE: sh +CODE: +``` +./datagen.py ./sift/sift_query.fvecs ./.lancedb/sift_query.lance -d 128 -n 1000 +``` + +---------------------------------------- + +TITLE: Convert SIFT1M Dataset to Lance for Vector Search +DESCRIPTION: Loads the SIFT1M dataset from a binary file, converts its raw vector data into a NumPy array, and then transforms it into a Lance table using `vec_to_table`. The dataset is then written to a Lance file, optimized for vector search with specific row group and file size settings. + +SOURCE: https://github.com/lancedb/lance/blob/main/README.md#_snippet_7 + +LANGUAGE: python +CODE: +``` +import lance +from lance.vector import vec_to_table +import numpy as np +import struct + +nvecs = 1000000 +ndims = 128 +with open("sift/sift_base.fvecs", mode="rb") as fobj: + buf = fobj.read() + data = np.array(struct.unpack("<128000000f", buf[4 : 4 + 4 * nvecs * ndims])).reshape((nvecs, ndims)) + dd = dict(zip(range(nvecs), data)) + +table = vec_to_table(dd) +uri = "vec_data.lance" +sift1m = lance.write_dataset(table, uri, max_rows_per_group=8192, max_rows_per_file=1024*1024) +``` + +---------------------------------------- + +TITLE: Load Entire Lance Dataset into Memory +DESCRIPTION: This Python snippet shows how to load an entire Lance dataset into an in-memory table using the `to_table()` method. This approach is straightforward and suitable for datasets that can comfortably fit within available memory. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_13 + +LANGUAGE: python +CODE: +``` +table = ds.to_table() +``` + +---------------------------------------- + +TITLE: Lance SQL Type to Apache Arrow Type Mapping +DESCRIPTION: This table provides a comprehensive mapping between SQL data types supported by Lance and their corresponding Apache Arrow data types. It details the internal storage format for various data representations, crucial for understanding data compatibility and performance. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_19 + +LANGUAGE: APIDOC +CODE: +``` +| SQL type | Arrow type | +|----------|------------| +| `boolean` | `Boolean` | +| `tinyint` / `tinyint unsigned` | `Int8` / `UInt8` | +| `smallint` / `smallint unsigned` | `Int16` / `UInt16` | +| `int` or `integer` / `int unsigned` or `integer unsigned` | `Int32` / `UInt32` | +| `bigint` / `bigint unsigned` | `Int64` / `UInt64` | +| `float` | `Float32` | +| `double` | `Float64` | +| `decimal(precision, scale)` | `Decimal128` | +| `date` | `Date32` | +| `timestamp` | `Timestamp` (1) | +| `string` | `Utf8` | +| `binary` | `Binary` | +``` + +---------------------------------------- + +TITLE: Visualize LanceDB Vector Search Recall Heatmap +DESCRIPTION: Defines make_plot, a utility function to visualize the recall data generated by run_test. It takes the recall data (a list of lists) and converts it into a pandas DataFrame, then uses seaborn to generate heatmaps showing recall across different nprobes and refine_factor values for various test cases. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_3 + +LANGUAGE: python +CODE: +``` +def make_plot(recall_data): + df = pd.DataFrame(recall_data, columns=["case", "nprobes", "refine_factor", "recall"]) + + num_cases = len(df["case"].unique()) + (fig, axs) = plt.subplots(1, 2, figsize=(16, 8)) + + for case, ax in zip(df["case"].unique(), axs): + current_case = df[df["case"] == case] + sns.heatmap( + current_case.drop(columns=["case"]).set_index(["nprobes", "refine_factor"])["recall"].unstack(), + annot=True, + ax=ax, + ).set(title=f"Recall -- {case}") +``` + +---------------------------------------- + +TITLE: Count unique video titles in dataset +DESCRIPTION: Converts the loaded dataset to a Pandas DataFrame and counts the number of unique video titles. This provides an overview of the diversity and scope of the video content within the dataset. + +SOURCE: https://github.com/lancedb/lance/blob/main/notebooks/youtube_transcript_search.ipynb#_snippet_2 + +LANGUAGE: python +CODE: +``` +data.to_pandas().title.nunique() +``` + +---------------------------------------- + +TITLE: Describe Median Latency by Refine Factor +DESCRIPTION: This snippet groups the DataFrame by the 'refine_factor' column and calculates descriptive statistics for the '50%' (median response time) column. This provides an understanding of latency variations across different refinement factors. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_6 + +LANGUAGE: python +CODE: +``` +df.groupby("refine_factor")["50%"].describe() +``` + +---------------------------------------- + +TITLE: Utility functions to load images and captions from Lance dataset +DESCRIPTION: These two Python functions, `load_image` and `load_caption`, facilitate loading data from a Lance dataset. `load_image` converts byte-formatted images to a usable image format using numpy and OpenCV, while `load_caption` extracts the longest caption associated with an image, assuming it contains the most descriptive information. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +def load_image(ds, idx): + # Utility function to load an image at an index and convert it from bytes format to img format + raw_img = ds.take([idx], columns=['image']).to_pydict() + raw_img = np.frombuffer(b''.join(raw_img['image']), dtype=np.uint8) + img = cv2.imdecode(raw_img, cv2.IMREAD_COLOR) + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + return img + +def load_caption(ds, idx): + # Utility function to load an image's caption. Currently we return the longest caption of all + captions = ds.take([idx], columns=['captions']).to_pydict()['captions'][0] + return max(captions, key=len) +``` + +---------------------------------------- + +TITLE: Save PyTorch Model Weights to LanceDB with Versioning +DESCRIPTION: This function saves a PyTorch model's `state_dict` to a LanceDB file. It utilizes the `_save_model_writer` utility to format the data. The function supports both overwriting existing model weights or saving them as a new version within the Lance dataset, providing flexibility for model checkpoint management. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/artifact_management.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +def save_model(state_dict: OrderedDict, file_name: str, version=False): + """Saves a PyTorch model in lance file format + + Args: + state_dict (OrderedDict): Model state dict + file_name (str): Lance model name + version (bool): Whether to save as a new version or overwrite the existing versions, + if the lance file already exists + """ + # Create a reader + reader = pa.RecordBatchReader.from_batches( + GLOBAL_SCHEMA, _save_model_writer(state_dict) + ) + + if os.path.exists(file_name): + if version: + # If we want versioning, we use the overwrite mode to create a new version + lance.write_dataset( + reader, file_name, schema=GLOBAL_SCHEMA, mode="overwrite" + ) + else: + # If we don't want versioning, we delete the existing file and write a new one + shutil.rmtree(file_name) + lance.write_dataset(reader, file_name, schema=GLOBAL_SCHEMA) + else: + # If the file doesn't exist, we write a new one + lance.write_dataset(reader, file_name, schema=GLOBAL_SCHEMA) +``` + +---------------------------------------- + +TITLE: Protobuf Definition for Row ID Sequence Storage +DESCRIPTION: This protobuf oneof field defines how row ID sequences are stored. Small sequences are stored directly as `inline_sequence` bytes to avoid I/O overhead, while large sequences are referenced via an `external_file` path to optimize storage and retrieval. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_16 + +LANGUAGE: Protobuf +CODE: +``` +oneof row_id_sequence { + // Inline sequence + bytes inline_sequence = 1; + // External file reference + string external_file = 2; +} // row_id_sequence +``` + +---------------------------------------- + +TITLE: Drop Columns in LanceDB Dataset +DESCRIPTION: This snippet demonstrates how to drop columns from a LanceDB dataset using the `lance.LanceDataset.drop_columns` method. This is a metadata-only operation, making it very fast. It also explains that physical data removal requires `lance.dataset.DatasetOptimizer.compact_files()` followed by `lance.LanceDataset.cleanup_old_versions()`. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +table = pa.table({"id": pa.array([1, 2, 3]), + "name": pa.array(["Alice", "Bob", "Carla"])}) +dataset = lance.write_dataset(table, "names", mode="overwrite") +dataset.drop_columns(["name"]) +print(dataset.schema) +# id: int64 +``` + +---------------------------------------- + +TITLE: Define CLIP Model Components (ImageEncoder, TextEncoder, Head) in PyTorch +DESCRIPTION: This snippet defines the core neural network modules for a CLIP-like model. ImageEncoder uses a pre-trained vision model (e.g., ResNet) to convert images to feature vectors. TextEncoder uses a pre-trained language model (e.g., BERT) for text embeddings. The Head module projects these features into a common embedding space using linear layers, GELU activation, dropout, and layer normalization. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/clip_training.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +class ImageEncoder(nn.Module): + """Encodes the Image""" + def __init__(self, model_name, pretrained = True): + super().__init__() + self.backbone = timm.create_model( + model_name, + pretrained=pretrained, + num_classes=0, + global_pool="avg" + ) + + for param in self.backbone.parameters(): + param.requires_grad = True + + def forward(self, img): + return self.backbone(img) + +class TextEncoder(nn.Module): + """Encodes the Caption""" + def __init__(self, model_name): + super().__init__() + + self.backbone = AutoModel.from_pretrained(model_name) + + for param in self.backbone.parameters(): + param.requires_grad = True + + def forward(self, captions): + output = self.backbone(**captions) + return output.last_hidden_state[:, 0, :] + +class Head(nn.Module): + """Projects both into Embedding space""" + def __init__(self, embedding_dim, projection_dim): + super().__init__() + self.projection = nn.Linear(embedding_dim, projection_dim) + self.gelu = nn.GELU() + self.fc = nn.Linear(projection_dim, projection_dim) + + self.dropout = nn.Dropout(0.3) + self.layer_norm = nn.LayerNorm(projection_dim) + + def forward(self, x): + projected = self.projection(x) + x = self.gelu(projected) + x = self.fc(x) + x = self.dropout(x) + x += projected + + return self.layer_norm(x) +``` + +---------------------------------------- + +TITLE: Retrieve Specific Records from a Lance Dataset in Rust +DESCRIPTION: Retrieves specific records from a Lance Dataset based on their indices and a projection. The result is a RecordBatch containing the requested data. + +SOURCE: https://github.com/lancedb/lance/blob/main/rust/lance/README.md#_snippet_3 + +LANGUAGE: rust +CODE: +``` +let values: Result = dataset.take(&[200, 199, 39, 40, 100], &projection).await; +``` + +---------------------------------------- + +TITLE: Define PyArrow schema for storing PyTorch model weights in Lance +DESCRIPTION: This snippet defines a `pyarrow.Schema` named `GLOBAL_SCHEMA` specifically designed for storing PyTorch model weights within the Lance file format. The schema includes three fields: 'name' (string) for the weight's identifier, 'value' (list of float64) for the flattened weight tensor, and 'shape' (list of int64) to preserve the original dimensions for reconstruction. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/artifact_management.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +GLOBAL_SCHEMA = pa.schema( + [ + pa.field("name", pa.string()), + pa.field("value", pa.list_(pa.float64(), -1)), + pa.field("shape", pa.list_(pa.int64(), -1)) # Is a list with variable shape because weights can have any number of dims + ] +) +``` + +---------------------------------------- + +TITLE: Create Lance ImageURIArray from URI List +DESCRIPTION: This snippet demonstrates how to initialize a `lance.arrow.ImageURIArray` from a list of image URIs. This array type is designed to store references to images in various storage systems (local, file, S3) for lazy loading, without validating or loading the images into memory immediately. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/arrays.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +from lance.arrow import ImageURIArray + +ImageURIArray.from_uris([ + "/tmp/image1.jpg", + "file:///tmp/image2.jpg", + "s3://example/image3.jpg" +]) +# +# ['/tmp/image1.jpg', 'file:///tmp/image2.jpg', 's3://example/image3.jpg'] +``` + +---------------------------------------- + +TITLE: Lance Execution Node Contract Definition +DESCRIPTION: Defines the contract for various execution nodes within Lance's I/O execution plan, detailing their parameters, input schemas, and output schemas. + +SOURCE: https://github.com/lancedb/lance/blob/main/__wiki__/I-O-Execution.md#_snippet_0 + +LANGUAGE: APIDOC +CODE: +``` +Execution Nodes: + Scan: + Parameters: dataset, projected columns + Input Schema: N/A + Output Schema: projected columns + Filter: + Parameters: input node, filter + Input Schema: any + Output Schema: input schema + columns in filters + Take: + Parameters: input node + Input Schema: any, must have a "_rowid" column + Output Schema: input schema minus _rowid + KNNFlatExec: + Parameters: input node, query + Input Schema: any + Output Schema: input schema + {"scores"} + KNNIndexExec: + Parameters: dataset + Input Schema: N/A + Output Schema: {"score", "_rowid"} +``` + +---------------------------------------- + +TITLE: Drop Columns from LanceDB Dataset in Java +DESCRIPTION: Shows how to remove specified columns from a LanceDB dataset. This operation simplifies the dataset's schema by eliminating unnecessary fields. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_8 + +LANGUAGE: java +CODE: +``` +void dropColumns() { + String datasetPath = ""; // specify a path point to a dataset + try (BufferAllocator allocator = new RootAllocator()) { + try (Dataset dataset = Dataset.open(datasetPath, allocator)) { + dataset.dropColumns(Collections.singletonList("name")); + } + } +} +``` + +---------------------------------------- + +TITLE: Describe Median Latency by IVF Index +DESCRIPTION: This snippet groups the DataFrame by the 'ivf' column and calculates descriptive statistics (count, mean, std, min, max, quartiles) for the '50%' (median response time) column. This helps understand latency distribution across different IVF index configurations. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_3 + +LANGUAGE: python +CODE: +``` +df.groupby("ivf")["50%"].describe() +``` + +---------------------------------------- + +TITLE: Update Rows with Complex SQL Expressions +DESCRIPTION: Shows how to update column values using complex SQL expressions that can reference existing columns, such as incrementing an age column by a fixed value. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +import lance + +dataset = lance.dataset("./alice_and_bob.lance") +dataset.update({"age": "age + 2"}) +``` + +---------------------------------------- + +TITLE: Add Rows to Lance Dataset +DESCRIPTION: Illustrates two methods for adding new rows to an existing Lance dataset: using the `LanceDataset.insert` method for direct insertion and using `lance.write_dataset` with `mode="append"` to append new data. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +import lance +import pyarrow as pa + +table = pa.Table.from_pylist([{"name": "Alice", "age": 20}, + {"name": "Bob", "age": 30}]) +ds = lance.write_dataset(table, "./insert_example.lance") + +new_table = pa.Table.from_pylist([{"name": "Carla", "age": 37}]) +ds.insert(new_table) +print(ds.to_table().to_pandas()) +# name age +# 0 Alice 20 +# 1 Bob 30 +# 2 Carla 37 + +new_table2 = pa.Table.from_pylist([{"name": "David", "age": 42}]) +ds = lance.write_dataset(new_table2, ds, mode="append") +print(ds.to_table().to_pandas()) +# name age +# 0 Alice 20 +# 1 Bob 30 +# 2 Carla 37 +# 3 David 42 +``` + +---------------------------------------- + +TITLE: Bulk Update Rows in LanceDB Dataset using Merge Insert +DESCRIPTION: Demonstrates how to efficiently replace existing rows in a LanceDB dataset with new data using `merge_insert` and `when_matched_update_all()`. This operation uses a key for matching rows, typically a unique identifier. Note that modified rows are re-inserted, changing their position to the end of the table. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_7 + +LANGUAGE: python +CODE: +``` +import lance + +dataset = lance.dataset("./alice_and_bob.lance") +print(dataset.to_table().to_pandas()) +# name age +# 0 Alice 20 +# 1 Bob 30 + +# Change the ages of both Alice and Bob +new_table = pa.Table.from_pylist([{"name": "Alice", "age": 2}, + {"name": "Bob", "age": 3}]) +# This will use `name` as the key for matching rows. Merge insert +# uses a JOIN internally and so you typically want this column to +# be a unique key or id of some kind. +rst = dataset.merge_insert("name") \ + .when_matched_update_all() \ + .execute(new_table) +print(dataset.to_table().to_pandas()) +# name age +# 0 Alice 2 +# 1 Bob 3 +``` + +---------------------------------------- + +TITLE: Load Single Weight Tensor from Lance Dataset (Python) +DESCRIPTION: This function converts a single weight entry, retrieved as a dictionary from a Lance dataset, into a PyTorch tensor. It reshapes the flattened 'value' array using the 'shape' information stored within the weight dictionary. The output is a torch.Tensor ready for further processing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/artifact_management.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +def _load_weight(weight: dict) -> torch.Tensor: + """Converts a weight dict to a torch tensor""" + return torch.tensor(weight["value"], dtype=torch.float64).reshape(weight["shape"]) +``` + +---------------------------------------- + +TITLE: Perform Parallel Writes with lance.fragment.write_fragments +DESCRIPTION: This code demonstrates how to write new data fragments in parallel across multiple workers using `lance.fragment.write_fragments`. Each worker generates its own set of fragments, which are then printed for verification. This is the first phase of a distributed write operation, preparing data for a later commit. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/distributed_write.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import json +from lance.fragment import write_fragments + +# Run on each worker +data_uri = "./dist_write" +schema = pa.schema([ + ("a", pa.int32()), + ("b", pa.string()), +]) + +# Run on worker 1 +data1 = { + "a": [1, 2, 3], + "b": ["x", "y", "z"], +} +fragments_1 = write_fragments(data1, data_uri, schema=schema) +print("Worker 1: ", fragments_1) + +# Run on worker 2 +data2 = { + "a": [4, 5, 6], + "b": ["u", "v", "w"], +} +fragments_2 = write_fragments(data2, data_uri, schema=schema) +print("Worker 2: ", fragments_2) +``` + +---------------------------------------- + +TITLE: Drop Lance Dataset in Java +DESCRIPTION: This Java code illustrates how to permanently delete a Lance dataset from the file system. It takes the dataset's path and uses the static `Dataset.drop` method to remove all associated files and metadata. This operation is irreversible and should be used with caution. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_4 + +LANGUAGE: Java +CODE: +``` +void dropDataset() { + String datasetPath = tempDir.resolve("drop_stream").toString(); + Dataset.drop(datasetPath, new HashMap<>()); +} +``` + +---------------------------------------- + +TITLE: LanceDB Statistics Storage +DESCRIPTION: Describes how statistics (null count, min, max) are stored within Lance files in a columnar format, enabling selective reading for query optimization. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_14 + +LANGUAGE: APIDOC +CODE: +``` +Statistics Storage: + - Location: Stored within Lance files. + - Purpose: Determine which pages to skip during queries. + - Data Points: null count, lower bound (min), upper bound (max). + - Format: Lance's columnar format. + - Benefit: Allows selective reading of relevant stats columns. +``` + +---------------------------------------- + +TITLE: Alter Columns in LanceDB Dataset in Java +DESCRIPTION: Illustrates how to modify existing columns within a LanceDB dataset. This includes renaming a column, changing its nullability, or casting its data type to a new ArrowType, facilitating schema adjustments. + +SOURCE: https://github.com/lancedb/lance/blob/main/java/README.md#_snippet_7 + +LANGUAGE: java +CODE: +``` +void alterColumns() { + String datasetPath = ""; // specify a path point to a dataset + try (BufferAllocator allocator = new RootAllocator()) { + try (Dataset dataset = Dataset.open(datasetPath, allocator)) { + ColumnAlteration nameColumnAlteration = + new ColumnAlteration.Builder("name") + .rename("new_name") + .nullable(true) + .castTo(new ArrowType.Utf8()) + .build(); + + dataset.alterColumns(Collections.singletonList(nameColumnAlteration)); + } + } +} +``` + +---------------------------------------- + +TITLE: Group and sort captions by image ID +DESCRIPTION: This section iterates through all unique image IDs found in the annotations. For each image, it collects all associated captions and sorts them based on their original annotation number, ensuring the correct order of captions for each image. The result is a list of tuples, each containing an image ID and a tuple of its ordered captions. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/flickr8k_dataset_creation.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +captions = [] +image_ids = set(ann[0] for ann in annotations) +for img_id in tqdm(image_ids): + current_img_captions = [] + for ann_img_id, num, caption in annotations: + if img_id == ann_img_id: + current_img_captions.append((num, caption)) + + # Sort by the annotation number + current_img_captions.sort(key=lambda x: x[0]) + captions.append((img_id, tuple([x[1] for x in current_img_captions]))) +``` + +---------------------------------------- + +TITLE: Create Scalar Index with Lindera Tokenizer in Python +DESCRIPTION: Python code demonstrating how to create a scalar index on a 'text' field using the 'INVERTED' index type, specifying 'lindera/ipadic' as the base tokenizer for text processing within LanceDB. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/tokenizer.md#_snippet_5 + +LANGUAGE: python +CODE: +``` +ds.create_scalar_index("text", "INVERTED", base_tokenizer="lindera/ipadic") +``` + +---------------------------------------- + +TITLE: Create Pandas Series with Lance BFloat16 Dtype +DESCRIPTION: This snippet demonstrates how to create a Pandas Series using the `lance.bfloat16` custom dtype. It shows the initialization of a Series with floating-point numbers, which are then converted to the BFloat16 format, suitable for machine learning applications. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/arrays.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import lance.arrow + +pd.Series([1.1, 2.1, 3.4], dtype="lance.bfloat16") +# 0 1.1015625 +# 1 2.09375 +# 2 3.40625 +# dtype: lance.bfloat16 +``` + +---------------------------------------- + +TITLE: Define Lance Schema with Blob Column in Python +DESCRIPTION: This Python code demonstrates how to define a PyArrow schema for a Lance dataset, marking a `large_binary` column as a blob column by setting the `lance-encoding:blob` metadata to `true`. This configuration enables Lance to efficiently store and retrieve large binary objects. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/blob.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +import pyarrow as pa + +schema = pa.schema( + [ + pa.field("id", pa.int64()), + pa.field("video", + pa.large_binary(), + metadata={"lance-encoding:blob": "true"} + ), + ] +) +``` + +---------------------------------------- + +TITLE: Describe Median Latency by NProbes +DESCRIPTION: This snippet groups the DataFrame by the 'nprobes' column and calculates descriptive statistics for the '50%' (median response time) column. This helps analyze how the number of probes affects median query latency. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_5 + +LANGUAGE: python +CODE: +``` +df.groupby("nprobes")["50%"].describe() +``` + +---------------------------------------- + +TITLE: Test LanceDB Vector Search with NYT TF-IDF Vectors (Cosine Metric) +DESCRIPTION: Illustrates testing LanceDB's vector search with real-world data: sparse TF-IDF vectors from the New York Times dataset, projected to 256 dimensions. It uses the cosine similarity metric and custom index parameters (num_partitions=256, num_sub_vectors=32). + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_6 + +LANGUAGE: python +CODE: +``` +# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- cosine +data = _get_nyt_vectors() +data = data[np.linalg.norm(data, axis=1) != 0] +data = np.unique(data, axis=0) +query = np.random.standard_normal((100, 256)) + +recall_data = run_test( + data, + query, + "cosine", + num_partitions=256, + num_sub_vectors=32, +) + +make_plot(recall_data) +``` + +---------------------------------------- + +TITLE: Test LanceDB Vector Search with NYT TF-IDF Vectors (Normalized L2 Metric) +DESCRIPTION: Presents a test case using the same NYT TF-IDF vectors, but normalized for L2 distance, effectively making it equivalent to cosine similarity on normalized vectors. It uses the L2 metric with specific index parameters (num_partitions=512, num_sub_vectors=32) and visualizes the recall. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/full_report/report.ipynb#_snippet_7 + +LANGUAGE: python +CODE: +``` +# test NYT -- TF-IDF sparse vectors projected on to 256D dense -- normalized L2 +data = _get_nyt_vectors() +data = data[np.linalg.norm(data, axis=1) != 0] +data = np.unique(data, axis=0) +data /= np.linalg.norm(data, axis=1)[:, None] + +# use the same out of sample query + + +recall_data = run_test( + data, + query, + "L2", + num_partitions=512, + num_sub_vectors=32, +) + +make_plot(recall_data) +``` + +---------------------------------------- + +TITLE: Update Rows in Lance Dataset by SQL Expression +DESCRIPTION: Demonstrates how to update specific columns of rows in a Lance dataset using the `lance.LanceDataset.update` method. The update values are SQL expressions, allowing for direct value assignment. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_4 + +LANGUAGE: python +CODE: +``` +import lance + +dataset = lance.dataset("./alice_and_bob.lance") +dataset.update({"name": "'Bob'"}, where="name = 'Blob'") +``` + +---------------------------------------- + +TITLE: Iteratively Read Large Lance Dataset in Batches +DESCRIPTION: This Python snippet demonstrates how to read a Lance dataset in batches, which is ideal for datasets too large to fit into memory. It uses `to_batches()` with column projection and filter push-down, allowing processing of data chunks iteratively. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_15 + +LANGUAGE: python +CODE: +``` +for batch in ds.to_batches(columns=["image"], filter="label = 10"): + # do something with batch + compute_on_batch(batch) +``` + +---------------------------------------- + +TITLE: Perform Upsert Operation (Update or Insert) in LanceDB +DESCRIPTION: Shows how to combine `when_matched_update_all()` and `when_not_matched_insert_all()` within `merge_insert` to achieve an 'upsert' behavior. This operation updates rows if they exist and inserts them if they do not, providing a flexible way to synchronize data. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_9 + +LANGUAGE: python +CODE: +``` +import lance +import pyarrow as pa + +# Change Carla's age and insert David +new_table = pa.Table.from_pylist([{"name": "Carla", "age": 27}, + {"name": "David", "age": 42}]) + +dataset = lance.dataset("./alice_and_bob.lance") + +# This will update Carla and insert David +_ = dataset.merge_insert("name") \ + .when_matched_update_all() \ + .when_not_matched_insert_all() \ + .execute(new_table) +# Verify the results +print(dataset.to_table().to_pandas()) +# name age +# 0 Alice 20 +# 1 Bob 30 +# 2 Carla 27 +# 3 David 42 +``` + +---------------------------------------- + +TITLE: Configure LanceDB for S3 Express One Zone Buckets +DESCRIPTION: Shows how to explicitly configure LanceDB to access S3 Express One Zone (directory) buckets, especially when the bucket name is hidden by an access point or private link. This involves setting the `region` and `s3_express` flag in `storage_options` for direct access. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/object_store.md#_snippet_6 + +LANGUAGE: python +CODE: +``` +import lance +ds = lance.dataset( + "s3://my-bucket--use1-az4--x-s3/path/imagenet.lance", + storage_options={ + "region": "us-east-1", + "s3_express": "true", + } +) +``` + +---------------------------------------- + +TITLE: Add Schema-Only Columns to Lance Dataset +DESCRIPTION: Demonstrates how to add new columns to a Lance dataset without populating them, using `pyarrow.Field` or `pyarrow.Schema`. This operation is metadata-only and very efficient, useful for lazy population. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_0 + +LANGUAGE: python +CODE: +``` +table = pa.table({"id": pa.array([1, 2, 3])}) +dataset = lance.write_dataset(table, "null_columns") + +# With pyarrow Field +dataset.add_columns(pa.field("embedding", pa.list_(pa.float32(), 128))) +assert dataset.schema == pa.schema([ + ("id", pa.int64()), + ("embedding", pa.list_(pa.float32(), 128)), +]) + +# With pyarrow Schema +dataset.add_columns(pa.schema([ + ("label", pa.string()), + ("score", pa.float32()), +])) +assert dataset.schema == pa.schema([ + ("id", pa.int64()), + ("embedding", pa.list_(pa.float32(), 128)), + ("label", pa.string()), + ("score", pa.float32()), +]) +``` + +---------------------------------------- + +TITLE: Commit Collected Fragments to a Lance Dataset +DESCRIPTION: After parallel writes, this snippet shows how to serialize fragment metadata from all workers, collect them on a single worker, and then commit them to a Lance dataset using `lance.LanceOperation.Overwrite`. It verifies the commit by reading the dataset and asserting its properties, demonstrating the final step of a distributed write. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/distributed_write.md#_snippet_1 + +LANGUAGE: python +CODE: +``` +import json +from lance import FragmentMetadata, LanceOperation + +# Serialize Fragments into JSON data +fragments_json1 = [json.dumps(fragment.to_json()) for fragment in fragments_1] +fragments_json2 = [json.dumps(fragment.to_json()) for fragment in fragments_2] + +# On one worker, collect all fragments +all_fragments = [FragmentMetadata.from_json(f) for f in \ + fragments_json1 + fragments_json2] + +# Commit the fragments into a single dataset +# Use LanceOperation.Overwrite to overwrite the dataset or create new dataset. +op = lance.LanceOperation.Overwrite(schema, all_fragments) +read_version = 0 # Because it is empty at the time. +lance.LanceDataset.commit( + data_uri, + op, + read_version=read_version, +) + +# We can read the dataset using the Lance API: +dataset = lance.dataset(data_uri) +assert len(dataset.get_fragments()) == 2 +assert dataset.version == 1 +print(dataset.to_table().to_pandas()) +``` + +---------------------------------------- + +TITLE: Merge Pre-computed Columns into Lance Dataset +DESCRIPTION: Explains how to integrate pre-computed columns into an existing Lance dataset using the `merge` method. This approach avoids rewriting the entire dataset by joining new data based on a specified column, as demonstrated with an 'id' column. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +table = pa.table({ + "id": pa.array([1, 2, 3]), + "embedding": pa.array([np.array([1, 2, 3]), np.array([4, 5, 6]), + np.array([7, 8, 9])]) +}) +dataset = lance.write_dataset(table, "embeddings", mode="overwrite") + +new_data = pa.table({ + "id": pa.array([1, 2, 3]), + "label": pa.array(["horse", "rabbit", "cat"]) +}) +dataset.merge(new_data, "id") +print(dataset.to_table().to_pandas()) +``` + +---------------------------------------- + +TITLE: SQL Filter Expression with Escaped Column Names +DESCRIPTION: This SQL snippet shows how to handle column names that are SQL keywords or contain special characters (like spaces) by escaping them with backticks. It also demonstrates accessing nested fields with escaped names to ensure correct parsing. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_17 + +LANGUAGE: sql +CODE: +``` +`CUBE` = 10 AND `column name with space` IS NOT NULL + AND `nested with space`.`inner with space` < 2 +``` + +---------------------------------------- + +TITLE: LanceDB Page-level Statistics Schema Definition +DESCRIPTION: This schema defines the structure for storing page-level statistics for each field (column) within a Lance file. It includes the null count, minimum value, and maximum value for each field, typed according to the field's original data type. The schema is flexible, allowing for missing fields and future extensions. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_15 + +LANGUAGE: APIDOC +CODE: +``` +: struct + null_count: i64 + min_value: + max_value: +... +: struct + null_count: i64 + min_value: + max_value: +``` + +---------------------------------------- + +TITLE: Define Custom TensorSpec for Lance TensorFlow Dataset Output +DESCRIPTION: This code shows how to explicitly define the `tf.TensorSpec` for the output signature of a `tf.data.Dataset` created from Lance. This is crucial for precise type and shape control, especially when automatic inference is insufficient or for complex data structures like ragged tensors. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/tensorflow.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +batch_size = 256 +ds = lance.tf.data.from_lance( + "s3://my-bucket/my-dataset", + columns=["image", "labels"], + batch_size=batch_size, + output_signature={ + "image": tf.TensorSpec(shape=(), dtype=tf.string), + "labels": tf.RaggedTensorSpec( + dtype=tf.int32, shape=(batch_size, None), ragged_rank=1), + }, +``` + +---------------------------------------- + +TITLE: SQL Literals for Date, Timestamp, and Decimal Types +DESCRIPTION: This SQL snippet illustrates how to specify literals for date, timestamp, and decimal columns in Lance filter expressions. It shows the syntax for casting string values to specific data types, ensuring correct interpretation during query execution. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_18 + +LANGUAGE: sql +CODE: +``` +date_col = date '2021-01-01' +and timestamp_col = timestamp '2021-01-01 00:00:00' +and decimal_col = decimal(8,3) '1.000' +``` + +---------------------------------------- + +TITLE: Add New Columns to a Lance Dataset in a Distributed Manner +DESCRIPTION: This snippet demonstrates adding new columns to a Lance dataset efficiently without copying existing data. It shows how to merge columns on individual fragments across workers using `frag.merge_columns` and then commit the changes using `lance.LanceOperation.Merge` on a single worker. This leverages Lance's two-dimensional layout for metadata-only operations, making column additions highly efficient. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/distributed_write.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +import lance +from pyarrow import RecordBatch +import pyarrow.compute as pc + +dataset = lance.dataset("./add_columns_example") +assert len(dataset.get_fragments()) == 2 +assert dataset.to_table().combine_chunks() == pa.Table.from_pydict({ + "name": ["alice", "bob", "charlie", "craig", "dave", "eve"], + "age": [25, 33, 44, 55, 66, 77], +}, schema=schema) + + +def name_len(names: RecordBatch) -> RecordBatch: + return RecordBatch.from_arrays( + [pc.utf8_length(names["name"])], + ["name_len"], + ) + +# On Worker 1 +frag1 = dataset.get_fragments()[0] +new_fragment1, new_schema = frag1.merge_columns(name_len, ["name"]) + +# On Worker 2 +frag2 = dataset.get_fragments()[1] +new_fragment2, _ = frag2.merge_columns(name_len, ["name"]) + +# On Worker 3 - Commit +all_fragments = [new_fragment1, new_fragment2] +op = lance.LanceOperation.Merge(all_fragments, schema=new_schema) +lance.LanceDataset.commit( + "./add_columns_example", + op, + read_version=dataset.version, +) + +# Verify dataset +dataset = lance.dataset("./add_columns_example") +print(dataset.to_table().to_pandas()) +``` + +---------------------------------------- + +TITLE: Plot Median Query Latency Histogram +DESCRIPTION: This snippet generates a histogram of the median query latency using seaborn's `displot` function. It visualizes the distribution of the '50%' column (median response time) from the DataFrame and sets appropriate x and y axis labels. + +SOURCE: https://github.com/lancedb/lance/blob/main/benchmarks/sift/Results.ipynb#_snippet_2 + +LANGUAGE: python +CODE: +``` +ax = sns.displot(df, x="50%") +ax.set(xlabel="Median response time seconds", ylabel="Number of configurations") +``` + +---------------------------------------- + +TITLE: Implement Custom PyTorch Sampler for Non-Overlapping Data +DESCRIPTION: The `LanceSampler` class is a custom PyTorch `Sampler` designed to prevent overlapping samples during LLM training, which can lead to overfitting. It ensures that the indices returned are `block_size` apart, guaranteeing that each sample processed by the model is unique and non-redundant. The sampler pre-calculates and shuffles available indices, yielding them during iteration to provide distinct data chunks for each batch. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/examples/python/llm_training.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +class LanceSampler(Sampler): + r"""Samples tokens randomly but `block_size` indices apart. + + Args: + data_source (Dataset): dataset to sample from + block_size (int): minimum index distance between each random sample + """ + + def __init__(self, data_source, block_size=512): + self.data_source = data_source + self.num_samples = len(self.data_source) + self.available_indices = list(range(0, self.num_samples, block_size)) + np.random.shuffle(self.available_indices) + + def __iter__(self): + yield from self.available_indices + + def __len__(self) -> int: + return len(self.available_indices) +``` + +---------------------------------------- + +TITLE: Insert New Rows Only in LanceDB Dataset +DESCRIPTION: Illustrates how to use `merge_insert` with `when_not_matched_insert_all()` to insert data only if it doesn't already exist in the dataset. This is useful for preventing duplicate entries when processing batches of data where some records might have been added previously. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_8 + +LANGUAGE: python +CODE: +``` +# Bob is already in the table, but Carla is new +new_table = pa.Table.from_pylist([{"name": "Bob", "age": 30}, + {"name": "Carla", "age": 37}]) + +dataset = lance.dataset("./alice_and_bob.lance") + +# This will insert Carla but leave Bob unchanged +_ = dataset.merge_insert("name") \ + .when_not_matched_insert_all() \ + .execute(new_table) +# Verify that Carla was added but Bob remains unchanged +print(dataset.to_table().to_pandas()) +# name age +# 0 Alice 20 +# 1 Bob 30 +# 2 Carla 37 +``` + +---------------------------------------- + +TITLE: Replace Filtered Data with New Rows in LanceDB +DESCRIPTION: Explains a less common but powerful use case of `merge_insert` to replace a specific region of existing rows (defined by a filter) with new data. This effectively acts as a combined delete and insert operation within a single transaction, using `when_not_matched_by_source_delete()`. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_10 + +LANGUAGE: python +CODE: +``` +import lance +import pyarrow as pa + +new_table = pa.Table.from_pylist([{"name": "Edgar", "age": 46}, + {"name": "Francene", "age": 44}]) + +dataset = lance.dataset("./alice_and_bob.lance") +print(dataset.to_table().to_pandas()) +# name age +# 0 Alice 20 +# 1 Bob 30 +# 2 Charlie 45 +# 3 Donna 50 + +# This will remove anyone above 40 and insert our new data +_ = dataset.merge_insert("name") \ + .when_not_matched_insert_all() \ + .when_not_matched_by_source_delete("age >= 40") \ + .execute(new_table) +# Verify the results - people over 40 replaced with new data +print(dataset.to_table().to_pandas()) +# name age +# 0 Alice 20 +# 1 Bob 30 +# 2 Edgar 46 +# 3 Francene 44 +``` + +---------------------------------------- + +TITLE: Distributed Training with Shuffled Lance Fragments in TensorFlow +DESCRIPTION: This snippet outlines a strategy for distributed training by sharding and shuffling Lance fragments across multiple workers. It uses `lance_fragments` to manage the distribution of data, ensuring each worker processes a unique subset of the dataset for efficient parallel training. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/integrations/tensorflow.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +import tensorflow as tf +from lance.tf.data import from_lance, lance_fragments + +world_size = 32 +rank = 10 +seed = 123 # +epoch = 100 + +dataset_uri = "s3://my-bucket/my-dataset" + +# Shuffle fragments distributedly. +fragments = + lance_fragments("s3://my-bucket/my-dataset") + .shuffling(32, seed=seed) + .repeat(epoch) + .enumerate() + .filter(lambda i, _: i % world_size == rank) + .map(lambda _, fid: fid) + +ds = from_lance( + uri, + columns=["image", "label"], + fragments=fragments, + batch_size=32 + ) +for batch in ds: + print(batch) +``` + +---------------------------------------- + +TITLE: LanceDB Deletion File Naming Convention +DESCRIPTION: This snippet specifies the naming convention for deletion files in LanceDB, which are used to mark rows for deletion. It details the components of the filename, including fragment ID, read version, and a random ID, along with the file type suffix (Arrow or Roaring Bitmap). + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/format/format.md#_snippet_9 + +LANGUAGE: text +CODE: +``` +_deletions/{fragment_id}-{read_version}-{random_id}.{arrow|bin} +``` + +---------------------------------------- + +TITLE: Convert NumPy BFloat16 Array to Lance Extension Arrays +DESCRIPTION: This snippet demonstrates how to convert an existing NumPy array of `bfloat16` dtype into Lance's `PandasBFloat16Array` or `BFloat16Array`. It showcases the interoperability between NumPy's `ml_dtypes` and Lance's extension arrays, facilitating data integration. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/arrays.md#_snippet_2 + +LANGUAGE: python +CODE: +``` +import numpy as np +from ml_dtypes import bfloat16 +from lance.arrow import PandasBFloat16Array, BFloat16Array + +np_array = np.array([1.1, 2.1, 3.4], dtype=bfloat16) +PandasBFloat16Array.from_numpy(np_array) +# +# [1.1015625, 2.09375, 3.40625] +# Length: 3, dtype: lance.bfloat16 +BFloat16Array.from_numpy(np_array) +# +# [ +# 1.1015625, +# 2.09375, +# 3.40625 +# ] +``` + +---------------------------------------- + +TITLE: Rename Nested Columns in LanceDB Dataset +DESCRIPTION: This snippet demonstrates how to rename nested columns within a LanceDB dataset using `lance.LanceDataset.alter_columns`. It shows how to specify nested paths using dot notation (e.g., 'meta.id') and verifies the renaming by printing the dataset's content. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/data_evolution.md#_snippet_6 + +LANGUAGE: python +CODE: +``` +data = [ + {"meta": {"id": 1, "name": "Alice"}}, + {"meta": {"id": 2, "name": "Bob"}} +] +schema = pa.schema([ + ("meta", pa.struct([ + ("id", pa.int32()), + ("name", pa.string()), + ])) +]) +dataset = lance.write_dataset(data, "nested_rename") +dataset.alter_columns({"path": "meta.id", "name": "new_id"}) +print(dataset.to_table().to_pandas()) +# meta +# 0 {'new_id': 1, 'name': 'Alice'} +# 1 {'new_id': 2, 'name': 'Bob'} +``` + +---------------------------------------- + +TITLE: Delete Rows from Lance Dataset by SQL Filter +DESCRIPTION: Explains how to delete rows from a Lance dataset using a SQL-like filter expression with the `LanceDataset.delete` method. Note that this operation creates a new version of the dataset, requiring it to be reopened to see changes. + +SOURCE: https://github.com/lancedb/lance/blob/main/docs/src/guide/read_and_write.md#_snippet_3 + +LANGUAGE: python +CODE: +``` +import lance + +dataset = lance.dataset("./alice_and_bob.lance") +dataset.delete("name = 'Bob'") +dataset2 = lance.dataset("./alice_and_bob.lance") +print(dataset2.to_table().to_pandas()) +# name age +# 0 Alice 20 +``` \ No newline at end of file diff --git a/herodb/specs/backgroundinfo/redb.md b/specs/backgroundinfo/redb.md similarity index 100% rename from herodb/specs/backgroundinfo/redb.md rename to specs/backgroundinfo/redb.md diff --git a/herodb/specs/backgroundinfo/redis_basic_client.md b/specs/backgroundinfo/redis_basic_client.md similarity index 100% rename from herodb/specs/backgroundinfo/redis_basic_client.md rename to specs/backgroundinfo/redis_basic_client.md diff --git a/herodb/specs/backgroundinfo/redis_basics.md b/specs/backgroundinfo/redis_basics.md similarity index 100% rename from herodb/specs/backgroundinfo/redis_basics.md rename to specs/backgroundinfo/redis_basics.md diff --git a/herodb/specs/backgroundinfo/redis_hset_functions.md b/specs/backgroundinfo/redis_hset_functions.md similarity index 100% rename from herodb/specs/backgroundinfo/redis_hset_functions.md rename to specs/backgroundinfo/redis_hset_functions.md diff --git a/herodb/specs/backgroundinfo/redis_lists.md b/specs/backgroundinfo/redis_lists.md similarity index 100% rename from herodb/specs/backgroundinfo/redis_lists.md rename to specs/backgroundinfo/redis_lists.md diff --git a/herodb/specs/backgroundinfo/sled.md b/specs/backgroundinfo/sled.md similarity index 100% rename from herodb/specs/backgroundinfo/sled.md rename to specs/backgroundinfo/sled.md diff --git a/herodb/specs/backgroundinfo/tantivy.md b/specs/backgroundinfo/tantivy.md similarity index 100% rename from herodb/specs/backgroundinfo/tantivy.md rename to specs/backgroundinfo/tantivy.md diff --git a/specs/lance_implementation.md b/specs/lance_implementation.md new file mode 100644 index 0000000..ba1e881 --- /dev/null +++ b/specs/lance_implementation.md @@ -0,0 +1,735 @@ +Looking at your request to ensure embedding happens with an external embedding provider configured in Redis at `config:core:aiembed:url`, I'll modify the implementation to use an external embedding service instead of local models. + +## Updated Implementation with External Embedding Provider + +### 1. Update Dependencies in `Cargo.toml` + +```toml +[dependencies] +# ... existing dependencies ... +lance = "0.18" +lance-index = "0.18" +lance-linalg = "0.18" +arrow = "52" +arrow-array = "52" +arrow-schema = "52" +parquet = "52" +uuid = { version = "1.10", features = ["v4"] } +reqwest = { version = "0.11", features = ["json"] } +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +base64 = "0.22" +image = "0.25" +``` + +### 2. Create Enhanced Lance Module with External Embedding + +Create `src/lance_store.rs`: + +```rust +use std::collections::HashMap; +use std::path::PathBuf; +use std::sync::Arc; +use tokio::sync::RwLock; + +use arrow::array::{Float32Array, StringArray, BinaryArray, ArrayRef}; +use arrow::datatypes::{DataType, Field, Schema}; +use arrow::record_batch::RecordBatch; +use lance::dataset::{Dataset, WriteParams, WriteMode}; +use lance::index::vector::VectorIndexParams; +use lance_index::vector::pq::PQBuildParams; +use lance_index::vector::ivf::IvfBuildParams; + +use serde::{Deserialize, Serialize}; +use crate::error::DBError; +use crate::cmd::Protocol; + +#[derive(Debug, Serialize, Deserialize)] +struct EmbeddingRequest { + texts: Option>, + images: Option>, // base64 encoded + model: Option, +} + +#[derive(Debug, Serialize, Deserialize)] +struct EmbeddingResponse { + embeddings: Vec>, + model: String, + usage: Option>, +} + +pub struct LanceStore { + datasets: Arc>>>, + data_dir: PathBuf, + http_client: reqwest::Client, +} + +impl LanceStore { + pub async fn new(data_dir: PathBuf) -> Result { + // Create data directory if it doesn't exist + std::fs::create_dir_all(&data_dir) + .map_err(|e| DBError(format!("Failed to create Lance data directory: {}", e)))?; + + let http_client = reqwest::Client::builder() + .timeout(std::time::Duration::from_secs(30)) + .build() + .map_err(|e| DBError(format!("Failed to create HTTP client: {}", e)))?; + + Ok(Self { + datasets: Arc::new(RwLock::new(HashMap::new())), + data_dir, + http_client, + }) + } + + /// Get embedding service URL from Redis config + async fn get_embedding_url(&self, server: &crate::server::Server) -> Result { + // Get the embedding URL from Redis config + let key = "config:core:aiembed:url"; + + // Use HGET to retrieve the URL from Redis hash + let cmd = crate::cmd::Cmd::HGet { + key: key.to_string(), + field: "url".to_string(), + }; + + // Execute command to get the config + let result = cmd.run(server).await?; + + match result { + Protocol::BulkString(url) => Ok(url), + Protocol::SimpleString(url) => Ok(url), + Protocol::Nil => Err(DBError( + "Embedding service URL not configured. Set it with: HSET config:core:aiembed:url url ".to_string() + )), + _ => Err(DBError("Invalid embedding URL configuration".to_string())), + } + } + + /// Call external embedding service + async fn call_embedding_service( + &self, + server: &crate::server::Server, + texts: Option>, + images: Option>, + ) -> Result>, DBError> { + let url = self.get_embedding_url(server).await?; + + let request = EmbeddingRequest { + texts, + images, + model: None, // Let the service use its default + }; + + let response = self.http_client + .post(&url) + .json(&request) + .send() + .await + .map_err(|e| DBError(format!("Failed to call embedding service: {}", e)))?; + + if !response.status().is_success() { + let status = response.status(); + let error_text = response.text().await.unwrap_or_default(); + return Err(DBError(format!( + "Embedding service returned error {}: {}", + status, error_text + ))); + } + + let embedding_response: EmbeddingResponse = response + .json() + .await + .map_err(|e| DBError(format!("Failed to parse embedding response: {}", e)))?; + + Ok(embedding_response.embeddings) + } + + pub async fn embed_text( + &self, + server: &crate::server::Server, + texts: Vec + ) -> Result>, DBError> { + if texts.is_empty() { + return Ok(Vec::new()); + } + + self.call_embedding_service(server, Some(texts), None).await + } + + pub async fn embed_image( + &self, + server: &crate::server::Server, + image_bytes: Vec + ) -> Result, DBError> { + // Convert image bytes to base64 + let base64_image = base64::encode(&image_bytes); + + let embeddings = self.call_embedding_service( + server, + None, + Some(vec![base64_image]) + ).await?; + + embeddings.into_iter() + .next() + .ok_or_else(|| DBError("No embedding returned for image".to_string())) + } + + pub async fn create_dataset( + &self, + name: &str, + schema: Schema, + ) -> Result<(), DBError> { + let dataset_path = self.data_dir.join(format!("{}.lance", name)); + + // Create empty dataset with schema + let write_params = WriteParams { + mode: WriteMode::Create, + ..Default::default() + }; + + // Create an empty RecordBatch with the schema + let empty_batch = RecordBatch::new_empty(Arc::new(schema)); + let batches = vec![empty_batch]; + + let dataset = Dataset::write( + batches, + dataset_path.to_str().unwrap(), + Some(write_params) + ).await + .map_err(|e| DBError(format!("Failed to create dataset: {}", e)))?; + + let mut datasets = self.datasets.write().await; + datasets.insert(name.to_string(), Arc::new(dataset)); + + Ok(()) + } + + pub async fn write_vectors( + &self, + dataset_name: &str, + vectors: Vec>, + metadata: Option>>, + ) -> Result { + let dataset_path = self.data_dir.join(format!("{}.lance", dataset_name)); + + // Open or get cached dataset + let dataset = self.get_or_open_dataset(dataset_name).await?; + + // Build RecordBatch + let num_vectors = vectors.len(); + if num_vectors == 0 { + return Ok(0); + } + + let dim = vectors.first() + .ok_or_else(|| DBError("Empty vectors".to_string()))? + .len(); + + // Flatten vectors + let flat_vectors: Vec = vectors.into_iter().flatten().collect(); + let vector_array = Float32Array::from(flat_vectors); + let vector_array = arrow::array::FixedSizeListArray::try_new_from_values( + vector_array, + dim as i32 + ).map_err(|e| DBError(format!("Failed to create vector array: {}", e)))?; + + let mut arrays: Vec = vec![Arc::new(vector_array)]; + let mut fields = vec![Field::new( + "vector", + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::Float32, true)), + dim as i32 + ), + false + )]; + + // Add metadata columns if provided + if let Some(metadata) = metadata { + for (key, values) in metadata { + if values.len() != num_vectors { + return Err(DBError(format!( + "Metadata field '{}' has {} values but expected {}", + key, values.len(), num_vectors + ))); + } + let array = StringArray::from(values); + arrays.push(Arc::new(array)); + fields.push(Field::new(&key, DataType::Utf8, true)); + } + } + + let schema = Arc::new(Schema::new(fields)); + let batch = RecordBatch::try_new(schema, arrays) + .map_err(|e| DBError(format!("Failed to create RecordBatch: {}", e)))?; + + // Append to dataset + let write_params = WriteParams { + mode: WriteMode::Append, + ..Default::default() + }; + + Dataset::write( + vec![batch], + dataset_path.to_str().unwrap(), + Some(write_params) + ).await + .map_err(|e| DBError(format!("Failed to write to dataset: {}", e)))?; + + // Refresh cached dataset + let mut datasets = self.datasets.write().await; + datasets.remove(dataset_name); + + Ok(num_vectors) + } + + pub async fn search_vectors( + &self, + dataset_name: &str, + query_vector: Vec, + k: usize, + nprobes: Option, + refine_factor: Option, + ) -> Result)>, DBError> { + let dataset = self.get_or_open_dataset(dataset_name).await?; + + // Build query + let mut query = dataset.scan(); + query = query.nearest( + "vector", + &query_vector, + k, + ).map_err(|e| DBError(format!("Failed to build search query: {}", e)))?; + + if let Some(nprobes) = nprobes { + query = query.nprobes(nprobes); + } + + if let Some(refine) = refine_factor { + query = query.refine_factor(refine); + } + + // Execute search + let results = query + .try_into_stream() + .await + .map_err(|e| DBError(format!("Failed to execute search: {}", e)))? + .try_collect::>() + .await + .map_err(|e| DBError(format!("Failed to collect results: {}", e)))?; + + // Process results + let mut output = Vec::new(); + for batch in results { + // Get distances + let distances = batch + .column_by_name("_distance") + .ok_or_else(|| DBError("No distance column".to_string()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| DBError("Invalid distance type".to_string()))?; + + // Get metadata + for i in 0..batch.num_rows() { + let distance = distances.value(i); + let mut metadata = HashMap::new(); + + for field in batch.schema().fields() { + if field.name() != "vector" && field.name() != "_distance" { + if let Some(col) = batch.column_by_name(field.name()) { + if let Some(str_array) = col.as_any().downcast_ref::() { + if !str_array.is_null(i) { + metadata.insert( + field.name().to_string(), + str_array.value(i).to_string() + ); + } + } + } + } + } + + output.push((distance, metadata)); + } + } + + Ok(output) + } + + pub async fn store_multimodal( + &self, + server: &crate::server::Server, + dataset_name: &str, + text: Option, + image_bytes: Option>, + metadata: HashMap, + ) -> Result { + // Generate ID + let id = uuid::Uuid::new_v4().to_string(); + + // Generate embeddings using external service + let embedding = if let Some(text) = text.as_ref() { + self.embed_text(server, vec![text.clone()]).await? + .into_iter() + .next() + .ok_or_else(|| DBError("No embedding returned".to_string()))? + } else if let Some(img) = image_bytes.as_ref() { + self.embed_image(server, img.clone()).await? + } else { + return Err(DBError("No text or image provided".to_string())); + }; + + // Prepare metadata + let mut full_metadata = metadata; + full_metadata.insert("id".to_string(), id.clone()); + if let Some(text) = text { + full_metadata.insert("text".to_string(), text); + } + if let Some(img) = image_bytes { + full_metadata.insert("image_base64".to_string(), base64::encode(img)); + } + + // Convert metadata to column vectors + let mut metadata_cols = HashMap::new(); + for (key, value) in full_metadata { + metadata_cols.insert(key, vec![value]); + } + + // Write to dataset + self.write_vectors(dataset_name, vec![embedding], Some(metadata_cols)).await?; + + Ok(id) + } + + pub async fn search_with_text( + &self, + server: &crate::server::Server, + dataset_name: &str, + query_text: String, + k: usize, + nprobes: Option, + refine_factor: Option, + ) -> Result)>, DBError> { + // Embed the query text using external service + let embeddings = self.embed_text(server, vec![query_text]).await?; + let query_vector = embeddings.into_iter() + .next() + .ok_or_else(|| DBError("No embedding returned for query".to_string()))?; + + // Search with the embedding + self.search_vectors(dataset_name, query_vector, k, nprobes, refine_factor).await + } + + pub async fn create_index( + &self, + dataset_name: &str, + index_type: &str, + num_partitions: Option, + num_sub_vectors: Option, + ) -> Result<(), DBError> { + let dataset = self.get_or_open_dataset(dataset_name).await?; + + let mut params = VectorIndexParams::default(); + + match index_type.to_uppercase().as_str() { + "IVF_PQ" => { + params.ivf = IvfBuildParams { + num_partitions: num_partitions.unwrap_or(256), + ..Default::default() + }; + params.pq = PQBuildParams { + num_sub_vectors: num_sub_vectors.unwrap_or(16), + ..Default::default() + }; + } + _ => return Err(DBError(format!("Unsupported index type: {}", index_type))), + } + + dataset.create_index( + &["vector"], + lance::index::IndexType::Vector, + None, + ¶ms, + true + ).await + .map_err(|e| DBError(format!("Failed to create index: {}", e)))?; + + Ok(()) + } + + async fn get_or_open_dataset(&self, name: &str) -> Result, DBError> { + let mut datasets = self.datasets.write().await; + + if let Some(dataset) = datasets.get(name) { + return Ok(dataset.clone()); + } + + let dataset_path = self.data_dir.join(format!("{}.lance", name)); + if !dataset_path.exists() { + return Err(DBError(format!("Dataset '{}' does not exist", name))); + } + + let dataset = Dataset::open(dataset_path.to_str().unwrap()) + .await + .map_err(|e| DBError(format!("Failed to open dataset: {}", e)))?; + + let dataset = Arc::new(dataset); + datasets.insert(name.to_string(), dataset.clone()); + + Ok(dataset) + } + + pub async fn list_datasets(&self) -> Result, DBError> { + let mut datasets = Vec::new(); + + let entries = std::fs::read_dir(&self.data_dir) + .map_err(|e| DBError(format!("Failed to read data directory: {}", e)))?; + + for entry in entries { + let entry = entry.map_err(|e| DBError(format!("Failed to read entry: {}", e)))?; + let path = entry.path(); + + if path.is_dir() { + if let Some(name) = path.file_name() { + if let Some(name_str) = name.to_str() { + if name_str.ends_with(".lance") { + let dataset_name = name_str.trim_end_matches(".lance"); + datasets.push(dataset_name.to_string()); + } + } + } + } + } + + Ok(datasets) + } + + pub async fn drop_dataset(&self, name: &str) -> Result<(), DBError> { + // Remove from cache + let mut datasets = self.datasets.write().await; + datasets.remove(name); + + // Delete from disk + let dataset_path = self.data_dir.join(format!("{}.lance", name)); + if dataset_path.exists() { + std::fs::remove_dir_all(dataset_path) + .map_err(|e| DBError(format!("Failed to delete dataset: {}", e)))?; + } + + Ok(()) + } + + pub async fn get_dataset_info(&self, name: &str) -> Result, DBError> { + let dataset = self.get_or_open_dataset(name).await?; + + let mut info = HashMap::new(); + info.insert("name".to_string(), name.to_string()); + info.insert("version".to_string(), dataset.version().to_string()); + info.insert("num_rows".to_string(), dataset.count_rows().await?.to_string()); + + // Get schema info + let schema = dataset.schema(); + let fields: Vec = schema.fields() + .iter() + .map(|f| format!("{}:{}", f.name(), f.data_type())) + .collect(); + info.insert("schema".to_string(), fields.join(", ")); + + Ok(info) + } +} +``` + +### 3. Update Command Implementations + +Update the command implementations to pass the server reference for embedding service access: + +```rust +// In cmd.rs, update the lance command implementations + +async fn lance_store_cmd( + server: &Server, + dataset: &str, + text: Option, + image_base64: Option, + metadata: HashMap, +) -> Result { + let lance_store = server.lance_store()?; + + // Decode image if provided + let image_bytes = if let Some(b64) = image_base64 { + Some(base64::decode(b64).map_err(|e| + DBError(format!("Invalid base64 image: {}", e)))?) + } else { + None + }; + + // Pass server reference for embedding service access + let id = lance_store.store_multimodal( + server, // Pass server to access Redis config + dataset, + text, + image_bytes, + metadata, + ).await?; + + Ok(Protocol::BulkString(id)) +} + +async fn lance_embed_text_cmd( + server: &Server, + texts: &[String], +) -> Result { + let lance_store = server.lance_store()?; + + // Pass server reference for embedding service access + let embeddings = lance_store.embed_text(server, texts.to_vec()).await?; + + // Return as array of vectors + let mut output = Vec::new(); + for embedding in embeddings { + let vector_str = format!("[{}]", + embedding.iter() + .map(|f| f.to_string()) + .collect::>() + .join(",") + ); + output.push(Protocol::BulkString(vector_str)); + } + + Ok(Protocol::Array(output)) +} + +async fn lance_search_text_cmd( + server: &Server, + dataset: &str, + query_text: &str, + k: usize, + nprobes: Option, + refine_factor: Option, +) -> Result { + let lance_store = server.lance_store()?; + + // Search using text query (will be embedded automatically) + let results = lance_store.search_with_text( + server, + dataset, + query_text.to_string(), + k, + nprobes, + refine_factor, + ).await?; + + // Format results + let mut output = Vec::new(); + for (distance, metadata) in results { + let metadata_json = serde_json::to_string(&metadata) + .unwrap_or_else(|_| "{}".to_string()); + + output.push(Protocol::Array(vec![ + Protocol::BulkString(distance.to_string()), + Protocol::BulkString(metadata_json), + ])); + } + + Ok(Protocol::Array(output)) +} + +// Add new command for text-based search +pub enum Cmd { + // ... existing commands ... + LanceSearchText { + dataset: String, + query_text: String, + k: usize, + nprobes: Option, + refine_factor: Option, + }, +} +``` + +## Usage Examples + +### 1. Configure the Embedding Service + +First, users need to configure the embedding service URL: + +```bash +# Configure the embedding service endpoint +redis-cli> HSET config:core:aiembed:url url "http://localhost:8000/embeddings" +OK + +# Or use a cloud service +redis-cli> HSET config:core:aiembed:url url "https://api.openai.com/v1/embeddings" +OK +``` + +### 2. Use Lance Commands with Automatic External Embedding + +```bash +# Create a dataset +redis-cli> LANCE.CREATE products DIM 1536 SCHEMA name:string price:float category:string +OK + +# Store text with automatic embedding (calls external service) +redis-cli> LANCE.STORE products TEXT "Wireless noise-canceling headphones with 30-hour battery" name:AirPods price:299.99 category:Electronics +"uuid-123-456" + +# Search using text query (automatically embeds the query) +redis-cli> LANCE.SEARCH.TEXT products "best headphones for travel" K 5 +1) "0.92" +2) "{\"id\":\"uuid-123\",\"name\":\"AirPods\",\"price\":\"299.99\"}" + +# Get embeddings directly +redis-cli> LANCE.EMBED.TEXT "This text will be embedded" +1) "[0.123, 0.456, 0.789, ...]" +``` + +## External Embedding Service API Specification + +The external embedding service should accept POST requests with this format: + +```json +// Request +{ + "texts": ["text1", "text2"], // Optional + "images": ["base64_img1"], // Optional + "model": "text-embedding-ada-002" // Optional +} + +// Response +{ + "embeddings": [[0.1, 0.2, ...], [0.3, 0.4, ...]], + "model": "text-embedding-ada-002", + "usage": { + "prompt_tokens": 100, + "total_tokens": 100 + } +} +``` + +## Error Handling + +The implementation includes comprehensive error handling: + +1. **Missing Configuration**: Clear error message if embedding URL not configured +2. **Service Failures**: Graceful handling of embedding service errors +3. **Timeout Protection**: 30-second timeout for embedding requests +4. **Retry Logic**: Could be added for resilience + +## Benefits of This Approach + +1. **Flexibility**: Supports any embedding service with compatible API +2. **Cost Control**: Use your preferred embedding provider +3. **Scalability**: Embedding service can be scaled independently +4. **Consistency**: All embeddings use the same configured service +5. **Security**: API keys and endpoints stored securely in Redis + +This implementation ensures that all embedding operations go through the external service configured in Redis, providing a clean separation between the vector database functionality and the embedding generation. + + +TODO EXTRA: + +- secret for the embedding service API key + diff --git a/herodb/src/age.rs b/src/age.rs similarity index 100% rename from herodb/src/age.rs rename to src/age.rs diff --git a/herodb/src/cmd.rs b/src/cmd.rs similarity index 100% rename from herodb/src/cmd.rs rename to src/cmd.rs diff --git a/herodb/src/crypto.rs b/src/crypto.rs similarity index 100% rename from herodb/src/crypto.rs rename to src/crypto.rs diff --git a/herodb/src/error.rs b/src/error.rs similarity index 100% rename from herodb/src/error.rs rename to src/error.rs diff --git a/herodb/src/lib.rs b/src/lib.rs similarity index 100% rename from herodb/src/lib.rs rename to src/lib.rs diff --git a/herodb/src/main.rs b/src/main.rs similarity index 100% rename from herodb/src/main.rs rename to src/main.rs diff --git a/herodb/src/options.rs b/src/options.rs similarity index 100% rename from herodb/src/options.rs rename to src/options.rs diff --git a/herodb/src/protocol.rs b/src/protocol.rs similarity index 100% rename from herodb/src/protocol.rs rename to src/protocol.rs diff --git a/herodb/src/server.rs b/src/server.rs similarity index 100% rename from herodb/src/server.rs rename to src/server.rs diff --git a/herodb/src/storage/mod.rs b/src/storage/mod.rs similarity index 100% rename from herodb/src/storage/mod.rs rename to src/storage/mod.rs diff --git a/herodb/src/storage/storage_basic.rs b/src/storage/storage_basic.rs similarity index 100% rename from herodb/src/storage/storage_basic.rs rename to src/storage/storage_basic.rs diff --git a/herodb/src/storage/storage_extra.rs b/src/storage/storage_extra.rs similarity index 100% rename from herodb/src/storage/storage_extra.rs rename to src/storage/storage_extra.rs diff --git a/herodb/src/storage/storage_hset.rs b/src/storage/storage_hset.rs similarity index 100% rename from herodb/src/storage/storage_hset.rs rename to src/storage/storage_hset.rs diff --git a/herodb/src/storage/storage_lists.rs b/src/storage/storage_lists.rs similarity index 100% rename from herodb/src/storage/storage_lists.rs rename to src/storage/storage_lists.rs diff --git a/herodb/src/storage_sled/mod.rs b/src/storage_sled/mod.rs similarity index 100% rename from herodb/src/storage_sled/mod.rs rename to src/storage_sled/mod.rs diff --git a/herodb/src/storage_trait.rs b/src/storage_trait.rs similarity index 100% rename from herodb/src/storage_trait.rs rename to src/storage_trait.rs diff --git a/herodb/test_herodb.sh b/test_herodb.sh similarity index 100% rename from herodb/test_herodb.sh rename to test_herodb.sh diff --git a/herodb/tests/debug_hset.rs b/tests/debug_hset.rs similarity index 100% rename from herodb/tests/debug_hset.rs rename to tests/debug_hset.rs diff --git a/herodb/tests/debug_hset_simple.rs b/tests/debug_hset_simple.rs similarity index 100% rename from herodb/tests/debug_hset_simple.rs rename to tests/debug_hset_simple.rs diff --git a/herodb/tests/debug_protocol.rs b/tests/debug_protocol.rs similarity index 100% rename from herodb/tests/debug_protocol.rs rename to tests/debug_protocol.rs diff --git a/herodb/tests/redis_integration_tests.rs b/tests/redis_integration_tests.rs similarity index 100% rename from herodb/tests/redis_integration_tests.rs rename to tests/redis_integration_tests.rs diff --git a/herodb/tests/redis_tests.rs b/tests/redis_tests.rs similarity index 100% rename from herodb/tests/redis_tests.rs rename to tests/redis_tests.rs diff --git a/herodb/tests/simple_integration_test.rs b/tests/simple_integration_test.rs similarity index 100% rename from herodb/tests/simple_integration_test.rs rename to tests/simple_integration_test.rs diff --git a/herodb/tests/simple_redis_test.rs b/tests/simple_redis_test.rs similarity index 100% rename from herodb/tests/simple_redis_test.rs rename to tests/simple_redis_test.rs diff --git a/herodb/tests/usage_suite.rs b/tests/usage_suite.rs similarity index 100% rename from herodb/tests/usage_suite.rs rename to tests/usage_suite.rs From ff4ea1d844acd82862c8bce4df4424cf331c458e Mon Sep 17 00:00:00 2001 From: despiegk Date: Sat, 23 Aug 2025 05:04:37 +0200 Subject: [PATCH 2/4] ... --- examples/README.md | 171 ++++++++++++++++++++++++++++++++ examples/age_bash_demo.sh | 71 +++++++++++++ examples/age_persist_demo.rs | 83 ++++++++++++++++ examples/simple_demo.sh | 186 +++++++++++++++++++++++++++++++++++ 4 files changed, 511 insertions(+) create mode 100644 examples/README.md create mode 100755 examples/age_bash_demo.sh create mode 100644 examples/age_persist_demo.rs create mode 100644 examples/simple_demo.sh diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..a36b993 --- /dev/null +++ b/examples/README.md @@ -0,0 +1,171 @@ +# HeroDB Tantivy Search Examples + +This directory contains examples demonstrating HeroDB's full-text search capabilities powered by Tantivy. + +## Tantivy Search Demo (Bash Script) + +### Overview +The `tantivy_search_demo.sh` script provides a comprehensive demonstration of HeroDB's search functionality using Redis commands. It showcases various search scenarios including basic text search, filtering, sorting, geographic queries, and more. + +### Prerequisites +1. **HeroDB Server**: The server must be running on port 6381 +2. **Redis CLI**: The `redis-cli` tool must be installed and available in your PATH + +### Running the Demo + +#### Step 1: Start HeroDB Server +```bash +# From the project root directory +cargo run -- --port 6381 +``` + +#### Step 2: Run the Demo (in a new terminal) +```bash +# From the project root directory +./examples/tantivy_search_demo.sh +``` + +### What the Demo Covers + +The script demonstrates 15 different search scenarios: + +1. **Index Creation** - Creating a search index with various field types +2. **Data Insertion** - Adding sample products to the index +3. **Basic Text Search** - Simple keyword searches +4. **Filtered Search** - Combining text search with category filters +5. **Numeric Range Search** - Finding products within price ranges +6. **Sorting Results** - Ordering results by different fields +7. **Limited Results** - Pagination and result limiting +8. **Complex Queries** - Multi-field searches with sorting +9. **Geographic Search** - Location-based queries +10. **Index Information** - Getting statistics about the search index +11. **Search Comparison** - Tantivy vs simple pattern matching +12. **Fuzzy Search** - Typo tolerance and approximate matching +13. **Phrase Search** - Exact phrase matching +14. **Boolean Queries** - AND, OR, NOT operators +15. **Cleanup** - Removing test data + +### Sample Data + +The demo uses a product catalog with the following fields: +- **title** (TEXT) - Product name with higher search weight +- **description** (TEXT) - Detailed product description +- **category** (TAG) - Comma-separated categories +- **price** (NUMERIC) - Product price for range queries +- **rating** (NUMERIC) - Customer rating for sorting +- **location** (GEO) - Geographic coordinates for location searches + +### Key Redis Commands Demonstrated + +#### Index Management +```bash +# Create search index +FT.CREATE product_catalog ON HASH PREFIX 1 product: SCHEMA title TEXT WEIGHT 2.0 SORTABLE description TEXT category TAG SEPARATOR , price NUMERIC SORTABLE rating NUMERIC SORTABLE location GEO + +# Get index information +FT.INFO product_catalog + +# Drop index +FT.DROPINDEX product_catalog +``` + +#### Search Queries +```bash +# Basic text search +FT.SEARCH product_catalog wireless + +# Filtered search +FT.SEARCH product_catalog 'organic @category:{food}' + +# Numeric range +FT.SEARCH product_catalog '@price:[50 150]' + +# Sorted results +FT.SEARCH product_catalog '@category:{electronics}' SORTBY price ASC + +# Geographic search +FT.SEARCH product_catalog '@location:[37.7749 -122.4194 50 km]' + +# Boolean queries +FT.SEARCH product_catalog 'wireless AND audio' +FT.SEARCH product_catalog 'coffee OR tea' + +# Phrase search +FT.SEARCH product_catalog '"noise canceling"' +``` + +### Interactive Features + +The demo script includes: +- **Colored output** for better readability +- **Pause between steps** to review results +- **Error handling** with clear error messages +- **Automatic cleanup** of test data +- **Progress indicators** showing what each step demonstrates + +### Troubleshooting + +#### HeroDB Not Running +``` +✗ HeroDB is not running on port 6381 +ℹ Please start HeroDB with: cargo run -- --port 6381 +``` +**Solution**: Start the HeroDB server in a separate terminal. + +#### Redis CLI Not Found +``` +redis-cli: command not found +``` +**Solution**: Install Redis tools or use an alternative Redis client. + +#### Connection Refused +``` +Could not connect to Redis at localhost:6381: Connection refused +``` +**Solution**: Ensure HeroDB is running and listening on the correct port. + +### Manual Testing + +You can also run individual commands manually: + +```bash +# Connect to HeroDB +redis-cli -h localhost -p 6381 + +# Create a simple index +FT.CREATE myindex ON HASH SCHEMA title TEXT description TEXT + +# Add a document +HSET doc:1 title "Hello World" description "This is a test document" + +# Search +FT.SEARCH myindex hello +``` + +### Performance Notes + +- **Indexing**: Documents are indexed in real-time as they're added +- **Search Speed**: Full-text search is much faster than pattern matching on large datasets +- **Memory Usage**: Tantivy indexes are memory-efficient and disk-backed +- **Scalability**: Supports millions of documents with sub-second search times + +### Advanced Features + +The demo showcases advanced Tantivy features: +- **Relevance Scoring** - Results ranked by relevance +- **Fuzzy Matching** - Handles typos and approximate matches +- **Field Weighting** - Title field has higher search weight +- **Multi-field Search** - Search across multiple fields simultaneously +- **Geographic Queries** - Distance-based location searches +- **Numeric Ranges** - Efficient range queries on numeric fields +- **Tag Filtering** - Fast categorical filtering + +### Next Steps + +After running the demo, explore: +1. **Custom Schemas** - Define your own field types and configurations +2. **Large Datasets** - Test with thousands or millions of documents +3. **Real Applications** - Integrate search into your applications +4. **Performance Tuning** - Optimize for your specific use case + +For more information, see the [search documentation](../herodb/docs/search.md). \ No newline at end of file diff --git a/examples/age_bash_demo.sh b/examples/age_bash_demo.sh new file mode 100755 index 0000000..07b54c8 --- /dev/null +++ b/examples/age_bash_demo.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +# Start the herodb server in the background +echo "Starting herodb server..." +cargo run -p herodb -- --dir /tmp/herodb_age_test --port 6382 --debug --encryption-key "testkey" & +SERVER_PID=$! +sleep 2 # Give the server a moment to start + +REDIS_CLI="redis-cli -p 6382" + +echo "--- Generating and Storing Encryption Keys ---" +# The new AGE commands are 'AGE KEYGEN ' etc., based on src/cmd.rs +# This script uses older commands like 'AGE.GENERATE_KEYPAIR alice' +# The demo script needs to be updated to match the implemented commands. +# Let's assume the commands in the script are what's expected for now, +# but note this discrepancy. The new commands are AGE KEYGEN etc. +# The script here uses a different syntax not found in src/cmd.rs like 'AGE.GENERATE_KEYPAIR'. +# For now, I will modify the script to fit the actual implementation. + +echo "--- Generating and Storing Encryption Keys ---" +$REDIS_CLI AGE KEYGEN alice +$REDIS_CLI AGE KEYGEN bob + +echo "--- Encrypting and Decrypting a Message ---" +MESSAGE="Hello, AGE encryption!" +# The new logic stores keys internally and does not expose a command to get the public key. +# We will encrypt by name. +ALICE_PUBKEY_REPLY=$($REDIS_CLI AGE KEYGEN alice | head -n 2 | tail -n 1) +echo "Alice's Public Key: $ALICE_PUBKEY_REPLY" + +echo "Encrypting message: '$MESSAGE' with Alice's identity..." +# AGE.ENCRYPT recipient message. But since we use persistent keys, let's use ENCRYPTNAME +CIPHERTEXT=$($REDIS_CLI AGE ENCRYPTNAME alice "$MESSAGE") +echo "Ciphertext: $CIPHERTEXT" + +echo "Decrypting ciphertext with Alice's private key..." +DECRYPTED_MESSAGE=$($REDIS_CLI AGE DECRYPTNAME alice "$CIPHERTEXT") +echo "Decrypted Message: $DECRYPTED_MESSAGE" + +echo "--- Generating and Storing Signing Keys ---" +$REDIS_CLI AGE SIGNKEYGEN signer1 + +echo "--- Signing and Verifying a Message ---" +SIGN_MESSAGE="This is a message to be signed." +# Similar to above, we don't have GET_SIGN_PUBKEY. We will verify by name. + +echo "Signing message: '$SIGN_MESSAGE' with signer1's private key..." +SIGNATURE=$($REDIS_CLI AGE SIGNNAME "$SIGN_MESSAGE" signer1) +echo "Signature: $SIGNATURE" + +echo "Verifying signature with signer1's public key..." +VERIFY_RESULT=$($REDIS_CLI AGE VERIFYNAME signer1 "$SIGN_MESSAGE" "$SIGNATURE") +echo "Verification Result: $VERIFY_RESULT" + + +# There is no DELETE_KEYPAIR command in the implementation +echo "--- Cleaning up keys (manual in herodb) ---" +# We would use DEL for age:key:alice, etc. +$REDIS_CLI DEL age:key:alice +$REDIS_CLI DEL age:privkey:alice +$REDIS_CLI DEL age:key:bob +$REDIS_CLI DEL age:privkey:bob +$REDIS_CLI DEL age:signpub:signer1 +$REDIS_CLI DEL age:signpriv:signer1 + +echo "--- Stopping herodb server ---" +kill $SERVER_PID +wait $SERVER_PID 2>/dev/null +echo "Server stopped." + +echo "Bash demo complete." \ No newline at end of file diff --git a/examples/age_persist_demo.rs b/examples/age_persist_demo.rs new file mode 100644 index 0000000..9caf3bd --- /dev/null +++ b/examples/age_persist_demo.rs @@ -0,0 +1,83 @@ +use std::io::{Read, Write}; +use std::net::TcpStream; + +// Minimal RESP helpers +fn arr(parts: &[&str]) -> String { + let mut out = format!("*{}\r\n", parts.len()); + for p in parts { + out.push_str(&format!("${}\r\n{}\r\n", p.len(), p)); + } + out +} +fn read_reply(s: &mut TcpStream) -> String { + let mut buf = [0u8; 65536]; + let n = s.read(&mut buf).unwrap(); + String::from_utf8_lossy(&buf[..n]).to_string() +} +fn parse_two_bulk(reply: &str) -> Option<(String,String)> { + let mut lines = reply.split("\r\n"); + if lines.next()? != "*2" { return None; } + let _n = lines.next()?; + let a = lines.next()?.to_string(); + let _m = lines.next()?; + let b = lines.next()?.to_string(); + Some((a,b)) +} +fn parse_bulk(reply: &str) -> Option { + let mut lines = reply.split("\r\n"); + let hdr = lines.next()?; + if !hdr.starts_with('$') { return None; } + Some(lines.next()?.to_string()) +} +fn parse_simple(reply: &str) -> Option { + let mut lines = reply.split("\r\n"); + let hdr = lines.next()?; + if !hdr.starts_with('+') { return None; } + Some(hdr[1..].to_string()) +} + +fn main() { + let mut args = std::env::args().skip(1); + let host = args.next().unwrap_or_else(|| "127.0.0.1".into()); + let port = args.next().unwrap_or_else(|| "6379".into()); + let addr = format!("{host}:{port}"); + println!("Connecting to {addr}..."); + let mut s = TcpStream::connect(addr).expect("connect"); + + // Generate & persist X25519 enc keys under name "alice" + s.write_all(arr(&["age","keygen","alice"]).as_bytes()).unwrap(); + let (_alice_recip, _alice_ident) = parse_two_bulk(&read_reply(&mut s)).expect("gen enc"); + + // Generate & persist Ed25519 signing key under name "signer" + s.write_all(arr(&["age","signkeygen","signer"]).as_bytes()).unwrap(); + let (_verify, _secret) = parse_two_bulk(&read_reply(&mut s)).expect("gen sign"); + + // Encrypt by name + let msg = "hello from persistent keys"; + s.write_all(arr(&["age","encryptname","alice", msg]).as_bytes()).unwrap(); + let ct_b64 = parse_bulk(&read_reply(&mut s)).expect("ct b64"); + println!("ciphertext b64: {}", ct_b64); + + // Decrypt by name + s.write_all(arr(&["age","decryptname","alice", &ct_b64]).as_bytes()).unwrap(); + let pt = parse_bulk(&read_reply(&mut s)).expect("pt"); + assert_eq!(pt, msg); + println!("decrypted ok"); + + // Sign by name + s.write_all(arr(&["age","signname","signer", msg]).as_bytes()).unwrap(); + let sig_b64 = parse_bulk(&read_reply(&mut s)).expect("sig b64"); + + // Verify by name + s.write_all(arr(&["age","verifyname","signer", msg, &sig_b64]).as_bytes()).unwrap(); + let ok = parse_simple(&read_reply(&mut s)).expect("verify"); + assert_eq!(ok, "1"); + println!("signature verified"); + + // List names + s.write_all(arr(&["age","list"]).as_bytes()).unwrap(); + let list = read_reply(&mut s); + println!("LIST -> {list}"); + + println!("✔ persistent AGE workflow complete."); +} \ No newline at end of file diff --git a/examples/simple_demo.sh b/examples/simple_demo.sh new file mode 100644 index 0000000..801f29e --- /dev/null +++ b/examples/simple_demo.sh @@ -0,0 +1,186 @@ +#!/bin/bash + +# Simple HeroDB Demo - Basic Redis Commands +# This script demonstrates basic Redis functionality that's currently implemented + +set -e # Exit on any error + +# Configuration +REDIS_HOST="localhost" +REDIS_PORT="6381" +REDIS_CLI="redis-cli -h $REDIS_HOST -p $REDIS_PORT" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +BLUE='\033[0;34m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Function to print colored output +print_header() { + echo -e "${BLUE}=== $1 ===${NC}" +} + +print_success() { + echo -e "${GREEN}✓ $1${NC}" +} + +print_info() { + echo -e "${YELLOW}ℹ $1${NC}" +} + +print_error() { + echo -e "${RED}✗ $1${NC}" +} + +# Function to check if HeroDB is running +check_herodb() { + print_info "Checking if HeroDB is running on port $REDIS_PORT..." + if ! $REDIS_CLI ping > /dev/null 2>&1; then + print_error "HeroDB is not running on port $REDIS_PORT" + print_info "Please start HeroDB with: cargo run -- --port $REDIS_PORT" + exit 1 + fi + print_success "HeroDB is running and responding" +} + +# Function to execute Redis command with error handling +execute_cmd() { + local cmd="$1" + local description="$2" + + echo -e "${YELLOW}Command:${NC} $cmd" + if result=$($REDIS_CLI $cmd 2>&1); then + echo -e "${GREEN}Result:${NC} $result" + return 0 + else + print_error "Failed: $description" + echo "Error: $result" + return 1 + fi +} + +# Main demo function +main() { + clear + print_header "HeroDB Basic Functionality Demo" + echo "This demo shows basic Redis commands that are currently implemented" + echo "HeroDB runs on port $REDIS_PORT (instead of Redis default 6379)" + echo + + # Check if HeroDB is running + check_herodb + echo + + print_header "Step 1: Basic Key-Value Operations" + + execute_cmd "SET greeting 'Hello HeroDB!'" "Setting a simple key-value pair" + echo + execute_cmd "GET greeting" "Getting the value" + echo + execute_cmd "SET counter 42" "Setting a numeric value" + echo + execute_cmd "INCR counter" "Incrementing the counter" + echo + execute_cmd "GET counter" "Getting the incremented value" + echo + + print_header "Step 2: Hash Operations" + + execute_cmd "HSET user:1 name 'John Doe' email 'john@example.com' age 30" "Setting hash fields" + echo + execute_cmd "HGET user:1 name" "Getting a specific field" + echo + execute_cmd "HGETALL user:1" "Getting all fields" + echo + execute_cmd "HLEN user:1" "Getting hash length" + echo + + print_header "Step 3: List Operations" + + execute_cmd "LPUSH tasks 'Write code' 'Test code' 'Deploy code'" "Adding items to list" + echo + execute_cmd "LLEN tasks" "Getting list length" + echo + execute_cmd "LRANGE tasks 0 -1" "Getting all list items" + echo + execute_cmd "LPOP tasks" "Popping from left" + echo + execute_cmd "LRANGE tasks 0 -1" "Checking remaining items" + echo + + print_header "Step 4: Key Management" + + execute_cmd "KEYS *" "Listing all keys" + echo + execute_cmd "EXISTS greeting" "Checking if key exists" + echo + execute_cmd "TYPE user:1" "Getting key type" + echo + execute_cmd "DBSIZE" "Getting database size" + echo + + print_header "Step 5: Expiration" + + execute_cmd "SET temp_key 'temporary value'" "Setting temporary key" + echo + execute_cmd "EXPIRE temp_key 5" "Setting 5 second expiration" + echo + execute_cmd "TTL temp_key" "Checking time to live" + echo + print_info "Waiting 2 seconds..." + sleep 2 + execute_cmd "TTL temp_key" "Checking TTL again" + echo + + print_header "Step 6: Multiple Operations" + + execute_cmd "MSET key1 'value1' key2 'value2' key3 'value3'" "Setting multiple keys" + echo + execute_cmd "MGET key1 key2 key3" "Getting multiple values" + echo + execute_cmd "DEL key1 key2" "Deleting multiple keys" + echo + execute_cmd "EXISTS key1 key2 key3" "Checking existence of multiple keys" + echo + + print_header "Step 7: Search Commands (Placeholder)" + print_info "Testing FT.CREATE command (currently returns placeholder response)" + + execute_cmd "FT.CREATE test_index SCHEMA title TEXT description TEXT" "Creating search index" + echo + + print_header "Step 8: Server Information" + + execute_cmd "INFO" "Getting server information" + echo + execute_cmd "CONFIG GET dir" "Getting configuration" + echo + + print_header "Step 9: Cleanup" + + execute_cmd "FLUSHDB" "Clearing database" + echo + execute_cmd "DBSIZE" "Confirming database is empty" + echo + + print_header "Demo Summary" + echo "This demonstration showed:" + echo "• Basic key-value operations (GET, SET, INCR)" + echo "• Hash operations (HSET, HGET, HGETALL)" + echo "• List operations (LPUSH, LPOP, LRANGE)" + echo "• Key management (KEYS, EXISTS, TYPE, DEL)" + echo "• Expiration handling (EXPIRE, TTL)" + echo "• Multiple key operations (MSET, MGET)" + echo "• Server information commands" + echo + print_success "HeroDB basic functionality demo completed successfully!" + echo + print_info "Note: Full-text search (FT.*) commands are defined but not yet fully implemented" + print_info "To run HeroDB server: cargo run -- --port 6381" + print_info "To connect with redis-cli: redis-cli -h localhost -p 6381" +} + +# Run the demo +main "$@" \ No newline at end of file From f17b441ca12ebd76cf997e42ce55fc152b8d0504 Mon Sep 17 00:00:00 2001 From: despiegk Date: Sat, 23 Aug 2025 05:07:45 +0200 Subject: [PATCH 3/4] ... --- Cargo.lock | 2166 ---------------------------------------------------- 1 file changed, 2166 deletions(-) delete mode 100644 Cargo.lock diff --git a/Cargo.lock b/Cargo.lock deleted file mode 100644 index fa0053c..0000000 --- a/Cargo.lock +++ /dev/null @@ -1,2166 +0,0 @@ -# This file is automatically @generated by Cargo. -# It is not intended for manual editing. -version = 4 - -[[package]] -name = "addr2line" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" -dependencies = [ - "gimli", -] - -[[package]] -name = "adler2" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" - -[[package]] -name = "aead" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" -dependencies = [ - "crypto-common", - "generic-array", -] - -[[package]] -name = "age" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77de71da1ca673855aacea507a7aed363beb8934cf61b62364fc4b479d2e8cda" -dependencies = [ - "age-core", - "base64 0.21.7", - "bech32", - "chacha20poly1305", - "cookie-factory", - "hmac", - "i18n-embed", - "i18n-embed-fl", - "lazy_static", - "nom", - "pin-project", - "rand", - "rust-embed", - "scrypt", - "sha2", - "subtle", - "x25519-dalek", - "zeroize", -] - -[[package]] -name = "age-core" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5f11899bc2bbddd135edbc30c36b1924fa59d0746bb45beb5933fafe3fe509b" -dependencies = [ - "base64 0.21.7", - "chacha20poly1305", - "cookie-factory", - "hkdf", - "io_tee", - "nom", - "rand", - "secrecy", - "sha2", -] - -[[package]] -name = "anstream" -version = "0.6.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" -dependencies = [ - "anstyle", - "anstyle-parse", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "is_terminal_polyfill", - "utf8parse", -] - -[[package]] -name = "anstyle" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" - -[[package]] -name = "anstyle-parse" -version = "0.2.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" -dependencies = [ - "utf8parse", -] - -[[package]] -name = "anstyle-query" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" -dependencies = [ - "windows-sys 0.60.2", -] - -[[package]] -name = "anstyle-wincon" -version = "3.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" -dependencies = [ - "anstyle", - "once_cell_polyfill", - "windows-sys 0.60.2", -] - -[[package]] -name = "anyhow" -version = "1.0.99" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" - -[[package]] -name = "arc-swap" -version = "1.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" - -[[package]] -name = "async-trait" -version = "0.1.89" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "autocfg" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" - -[[package]] -name = "backtrace" -version = "0.3.75" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" -dependencies = [ - "addr2line", - "cfg-if", - "libc", - "miniz_oxide", - "object", - "rustc-demangle", - "windows-targets 0.52.6", -] - -[[package]] -name = "base64" -version = "0.21.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" - -[[package]] -name = "base64" -version = "0.22.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" - -[[package]] -name = "base64ct" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" - -[[package]] -name = "basic-toml" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba62675e8242a4c4e806d12f11d136e626e6c8361d6b829310732241652a178a" -dependencies = [ - "serde", -] - -[[package]] -name = "bech32" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d86b93f97252c47b41663388e6d155714a9d0c398b99f1005cbc5f978b29f445" - -[[package]] -name = "bincode" -version = "1.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" -dependencies = [ - "serde", -] - -[[package]] -name = "bitflags" -version = "1.3.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" - -[[package]] -name = "bitflags" -version = "2.9.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a65b545ab31d687cff52899d4890855fec459eb6afe0da6417b8a18da87aa29" - -[[package]] -name = "block-buffer" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" -dependencies = [ - "generic-array", -] - -[[package]] -name = "byteorder" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" - -[[package]] -name = "bytes" -version = "1.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" - -[[package]] -name = "cfg-if" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9555578bc9e57714c812a1f84e4fc5b4d21fcb063490c624de019f7464c91268" - -[[package]] -name = "chacha20" -version = "0.9.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818" -dependencies = [ - "cfg-if", - "cipher", - "cpufeatures", -] - -[[package]] -name = "chacha20poly1305" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35" -dependencies = [ - "aead", - "chacha20", - "cipher", - "poly1305", - "zeroize", -] - -[[package]] -name = "cipher" -version = "0.4.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" -dependencies = [ - "crypto-common", - "inout", - "zeroize", -] - -[[package]] -name = "clap" -version = "4.5.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fc0e74a703892159f5ae7d3aac52c8e6c392f5ae5f359c70b5881d60aaac318" -dependencies = [ - "clap_builder", - "clap_derive", -] - -[[package]] -name = "clap_builder" -version = "4.5.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3e7f4214277f3c7aa526a59dd3fbe306a370daee1f8b7b8c987069cd8e888a8" -dependencies = [ - "anstream", - "anstyle", - "clap_lex", - "strsim 0.11.1", -] - -[[package]] -name = "clap_derive" -version = "4.5.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14cb31bb0a7d536caef2639baa7fad459e15c3144efefa6dbd1c84562c4739f6" -dependencies = [ - "heck", - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "clap_lex" -version = "0.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" - -[[package]] -name = "colorchoice" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" - -[[package]] -name = "combine" -version = "4.6.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" -dependencies = [ - "bytes", - "futures-core", - "memchr", - "pin-project-lite", - "tokio", - "tokio-util", -] - -[[package]] -name = "const-oid" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" - -[[package]] -name = "cookie-factory" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9885fa71e26b8ab7855e2ec7cae6e9b380edff76cd052e07c683a0319d51b3a2" -dependencies = [ - "futures", -] - -[[package]] -name = "cpufeatures" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" -dependencies = [ - "libc", -] - -[[package]] -name = "crc32fast" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "crossbeam-epoch" -version = "0.9.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" -dependencies = [ - "crossbeam-utils", -] - -[[package]] -name = "crossbeam-utils" -version = "0.8.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" - -[[package]] -name = "crypto-common" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" -dependencies = [ - "generic-array", - "rand_core", - "typenum", -] - -[[package]] -name = "curve25519-dalek" -version = "4.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" -dependencies = [ - "cfg-if", - "cpufeatures", - "curve25519-dalek-derive", - "digest", - "fiat-crypto", - "rustc_version", - "subtle", - "zeroize", -] - -[[package]] -name = "curve25519-dalek-derive" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "dashmap" -version = "5.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" -dependencies = [ - "cfg-if", - "hashbrown", - "lock_api", - "once_cell", - "parking_lot_core 0.9.11", -] - -[[package]] -name = "der" -version = "0.7.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" -dependencies = [ - "const-oid", - "zeroize", -] - -[[package]] -name = "digest" -version = "0.10.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" -dependencies = [ - "block-buffer", - "crypto-common", - "subtle", -] - -[[package]] -name = "displaydoc" -version = "0.2.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "ed25519" -version = "2.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" -dependencies = [ - "pkcs8", - "signature", -] - -[[package]] -name = "ed25519-dalek" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9" -dependencies = [ - "curve25519-dalek", - "ed25519", - "serde", - "sha2", - "subtle", - "zeroize", -] - -[[package]] -name = "fiat-crypto" -version = "0.2.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" - -[[package]] -name = "find-crate" -version = "0.6.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59a98bbaacea1c0eb6a0876280051b892eb73594fd90cf3b20e9c817029c57d2" -dependencies = [ - "toml", -] - -[[package]] -name = "fluent" -version = "0.16.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb74634707bebd0ce645a981148e8fb8c7bccd4c33c652aeffd28bf2f96d555a" -dependencies = [ - "fluent-bundle", - "unic-langid", -] - -[[package]] -name = "fluent-bundle" -version = "0.15.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fe0a21ee80050c678013f82edf4b705fe2f26f1f9877593d13198612503f493" -dependencies = [ - "fluent-langneg", - "fluent-syntax", - "intl-memoizer", - "intl_pluralrules", - "rustc-hash 1.1.0", - "self_cell 0.10.3", - "smallvec", - "unic-langid", -] - -[[package]] -name = "fluent-langneg" -version = "0.13.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c4ad0989667548f06ccd0e306ed56b61bd4d35458d54df5ec7587c0e8ed5e94" -dependencies = [ - "unic-langid", -] - -[[package]] -name = "fluent-syntax" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a530c4694a6a8d528794ee9bbd8ba0122e779629ac908d15ad5a7ae7763a33d" -dependencies = [ - "thiserror", -] - -[[package]] -name = "form_urlencoded" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" -dependencies = [ - "percent-encoding", -] - -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "futures" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" -dependencies = [ - "futures-channel", - "futures-core", - "futures-executor", - "futures-io", - "futures-sink", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-channel" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" -dependencies = [ - "futures-core", - "futures-sink", -] - -[[package]] -name = "futures-core" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" - -[[package]] -name = "futures-executor" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" -dependencies = [ - "futures-core", - "futures-task", - "futures-util", -] - -[[package]] -name = "futures-io" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" - -[[package]] -name = "futures-macro" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "futures-sink" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" - -[[package]] -name = "futures-task" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" - -[[package]] -name = "futures-util" -version = "0.3.31" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" -dependencies = [ - "futures-channel", - "futures-core", - "futures-io", - "futures-macro", - "futures-sink", - "futures-task", - "memchr", - "pin-project-lite", - "pin-utils", - "slab", -] - -[[package]] -name = "fxhash" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" -dependencies = [ - "byteorder", -] - -[[package]] -name = "generic-array" -version = "0.14.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" -dependencies = [ - "typenum", - "version_check", -] - -[[package]] -name = "getrandom" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" -dependencies = [ - "cfg-if", - "libc", - "wasi", -] - -[[package]] -name = "gimli" -version = "0.31.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" - -[[package]] -name = "hashbrown" -version = "0.14.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" - -[[package]] -name = "heck" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" - -[[package]] -name = "herodb" -version = "0.0.1" -dependencies = [ - "age", - "anyhow", - "base64 0.22.1", - "bincode", - "byteorder", - "bytes", - "chacha20poly1305", - "clap", - "ed25519-dalek", - "futures", - "rand", - "redb", - "redis", - "secrecy", - "serde", - "serde_json", - "sha2", - "sled", - "thiserror", - "tokio", -] - -[[package]] -name = "hkdf" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" -dependencies = [ - "hmac", -] - -[[package]] -name = "hmac" -version = "0.12.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" -dependencies = [ - "digest", -] - -[[package]] -name = "i18n-config" -version = "0.4.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3e06b90c8a0d252e203c94344b21e35a30f3a3a85dc7db5af8f8df9f3e0c63ef" -dependencies = [ - "basic-toml", - "log", - "serde", - "serde_derive", - "thiserror", - "unic-langid", -] - -[[package]] -name = "i18n-embed" -version = "0.14.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94205d95764f5bb9db9ea98fa77f89653365ca748e27161f5bbea2ffd50e459c" -dependencies = [ - "arc-swap", - "fluent", - "fluent-langneg", - "fluent-syntax", - "i18n-embed-impl", - "intl-memoizer", - "lazy_static", - "log", - "parking_lot 0.12.4", - "rust-embed", - "thiserror", - "unic-langid", - "walkdir", -] - -[[package]] -name = "i18n-embed-fl" -version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc1f8715195dffc4caddcf1cf3128da15fe5d8a137606ea8856c9300047d5a2" -dependencies = [ - "dashmap", - "find-crate", - "fluent", - "fluent-syntax", - "i18n-config", - "i18n-embed", - "lazy_static", - "proc-macro-error", - "proc-macro2", - "quote", - "strsim 0.10.0", - "syn 2.0.106", - "unic-langid", -] - -[[package]] -name = "i18n-embed-impl" -version = "0.8.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f2cc0e0523d1fe6fc2c6f66e5038624ea8091b3e7748b5e8e0c84b1698db6c2" -dependencies = [ - "find-crate", - "i18n-config", - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "icu_collections" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" -dependencies = [ - "displaydoc", - "potential_utf", - "yoke", - "zerofrom", - "zerovec", -] - -[[package]] -name = "icu_locale_core" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" -dependencies = [ - "displaydoc", - "litemap", - "tinystr", - "writeable", - "zerovec", -] - -[[package]] -name = "icu_normalizer" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_normalizer_data", - "icu_properties", - "icu_provider", - "smallvec", - "zerovec", -] - -[[package]] -name = "icu_normalizer_data" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" - -[[package]] -name = "icu_properties" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" -dependencies = [ - "displaydoc", - "icu_collections", - "icu_locale_core", - "icu_properties_data", - "icu_provider", - "potential_utf", - "zerotrie", - "zerovec", -] - -[[package]] -name = "icu_properties_data" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" - -[[package]] -name = "icu_provider" -version = "2.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" -dependencies = [ - "displaydoc", - "icu_locale_core", - "stable_deref_trait", - "tinystr", - "writeable", - "yoke", - "zerofrom", - "zerotrie", - "zerovec", -] - -[[package]] -name = "idna" -version = "1.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" -dependencies = [ - "idna_adapter", - "smallvec", - "utf8_iter", -] - -[[package]] -name = "idna_adapter" -version = "1.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" -dependencies = [ - "icu_normalizer", - "icu_properties", -] - -[[package]] -name = "inout" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" -dependencies = [ - "generic-array", -] - -[[package]] -name = "instant" -version = "0.1.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" -dependencies = [ - "cfg-if", -] - -[[package]] -name = "intl-memoizer" -version = "0.5.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "310da2e345f5eb861e7a07ee182262e94975051db9e4223e909ba90f392f163f" -dependencies = [ - "type-map", - "unic-langid", -] - -[[package]] -name = "intl_pluralrules" -version = "7.0.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078ea7b7c29a2b4df841a7f6ac8775ff6074020c6776d48491ce2268e068f972" -dependencies = [ - "unic-langid", -] - -[[package]] -name = "io-uring" -version = "0.7.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d93587f37623a1a17d94ef2bc9ada592f5465fe7732084ab7beefabe5c77c0c4" -dependencies = [ - "bitflags 2.9.2", - "cfg-if", - "libc", -] - -[[package]] -name = "io_tee" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b3f7cef34251886990511df1c61443aa928499d598a9473929ab5a90a527304" - -[[package]] -name = "is_terminal_polyfill" -version = "1.70.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" - -[[package]] -name = "itoa" -version = "1.0.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" - -[[package]] -name = "lazy_static" -version = "1.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" - -[[package]] -name = "libc" -version = "0.2.175" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" - -[[package]] -name = "litemap" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" - -[[package]] -name = "lock_api" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" -dependencies = [ - "autocfg", - "scopeguard", -] - -[[package]] -name = "log" -version = "0.4.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" - -[[package]] -name = "memchr" -version = "2.7.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" - -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - -[[package]] -name = "miniz_oxide" -version = "0.8.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" -dependencies = [ - "adler2", -] - -[[package]] -name = "mio" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" -dependencies = [ - "libc", - "wasi", - "windows-sys 0.59.0", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - -[[package]] -name = "object" -version = "0.36.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" -dependencies = [ - "memchr", -] - -[[package]] -name = "once_cell" -version = "1.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" - -[[package]] -name = "once_cell_polyfill" -version = "1.70.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" - -[[package]] -name = "opaque-debug" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" - -[[package]] -name = "parking_lot" -version = "0.11.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" -dependencies = [ - "instant", - "lock_api", - "parking_lot_core 0.8.6", -] - -[[package]] -name = "parking_lot" -version = "0.12.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" -dependencies = [ - "lock_api", - "parking_lot_core 0.9.11", -] - -[[package]] -name = "parking_lot_core" -version = "0.8.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" -dependencies = [ - "cfg-if", - "instant", - "libc", - "redox_syscall 0.2.16", - "smallvec", - "winapi", -] - -[[package]] -name = "parking_lot_core" -version = "0.9.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall 0.5.17", - "smallvec", - "windows-targets 0.52.6", -] - -[[package]] -name = "pbkdf2" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" -dependencies = [ - "digest", - "hmac", -] - -[[package]] -name = "percent-encoding" -version = "2.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" - -[[package]] -name = "pin-project" -version = "1.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "pin-project-lite" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" - -[[package]] -name = "pin-utils" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" - -[[package]] -name = "pkcs8" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" -dependencies = [ - "der", - "spki", -] - -[[package]] -name = "poly1305" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf" -dependencies = [ - "cpufeatures", - "opaque-debug", - "universal-hash", -] - -[[package]] -name = "potential_utf" -version = "0.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" -dependencies = [ - "zerovec", -] - -[[package]] -name = "ppv-lite86" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" -dependencies = [ - "zerocopy", -] - -[[package]] -name = "proc-macro-error" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" -dependencies = [ - "proc-macro-error-attr", - "proc-macro2", - "quote", - "syn 1.0.109", - "version_check", -] - -[[package]] -name = "proc-macro-error-attr" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" -dependencies = [ - "proc-macro2", - "quote", - "version_check", -] - -[[package]] -name = "proc-macro2" -version = "1.0.97" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61789d7719defeb74ea5fe81f2fdfdbd28a803847077cecce2ff14e1472f6f1" -dependencies = [ - "unicode-ident", -] - -[[package]] -name = "quote" -version = "1.0.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" -dependencies = [ - "proc-macro2", -] - -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - -[[package]] -name = "redb" -version = "2.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59b38b05028f398f08bea4691640503ec25fcb60b82fb61ce1f8fd1f4fccd3f7" -dependencies = [ - "libc", -] - -[[package]] -name = "redis" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd" -dependencies = [ - "async-trait", - "bytes", - "combine", - "futures-util", - "itoa", - "percent-encoding", - "pin-project-lite", - "ryu", - "sha1_smol", - "socket2 0.4.10", - "tokio", - "tokio-util", - "url", -] - -[[package]] -name = "redox_syscall" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" -dependencies = [ - "bitflags 1.3.2", -] - -[[package]] -name = "redox_syscall" -version = "0.5.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" -dependencies = [ - "bitflags 2.9.2", -] - -[[package]] -name = "rust-embed" -version = "8.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025908b8682a26ba8d12f6f2d66b987584a4a87bc024abc5bbc12553a8cd178a" -dependencies = [ - "rust-embed-impl", - "rust-embed-utils", - "walkdir", -] - -[[package]] -name = "rust-embed-impl" -version = "8.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6065f1a4392b71819ec1ea1df1120673418bf386f50de1d6f54204d836d4349c" -dependencies = [ - "proc-macro2", - "quote", - "rust-embed-utils", - "syn 2.0.106", - "walkdir", -] - -[[package]] -name = "rust-embed-utils" -version = "8.7.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f6cc0c81648b20b70c491ff8cce00c1c3b223bb8ed2b5d41f0e54c6c4c0a3594" -dependencies = [ - "sha2", - "walkdir", -] - -[[package]] -name = "rustc-demangle" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" - -[[package]] -name = "rustc-hash" -version = "1.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" - -[[package]] -name = "rustc-hash" -version = "2.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" - -[[package]] -name = "rustc_version" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" -dependencies = [ - "semver", -] - -[[package]] -name = "ryu" -version = "1.0.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" - -[[package]] -name = "salsa20" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213" -dependencies = [ - "cipher", -] - -[[package]] -name = "same-file" -version = "1.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" -dependencies = [ - "winapi-util", -] - -[[package]] -name = "scopeguard" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" - -[[package]] -name = "scrypt" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f" -dependencies = [ - "pbkdf2", - "salsa20", - "sha2", -] - -[[package]] -name = "secrecy" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bd1c54ea06cfd2f6b63219704de0b9b4f72dcc2b8fdef820be6cd799780e91e" -dependencies = [ - "zeroize", -] - -[[package]] -name = "self_cell" -version = "0.10.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14e4d63b804dc0c7ec4a1e52bcb63f02c7ac94476755aa579edac21e01f915d" -dependencies = [ - "self_cell 1.2.0", -] - -[[package]] -name = "self_cell" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f7d95a54511e0c7be3f51e8867aa8cf35148d7b9445d44de2f943e2b206e749" - -[[package]] -name = "semver" -version = "1.0.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" - -[[package]] -name = "serde" -version = "1.0.219" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" -dependencies = [ - "serde_derive", -] - -[[package]] -name = "serde_derive" -version = "1.0.219" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "serde_json" -version = "1.0.142" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "030fedb782600dcbd6f02d479bf0d817ac3bb40d644745b769d6a96bc3afc5a7" -dependencies = [ - "itoa", - "memchr", - "ryu", - "serde", -] - -[[package]] -name = "sha1_smol" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" - -[[package]] -name = "sha2" -version = "0.10.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" -dependencies = [ - "cfg-if", - "cpufeatures", - "digest", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" -dependencies = [ - "libc", -] - -[[package]] -name = "signature" -version = "2.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" -dependencies = [ - "rand_core", -] - -[[package]] -name = "slab" -version = "0.4.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" - -[[package]] -name = "sled" -version = "0.34.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935" -dependencies = [ - "crc32fast", - "crossbeam-epoch", - "crossbeam-utils", - "fs2", - "fxhash", - "libc", - "log", - "parking_lot 0.11.2", -] - -[[package]] -name = "smallvec" -version = "1.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" - -[[package]] -name = "socket2" -version = "0.4.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9f7916fc008ca5542385b89a3d3ce689953c143e9304a9bf8beec1de48994c0d" -dependencies = [ - "libc", - "winapi", -] - -[[package]] -name = "socket2" -version = "0.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" -dependencies = [ - "libc", - "windows-sys 0.59.0", -] - -[[package]] -name = "spki" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" -dependencies = [ - "base64ct", - "der", -] - -[[package]] -name = "stable_deref_trait" -version = "1.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" - -[[package]] -name = "strsim" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" - -[[package]] -name = "strsim" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" - -[[package]] -name = "subtle" -version = "2.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" - -[[package]] -name = "syn" -version = "1.0.109" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" -dependencies = [ - "proc-macro2", - "unicode-ident", -] - -[[package]] -name = "syn" -version = "2.0.106" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" -dependencies = [ - "proc-macro2", - "quote", - "unicode-ident", -] - -[[package]] -name = "synstructure" -version = "0.13.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "thiserror" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" -dependencies = [ - "thiserror-impl", -] - -[[package]] -name = "thiserror-impl" -version = "1.0.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "tinystr" -version = "0.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" -dependencies = [ - "displaydoc", - "zerovec", -] - -[[package]] -name = "tokio" -version = "1.47.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" -dependencies = [ - "backtrace", - "bytes", - "io-uring", - "libc", - "mio", - "parking_lot 0.12.4", - "pin-project-lite", - "signal-hook-registry", - "slab", - "socket2 0.6.0", - "tokio-macros", - "windows-sys 0.59.0", -] - -[[package]] -name = "tokio-macros" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "tokio-util" -version = "0.7.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" -dependencies = [ - "bytes", - "futures-core", - "futures-sink", - "pin-project-lite", - "tokio", -] - -[[package]] -name = "toml" -version = "0.5.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" -dependencies = [ - "serde", -] - -[[package]] -name = "type-map" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cb30dbbd9036155e74adad6812e9898d03ec374946234fbcebd5dfc7b9187b90" -dependencies = [ - "rustc-hash 2.1.1", -] - -[[package]] -name = "typenum" -version = "1.18.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" - -[[package]] -name = "unic-langid" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a28ba52c9b05311f4f6e62d5d9d46f094bd6e84cb8df7b3ef952748d752a7d05" -dependencies = [ - "unic-langid-impl", -] - -[[package]] -name = "unic-langid-impl" -version = "0.9.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dce1bf08044d4b7a94028c93786f8566047edc11110595914de93362559bc658" -dependencies = [ - "serde", - "tinystr", -] - -[[package]] -name = "unicode-ident" -version = "1.0.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" - -[[package]] -name = "universal-hash" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" -dependencies = [ - "crypto-common", - "subtle", -] - -[[package]] -name = "url" -version = "2.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" -dependencies = [ - "form_urlencoded", - "idna", - "percent-encoding", -] - -[[package]] -name = "utf8_iter" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" - -[[package]] -name = "utf8parse" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" - -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - -[[package]] -name = "walkdir" -version = "2.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" -dependencies = [ - "same-file", - "winapi-util", -] - -[[package]] -name = "wasi" -version = "0.11.1+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-util" -version = "0.1.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" -dependencies = [ - "windows-sys 0.59.0", -] - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" - -[[package]] -name = "windows-link" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" - -[[package]] -name = "windows-sys" -version = "0.59.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" -dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-sys" -version = "0.60.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" -dependencies = [ - "windows-targets 0.53.3", -] - -[[package]] -name = "windows-targets" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" -dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", - "windows_i686_gnullvm 0.52.6", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", -] - -[[package]] -name = "windows-targets" -version = "0.53.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" -dependencies = [ - "windows-link", - "windows_aarch64_gnullvm 0.53.0", - "windows_aarch64_msvc 0.53.0", - "windows_i686_gnu 0.53.0", - "windows_i686_gnullvm 0.53.0", - "windows_i686_msvc 0.53.0", - "windows_x86_64_gnu 0.53.0", - "windows_x86_64_gnullvm 0.53.0", - "windows_x86_64_msvc 0.53.0", -] - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" - -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" - -[[package]] -name = "windows_aarch64_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" - -[[package]] -name = "windows_i686_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" - -[[package]] -name = "windows_i686_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" - -[[package]] -name = "windows_i686_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" - -[[package]] -name = "windows_i686_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" - -[[package]] -name = "windows_i686_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" - -[[package]] -name = "windows_x86_64_gnu" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" - -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.52.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" - -[[package]] -name = "windows_x86_64_msvc" -version = "0.53.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" - -[[package]] -name = "writeable" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" - -[[package]] -name = "x25519-dalek" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7e468321c81fb07fa7f4c636c3972b9100f0346e5b6a9f2bd0603a52f7ed277" -dependencies = [ - "curve25519-dalek", - "rand_core", - "serde", - "zeroize", -] - -[[package]] -name = "yoke" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" -dependencies = [ - "serde", - "stable_deref_trait", - "yoke-derive", - "zerofrom", -] - -[[package]] -name = "yoke-derive" -version = "0.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", - "synstructure", -] - -[[package]] -name = "zerocopy" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" -dependencies = [ - "zerocopy-derive", -] - -[[package]] -name = "zerocopy-derive" -version = "0.8.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "zerofrom" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" -dependencies = [ - "zerofrom-derive", -] - -[[package]] -name = "zerofrom-derive" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", - "synstructure", -] - -[[package]] -name = "zeroize" -version = "1.8.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" -dependencies = [ - "zeroize_derive", -] - -[[package]] -name = "zeroize_derive" -version = "1.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - -[[package]] -name = "zerotrie" -version = "0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" -dependencies = [ - "displaydoc", - "yoke", - "zerofrom", -] - -[[package]] -name = "zerovec" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" -dependencies = [ - "yoke", - "zerofrom", - "zerovec-derive", -] - -[[package]] -name = "zerovec-derive" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] From 45195d403eb2f42a4ed03d1a6d539c3e44d77aab Mon Sep 17 00:00:00 2001 From: despiegk Date: Sat, 23 Aug 2025 05:12:17 +0200 Subject: [PATCH 4/4] ... --- Cargo.lock | 2166 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2166 insertions(+) create mode 100644 Cargo.lock diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..e0ec8c8 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,2166 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfbe277e56a376000877090da837660b4427aad530e3028d44e0bffe4f89a1c1" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aead" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d122413f284cf2d62fb1b7db97e02edb8cda96d769b16e443a4f6195e35662b0" +dependencies = [ + "crypto-common", + "generic-array", +] + +[[package]] +name = "age" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77de71da1ca673855aacea507a7aed363beb8934cf61b62364fc4b479d2e8cda" +dependencies = [ + "age-core", + "base64 0.21.7", + "bech32", + "chacha20poly1305", + "cookie-factory", + "hmac", + "i18n-embed", + "i18n-embed-fl", + "lazy_static", + "nom", + "pin-project", + "rand", + "rust-embed", + "scrypt", + "sha2", + "subtle", + "x25519-dalek", + "zeroize", +] + +[[package]] +name = "age-core" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5f11899bc2bbddd135edbc30c36b1924fa59d0746bb45beb5933fafe3fe509b" +dependencies = [ + "base64 0.21.7", + "chacha20poly1305", + "cookie-factory", + "hkdf", + "io_tee", + "nom", + "rand", + "secrecy", + "sha2", +] + +[[package]] +name = "anstream" +version = "0.6.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3ae563653d1938f79b1ab1b5e668c87c76a9930414574a6583a7b7e11a8e6192" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "862ed96ca487e809f1c8e5a8447f6ee2cf102f846893800b20cebdf541fc6bbd" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e231f6134f61b71076a3eab506c379d4f36122f2af15a9ff04415ea4c3339e2" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e0633414522a32ffaac8ac6cc8f748e090c5717661fddeea04219e2344f5f2a" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.60.2", +] + +[[package]] +name = "anyhow" +version = "1.0.99" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0674a1ddeecb70197781e945de4b3b8ffb61fa939a5597bcf48503737663100" + +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "backtrace" +version = "0.3.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6806a6321ec58106fea15becdad98371e28d92ccbc7c8f1b3b6dd724fe8f1002" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-targets 0.52.6", +] + +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64ct" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55248b47b0caf0546f7988906588779981c43bb1bc9d0c44087278f80cdb44ba" + +[[package]] +name = "basic-toml" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba62675e8242a4c4e806d12f11d136e626e6c8361d6b829310732241652a178a" +dependencies = [ + "serde", +] + +[[package]] +name = "bech32" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d86b93f97252c47b41663388e6d155714a9d0c398b99f1005cbc5f978b29f445" + +[[package]] +name = "bincode" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1f45e9417d87227c7a56d22e471c6206462cba514c7590c09aff4cf6d1ddcad" +dependencies = [ + "serde", +] + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34efbcccd345379ca2868b2b2c9d3782e9cc58ba87bc7d79d5b53d9c9ae6f25d" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71b6127be86fdcfddb610f7182ac57211d4b18a3e9c82eb2d17662f2227ad6a" + +[[package]] +name = "cfg-if" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fd1289c04a9ea8cb22300a459a72a385d7c73d3259e2ed7dcb2af674838cfa9" + +[[package]] +name = "chacha20" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3613f74bd2eac03dad61bd53dbe620703d4371614fe0bc3b9f04dd36fe4e818" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "chacha20poly1305" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10cd79432192d1c0f4e1a0fef9527696cc039165d729fb41b3f4f4f354c2dc35" +dependencies = [ + "aead", + "chacha20", + "cipher", + "poly1305", + "zeroize", +] + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", + "zeroize", +] + +[[package]] +name = "clap" +version = "4.5.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc0e74a703892159f5ae7d3aac52c8e6c392f5ae5f359c70b5881d60aaac318" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3e7f4214277f3c7aa526a59dd3fbe306a370daee1f8b7b8c987069cd8e888a8" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim 0.11.1", +] + +[[package]] +name = "clap_derive" +version = "4.5.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14cb31bb0a7d536caef2639baa7fad459e15c3144efefa6dbd1c84562c4739f6" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "clap_lex" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b94f61472cee1439c0b966b47e3aca9ae07e45d070759512cd390ea2bebc6675" + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "combine" +version = "4.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba5a308b75df32fe02788e748662718f03fde005016435c444eea572398219fd" +dependencies = [ + "bytes", + "futures-core", + "memchr", + "pin-project-lite", + "tokio", + "tokio-util", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "cookie-factory" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9885fa71e26b8ab7855e2ec7cae6e9b380edff76cd052e07c683a0319d51b3a2" +dependencies = [ + "futures", +] + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "rand_core", + "typenum", +] + +[[package]] +name = "curve25519-dalek" +version = "4.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fb8b7c4503de7d6ae7b42ab72a5a59857b4c937ec27a3d4539dba95b5ab2be" +dependencies = [ + "cfg-if", + "cpufeatures", + "curve25519-dalek-derive", + "digest", + "fiat-crypto", + "rustc_version", + "subtle", + "zeroize", +] + +[[package]] +name = "curve25519-dalek-derive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f46882e17999c6cc590af592290432be3bce0428cb0d5f8b6715e4dc7b383eb3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "dashmap" +version = "5.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "978747c1d849a7d2ee5e8adc0159961c48fb7e5db2f06af6723b80123bb53856" +dependencies = [ + "cfg-if", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core 0.9.11", +] + +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "zeroize", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", + "subtle", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "ed25519" +version = "2.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "115531babc129696a58c64a4fef0a8bf9e9698629fb97e9e40767d235cfbcd53" +dependencies = [ + "pkcs8", + "signature", +] + +[[package]] +name = "ed25519-dalek" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70e796c081cee67dc755e1a36a0a172b897fab85fc3f6bc48307991f64e4eca9" +dependencies = [ + "curve25519-dalek", + "ed25519", + "serde", + "sha2", + "subtle", + "zeroize", +] + +[[package]] +name = "fiat-crypto" +version = "0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28dea519a9695b9977216879a3ebfddf92f1c08c05d984f8996aecd6ecdc811d" + +[[package]] +name = "find-crate" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59a98bbaacea1c0eb6a0876280051b892eb73594fd90cf3b20e9c817029c57d2" +dependencies = [ + "toml", +] + +[[package]] +name = "fluent" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb74634707bebd0ce645a981148e8fb8c7bccd4c33c652aeffd28bf2f96d555a" +dependencies = [ + "fluent-bundle", + "unic-langid", +] + +[[package]] +name = "fluent-bundle" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe0a21ee80050c678013f82edf4b705fe2f26f1f9877593d13198612503f493" +dependencies = [ + "fluent-langneg", + "fluent-syntax", + "intl-memoizer", + "intl_pluralrules", + "rustc-hash 1.1.0", + "self_cell 0.10.3", + "smallvec", + "unic-langid", +] + +[[package]] +name = "fluent-langneg" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c4ad0989667548f06ccd0e306ed56b61bd4d35458d54df5ec7587c0e8ed5e94" +dependencies = [ + "unic-langid", +] + +[[package]] +name = "fluent-syntax" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a530c4694a6a8d528794ee9bbd8ba0122e779629ac908d15ad5a7ae7763a33d" +dependencies = [ + "thiserror", +] + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs2" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "futures" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "65bc07b1a8bc7c85c5f2e110c476c7389b4554ba72af57d8445ea63a576b0876" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" + +[[package]] +name = "futures-executor" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e28d1d997f585e54aebc3f97d39e72338912123a67330d723fdbb564d646c9f" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-macro" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "futures-sink" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" + +[[package]] +name = "futures-task" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" + +[[package]] +name = "futures-util" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "pin-utils", + "slab", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "335ff9f135e4384c8150d6f27c6daed433577f86b4750418338c01a1a2528592" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + +[[package]] +name = "gimli" +version = "0.31.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "herodb" +version = "0.0.1" +dependencies = [ + "age", + "anyhow", + "base64 0.22.1", + "bincode", + "byteorder", + "bytes", + "chacha20poly1305", + "clap", + "ed25519-dalek", + "futures", + "rand", + "redb", + "redis", + "secrecy", + "serde", + "serde_json", + "sha2", + "sled", + "thiserror", + "tokio", +] + +[[package]] +name = "hkdf" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b5f8eb2ad728638ea2c7d47a21db23b7b58a72ed6a38256b8a1849f15fbbdf7" +dependencies = [ + "hmac", +] + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "i18n-config" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e06b90c8a0d252e203c94344b21e35a30f3a3a85dc7db5af8f8df9f3e0c63ef" +dependencies = [ + "basic-toml", + "log", + "serde", + "serde_derive", + "thiserror", + "unic-langid", +] + +[[package]] +name = "i18n-embed" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94205d95764f5bb9db9ea98fa77f89653365ca748e27161f5bbea2ffd50e459c" +dependencies = [ + "arc-swap", + "fluent", + "fluent-langneg", + "fluent-syntax", + "i18n-embed-impl", + "intl-memoizer", + "lazy_static", + "log", + "parking_lot 0.12.4", + "rust-embed", + "thiserror", + "unic-langid", + "walkdir", +] + +[[package]] +name = "i18n-embed-fl" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fc1f8715195dffc4caddcf1cf3128da15fe5d8a137606ea8856c9300047d5a2" +dependencies = [ + "dashmap", + "find-crate", + "fluent", + "fluent-syntax", + "i18n-config", + "i18n-embed", + "lazy_static", + "proc-macro-error", + "proc-macro2", + "quote", + "strsim 0.10.0", + "syn 2.0.106", + "unic-langid", +] + +[[package]] +name = "i18n-embed-impl" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f2cc0e0523d1fe6fc2c6f66e5038624ea8091b3e7748b5e8e0c84b1698db6c2" +dependencies = [ + "find-crate", + "i18n-config", + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "icu_collections" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "200072f5d0e3614556f94a9930d5dc3e0662a652823904c3a75dc3b0af7fee47" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde2700ccaed3872079a65fb1a78f6c0a36c91570f28755dda67bc8f7d9f00a" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "436880e8e18df4d7bbc06d58432329d6458cc84531f7ac5f024e93deadb37979" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00210d6893afc98edb752b664b8890f0ef174c8adbb8d0be9710fa66fbbf72d3" + +[[package]] +name = "icu_properties" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "016c619c1eeb94efb86809b015c58f479963de65bdb6253345c1a1276f22e32b" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "potential_utf", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "298459143998310acd25ffe6810ed544932242d3f07083eee1084d83a71bd632" + +[[package]] +name = "icu_provider" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03c80da27b5f4187909049ee2d72f276f0d9f99a42c306bd0131ecfe04d8e5af" +dependencies = [ + "displaydoc", + "icu_locale_core", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "generic-array", +] + +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "intl-memoizer" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "310da2e345f5eb861e7a07ee182262e94975051db9e4223e909ba90f392f163f" +dependencies = [ + "type-map", + "unic-langid", +] + +[[package]] +name = "intl_pluralrules" +version = "7.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "078ea7b7c29a2b4df841a7f6ac8775ff6074020c6776d48491ce2268e068f972" +dependencies = [ + "unic-langid", +] + +[[package]] +name = "io-uring" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "046fa2d4d00aea763528b4950358d0ead425372445dc8ff86312b3c69ff7727b" +dependencies = [ + "bitflags 2.9.3", + "cfg-if", + "libc", +] + +[[package]] +name = "io_tee" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b3f7cef34251886990511df1c61443aa928499d598a9473929ab5a90a527304" + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libc" +version = "0.2.175" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a82ae493e598baaea5209805c49bbf2ea7de956d50d7da0da1164f9c6d28543" + +[[package]] +name = "litemap" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "241eaef5fd12c88705a01fc1066c48c4b36e0dd4377dcdc7ec3942cea7a69956" + +[[package]] +name = "lock_api" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "memchr" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a282da65faaf38286cf3be983213fcf1d2e2a58700e808f83f4ea9a4804bc0" + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", +] + +[[package]] +name = "mio" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78bed444cc8a2160f01cbcf811ef18cac863ad68ae8ca62092e8db51d51c761c" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.59.0", +] + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "object" +version = "0.36.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62948e14d923ea95ea2c7c86c71013138b66525b86bdc08d2dcc262bdb497b87" +dependencies = [ + "memchr", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4895175b425cb1f87721b59f0f286c2092bd4af812243672510e1ac53e2e0ad" + +[[package]] +name = "opaque-debug" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" + +[[package]] +name = "parking_lot" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d17b78036a60663b797adeaee46f5c9dfebb86948d1255007a1d6be0271ff99" +dependencies = [ + "instant", + "lock_api", + "parking_lot_core 0.8.6", +] + +[[package]] +name = "parking_lot" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70d58bf43669b5795d1576d0641cfb6fbb2057bf629506267a92807158584a13" +dependencies = [ + "lock_api", + "parking_lot_core 0.9.11", +] + +[[package]] +name = "parking_lot_core" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a2cfe6f0ad2bfc16aefa463b497d5c7a5ecd44a23efa72aa342d90177356dc" +dependencies = [ + "cfg-if", + "instant", + "libc", + "redox_syscall 0.2.16", + "smallvec", + "winapi", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc838d2a56b5b1a6c25f55575dfc605fabb63bb2365f6c2353ef9159aa69e4a5" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall 0.5.17", + "smallvec", + "windows-targets 0.52.6", +] + +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "pin-project" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "spki", +] + +[[package]] +name = "poly1305" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8159bd90725d2df49889a078b54f4f79e87f1f8a8444194cdca81d38f5393abf" +dependencies = [ + "cpufeatures", + "opaque-debug", + "universal-hash", +] + +[[package]] +name = "potential_utf" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5a7c30837279ca13e7c867e9e40053bc68740f988cb07f7ca6df43cc734b585" +dependencies = [ + "zerovec", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "syn 1.0.109", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro2" +version = "1.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89ae43fd86e4158d6db51ad8e2b80f313af9cc74f5c0e03ccb87de09998732de" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom", +] + +[[package]] +name = "redb" +version = "2.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59b38b05028f398f08bea4691640503ec25fcb60b82fb61ce1f8fd1f4fccd3f7" +dependencies = [ + "libc", +] + +[[package]] +name = "redis" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd" +dependencies = [ + "async-trait", + "bytes", + "combine", + "futures-util", + "itoa", + "percent-encoding", + "pin-project-lite", + "ryu", + "sha1_smol", + "socket2 0.4.10", + "tokio", + "tokio-util", + "url", +] + +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags 1.3.2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5407465600fb0548f1442edf71dd20683c6ed326200ace4b1ef0763521bb3b77" +dependencies = [ + "bitflags 2.9.3", +] + +[[package]] +name = "rust-embed" +version = "8.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "025908b8682a26ba8d12f6f2d66b987584a4a87bc024abc5bbc12553a8cd178a" +dependencies = [ + "rust-embed-impl", + "rust-embed-utils", + "walkdir", +] + +[[package]] +name = "rust-embed-impl" +version = "8.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6065f1a4392b71819ec1ea1df1120673418bf386f50de1d6f54204d836d4349c" +dependencies = [ + "proc-macro2", + "quote", + "rust-embed-utils", + "syn 2.0.106", + "walkdir", +] + +[[package]] +name = "rust-embed-utils" +version = "8.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6cc0c81648b20b70c491ff8cce00c1c3b223bb8ed2b5d41f0e54c6c4c0a3594" +dependencies = [ + "sha2", + "walkdir", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" + +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" + +[[package]] +name = "salsa20" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213" +dependencies = [ + "cipher", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scrypt" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f" +dependencies = [ + "pbkdf2", + "salsa20", + "sha2", +] + +[[package]] +name = "secrecy" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bd1c54ea06cfd2f6b63219704de0b9b4f72dcc2b8fdef820be6cd799780e91e" +dependencies = [ + "zeroize", +] + +[[package]] +name = "self_cell" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e14e4d63b804dc0c7ec4a1e52bcb63f02c7ac94476755aa579edac21e01f915d" +dependencies = [ + "self_cell 1.2.0", +] + +[[package]] +name = "self_cell" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f7d95a54511e0c7be3f51e8867aa8cf35148d7b9445d44de2f943e2b206e749" + +[[package]] +name = "semver" +version = "1.0.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6fa9c48d24d85fb3de5ad847117517440f6beceb7798af16b4a87d616b8d0" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "serde_json" +version = "1.0.143" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d401abef1d108fbd9cbaebc3e46611f4b1021f714a0597a71f41ee463f5f4a5a" +dependencies = [ + "itoa", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "sha1_smol" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbfa15b3dddfee50a0fff136974b3e1bde555604ba463834a7eb7deb6417705d" + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a4719bff48cee6b39d12c020eeb490953ad2443b7055bd0b21fca26bd8c28b" +dependencies = [ + "libc", +] + +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "rand_core", +] + +[[package]] +name = "slab" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" + +[[package]] +name = "sled" +version = "0.34.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f96b4737c2ce5987354855aed3797279def4ebf734436c6aa4552cf8e169935" +dependencies = [ + "crc32fast", + "crossbeam-epoch", + "crossbeam-utils", + "fs2", + "fxhash", + "libc", + "log", + "parking_lot 0.11.2", +] + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "socket2" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9f7916fc008ca5542385b89a3d3ce689953c143e9304a9bf8beec1de48994c0d" +dependencies = [ + "libc", + "winapi", +] + +[[package]] +name = "socket2" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "233504af464074f9d066d7b5416c5f9b894a5862a6506e306f7b816cdd6f1807" +dependencies = [ + "libc", + "windows-sys 0.59.0", +] + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "strsim" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ede7c438028d4436d71104916910f5bb611972c5cfd7f89b8300a8186e6fada6" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "thiserror" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.69" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "tinystr" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d4f6d1145dcb577acf783d4e601bc1d76a13337bb54e6233add580b07344c8b" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tokio" +version = "1.47.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89e49afdadebb872d3145a5638b59eb0691ea23e46ca484037cfab3b76b95038" +dependencies = [ + "backtrace", + "bytes", + "io-uring", + "libc", + "mio", + "parking_lot 0.12.4", + "pin-project-lite", + "signal-hook-registry", + "slab", + "socket2 0.6.0", + "tokio-macros", + "windows-sys 0.59.0", +] + +[[package]] +name = "tokio-macros" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e06d43f1345a3bcd39f6a56dbb7dcab2ba47e68e8ac134855e7e2bdbaf8cab8" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "tokio-util" +version = "0.7.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14307c986784f72ef81c89db7d9e28d6ac26d16213b109ea501696195e6e3ce5" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + +[[package]] +name = "type-map" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb30dbbd9036155e74adad6812e9898d03ec374946234fbcebd5dfc7b9187b90" +dependencies = [ + "rustc-hash 2.1.1", +] + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + +[[package]] +name = "unic-langid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28ba52c9b05311f4f6e62d5d9d46f094bd6e84cb8df7b3ef952748d752a7d05" +dependencies = [ + "unic-langid-impl", +] + +[[package]] +name = "unic-langid-impl" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce1bf08044d4b7a94028c93786f8566047edc11110595914de93362559bc658" +dependencies = [ + "serde", + "tinystr", +] + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "universal-hash" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc1de2c688dc15305988b563c3854064043356019f97a4b46276fe734c4f07ea" +dependencies = [ + "crypto-common", + "subtle", +] + +[[package]] +name = "url" +version = "2.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "137a3c834eaf7139b73688502f3f1141a0337c5d8e4d9b536f9b8c796e26a7c4" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0978bf7171b3d90bac376700cb56d606feb40f251a475a5d6634613564460b22" +dependencies = [ + "windows-sys 0.60.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-link" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e6ad25900d524eaabdbbb96d20b4311e1e7ae1699af4fb28c17ae66c80d798a" + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.3", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.0", + "windows_aarch64_msvc 0.53.0", + "windows_i686_gnu 0.53.0", + "windows_i686_gnullvm 0.53.0", + "windows_i686_msvc 0.53.0", + "windows_x86_64_gnu 0.53.0", + "windows_x86_64_gnullvm 0.53.0", + "windows_x86_64_msvc 0.53.0", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86b8d5f90ddd19cb4a147a5fa63ca848db3df085e25fee3cc10b39b6eebae764" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7651a1f62a11b8cbd5e0d42526e55f2c99886c77e007179efff86c2b137e66c" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1dc67659d35f387f5f6c479dc4e28f1d4bb90ddd1a5d3da2e5d97b42d6272c3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ce6ccbdedbf6d6354471319e781c0dfef054c81fbc7cf83f338a4296c0cae11" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "581fee95406bb13382d2f65cd4a908ca7b1e4c2f1917f143ba16efe98a589b5d" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e55b5ac9ea33f2fc1716d1742db15574fd6fc8dadc51caab1c16a3d3b4190ba" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a6e035dd0599267ce1ee132e51c27dd29437f63325753051e71dd9e42406c57" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "271414315aff87387382ec3d271b52d7ae78726f5d44ac98b4f4030c91880486" + +[[package]] +name = "writeable" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea2f10b9bb0928dfb1b42b65e1f9e36f7f54dbdf08457afefb38afcdec4fa2bb" + +[[package]] +name = "x25519-dalek" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7e468321c81fb07fa7f4c636c3972b9100f0346e5b6a9f2bd0603a52f7ed277" +dependencies = [ + "curve25519-dalek", + "rand_core", + "serde", + "zeroize", +] + +[[package]] +name = "yoke" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f41bb01b8226ef4bfd589436a297c53d118f65921786300e427be8d487695cc" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38da3c9736e16c5d3c8c597a9aaa5d1fa565d0532ae05e27c24aa62fb32c0ab6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1039dd0d3c310cf05de012d8a39ff557cb0d23087fd44cad61df08fc31907a2f" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ecf5b4cc5364572d7f4c329661bcc82724222973f2cab6f050a4e5c22f75181" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ced3678a2879b30306d323f4542626697a464a97c0a07c9aebf7ebca65cd4dde" +dependencies = [ + "zeroize_derive", +] + +[[package]] +name = "zeroize_derive" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +] + +[[package]] +name = "zerotrie" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "36f0bbd478583f79edad978b407914f61b2972f5af6fa089686016be8f9af595" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7aa2bd55086f1ab526693ecbe444205da57e25f4489879da80635a46d90e73b" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b96237efa0c878c64bd89c436f661be4e46b2f3eff1ebb976f7ef2321d2f58f" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.106", +]