33 Commits

Author SHA1 Message Date
Maxime Van Hees
a8720c06db prevent unauthorized access to administrative db0 when connection to redis-cli 2025-10-07 10:52:30 +02:00
Maxime Van Hees
2139deb85d WIP6: implementing image embedding as first step towards multi-model support 2025-09-30 14:53:01 +02:00
Maxime Van Hees
7d07b57d32 WIP 5 add image embedding provider (local only for now) 2025-09-29 16:14:34 +02:00
Maxime Van Hees
4aa49e0d5c WIP4 implementation lanceDB: removed blocking Tokio runtime usage during embeddings and isolated all embedding work off the async runtime 2025-09-29 15:54:12 +02:00
Maxime Van Hees
644946f1ca WIP3 implemeting lancedb 2025-09-29 14:55:41 +02:00
Maxime Van Hees
cf66f4c304 WIP2: implementing lancedb: created embedding abstraction, server-side per-dataset embedding config + updates RPC endpoints 2025-09-29 13:17:34 +02:00
Maxime Van Hees
6a4e2819bf WIP 1: implement lancedb vector 2025-09-29 11:24:31 +02:00
Maxime Van Hees
77a53bae86 don't use strings for paths 2025-09-25 16:25:08 +02:00
7f689ae29b Merge pull request 'tantivy_impl' (#14) from tantivy_impl into main
Reviewed-on: #14
2025-09-25 14:08:50 +00:00
Maxime Van Hees
7f92001b89 fixed key-based access control for Tantivy backends 2025-09-25 16:06:08 +02:00
Maxime Van Hees
e7248b84e8 key-based access control for tantivy backend 2025-09-25 13:36:23 +02:00
Maxime Van Hees
22ac4c9ed6 implementation of tantivy datastore + updated RPC calls to deal with tantivy + docs 2025-09-23 17:15:40 +02:00
Maxime Van Hees
c470772a13 Merge branch 'management_rpc_server' 2025-09-22 16:26:53 +02:00
Maxime Van Hees
bd34fd092a Persist backend per database id in admin metadata so restarts and lazy opens always use the correct engine (Sled/Redb) 2025-09-22 15:29:58 +02:00
Maxime Van Hees
8e044a64b7 fix incorrect keycount displayed in database info over RPC calls 2025-09-19 14:04:03 +02:00
Maxime Van Hees
87177f4a07 update documentation about 0.db admin db + symmetric encryption + include RPC examples + asymmetric transpart named key instances for encryption and signatures 2025-09-19 11:55:28 +02:00
Maxime Van Hees
151a6ffbfa fixed test 2025-09-19 10:35:08 +02:00
Maxime Van Hees
8ab841f68c Key generation now automatically derives X25519 keys from Ed25519 keys which allows user to transparantly use their key name for encrypting/decrypting and signing/verifying 2025-09-18 22:37:19 +02:00
Maxime Van Hees
8808c0e9d9 Implemented symmetric encryption; new commands are SYM KEYGEN; SYM ENCRYPT; SYM DECRYPT 2025-09-18 11:59:44 +02:00
Maxime Van Hees
c6b277cc9c fixed DEL showing wrong deletion amount + AGE LIST now returns a list of managed keys names without nested arrays or labels 2025-09-18 00:19:40 +02:00
8331ed032b ... 2025-09-17 07:02:44 +02:00
Maxime Van Hees
b8ca73397d implemented 0.db as admin database architecture + updated test file 2025-09-16 16:06:47 +02:00
Maxime Van Hees
1b15806a85 fix invalid values in RPC response about database instance details 2025-09-15 13:45:37 +02:00
Maxime Van Hees
da325a9659 fix bug where meta files where not auto-created upon starting + fix bug where meta json files were actually binary + improved access control to database instances 2025-09-15 10:34:03 +02:00
Maxime Van Hees
bdf363016a WIP: adding access management control to db instances 2025-09-12 17:11:50 +02:00
Maxime Van Hees
8798bc202e Restore working code 2025-09-11 18:33:09 +02:00
Maxime Van Hees
9fa9832605 combined curret main (with sled) and RPC server 2025-09-11 17:23:46 +02:00
Maxime Van Hees
4bb24b38dd fix typo in README 2025-09-11 15:34:03 +02:00
Maxime Van Hees
f3da14b957 Merge branch 'append' 2025-09-11 15:31:47 +02:00
Maxime Van Hees
5ea34b4445 update variable name as 'gen' is a reserved keyword since Rust 2024 edition 2025-09-11 15:25:26 +02:00
Maxime Van Hees
d9a3b711d1 Update tot Rust 2024 edition + update Cargo.toml file 2025-09-11 15:24:28 +02:00
Maxime Van Hees
d931770e90 Fix test suite + update Cargo.toml file 2025-09-09 16:04:31 +02:00
Timur Gordon
a87ec4dbb5 add readme 2025-08-27 15:39:59 +02:00
39 changed files with 14481 additions and 354 deletions

5982
Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,8 @@
[package]
name = "herodb"
version = "0.0.1"
authors = ["Pin Fang <fpfangpin@hotmail.com>"]
edition = "2021"
authors = ["ThreeFold Tech NV"]
edition = "2024"
[dependencies]
anyhow = "1.0.59"
@@ -23,7 +23,18 @@ sha2 = "0.10"
age = "0.10"
secrecy = "0.8"
ed25519-dalek = "2"
x25519-dalek = "2"
base64 = "0.22"
jsonrpsee = { version = "0.26.0", features = ["http-client", "ws-client", "server", "macros"] }
tantivy = "0.25.0"
arrow-schema = "55.2.0"
arrow-array = "55.2.0"
lance = "0.37.0"
lance-index = "0.37.0"
arrow = "55.2.0"
lancedb = "0.22.1"
uuid = "1.18.1"
ureq = { version = "2.10.0", features = ["json", "tls"] }
[dev-dependencies]
redis = { version = "0.24", features = ["aio", "tokio-comp"] }

View File

@@ -17,6 +17,8 @@ The main purpose of HeroDB is to offer a lightweight, embeddable, and Redis-comp
- **Expiration**: Time-to-live (TTL) functionality for keys.
- **Scanning**: Cursor-based iteration for keys and hash fields (`SCAN`, `HSCAN`).
- **AGE Cryptography Commands**: HeroDB-specific extensions for cryptographic operations.
- **Symmetric Encryption**: Stateless symmetric encryption using XChaCha20-Poly1305.
- **Admin Database 0**: Centralized control for database management, access control, and per-database encryption.
## Quick Start
@@ -30,31 +32,14 @@ cargo build --release
### Running HeroDB
You can start HeroDB with different backends and encryption options:
#### Default `redb` Backend
Launch HeroDB with the required `--admin-secret` flag, which encrypts the admin database (DB 0) and authorizes admin access. Optional flags include `--dir` for the database directory, `--port` for the TCP port (default 6379), `--sled` for the sled backend, and `--enable-rpc` to start the JSON-RPC management server on port 8080.
Example:
```bash
./target/release/herodb --dir /tmp/herodb_redb --port 6379
./target/release/herodb --dir /tmp/herodb --admin-secret myadminsecret --port 6379 --enable-rpc
```
#### `sled` Backend
```bash
./target/release/herodb --dir /tmp/herodb_sled --port 6379 --sled
```
#### `redb` with Encryption
```bash
./target/release/herodb --dir /tmp/herodb_encrypted --port 6379 --encrypt --key mysecretkey
```
#### `sled` with Encryption
```bash
./target/release/herodb --dir /tmp/herodb_sled_encrypted --port 6379 --sled --encrypt --key mysecretkey
```
For detailed launch options, see [Basics](docs/basics.md).
## Usage with Redis Clients
@@ -62,20 +47,38 @@ HeroDB can be interacted with using any standard Redis client, such as `redis-cl
### Example with `redis-cli`
Connections start with no database selected. You must SELECT a database first.
- To work in the admin database (DB 0), authenticate with the admin secret:
```bash
redis-cli -p 6379 SELECT 0 KEY myadminsecret
redis-cli -p 6379 SET mykey "Hello from HeroDB!"
redis-cli -p 6379 GET mykey
# → "Hello from HeroDB!"
```
- To use a user database, first create one via the JSON-RPC API (see docs/rpc_examples.md), then select it:
```bash
# Suppose RPC created database id 1
redis-cli -p 6379 SELECT 1
redis-cli -p 6379 HSET user:1 name "Alice" age "30"
redis-cli -p 6379 HGET user:1 name
# → "Alice"
redis-cli -p 6379 SCAN 0 MATCH user:* COUNT 10
# → 1) "0"
# 2) 1) "user:1"
```
## Cryptography
HeroDB supports asymmetric encryption/signatures via AGE commands (X25519 for encryption, Ed25519 for signatures) in stateless or key-managed modes, and symmetric encryption via SYM commands. Keys are persisted in the admin database (DB 0) for managed modes.
For details, see [AGE Cryptography](docs/age.md) and [Basics](docs/basics.md).
## Database Management
Databases are managed via JSON-RPC API, with metadata stored in the encrypted admin database (DB 0). Databases are public by default upon creation; use RPC to set them private, requiring access keys for SELECT operations (read or readwrite based on permissions). This includes per-database encryption keys, access control, and lifecycle management.
For examples, see [JSON-RPC Examples](docs/rpc_examples.md) and [Admin DB 0 Model](docs/admin.md).
## Documentation
For more detailed information on commands, features, and advanced usage, please refer to the documentation:
@@ -83,3 +86,5 @@ For more detailed information on commands, features, and advanced usage, please
- [Basics](docs/basics.md)
- [Supported Commands](docs/cmds.md)
- [AGE Cryptography](docs/age.md)
- [Admin DB 0 Model (access control, per-db encryption)](docs/admin.md)
- [JSON-RPC Examples (management API)](docs/rpc_examples.md)

182
docs/admin.md Normal file
View File

@@ -0,0 +1,182 @@
# Admin Database 0 (`0.db`)
This page explains what the Admin Database `DB 0` is, why HeroDB uses it, and how to work with it as a developer and end-user. Its a practical guide covering how databases are created, listed, secured with access keys, and encrypted using per-database secrets.
## What is `DB 0`?
`DB 0` is the control-plane for a HeroDB instance. It stores metadata for all user databases (`db_id >= 1`) so the server can:
- Know which databases exist (without scanning the filesystem)
- Enforce access control (public/private with access keys)
- Enforce per-database encryption (whether a given database must be opened encrypted and with which write-only key)
`DB 0` itself is always encrypted with the admin secret (the process-level secret provided at startup).
## How `DB 0` is created and secured
- `DB 0` lives at `<base_dir>/0.db`
- It is always encrypted using the `admin secret` provided at process startup (using the `--admin-secret <secret>` CLI flag)
- Only clients that provide the correct admin secret can `SELECT 0` (see “`SELECT` + `KEY`” below)
At startup, the server bootstraps `DB 0` (initializes counters and structures) if its missing.
## Metadata stored in `DB 0`
Keys in `DB 0` (internal layout, but useful to understand how things work):
- `admin:next_id`
- String counter holding the next id to allocate (initialized to `"1"`)
- `admin:dbs`
- A hash acting as a set of existing database ids
- field = id (as string), value = `"1"`
- `meta:db:<id>`
- A hash holding db-level metadata
- field `public` = `"true"` or `"false"` (defaults to `true` if missing)
- `meta:db:<id>:keys`
- A hash mapping access-key hashes to the string `Permission:created_at_seconds`
- Examples: `Read:1713456789` or `ReadWrite:1713456789`
- The plaintext access keys are never stored; only their `SHA-256` hashes are kept
- `meta:db:<id>:enc`
- A string holding the per-database encryption key used to open `<id>.db` encrypted
- This value is write-only from the perspective of the management APIs (its set at creation and never returned)
- `age:key:<name>`
- Base64-encoded X25519 recipient (public encryption key) for named AGE keys
- `age:privkey:<name>`
- Base64-encoded X25519 identity (secret encryption key) for named AGE keys
- `age:signpub:<name>`
- Base64-encoded Ed25519 verify public key for named AGE keys
- `age:signpriv:<name>`
- Base64-encoded Ed25519 signing secret key for named AGE keys
> You dont need to manipulate these keys directly; theyre listed to clarify the model. AGE keys are managed via AGE commands.
## Database lifecycle
1) Create a database (via JSON-RPC)
- The server allocates an id from `admin:next_id`, registers it in `admin:dbs`, and defaults the database to `public=true`
- If you pass an optional `encryption_key` during creation, the server persists it in `meta:db:<id>:enc`. That database will be opened in encrypted mode from then on
2) Open and use a database
- Clients select a database over RESP using `SELECT`
- Authorization and encryption state are enforced using `DB 0` metadata
3) Delete database files
- Removing `<id>.db` removes the physical storage
- `DB 0` remains the source of truth for existence and may be updated by future management methods as the system evolves
## Access control model
- Public database (default)
- Anyone can `SELECT <id>` with no key, and will get `ReadWrite` permission
- Private database
- You must provide an access key when selecting the database
- The server hashes the provided key with `SHA-256` and checks membership in `meta:db:<id>:keys`
- Permissions are `Read` or `ReadWrite` depending on how the key was added
- Admin `DB 0`
- Requires the exact admin secret as the `KEY` argument to `SELECT 0`
- Permission is `ReadWrite` when the secret matches
Connections start with no database selected. Any command that requires storage (GET, SET, H*, L*, SCAN, etc.) will return an error until you issue a SELECT to choose a database. Admin DB 0 is never accessible without authenticating via SELECT 0 KEY &lt;admin_secret&gt;.
### How to select databases with optional `KEY`
- Public DB (no key required)
- `SELECT <id>`
- Private DB (access key required)
- `SELECT <id> KEY <plaintext_key>`
- Admin `DB 0` (admin secret required)
- `SELECT 0 KEY <admin_secret>`
Examples (using `redis-cli`):
```bash
# Public database
redis-cli -p $PORT SELECT 1
# → OK
# Private database
redis-cli -p $PORT SELECT 2 KEY my-db2-access-key
# → OK
# Admin DB 0
redis-cli -p $PORT SELECT 0 KEY my-admin-secret
# → OK
```
## Per-database encryption
- At database creation, you can provide an optional per-db encryption key
- If provided, the server persists that key in `DB 0` as `meta:db:<id>:enc`
- When you later open the database, the engine checks whether `meta:db:<id>:enc` exists to decide if it must open `<id>.db` in encrypted mode
- The per-db key is not returned by RPC—it is considered write-only configuration data
Operationally:
- Create with encryption: pass a non-null `encryption_key` to the `createDatabase` RPC
- Open later: simply `SELECT` the database; encryption is transparent to clients
## Management via JSON-RPC
You can manage databases using the management RPC (namespaced `herodb.*`). Typical operations:
- `createDatabase(backend, config, encryption_key?)`
- Allocates a new id, sets optional encryption key
- `listDatabases()`
- Lists database ids and info (including whether storage is currently encrypted)
- `getDatabaseInfo(db_id)`
- Returns details: backend, encrypted flag, size on disk, `key_count`, timestamps, etc.
- `addAccessKey(db_id, key, permissions)`
- Adds a `Read` or `ReadWrite` access key (permissions = `"read"` | `"readwrite"`)
- `listAccessKeys(db_id)`
- Returns hashes and permissions; you can use these hashes to delete keys
- `deleteAccessKey(db_id, key_hash)`
- Removes a key by its hash
- `setDatabasePublic(db_id, public)`
- Toggles public/private
Copyable JSON examples are provided in the [RPC examples documentation](./rpc_examples.md).
## Typical flows
1) Public, unencrypted database
- Create a new database without an encryption key
- Clients can immediately `SELECT <id>` without a key
- You can later make it private and add keys if needed
2) Private, encrypted database
- Create passing an `encryption_key`
- Mark it private (`setDatabasePublic false`) and add access keys
- Clients must use `SELECT <id> KEY <plaintext_access_key>`
- Storage opens in encrypted mode automatically
## Security notes
- Only `SHA-256` hashes of access keys are stored in `DB 0`; keep plaintext keys safe on the client side
- The per-db encryption key is never exposed via the API after it is set
- The admin secret must be kept secure; anyone with it can `SELECT 0` and perform administrative actions
## Troubleshooting
- `ERR invalid access key` when selecting a private db
- Ensure you passed the `KEY` argument: `SELECT <id> KEY <plaintext_key>`
- If you recently added the key, confirm the permissions and that you used the exact plaintext (hash must match)
- `Database X not found`
- The id isnt registered in `DB 0` (`admin:dbs`). Use the management APIs to create or list databases
- Cannot `SELECT 0`
- The `KEY` must be the exact admin secret passed at server startup
## Reference
- Admin metadata lives in `DB 0` (`0.db`) and controls:
- Existence: `admin:dbs`
- Access: `meta:db:<id>.public` and `meta:db:<id>:keys`
- Encryption: `meta:db:<id>:enc`
For command examples and management payloads:
- RESP command basics: `docs/basics.md`
- Supported commands: `docs/cmds.md`
- JSON-RPC examples: `docs/rpc_examples.md`

View File

@@ -1,188 +1,96 @@
# HeroDB AGE usage: Stateless vs KeyManaged
# HeroDB AGE Cryptography
This document explains how to use the AGE cryptography commands exposed by HeroDB over the Redis protocol in two modes:
- Stateless (ephemeral keys; nothing stored on the server)
- Keymanaged (serverpersisted, named keys)
HeroDB provides AGE-based asymmetric encryption and digital signatures over the Redis protocol using X25519 for encryption and Ed25519 for signatures. Keys can be used in stateless (ephemeral) or key-managed (persistent, named) modes.
If you are new to the codebase, the exact tests that exercise these behaviors are:
- [rust.test_07_age_stateless_suite()](herodb/tests/usage_suite.rs:495)
- [rust.test_08_age_persistent_named_suite()](herodb/tests/usage_suite.rs:555)
In key-managed mode, HeroDB uses a unified keypair concept: a single Ed25519 signing key is deterministically derived into X25519 keys for encryption, allowing one keypair to handle both encryption and signatures transparently.
Implementation entry points:
- [herodb/src/age.rs](herodb/src/age.rs)
- Dispatch from [herodb/src/cmd.rs](herodb/src/cmd.rs)
## Cryptographic Algorithms
Note: Database-at-rest encryption flags in the test harness are unrelated to AGE commands; those flags control storage-level encryption of DB files. See the harness near [rust.start_test_server()](herodb/tests/usage_suite.rs:10).
### X25519 (Encryption)
- Elliptic-curve Diffie-Hellman key exchange for symmetric key derivation.
- Used for encrypting/decrypting messages.
## Quick start
### Ed25519 (Signatures)
- EdDSA digital signatures for message authentication.
- Used for signing/verifying messages.
Assuming the server is running on localhost on some $PORT:
### Key Derivation
Ed25519 signing keys are deterministically converted to X25519 keys for encryption. This enables a single keypair to support both operations without additional keys. Derivation uses the Ed25519 secret scalar clamped for X25519.
In named keypairs, Ed25519 keys are stored, and X25519 keys are derived on-demand and cached.
## Stateless Mode (Ephemeral Keys)
No server-side storage; keys are provided with each command.
Available commands:
- `AGE GENENC`: Generate ephemeral X25519 keypair. Returns `[recipient, identity]`.
- `AGE GENSIGN`: Generate ephemeral Ed25519 keypair. Returns `[verify_pub, sign_secret]`.
- `AGE ENCRYPT <recipient> <message>`: Encrypt message. Returns base64 ciphertext.
- `AGE DECRYPT <identity> <ciphertext_b64>`: Decrypt ciphertext. Returns plaintext.
- `AGE SIGN <sign_secret> <message>`: Sign message. Returns base64 signature.
- `AGE VERIFY <verify_pub> <message> <signature_b64>`: Verify signature. Returns 1 (valid) or 0 (invalid).
Example:
```bash
~/code/git.ourworld.tf/herocode/herodb/herodb/build.sh
~/code/git.ourworld.tf/herocode/herodb/target/release/herodb --dir /tmp/data --debug --$PORT 6381 --encryption-key 1234 --encrypt
```
redis-cli AGE GENENC
# → 1) "age1qz..." # recipient (X25519 public)
# 2) "AGE-SECRET-KEY-1..." # identity (X25519 secret)
redis-cli AGE ENCRYPT "age1qz..." "hello"
# → base64_ciphertext
```bash
export PORT=6381
# Generate an ephemeral keypair and encrypt/decrypt a message (stateless mode)
redis-cli -p $PORT AGE GENENC
# → returns an array: [recipient, identity]
redis-cli -p $PORT AGE ENCRYPT <recipient> "hello world"
# → returns ciphertext (base64 in a bulk string)
redis-cli -p $PORT AGE DECRYPT <identity> <ciphertext_b64>
# → returns "hello world"
```
For keymanaged mode, generate a named key once and reference it by name afterwards:
```bash
redis-cli -p $PORT AGE KEYGEN app1
# → persists encryption keypair under name "app1"
redis-cli -p $PORT AGE ENCRYPTNAME app1 "hello"
redis-cli -p $PORT AGE DECRYPTNAME app1 <ciphertext_b64>
```
## Stateless AGE (ephemeral)
Characteristics
- No serverside storage of keys.
- You pass the actual key material with every call.
- Not listable via AGE LIST.
Commands and examples
1) Ephemeral encryption keys
```bash
# Generate an ephemeral encryption keypair
redis-cli -p $PORT AGE GENENC
# Example output (abridged):
# 1) "age1qz..." # recipient (public key) = can be used by others e.g. to verify what I sign
# 2) "AGE-SECRET-KEY-1..." # identity (secret) = is like my private, cannot lose this one
# Encrypt with the recipient public key
redis-cli -p $PORT AGE ENCRYPT "age1qz..." "hello world"
# → returns bulk string payload: base64 ciphertext (encrypted content)
# Decrypt with the identity (secret) in other words your private key
redis-cli -p $PORT AGE DECRYPT "AGE-SECRET-KEY-1..." "<ciphertext_b64>"
# → "hello world"
```
2) Ephemeral signing keys
> ? is this same as my private key
```bash
# Generate an ephemeral signing keypair
redis-cli -p $PORT AGE GENSIGN
# Example output:
# 1) "<verify_pub_b64>"
# 2) "<sign_secret_b64>"
# Sign a message with the secret
redis-cli -p $PORT AGE SIGN "<sign_secret_b64>" "msg"
# → returns "<signature_b64>"
# Verify with the public key
redis-cli -p $PORT AGE VERIFY "<verify_pub_b64>" "msg" "<signature_b64>"
# → 1 (valid) or 0 (invalid)
```
When to use
- You do not want the server to store private keys.
- You already manage key material on the client side.
- You need adhoc operations without persistence.
Reference test: [rust.test_07_age_stateless_suite()](herodb/tests/usage_suite.rs:495)
## Keymanaged AGE (persistent, named)
Characteristics
- Server generates and persists keypairs under a chosen name.
- Clients refer to keys by name; raw secrets are not supplied on each call.
- Keys are discoverable via AGE LIST.
Commands and examples
1) Named encryption keys
```bash
# Create/persist a named encryption keypair
redis-cli -p $PORT AGE KEYGEN app1
# → returns [recipient, identity] but also stores them under name "app1"
> TODO: should not return identity (security, but there can be separate function to export it e.g. AGE EXPORTKEY app1)
# Encrypt using the stored public key
redis-cli -p $PORT AGE ENCRYPTNAME app1 "hello"
# → returns bulk string payload: base64 ciphertext
# Decrypt using the stored secret
redis-cli -p $PORT AGE DECRYPTNAME app1 "<ciphertext_b64>"
redis-cli AGE DECRYPT "AGE-SECRET-KEY-1..." base64_ciphertext
# → "hello"
```
2) Named signing keys
## Key-Managed Mode (Persistent Named Keys)
Keys are stored server-side under names. Supports unified keypairs for both encryption and signatures.
Available commands:
- `AGE KEYGEN <name>`: Generate and store unified keypair. Returns `[recipient, identity]` in age format.
- `AGE SIGNKEYGEN <name>`: Generate and store Ed25519 signing keypair. Returns `[verify_pub, sign_secret]`.
- `AGE ENCRYPTNAME <name> <message>`: Encrypt with named key. Returns base64 ciphertext.
- `AGE DECRYPTNAME <name> <ciphertext_b64>`: Decrypt with named key. Returns plaintext.
- `AGE SIGNNAME <name> <message>`: Sign with named key. Returns base64 signature.
- `AGE VERIFYNAME <name> <message> <signature_b64>`: Verify with named key. Returns 1 or 0.
- `AGE LIST`: List all stored key names. Returns sorted array of names.
### AGE LIST Output
Returns a flat, deduplicated, sorted array of key names (strings). Each name corresponds to a stored keypair, which may include encryption keys (X25519), signing keys (Ed25519), or both.
Output format: `["name1", "name2", ...]`
Example:
```bash
# Create/persist a named signing keypair
redis-cli -p $PORT AGE SIGNKEYGEN app1
# → returns [verify_pub_b64, sign_secret_b64] and stores under name "app1"
> TODO: should not return sign_secret_b64 (for security, but there can be separate function to export it e.g. AGE EXPORTSIGNKEY app1)
# Sign using the stored secret
redis-cli -p $PORT AGE SIGNNAME app1 "msg"
# → returns "<signature_b64>"
# Verify using the stored public key
redis-cli -p $PORT AGE VERIFYNAME app1 "msg" "<signature_b64>"
# → 1 (valid) or 0 (invalid)
redis-cli AGE LIST
# → 1) "<named_keypair_1>"
# 2) "<named_keypair_2>"
```
3) List stored AGE keys
For unified keypairs (from `AGE KEYGEN`), the name handles both encryption (derived X25519) and signatures (stored Ed25519) transparently.
Example with named keys:
```bash
redis-cli -p $PORT AGE LIST
# Example output includes labels such as "encpub" and your key names (e.g., "app1")
redis-cli AGE KEYGEN app1
# → 1) "age1..." # recipient
# 2) "AGE-SECRET-KEY-1..." # identity
redis-cli AGE ENCRYPTNAME app1 "secret message"
# → base64_ciphertext
redis-cli AGE DECRYPTNAME app1 base64_ciphertext
# → "secret message"
redis-cli AGE SIGNNAME app1 "message"
# → base64_signature
redis-cli AGE VERIFYNAME app1 "message" base64_signature
# → 1
```
When to use
- You want centralized key storage/rotation and fewer secrets on the client.
- You need names/labels for workflows and can trust the server with secrets.
- You want discoverability (AGE LIST) and simpler client commands.
## Choosing a Mode
- **Stateless**: For ad-hoc operations without persistence; client manages keys.
- **Key-managed**: For centralized key lifecycle; server stores keys for convenience and discoverability.
Reference test: [rust.test_08_age_persistent_named_suite()](herodb/tests/usage_suite.rs:555)
## Choosing a mode
- Prefer Stateless when:
- Minimizing server trust for secret material is the priority.
- Clients already have a secure mechanism to store/distribute keys.
- Prefer Keymanaged when:
- Centralized lifecycle, naming, and discoverability are beneficial.
- You plan to integrate rotation, ACLs, or auditability on the server side.
## Security notes
- Treat identities and signing secrets as sensitive; avoid logging them.
- For keymanaged mode, ensure server storage (and backups) are protected.
- AGE operations here are applicationlevel crypto and are distinct from database-at-rest encryption configured in the test harness.
## Repository pointers
- Stateless examples in tests: [rust.test_07_age_stateless_suite()](herodb/tests/usage_suite.rs:495)
- Keymanaged examples in tests: [rust.test_08_age_persistent_named_suite()](herodb/tests/usage_suite.rs:555)
- AGE implementation: [herodb/src/age.rs](herodb/src/age.rs)
- Command dispatch: [herodb/src/cmd.rs](herodb/src/cmd.rs)
- Bash demo: [herodb/examples/age_bash_demo.sh](herodb/examples/age_bash_demo.sh)
- Rust persistent demo: [herodb/examples/age_persist_demo.rs](herodb/examples/age_persist_demo.rs)
- Additional notes: [herodb/instructions/encrypt.md](herodb/instructions/encrypt.md)
Implementation: [herodb/src/age.rs](herodb/src/age.rs) <br>
Tests: [herodb/tests/usage_suite.rs](herodb/tests/usage_suite.rs)

View File

@@ -1,4 +1,58 @@
Here's an expanded version of the cmds.md documentation to include the list commands:
# HeroDB Basics
## Launching HeroDB
To launch HeroDB, use the binary with required and optional flags. The `--admin-secret` flag is mandatory, encrypting the admin database (DB 0) and authorizing admin access.
### Launch Flags
- `--dir <path>`: Directory for database files (default: current directory).
- `--port <port>`: TCP port for Redis protocol (default: 6379).
- `--debug`: Enable debug logging.
- `--sled`: Use Sled backend (default: Redb).
- `--enable-rpc`: Start JSON-RPC management server on port 8080.
- `--rpc-port <port>`: Custom RPC port (default: 8080).
- `--admin-secret <secret>`: Required secret for DB 0 encryption and admin access.
Example:
```bash
./target/release/herodb --dir /tmp/herodb --admin-secret mysecret --port 6379 --enable-rpc
```
Deprecated flags (`--encrypt`, `--encryption-key`) are ignored for data DBs; per-database encryption is managed via RPC.
## Admin Database (DB 0)
DB 0 acts as the administrative database instance, storing metadata for all user databases (IDs >= 1). It controls existence, access control, and per-database encryption. DB 0 is always encrypted with the `--admin-secret`.
When creating a new database, DB 0 allocates an ID, registers it, and optionally stores a per-database encryption key (write-only). Databases are public by default; use RPC to set them private, requiring access keys for SELECT (read or readwrite based on permissions). Keys are persisted in DB 0 for managed AGE operations.
Access DB 0 with `SELECT 0 KEY <admin-secret>`.
## Symmetric Encryption
HeroDB supports stateless symmetric encryption via SYM commands, using XChaCha20-Poly1305 AEAD.
Commands:
- `SYM KEYGEN`: Generate 32-byte key. Returns base64-encoded key.
- `SYM ENCRYPT <key_b64> <message>`: Encrypt message. Returns base64 ciphertext.
- `SYM DECRYPT <key_b64> <ciphertext_b64>`: Decrypt. Returns plaintext.
Example:
```bash
redis-cli SYM KEYGEN
# → base64_key
redis-cli SYM ENCRYPT base64_key "secret"
# → base64_ciphertext
redis-cli SYM DECRYPT base64_key base64_ciphertext
# → "secret"
```
## RPC Options
Enable the JSON-RPC server with `--enable-rpc` for database management. Methods include creating databases, managing access keys, and setting encryption. See [JSON-RPC Examples](./rpc_examples.md) for payloads.
# HeroDB Commands
HeroDB implements a subset of Redis commands over the Redis protocol. This document describes the available commands and their usage.
@@ -575,6 +629,29 @@ redis-cli -p $PORT AGE LIST
# 2) "keyname2"
```
## SYM Commands
### SYM KEYGEN
Generate a symmetric encryption key.
```bash
redis-cli -p $PORT SYM KEYGEN
# → base64_encoded_32byte_key
```
### SYM ENCRYPT
Encrypt a message with a symmetric key.
```bash
redis-cli -p $PORT SYM ENCRYPT <key_b64> "message"
# → base64_encoded_ciphertext
```
### SYM DECRYPT
Decrypt a ciphertext with a symmetric key.
```bash
redis-cli -p $PORT SYM DECRYPT <key_b64> <ciphertext_b64>
# → decrypted_message
```
## Server Information Commands
### INFO
@@ -621,3 +698,27 @@ This expanded documentation includes all the list commands that were implemented
10. LINDEX - get element by index
11. LRANGE - get range of elements
## Updated Database Selection and Access Keys
HeroDB uses an `Admin DB 0` to control database existence, access, and encryption. Access to data DBs can be public (no key) or private (requires a key). See detailed model in `docs/admin.md`.
Examples:
```bash
# Public database (no key required)
redis-cli -p $PORT SELECT 1
# → OK
```
```bash
# Private database (requires access key)
redis-cli -p $PORT SELECT 2 KEY my-db2-access-key
# → OK
```
```bash
# Admin DB 0 (requires admin secret)
redis-cli -p $PORT SELECT 0 KEY my-admin-secret
# → OK
```

View File

@@ -123,3 +123,34 @@ redis-cli -p 6379 --rdb dump.rdb
# Import to sled
redis-cli -p 6381 --pipe < dump.rdb
```
## Authentication and Database Selection
Connections start with no database selected. Any storage-backed command (GET, SET, H*, L*, SCAN, etc.) will return an error until you issue a SELECT to choose a database.
HeroDB uses an `Admin DB 0` to govern database existence, access and per-db encryption. Access control is enforced via `Admin DB 0` metadata. See the full model in (docs/admin.md:1).
Examples:
```bash
# Public database (no key required)
redis-cli -p $PORT SELECT 1
# → OK
```
```bash
# Private database (requires access key)
redis-cli -p $PORT SELECT 2 KEY my-db2-access-key
# → OK
```
```bash
# Admin DB 0 (requires admin secret)
redis-cli -p $PORT SELECT 0 KEY my-admin-secret
# → OK
```
```bash
# Before selecting a DB, storage commands will fail
redis-cli -p $PORT GET key
# → -ERR No database selected. Use SELECT <id> [KEY <key>] first
```

444
docs/lance.md Normal file
View File

@@ -0,0 +1,444 @@
# Lance Vector Backend (RESP + JSON-RPC)
This document explains how to use HeroDBs Lance-backed vector store. It is text-first: users provide text, and HeroDB computes embeddings server-side (no manual vectors). It includes copy-pasteable RESP (redis-cli) and JSON-RPC examples for:
- Creating a Lance database
- Embedding provider configuration (OpenAI, Azure OpenAI, or deterministic test provider)
- Dataset lifecycle: CREATE, LIST, INFO, DROP
- Ingestion: STORE text (+ optional metadata)
- Search: QUERY with K, optional FILTER and RETURN
- Delete by id
- Index creation (currently a placeholder/no-op)
References:
- Implementation: [src/lance_store.rs](src/lance_store.rs), [src/cmd.rs](src/cmd.rs), [src/rpc.rs](src/rpc.rs), [src/server.rs](src/server.rs), [src/embedding.rs](src/embedding.rs)
Notes:
- Admin DB 0 cannot be Lance (or Tantivy). Only databases with id >= 1 can use Lance.
- Permissions:
- Read operations (SEARCH, LIST, INFO) require read permission.
- Mutating operations (CREATE, STORE, CREATEINDEX, DEL, DROP, EMBEDDING CONFIG SET) require readwrite permission.
- Backend gating:
- If a DB is Lance, only LANCE.* and basic control commands (PING, ECHO, SELECT, INFO, CLIENT, etc.) are permitted.
- If a DB is not Lance, LANCE.* commands return an error.
Storage layout and schema:
- Files live at: <base_dir>/lance/<db_id>/<dataset>.lance
- Records schema:
- id: Utf8 (non-null)
- vector: FixedSizeList<Float32, dim> (non-null)
- text: Utf8 (nullable)
- meta: Utf8 JSON (nullable)
- Search is an L2 KNN brute-force scan for now (lower score = better). Index creation is a no-op placeholder to be implemented later.
Prerequisites:
- Start HeroDB with RPC enabled (for management calls):
- See [docs/basics.md](./basics.md) for flags. Example:
```bash
./target/release/herodb --dir /tmp/herodb --admin-secret mysecret --port 6379 --enable-rpc
```
## 0) Create a Lance-backed database (JSON-RPC)
Use the management API to create a database with backend "Lance". DB 0 is reserved for admin and cannot be Lance.
Request:
```json
{
"jsonrpc": "2.0",
"id": 1,
"method": "herodb_createDatabase",
"params": [
"Lance",
{ "name": "vectors-db", "storage_path": null, "max_size": null, "redis_version": null },
null
]
}
```
- Response contains the allocated db_id (>= 1). Use that id below (replace 1 with your actual id).
Select the database over RESP:
```bash
redis-cli -p 6379 SELECT 1
# → OK
```
## 1) Configure embedding provider (server-side embeddings)
HeroDB embeds text internally at STORE/SEARCH time using a per-dataset EmbeddingConfig sidecar. Configure provider before creating a dataset to choose dimensions and provider.
Supported providers:
- openai (standard OpenAI or Azure OpenAI)
- testhash (deterministic, CI-friendly; no network)
Environment variables for OpenAI:
- Standard OpenAI: export OPENAI_API_KEY=sk-...
- Azure OpenAI: export AZURE_OPENAI_API_KEY=...
RESP examples:
```bash
# Standard OpenAI with default dims (model-dependent, e.g. 1536)
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET myset PROVIDER openai MODEL text-embedding-3-small
# OpenAI with reduced output dimension (e.g., 512) when supported
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET myset PROVIDER openai MODEL text-embedding-3-small PARAM dim 512
# Azure OpenAI (set env: AZURE_OPENAI_API_KEY)
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET myset PROVIDER openai MODEL text-embedding-3-small \
PARAM use_azure true \
PARAM azure_endpoint https://myresource.openai.azure.com \
PARAM azure_deployment my-embed-deploy \
PARAM azure_api_version 2024-02-15 \
PARAM dim 512
# Deterministic test provider (no network, stable vectors)
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET myset PROVIDER testhash MODEL any
```
Read config:
```bash
redis-cli -p 6379 LANCE.EMBEDDING CONFIG GET myset
# → JSON blob describing provider/model/params
```
JSON-RPC examples:
```json
{
"jsonrpc": "2.0",
"id": 2,
"method": "herodb_lanceSetEmbeddingConfig",
"params": [
1,
"myset",
"openai",
"text-embedding-3-small",
{ "dim": "512" }
]
}
```
```json
{
"jsonrpc": "2.0",
"id": 3,
"method": "herodb_lanceGetEmbeddingConfig",
"params": [1, "myset"]
}
```
## 2) Create a dataset
Choose a dimension that matches your embedding configuration. For OpenAI text-embedding-3-small without dimension override, typical dimension is 1536; when `dim` is set (e.g., 512), use that. The current API requires an explicit DIM.
RESP:
```bash
redis-cli -p 6379 LANCE.CREATE myset DIM 512
# → OK
```
JSON-RPC:
```json
{
"jsonrpc": "2.0",
"id": 4,
"method": "herodb_lanceCreate",
"params": [1, "myset", 512]
}
```
## 3) Store text documents (server-side embedding)
Provide your id, the text to embed, and optional META fields. The server computes the embedding using the configured provider and stores id/vector/text/meta in the Lance dataset. Upserts by id are supported via delete-then-append semantics.
RESP:
```bash
redis-cli -p 6379 LANCE.STORE myset ID doc-1 TEXT "Hello vector world" META title "Hello" category "demo"
# → OK
```
JSON-RPC:
```json
{
"jsonrpc": "2.0",
"id": 5,
"method": "herodb_lanceStoreText",
"params": [
1,
"myset",
"doc-1",
"Hello vector world",
{ "title": "Hello", "category": "demo" }
]
}
```
## 4) Search with a text query
Provide a query string; the server embeds it and performs KNN search. Optional: FILTER expression and RETURN subset of fields.
RESP:
```bash
# K nearest neighbors for the query text
redis-cli -p 6379 LANCE.SEARCH myset K 5 QUERY "greetings to vectors"
# → Array of hits: [id, score, [k,v, ...]] pairs, lower score = closer
# With a filter on meta fields and return only title
redis-cli -p 6379 LANCE.SEARCH myset K 3 QUERY "greetings to vectors" FILTER "category = 'demo'" RETURN 1 title
```
JSON-RPC:
```json
{
"jsonrpc": "2.0",
"id": 6,
"method": "herodb_lanceSearchText",
"params": [1, "myset", "greetings to vectors", 5, null, null]
}
```
With filter and selected fields:
```json
{
"jsonrpc": "2.0",
"id": 7,
"method": "herodb_lanceSearchText",
"params": [1, "myset", "greetings to vectors", 3, "category = 'demo'", ["title"]]
}
```
Response shape:
- RESP over redis-cli: an array of hits [id, score, [k, v, ...]].
- JSON-RPC returns an object containing the RESP-encoded wire format string or a structured result depending on implementation. See [src/rpc.rs](src/rpc.rs) for details.
## 5) Create an index (placeholder)
Index creation currently returns OK but is a no-op. It will integrate Lance vector indices in a future update.
RESP:
```bash
redis-cli -p 6379 LANCE.CREATEINDEX myset TYPE "ivf_pq" PARAM nlist 100 PARAM pq_m 16
# → OK (no-op for now)
```
JSON-RPC:
```json
{
"jsonrpc": "2.0",
"id": 8,
"method": "herodb_lanceCreateIndex",
"params": [1, "myset", "ivf_pq", { "nlist": "100", "pq_m": "16" }]
}
```
## 6) Inspect datasets
RESP:
```bash
# List datasets in current Lance DB
redis-cli -p 6379 LANCE.LIST
# Get dataset info
redis-cli -p 6379 LANCE.INFO myset
```
JSON-RPC:
```json
{
"jsonrpc": "2.0",
"id": 9,
"method": "herodb_lanceList",
"params": [1]
}
```
```json
{
"jsonrpc": "2.0",
"id": 10,
"method": "herodb_lanceInfo",
"params": [1, "myset"]
}
```
## 7) Delete and drop
RESP:
```bash
# Delete by id
redis-cli -p 6379 LANCE.DEL myset doc-1
# → OK
# Drop the entire dataset
redis-cli -p 6379 LANCE.DROP myset
# → OK
```
JSON-RPC:
```json
{
"jsonrpc": "2.0",
"id": 11,
"method": "herodb_lanceDel",
"params": [1, "myset", "doc-1"]
}
```
```json
{
"jsonrpc": "2.0",
"id": 12,
"method": "herodb_lanceDrop",
"params": [1, "myset"]
}
```
## 8) End-to-end example (RESP)
```bash
# 1. Select Lance DB (assume db_id=1 created via RPC)
redis-cli -p 6379 SELECT 1
# 2. Configure embedding provider (OpenAI small model at 512 dims)
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET myset PROVIDER openai MODEL text-embedding-3-small PARAM dim 512
# 3. Create dataset
redis-cli -p 6379 LANCE.CREATE myset DIM 512
# 4. Store documents
redis-cli -p 6379 LANCE.STORE myset ID doc-1 TEXT "The quick brown fox jumps over the lazy dog" META title "Fox" category "animal"
redis-cli -p 6379 LANCE.STORE myset ID doc-2 TEXT "A fast auburn fox vaulted a sleepy canine" META title "Fox paraphrase" category "animal"
# 5. Search
redis-cli -p 6379 LANCE.SEARCH myset K 2 QUERY "quick brown fox" RETURN 1 title
# 6. Dataset info and listing
redis-cli -p 6379 LANCE.INFO myset
redis-cli -p 6379 LANCE.LIST
# 7. Delete and drop
redis-cli -p 6379 LANCE.DEL myset doc-2
redis-cli -p 6379 LANCE.DROP myset
```
## 9) End-to-end example (JSON-RPC)
Assume RPC server on port 8080. Replace ids and ports as needed.
1) Create Lance DB:
```json
{
"jsonrpc": "2.0",
"id": 100,
"method": "herodb_createDatabase",
"params": ["Lance", { "name": "vectors-db", "storage_path": null, "max_size": null, "redis_version": null }, null]
}
```
2) Set embedding config:
```json
{
"jsonrpc": "2.0",
"id": 101,
"method": "herodb_lanceSetEmbeddingConfig",
"params": [1, "myset", "openai", "text-embedding-3-small", { "dim": "512" }]
}
```
3) Create dataset:
```json
{
"jsonrpc": "2.0",
"id": 102,
"method": "herodb_lanceCreate",
"params": [1, "myset", 512]
}
```
4) Store text:
```json
{
"jsonrpc": "2.0",
"id": 103,
"method": "herodb_lanceStoreText",
"params": [1, "myset", "doc-1", "The quick brown fox jumps over the lazy dog", { "title": "Fox", "category": "animal" }]
}
```
5) Search text:
```json
{
"jsonrpc": "2.0",
"id": 104,
"method": "herodb_lanceSearchText",
"params": [1, "myset", "quick brown fox", 2, null, ["title"]]
}
```
6) Info/list:
```json
{
"jsonrpc": "2.0",
"id": 105,
"method": "herodb_lanceInfo",
"params": [1, "myset"]
}
```
```json
{
"jsonrpc": "2.0",
"id": 106,
"method": "herodb_lanceList",
"params": [1]
}
```
7) Delete/drop:
```json
{
"jsonrpc": "2.0",
"id": 107,
"method": "herodb_lanceDel",
"params": [1, "myset", "doc-1"]
}
```
```json
{
"jsonrpc": "2.0",
"id": 108,
"method": "herodb_lanceDrop",
"params": [1, "myset"]
}
```
## 10) Operational notes and troubleshooting
- If using OpenAI and you see “missing API key env”, set:
- Standard: `export OPENAI_API_KEY=sk-...`
- Azure: `export AZURE_OPENAI_API_KEY=...` and pass `use_azure true`, `azure_endpoint`, `azure_deployment`, `azure_api_version`.
- Dimensions mismatch:
- Ensure the dataset DIM equals the providers embedding dim. For OpenAI text-embedding-3 models, set `PARAM dim 512` (or another supported size) and use that same DIM for `LANCE.CREATE`.
- DB 0 restriction:
- Lance is not allowed on DB 0. Use db_id >= 1.
- Permissions:
- Read operations (SEARCH, LIST, INFO) require read permission.
- Mutations (CREATE, STORE, CREATEINDEX, DEL, DROP, EMBEDDING CONFIG SET) require readwrite permission.
- Backend gating:
- On Lance DBs, only LANCE.* commands are accepted (plus basic control).
- Current index behavior:
- `LANCE.CREATEINDEX` returns OK but is a no-op. Future versions will integrate Lance vector indices.
- Implementation files for reference:
- [src/lance_store.rs](src/lance_store.rs), [src/cmd.rs](src/cmd.rs), [src/rpc.rs](src/rpc.rs), [src/server.rs](src/server.rs), [src/embedding.rs](src/embedding.rs)

View File

@@ -0,0 +1,138 @@
# LanceDB Text and Images: End-to-End Example
This guide demonstrates creating a Lance backend database, ingesting two text documents and two images, performing searches over both, and cleaning up the datasets.
Prerequisites
- Build HeroDB and start the server with JSON-RPC enabled.
Commands:
```bash
cargo build --release
./target/release/herodb --dir /tmp/herodb --admin-secret mysecret --port 6379 --enable-rpc
```
We'll use:
- redis-cli for RESP commands against port 6379
- curl for JSON-RPC against 8080 if desired
- Deterministic local embedders to avoid external dependencies: testhash (text, dim 64) and testimagehash (image, dim 512)
0) Create a Lance-backed database (JSON-RPC)
Request:
```json
{ "jsonrpc": "2.0", "id": 1, "method": "herodb_createDatabase", "params": ["Lance", { "name": "media-db", "storage_path": null, "max_size": null, "redis_version": null }, null] }
```
Response returns db_id (assume 1). Select DB over RESP:
```bash
redis-cli -p 6379 SELECT 1
# → OK
```
1) Configure embedding providers
We'll create two datasets with independent embedding configs:
- textset → provider testhash, dim 64
- imageset → provider testimagehash, dim 512
Text config:
```bash
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET textset PROVIDER testhash MODEL any PARAM dim 64
# → OK
```
Image config:
```bash
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET imageset PROVIDER testimagehash MODEL any PARAM dim 512
# → OK
```
2) Create datasets
```bash
redis-cli -p 6379 LANCE.CREATE textset DIM 64
# → OK
redis-cli -p 6379 LANCE.CREATE imageset DIM 512
# → OK
```
3) Ingest two text documents (server-side embedding)
```bash
redis-cli -p 6379 LANCE.STORE textset ID doc-1 TEXT "The quick brown fox jumps over the lazy dog" META title "Fox" category "animal"
# → OK
redis-cli -p 6379 LANCE.STORE textset ID doc-2 TEXT "A fast auburn fox vaulted a sleepy canine" META title "Paraphrase" category "animal"
# → OK
```
4) Ingest two images
You can provide a URI or base64 bytes. Use URI for URIs, BYTES for base64 data.
Example using free placeholder images:
```bash
# Store via URI
redis-cli -p 6379 LANCE.STOREIMAGE imageset ID img-1 URI "https://picsum.photos/seed/1/256/256" META title "Seed1" group "demo"
# → OK
redis-cli -p 6379 LANCE.STOREIMAGE imageset ID img-2 URI "https://picsum.photos/seed/2/256/256" META title "Seed2" group "demo"
# → OK
```
If your environment blocks outbound HTTP, you can embed image bytes:
```bash
# Example: read a local file and base64 it (replace path)
b64=$(base64 -w0 ./image1.png)
redis-cli -p 6379 LANCE.STOREIMAGE imageset ID img-b64-1 BYTES "$b64" META title "Local1" group "demo"
```
5) Search text
```bash
# Top-2 nearest neighbors for a query
redis-cli -p 6379 LANCE.SEARCH textset K 2 QUERY "quick brown fox" RETURN 1 title
# → 1) [id, score, [k1,v1,...]]
```
With a filter (supports equality on schema or meta keys):
```bash
redis-cli -p 6379 LANCE.SEARCH textset K 2 QUERY "fox jumps" FILTER "category = 'animal'" RETURN 1 title
```
6) Search images
```bash
# Provide a URI as the query
redis-cli -p 6379 LANCE.SEARCHIMAGE imageset K 2 QUERYURI "https://picsum.photos/seed/1/256/256" RETURN 1 title
# Or provide base64 bytes as the query
qb64=$(curl -s https://picsum.photos/seed/3/256/256 | base64 -w0)
redis-cli -p 6379 LANCE.SEARCHIMAGE imageset K 2 QUERYBYTES "$qb64" RETURN 1 title
```
7) Inspect datasets
```bash
redis-cli -p 6379 LANCE.LIST
redis-cli -p 6379 LANCE.INFO textset
redis-cli -p 6379 LANCE.INFO imageset
```
8) Delete by id and drop datasets
```bash
# Delete one record
redis-cli -p 6379 LANCE.DEL textset doc-2
# → OK
# Drop entire datasets
redis-cli -p 6379 LANCE.DROP textset
redis-cli -p 6379 LANCE.DROP imageset
# → OK
```
Appendix: Using OpenAI embeddings instead of test providers
Text:
```bash
export OPENAI_API_KEY=sk-...
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET textset PROVIDER openai MODEL text-embedding-3-small PARAM dim 512
redis-cli -p 6379 LANCE.CREATE textset DIM 512
```
Azure OpenAI:
```bash
export AZURE_OPENAI_API_KEY=...
redis-cli -p 6379 LANCE.EMBEDDING CONFIG SET textset PROVIDER openai MODEL text-embedding-3-small \
PARAM use_azure true \
PARAM azure_endpoint https://myresource.openai.azure.com \
PARAM azure_deployment my-embed-deploy \
PARAM azure_api_version 2024-02-15 \
PARAM dim 512
```
Notes:
- Ensure dataset DIM matches the configured embedding dimension.
- Lance is only available for non-admin databases (db_id >= 1).
- On Lance DBs, only LANCE.* and basic control commands are allowed.

141
docs/rpc_examples.md Normal file
View File

@@ -0,0 +1,141 @@
# HeroDB JSON-RPC Examples
These examples show full JSON-RPC 2.0 payloads for managing HeroDB via the RPC API (enable with `--enable-rpc`). Methods are named as `hero_<function>`. Params are positional arrays; enum values are strings (e.g., `"Redb"`). Copy-paste into Postman or similar clients.
## Database Management
### Create Database
Creates a new database with optional per-database encryption key (stored write-only in Admin DB 0).
```json
{
"jsonrpc": "2.0",
"id": 1,
"method": "hero_createDatabase",
"params": [
"Redb",
{ "name": null, "storage_path": null, "max_size": null, "redis_version": null },
null
]
}
```
With encryption:
```json
{
"jsonrpc": "2.0",
"id": 2,
"method": "hero_createDatabase",
"params": [
"Sled",
{ "name": "secure-db", "storage_path": null, "max_size": null, "redis_version": null },
"my-per-db-encryption-key"
]
}
```
### List Databases
Returns array of database infos (id, backend, encrypted status, size, etc.).
```json
{
"jsonrpc": "2.0",
"id": 3,
"method": "hero_listDatabases",
"params": []
}
```
### Get Database Info
Retrieves detailed info for a specific database.
```json
{
"jsonrpc": "2.0",
"id": 4,
"method": "hero_getDatabaseInfo",
"params": [1]
}
```
### Delete Database
Removes physical database file; metadata remains in Admin DB 0.
```json
{
"jsonrpc": "2.0",
"id": 5,
"method": "hero_deleteDatabase",
"params": [1]
}
```
## Access Control
### Add Access Key
Adds a hashed access key for private databases. Permissions: `"read"` or `"readwrite"`.
```json
{
"jsonrpc": "2.0",
"id": 6,
"method": "hero_addAccessKey",
"params": [2, "my-access-key", "readwrite"]
}
```
### List Access Keys
Returns array of key hashes, permissions, and creation timestamps.
```json
{
"jsonrpc": "2.0",
"id": 7,
"method": "hero_listAccessKeys",
"params": [2]
}
```
### Delete Access Key
Removes key by its SHA-256 hash.
```json
{
"jsonrpc": "2.0",
"id": 8,
"method": "hero_deleteAccessKey",
"params": [2, "0123abcd...keyhash..."]
}
```
### Set Database Public/Private
Toggles public access (default true). Private databases require access keys.
```json
{
"jsonrpc": "2.0",
"id": 9,
"method": "hero_setDatabasePublic",
"params": [2, false]
}
```
## Server Info
### Get Server Stats
Returns stats like total databases and uptime.
```json
{
"jsonrpc": "2.0",
"id": 10,
"method": "hero_getServerStats",
"params": []
}
```
## Notes
- Per-database encryption keys are write-only; set at creation and used transparently.
- Access keys are hashed (SHA-256) for storage; provide plaintext in requests.
- Backend options: `"Redb"` (default) or `"Sled"`.
- Config object fields (name, storage_path, etc.) are optional and currently ignored but positional.

253
docs/tantivy.md Normal file
View File

@@ -0,0 +1,253 @@
# Tantivy FullText Backend (JSONRPC)
This document explains how to use HeroDBs Tantivy-backed fulltext search as a dedicated database backend and provides copypasteable JSONRPC requests. Tantivy is available only for nonadmin databases (db_id >= 1). Admin DB 0 always uses Redb/Sled and rejects FT operations.
Important characteristics:
- Tantivy is a third backend alongside Redb and Sled. It provides search indexes only; there is no KV store backing it.
- On Tantivy databases, Redis KV/list/hash commands are rejected; only FT commands and basic control (SELECT, CLIENT, INFO, etc.) are allowed.
- FT JSONRPC is namespaced as "herodb" and methods are named with underscore: herodb_ftCreate, herodb_ftAdd, herodb_ftSearch, herodb_ftDel, herodb_ftInfo, herodb_ftDrop.
Reference to server implementation:
- RPC methods are defined in [rust.trait Rpc()](src/rpc.rs:70):
- [rust.fn ft_create()](src/rpc.rs:121)
- [rust.fn ft_add()](src/rpc.rs:130)
- [rust.fn ft_search()](src/rpc.rs:141)
- [rust.fn ft_del()](src/rpc.rs:154)
- [rust.fn ft_info()](src/rpc.rs:158)
- [rust.fn ft_drop()](src/rpc.rs:162)
Notes on responses:
- ftCreate/ftAdd/ftDel/ftDrop return a JSON boolean: true on success.
- ftSearch/ftInfo return a JSON object with a single key "resp" containing a RESPencoded string (wire format used by Redis). You can display or parse it on the client side as needed.
RESP usage (redis-cli):
- For RESP clients, you must SELECT the Tantivy database first. SELECT now succeeds for Tantivy DBs without opening KV storage.
- After SELECT, you can run FT.* commands within that DB context.
Example with redis-cli:
```bash
# Connect to server
redis-cli -p 6379
# Select Tantivy DB 1 (public by default)
SELECT 1
# → OK
# Create index
FT.CREATE product_catalog SCHEMA title TEXT description TEXT category TAG price NUMERIC rating NUMERIC location GEO
# → OK
# Add a document
FT.ADD product_catalog product:1 1.0 title "Wireless Bluetooth Headphones" description "Premium noise-canceling headphones with 30-hour battery life" category "electronics,audio" price 299.99 rating 4.5 location "-122.4194,37.7749"
# → OK
# Search
FT.SEARCH product_catalog wireless LIMIT 0 3
# → RESP array with hits
```
Storage layout (on disk):
- Indices are stored per database under:
- <base_dir>/search_indexes/<db_id>/<index_name>
- Example: /tmp/test/search_indexes/1/product_catalog
0) Create a new Tantivy database
Use herodb_createDatabase with backend "Tantivy". DB 0 cannot be Tantivy.
```json
{
"jsonrpc": "2.0",
"id": 1,
"method": "herodb_createDatabase",
"params": [
"Tantivy",
{ "name": "search-db", "storage_path": null, "max_size": null, "redis_version": null },
null
]
}
```
The response contains the allocated db_id (>= 1). Use that id in the calls below.
1) FT.CREATE — create an index with schema
Method: herodb_ftCreate → [rust.fn ft_create()](src/rpc.rs:121)
Schema format is an array of tuples: [ [field_name, field_type, [options...] ], ... ]
Supported field types: "TEXT", "NUMERIC" (defaults to F64), "TAG", "GEO"
Supported options (subset): "WEIGHT", "SORTABLE", "NOINDEX", "SEPARATOR", "CASESENSITIVE"
```json
{
"jsonrpc": "2.0",
"id": 2,
"method": "herodb_ftCreate",
"params": [
1,
"product_catalog",
[
["title", "TEXT", ["SORTABLE"]],
["description", "TEXT", []],
["category", "TAG", ["SEPARATOR", ","]],
["price", "NUMERIC", ["SORTABLE"]],
["rating", "NUMERIC", []],
["location", "GEO", []]
]
]
}
```
Returns: true on success.
2) FT.ADD — add or replace a document
Method: herodb_ftAdd → [rust.fn ft_add()](src/rpc.rs:130)
Fields is an object (map) of field_name → value (all values are sent as strings). GEO expects "lat,lon".
```json
{
"jsonrpc": "2.0",
"id": 3,
"method": "herodb_ftAdd",
"params": [
1,
"product_catalog",
"product:1",
1.0,
{
"title": "Wireless Bluetooth Headphones",
"description": "Premium noise-canceling headphones with 30-hour battery life",
"category": "electronics,audio",
"price": "299.99",
"rating": "4.5",
"location": "-122.4194,37.7749"
}
]
}
```
Returns: true on success.
3) FT.SEARCH — query an index
Method: herodb_ftSearch → [rust.fn ft_search()](src/rpc.rs:141)
Parameters: (db_id, index_name, query, filters?, limit?, offset?, return_fields?)
- filters: array of [field, value] pairs (Equals filter)
- limit/offset: numbers (defaults: limit=10, offset=0)
- return_fields: array of field names to include (optional)
Simple query:
```json
{
"jsonrpc": "2.0",
"id": 4,
"method": "herodb_ftSearch",
"params": [1, "product_catalog", "wireless", null, 10, 0, null]
}
```
Pagination + filters + selected fields:
```json
{
"jsonrpc": "2.0",
"id": 5,
"method": "herodb_ftSearch",
"params": [
1,
"product_catalog",
"mouse",
[["category", "electronics"]],
5,
0,
["title", "price", "rating"]
]
}
```
Response shape:
```json
{
"jsonrpc": "2.0",
"id": 5,
"result": { "resp": "*...RESP encoded array..." }
}
```
4) FT.INFO — index metadata
Method: herodb_ftInfo → [rust.fn ft_info()](src/rpc.rs:158)
```json
{
"jsonrpc": "2.0",
"id": 6,
"method": "herodb_ftInfo",
"params": [1, "product_catalog"]
}
```
Response shape:
```json
{
"jsonrpc": "2.0",
"id": 6,
"result": { "resp": "*...RESP encoded array with fields and counts..." }
}
```
5) FT.DEL — delete by doc id
Method: herodb_ftDel → [rust.fn ft_del()](src/rpc.rs:154)
```json
{
"jsonrpc": "2.0",
"id": 7,
"method": "herodb_ftDel",
"params": [1, "product_catalog", "product:1"]
}
```
Returns: true on success. Note: current implementation logs and returns success; physical delete may be a noop until delete is finalized in the engine.
6) FT.DROP — drop an index
Method: herodb_ftDrop → [rust.fn ft_drop()](src/rpc.rs:162)
```json
{
"jsonrpc": "2.0",
"id": 8,
"method": "herodb_ftDrop",
"params": [1, "product_catalog"]
}
```
Returns: true on success.
Field types and options
- TEXT: stored/indexed/tokenized text. "SORTABLE" marks it fast (stored + fast path in our wrapper).
- NUMERIC: stored/indexed numeric; default precision F64. "SORTABLE" enables fast column.
- TAG: exact matching terms. Options: "SEPARATOR" (default ","), "CASESENSITIVE" (default false).
- GEO: "lat,lon" string; stored as two numeric fields internally.
Backend and permission gating
- FT methods are rejected on DB 0.
- FT methods require the database backend to be Tantivy; otherwise RPC returns an error.
- Writelike FT methods (create/add/del/drop) follow the same permission model as Redis writes on selected databases.
Troubleshooting
- "DB backend is not Tantivy": ensure the database was created with backend "Tantivy".
- "FT not allowed on DB 0": use a nonadmin database id (>= 1).
- Empty search results: confirm that the queried fields are tokenized/indexed (TEXT) and that documents were added successfully.
Related docs
- Commandlevel search overview: [docs/search.md](docs/search.md:1)
- RPC definitions: [src/rpc.rs](src/rpc.rs:1)

143
run.sh Executable file
View File

@@ -0,0 +1,143 @@
#!/bin/bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# Test script for HeroDB - Redis-compatible database with redb backend
# This script starts the server and runs comprehensive tests
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Configuration
DB_DIR="/tmp/test_db"
PORT=6381
SERVER_PID=""
# Function to print colored output
print_status() {
echo -e "${BLUE}[INFO]${NC} $1"
}
print_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
print_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
print_warning() {
echo -e "${YELLOW}[WARNING]${NC} $1"
}
# Function to cleanup on exit
cleanup() {
if [ ! -z "$SERVER_PID" ]; then
print_status "Stopping HeroDB server (PID: $SERVER_PID)..."
kill $SERVER_PID 2>/dev/null || true
wait $SERVER_PID 2>/dev/null || true
fi
# Clean up test database
if [ -d "$DB_DIR" ]; then
print_status "Cleaning up test database directory..."
rm -rf "$DB_DIR"
fi
}
# Set trap to cleanup on script exit
trap cleanup EXIT
# Function to wait for server to start
wait_for_server() {
local max_attempts=30
local attempt=1
print_status "Waiting for server to start on port $PORT..."
while [ $attempt -le $max_attempts ]; do
if nc -z localhost $PORT 2>/dev/null; then
print_success "Server is ready!"
return 0
fi
echo -n "."
sleep 1
attempt=$((attempt + 1))
done
print_error "Server failed to start within $max_attempts seconds"
return 1
}
# Function to send Redis command and get response
redis_cmd() {
local cmd="$1"
local expected="$2"
print_status "Testing: $cmd"
local result=$(echo "$cmd" | redis-cli -p $PORT --raw 2>/dev/null || echo "ERROR")
if [ "$expected" != "" ] && [ "$result" != "$expected" ]; then
print_error "Expected: '$expected', Got: '$result'"
return 1
else
print_success "$cmd -> $result"
return 0
fi
}
# Main execution
main() {
print_status "Starting HeroDB"
# Build the project
print_status "Building HeroDB..."
if ! cargo build -p herodb --release; then
print_error "Failed to build HeroDB"
exit 1
fi
# Create test database directory
mkdir -p "$DB_DIR"
# Start the server
print_status "Starting HeroDB server..."
${SCRIPT_DIR}/target/release/herodb --dir "$DB_DIR" --port $PORT &
SERVER_PID=$!
# Wait for server to start
if ! wait_for_server; then
print_error "Failed to start server"
exit 1
fi
}
# Check dependencies
check_dependencies() {
if ! command -v cargo &> /dev/null; then
print_error "cargo is required but not installed"
exit 1
fi
if ! command -v nc &> /dev/null; then
print_warning "netcat (nc) not found - some tests may not work properly"
fi
if ! command -v redis-cli &> /dev/null; then
print_warning "redis-cli not found - using netcat fallback"
fi
}
# Run dependency check and main function
check_dependencies
main "$@"
tail -f /dev/null

View File

@@ -1,4 +1,7 @@
#!/bin/bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
echo "🧪 Running HeroDB Redis Compatibility Tests"
echo "=========================================="

501
src/admin_meta.rs Normal file
View File

@@ -0,0 +1,501 @@
use std::path::{Path, PathBuf};
use std::sync::{Arc, OnceLock, Mutex, RwLock};
use std::collections::HashMap;
use crate::error::DBError;
use crate::options;
use crate::rpc::Permissions;
use crate::storage::Storage;
use crate::storage_sled::SledStorage;
use crate::storage_trait::StorageBackend;
// Key builders
fn k_admin_next_id() -> &'static str {
"admin:next_id"
}
fn k_admin_dbs() -> &'static str {
"admin:dbs"
}
fn k_meta_db(id: u64) -> String {
format!("meta:db:{}", id)
}
fn k_meta_db_keys(id: u64) -> String {
format!("meta:db:{}:keys", id)
}
fn k_meta_db_enc(id: u64) -> String {
format!("meta:db:{}:enc", id)
}
// Global cache of admin DB 0 handles per base_dir to avoid sled/reDB file-lock contention
// and to correctly isolate different test instances with distinct directories.
static ADMIN_STORAGES: OnceLock<RwLock<HashMap<String, Arc<dyn StorageBackend>>>> = OnceLock::new();
// Global registry for data DB storages to avoid double-open across process.
static DATA_STORAGES: OnceLock<RwLock<HashMap<u64, Arc<dyn StorageBackend>>>> = OnceLock::new();
static DATA_INIT_LOCK: Mutex<()> = Mutex::new(());
fn init_admin_storage(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
) -> Result<Arc<dyn StorageBackend>, DBError> {
let db_file = base_dir.join("0.db");
if let Some(parent_dir) = db_file.parent() {
std::fs::create_dir_all(parent_dir).map_err(|e| {
DBError(format!("Failed to create directory {}: {}", parent_dir.display(), e))
})?;
}
let storage: Arc<dyn StorageBackend> = match backend {
options::BackendType::Redb => Arc::new(Storage::new(&db_file, true, Some(admin_secret))?),
options::BackendType::Sled => Arc::new(SledStorage::new(&db_file, true, Some(admin_secret))?),
options::BackendType::Tantivy | options::BackendType::Lance => {
return Err(DBError("Admin DB 0 cannot use search-only backends (Tantivy/Lance)".to_string()))
}
};
Ok(storage)
}
// Get or initialize a cached handle to admin DB 0 per base_dir (thread-safe, no double-open race)
pub fn open_admin_storage(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
) -> Result<Arc<dyn StorageBackend>, DBError> {
let map = ADMIN_STORAGES.get_or_init(|| RwLock::new(HashMap::new()));
let key = base_dir.display().to_string();
// Fast path
if let Some(st) = map.read().unwrap().get(&key) {
return Ok(st.clone());
}
// Slow path with write lock
{
let mut w = map.write().unwrap();
if let Some(st) = w.get(&key) {
return Ok(st.clone());
}
// Detect existing 0.db backend by filesystem, if present.
let admin_path = base_dir.join("0.db");
let detected = if admin_path.exists() {
if admin_path.is_file() {
Some(options::BackendType::Redb)
} else if admin_path.is_dir() {
Some(options::BackendType::Sled)
} else {
None
}
} else {
None
};
let effective_backend = match detected {
Some(d) if d != backend => {
eprintln!(
"warning: Admin DB 0 at {} appears to be {:?}, but process default is {:?}. Using detected backend.",
admin_path.display(),
d,
backend
);
d
}
Some(d) => d,
None => backend, // First boot: use requested backend to initialize 0.db
};
let st = init_admin_storage(base_dir, effective_backend, admin_secret)?;
w.insert(key, st.clone());
Ok(st)
}
}
// Ensure admin structures exist in encrypted DB 0
pub fn ensure_bootstrap(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
) -> Result<(), DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
// Initialize next id if missing
if !admin.exists(k_admin_next_id())? {
admin.set(k_admin_next_id().to_string(), "1".to_string())?;
}
// admin:dbs is a hash; it's fine if it doesn't exist (hlen -> 0)
Ok(())
}
// Get or initialize a shared handle to a data DB (> 0), avoiding double-open across subsystems
pub fn open_data_storage(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
) -> Result<Arc<dyn StorageBackend>, DBError> {
if id == 0 {
return open_admin_storage(base_dir, backend, admin_secret);
}
// Validate existence in admin metadata
if !db_exists(base_dir, backend.clone(), admin_secret, id)? {
return Err(DBError(format!(
"Cannot open database instance {}, as that database instance does not exist.",
id
)));
}
let map = DATA_STORAGES.get_or_init(|| RwLock::new(HashMap::new()));
// Fast path
if let Some(st) = map.read().unwrap().get(&id) {
return Ok(st.clone());
}
// Slow path with init lock
let _guard = DATA_INIT_LOCK.lock().unwrap();
if let Some(st) = map.read().unwrap().get(&id) {
return Ok(st.clone());
}
// Resolve effective backend for this db id:
// 1) Try admin meta "backend" field
// 2) If missing, sniff filesystem (file => Redb, dir => Sled), then persist into admin meta
// 3) Fallback to requested 'backend' (startup default) if nothing else is known
let meta_backend = get_database_backend(base_dir, backend.clone(), admin_secret, id).ok().flatten();
let db_path = base_dir.join(format!("{}.db", id));
let sniffed_backend = if db_path.exists() {
if db_path.is_file() {
Some(options::BackendType::Redb)
} else if db_path.is_dir() {
Some(options::BackendType::Sled)
} else {
None
}
} else {
None
};
let effective_backend = meta_backend.clone().or(sniffed_backend).unwrap_or(backend.clone());
// If we had to sniff (i.e., meta missing), persist it for future robustness
if meta_backend.is_none() {
let _ = set_database_backend(base_dir, backend.clone(), admin_secret, id, effective_backend.clone());
}
// Warn if caller-provided backend differs from effective
if effective_backend != backend {
eprintln!(
"notice: Database {} backend resolved to {:?} (caller requested {:?}). Using resolved backend.",
id, effective_backend, backend
);
}
// Determine per-db encryption (from admin meta)
let enc = get_enc_key(base_dir, backend.clone(), admin_secret, id)?;
let should_encrypt = enc.is_some();
// Build database file path and ensure parent dir exists
let db_file = PathBuf::from(base_dir).join(format!("{}.db", id));
if let Some(parent_dir) = db_file.parent() {
std::fs::create_dir_all(parent_dir).map_err(|e| {
DBError(format!("Failed to create directory {}: {}", parent_dir.display(), e))
})?;
}
// Open storage using the effective backend
let storage: Arc<dyn StorageBackend> = match effective_backend {
options::BackendType::Redb => Arc::new(Storage::new(&db_file, should_encrypt, enc.as_deref())?),
options::BackendType::Sled => Arc::new(SledStorage::new(&db_file, should_encrypt, enc.as_deref())?),
options::BackendType::Tantivy => {
return Err(DBError("Tantivy backend has no KV storage; use FT.* commands only".to_string()))
}
options::BackendType::Lance => {
return Err(DBError("Lance backend has no KV storage; use LANCE.* commands only".to_string()))
}
};
// Publish to registry
map.write().unwrap().insert(id, storage.clone());
Ok(storage)
}
// Allocate the next DB id and persist new pointer
pub fn allocate_next_id(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
) -> Result<u64, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let cur = admin
.get(k_admin_next_id())?
.unwrap_or_else(|| "1".to_string());
let id: u64 = cur.parse().unwrap_or(1);
let next = id.checked_add(1).ok_or_else(|| DBError("next_id overflow".into()))?;
admin.set(k_admin_next_id().to_string(), next.to_string())?;
// Register into admin:dbs set/hash
let _ = admin.hset(k_admin_dbs(), vec![(id.to_string(), "1".to_string())])?;
// Default meta for the new db: public true
let meta_key = k_meta_db(id);
let _ = admin.hset(&meta_key, vec![("public".to_string(), "true".to_string())])?;
Ok(id)
}
// Check existence of a db id in admin:dbs
pub fn db_exists(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
) -> Result<bool, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
Ok(admin.hexists(k_admin_dbs(), &id.to_string())?)
}
// Get per-db encryption key, if any
pub fn get_enc_key(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
) -> Result<Option<String>, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
admin.get(&k_meta_db_enc(id))
}
// Set per-db encryption key (called during create)
pub fn set_enc_key(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
key: &str,
) -> Result<(), DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
admin.set(k_meta_db_enc(id), key.to_string())
}
// Set database public flag
pub fn set_database_public(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
public: bool,
) -> Result<(), DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let mk = k_meta_db(id);
let _ = admin.hset(&mk, vec![("public".to_string(), public.to_string())])?;
Ok(())
}
// Persist per-db backend type in admin metadata (module-scope)
pub fn set_database_backend(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
db_backend: options::BackendType,
) -> Result<(), DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let mk = k_meta_db(id);
let val = match db_backend {
options::BackendType::Redb => "Redb",
options::BackendType::Sled => "Sled",
options::BackendType::Tantivy => "Tantivy",
options::BackendType::Lance => "Lance",
};
let _ = admin.hset(&mk, vec![("backend".to_string(), val.to_string())])?;
Ok(())
}
pub fn get_database_backend(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
) -> Result<Option<options::BackendType>, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let mk = k_meta_db(id);
match admin.hget(&mk, "backend")? {
Some(s) if s == "Redb" => Ok(Some(options::BackendType::Redb)),
Some(s) if s == "Sled" => Ok(Some(options::BackendType::Sled)),
Some(s) if s == "Tantivy" => Ok(Some(options::BackendType::Tantivy)),
Some(s) if s == "Lance" => Ok(Some(options::BackendType::Lance)),
_ => Ok(None),
}
}
// Set database name
pub fn set_database_name(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
name: &str,
) -> Result<(), DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let mk = k_meta_db(id);
let _ = admin.hset(&mk, vec![("name".to_string(), name.to_string())])?;
Ok(())
}
// Get database name
pub fn get_database_name(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
) -> Result<Option<String>, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let mk = k_meta_db(id);
admin.hget(&mk, "name")
}
// Internal: load public flag; default to true when meta missing
fn load_public(
admin: &Arc<dyn StorageBackend>,
id: u64,
) -> Result<bool, DBError> {
let mk = k_meta_db(id);
match admin.hget(&mk, "public")? {
Some(v) => Ok(v == "true"),
None => Ok(true),
}
}
// Add access key for db (value format: "Read:ts" or "ReadWrite:ts")
pub fn add_access_key(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
key_plain: &str,
perms: Permissions,
) -> Result<(), DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let hash = crate::rpc::hash_key(key_plain);
let v = match perms {
Permissions::Read => format!("Read:{}", now_secs()),
Permissions::ReadWrite => format!("ReadWrite:{}", now_secs()),
};
let _ = admin.hset(&k_meta_db_keys(id), vec![(hash, v)])?;
Ok(())
}
// Delete access key by hash
pub fn delete_access_key(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
key_hash: &str,
) -> Result<bool, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let n = admin.hdel(&k_meta_db_keys(id), vec![key_hash.to_string()])?;
Ok(n > 0)
}
// List access keys, returning (hash, perms, created_at_secs)
pub fn list_access_keys(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
) -> Result<Vec<(String, Permissions, u64)>, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let pairs = admin.hgetall(&k_meta_db_keys(id))?;
let mut out = Vec::new();
for (hash, val) in pairs {
let (perm, ts) = parse_perm_value(&val);
out.push((hash, perm, ts));
}
Ok(out)
}
// Verify access permission for db id with optional key
// Returns:
// - Ok(Some(Permissions)) when access is allowed
// - Ok(None) when not allowed or db missing (caller can distinguish by calling db_exists)
pub fn verify_access(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
id: u64,
key_opt: Option<&str>,
) -> Result<Option<Permissions>, DBError> {
// Admin DB 0: require exact admin_secret
if id == 0 {
if let Some(k) = key_opt {
if k == admin_secret {
return Ok(Some(Permissions::ReadWrite));
}
}
return Ok(None);
}
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
if !admin.hexists(k_admin_dbs(), &id.to_string())? {
return Ok(None);
}
let is_public = load_public(&admin, id)?;
// If a key is explicitly provided, enforce its validity strictly.
// Do NOT fall back to public when an invalid key is supplied.
if let Some(k) = key_opt {
let hash = crate::rpc::hash_key(k);
if let Some(v) = admin.hget(&k_meta_db_keys(id), &hash)? {
let (perm, _ts) = parse_perm_value(&v);
return Ok(Some(perm));
}
// Invalid key
return Ok(None);
}
// No key provided: allow access if DB is public, otherwise deny
if is_public {
Ok(Some(Permissions::ReadWrite))
} else {
Ok(None)
}
}
// Enumerate all db ids
pub fn list_dbs(
base_dir: &Path,
backend: options::BackendType,
admin_secret: &str,
) -> Result<Vec<u64>, DBError> {
let admin = open_admin_storage(base_dir, backend, admin_secret)?;
let ids = admin.hkeys(k_admin_dbs())?;
let mut out = Vec::new();
for s in ids {
if let Ok(v) = s.parse() {
out.push(v);
}
}
Ok(out)
}
// Helper: parse permission value "Read:ts" or "ReadWrite:ts"
fn parse_perm_value(v: &str) -> (Permissions, u64) {
let mut parts = v.split(':');
let p = parts.next().unwrap_or("Read");
let ts = parts
.next()
.and_then(|s| s.parse().ok())
.unwrap_or(0u64);
let perm = match p {
"ReadWrite" => Permissions::ReadWrite,
_ => Permissions::Read,
};
(perm, ts)
}
fn now_secs() -> u64 {
use std::time::{SystemTime, UNIX_EPOCH};
SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default()
.as_secs()
}

View File

@@ -19,6 +19,8 @@ use age::x25519;
use ed25519_dalek::{Signature, Signer, Verifier, SigningKey, VerifyingKey};
use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
use std::collections::HashSet;
use std::convert::TryInto;
use crate::protocol::Protocol;
use crate::server::Server;
@@ -74,6 +76,125 @@ fn parse_ed25519_verifying_key(s: &str) -> Result<VerifyingKey, AgeWireError> {
VerifyingKey::from_bytes(&key_bytes).map_err(|_| AgeWireError::ParseKey)
}
// ---------- Derivation + Raw X25519 (Ed25519 -> X25519) ----------
//
// We deterministically derive an X25519 keypair from an Ed25519 SigningKey.
// We persist the X25519 public/secret as base64-encoded 32-byte raw values
// (no "age1..."/"AGE-SECRET-KEY-1..." formatting). Name-based encrypt/decrypt
// uses these raw values directly via x25519-dalek + ChaCha20Poly1305.
use chacha20poly1305::{aead::{Aead, KeyInit}, ChaCha20Poly1305, Key, Nonce};
use sha2::{Digest, Sha256};
use x25519_dalek::{PublicKey as XPublicKey, StaticSecret as XStaticSecret};
fn derive_x25519_raw_from_ed25519(sk: &SigningKey) -> ([u8; 32], [u8; 32]) {
// X25519 secret scalar (clamped) from Ed25519 secret
let scalar: [u8; 32] = sk.to_scalar_bytes();
// Build X25519 secret/public using dalek
let xsec = XStaticSecret::from(scalar);
let xpub = XPublicKey::from(&xsec);
(xpub.to_bytes(), xsec.to_bytes())
}
fn derive_x25519_raw_b64_from_ed25519(sk: &SigningKey) -> (String, String) {
let (xpub, xsec) = derive_x25519_raw_from_ed25519(sk);
(B64.encode(xpub), B64.encode(xsec))
}
// Helper: detect whether a stored key looks like an age-formatted string
fn looks_like_age_format(s: &str) -> bool {
s.starts_with("age1") || s.starts_with("AGE-SECRET-KEY-1")
}
// Our container format for name-based raw X25519 encryption:
// bytes = "HDBX1" (5) || eph_pub(32) || nonce(12) || ciphertext(..)
// Entire blob is base64-encoded for transport.
const HDBX1_MAGIC: &[u8; 5] = b"HDBX1";
fn encrypt_b64_with_x25519_raw(recip_pub_b64: &str, msg: &str) -> Result<String, AgeWireError> {
use rand::RngCore;
use rand::rngs::OsRng;
// Parse recipient public key (raw 32 bytes, base64)
let recip_pub_bytes = B64.decode(recip_pub_b64).map_err(|_| AgeWireError::ParseKey)?;
if recip_pub_bytes.len() != 32 { return Err(AgeWireError::ParseKey); }
let recip_pub_arr: [u8; 32] = recip_pub_bytes.as_slice().try_into().map_err(|_| AgeWireError::ParseKey)?;
let recip_pub: XPublicKey = XPublicKey::from(recip_pub_arr);
// Generate ephemeral X25519 keypair
let mut eph_sec_bytes = [0u8; 32];
OsRng.fill_bytes(&mut eph_sec_bytes);
let eph_sec = XStaticSecret::from(eph_sec_bytes);
let eph_pub = XPublicKey::from(&eph_sec);
// ECDH
let shared = eph_sec.diffie_hellman(&recip_pub);
// Derive symmetric key via SHA-256 over context + shared + parties
let mut hasher = Sha256::default();
hasher.update(b"herodb-x25519-v1");
hasher.update(shared.as_bytes());
hasher.update(eph_pub.as_bytes());
hasher.update(recip_pub.as_bytes());
let key_bytes = hasher.finalize();
let key = Key::from_slice(&key_bytes[..32]);
// Nonce (12 bytes)
let mut nonce_bytes = [0u8; 12];
OsRng.fill_bytes(&mut nonce_bytes);
let nonce = Nonce::from_slice(&nonce_bytes);
// Encrypt
let cipher = ChaCha20Poly1305::new(key);
let ct = cipher.encrypt(nonce, msg.as_bytes())
.map_err(|e| AgeWireError::Crypto(format!("encrypt: {e}")))?;
// Assemble container
let mut out = Vec::with_capacity(5 + 32 + 12 + ct.len());
out.extend_from_slice(HDBX1_MAGIC);
out.extend_from_slice(eph_pub.as_bytes());
out.extend_from_slice(&nonce_bytes);
out.extend_from_slice(&ct);
Ok(B64.encode(out))
}
fn decrypt_b64_with_x25519_raw(identity_sec_b64: &str, ct_b64: &str) -> Result<String, AgeWireError> {
// Parse X25519 secret (raw 32 bytes, base64)
let sec_bytes = B64.decode(identity_sec_b64).map_err(|_| AgeWireError::ParseKey)?;
if sec_bytes.len() != 32 { return Err(AgeWireError::ParseKey); }
let sec_arr: [u8; 32] = sec_bytes.as_slice().try_into().map_err(|_| AgeWireError::ParseKey)?;
let xsec = XStaticSecret::from(sec_arr);
let xpub = XPublicKey::from(&xsec); // self public
// Decode container
let blob = B64.decode(ct_b64.as_bytes()).map_err(|e| AgeWireError::Crypto(e.to_string()))?;
if blob.len() < 5 + 32 + 12 { return Err(AgeWireError::Crypto("ciphertext too short".to_string())); }
if &blob[..5] != HDBX1_MAGIC { return Err(AgeWireError::Crypto("bad header".to_string())); }
let eph_pub_arr: [u8; 32] = blob[5..5+32].try_into().map_err(|_| AgeWireError::Crypto("bad eph pub".to_string()))?;
let eph_pub = XPublicKey::from(eph_pub_arr);
let nonce_bytes: [u8; 12] = blob[5+32..5+32+12].try_into().unwrap();
let ct = &blob[5+32+12..];
// Recompute shared + key
let shared = xsec.diffie_hellman(&eph_pub);
let mut hasher = Sha256::default();
hasher.update(b"herodb-x25519-v1");
hasher.update(shared.as_bytes());
hasher.update(eph_pub.as_bytes());
hasher.update(xpub.as_bytes());
let key_bytes = hasher.finalize();
let key = Key::from_slice(&key_bytes[..32]);
// Decrypt
let cipher = ChaCha20Poly1305::new(key);
let nonce = Nonce::from_slice(&nonce_bytes);
let pt = cipher.decrypt(nonce, ct)
.map_err(|e| AgeWireError::Crypto(format!("decrypt: {e}")))?;
String::from_utf8(pt).map_err(|_| AgeWireError::Utf8)
}
// ---------- Stateless crypto helpers (string in/out) ----------
pub fn gen_enc_keypair() -> (String, String) {
@@ -210,13 +331,72 @@ pub async fn cmd_age_verify(verify_pub: &str, message: &str, sig_b64: &str) -> P
}
}
// ---------- NEW: unified stateless generator (Ed25519 + derived X25519 raw) ----------
//
// Returns 4-tuple:
// [ verify_pub_b64 (32B), signpriv_b64 (32B), x25519_pub_b64 (32B), x25519_sec_b64 (32B) ]
// No persistence (stateless).
pub async fn cmd_age_genkey() -> Protocol {
use rand::RngCore;
use rand::rngs::OsRng;
let mut secret_bytes = [0u8; 32];
OsRng.fill_bytes(&mut secret_bytes);
let signing_key = SigningKey::from_bytes(&secret_bytes);
let verifying_key = signing_key.verifying_key();
let verify_b64 = B64.encode(verifying_key.to_bytes());
let sign_b64 = B64.encode(signing_key.to_bytes());
let (xpub_b64, xsec_b64) = derive_x25519_raw_b64_from_ed25519(&signing_key);
Protocol::Array(vec![
Protocol::BulkString(verify_b64),
Protocol::BulkString(sign_b64),
Protocol::BulkString(xpub_b64),
Protocol::BulkString(xsec_b64),
])
}
// ---------- NEW: Persistent, named-key commands ----------
pub async fn cmd_age_keygen(server: &Server, name: &str) -> Protocol {
let (recip, ident) = gen_enc_keypair();
if let Err(e) = sset(server, &enc_pub_key_key(name), &recip) { return e.to_protocol(); }
if let Err(e) = sset(server, &enc_priv_key_key(name), &ident) { return e.to_protocol(); }
Protocol::Array(vec![Protocol::BulkString(recip), Protocol::BulkString(ident)])
use rand::RngCore;
use rand::rngs::OsRng;
// Generate Ed25519 keypair
let mut secret_bytes = [0u8; 32];
OsRng.fill_bytes(&mut secret_bytes);
let signing_key = SigningKey::from_bytes(&secret_bytes);
let verifying_key = signing_key.verifying_key();
// Encode Ed25519 as base64 (32 bytes)
let verify_b64 = B64.encode(verifying_key.to_bytes());
let sign_b64 = B64.encode(signing_key.to_bytes());
// Derive X25519 raw (32-byte) keys and encode as base64
let (xpub_b64, xsec_b64) = derive_x25519_raw_b64_from_ed25519(&signing_key);
// Decode to create age-formatted strings
let xpub_bytes = B64.decode(&xpub_b64).unwrap();
let xsec_bytes = B64.decode(&xsec_b64).unwrap();
let xpub_arr: [u8; 32] = xpub_bytes.as_slice().try_into().unwrap();
let xsec_arr: [u8; 32] = xsec_bytes.as_slice().try_into().unwrap();
let recip_str = format!("age1{}", B64.encode(xpub_arr));
let ident_str = format!("AGE-SECRET-KEY-1{}", B64.encode(xsec_arr));
// Persist Ed25519 and derived X25519 (key-managed mode)
if let Err(e) = sset(server, &sign_pub_key_key(name), &verify_b64) { return e.to_protocol(); }
if let Err(e) = sset(server, &sign_priv_key_key(name), &sign_b64) { return e.to_protocol(); }
if let Err(e) = sset(server, &enc_pub_key_key(name), &xpub_b64) { return e.to_protocol(); }
if let Err(e) = sset(server, &enc_priv_key_key(name), &xsec_b64) { return e.to_protocol(); }
// Return [recipient, identity] in age format
Protocol::Array(vec![
Protocol::BulkString(recip_str),
Protocol::BulkString(ident_str),
])
}
pub async fn cmd_age_signkeygen(server: &Server, name: &str) -> Protocol {
@@ -227,26 +407,76 @@ pub async fn cmd_age_signkeygen(server: &Server, name: &str) -> Protocol {
}
pub async fn cmd_age_encrypt_name(server: &Server, name: &str, message: &str) -> Protocol {
let recip = match sget(server, &enc_pub_key_key(name)) {
// Load stored recipient (could be raw b64 32-byte or "age1..." from legacy)
let recip_or_b64 = match sget(server, &enc_pub_key_key(name)) {
Ok(Some(v)) => v,
Ok(None) => return AgeWireError::NotFound("recipient (age:key:{name})").to_protocol(),
Ok(None) => {
// Derive from stored Ed25519 if present, then persist
match sget(server, &sign_priv_key_key(name)) {
Ok(Some(sign_b64)) => {
let sk = match parse_ed25519_signing_key(&sign_b64) {
Ok(k) => k,
Err(e) => return e.to_protocol(),
};
let (xpub_b64, xsec_b64) = derive_x25519_raw_b64_from_ed25519(&sk);
if let Err(e) = sset(server, &enc_pub_key_key(name), &xpub_b64) { return e.to_protocol(); }
if let Err(e) = sset(server, &enc_priv_key_key(name), &xsec_b64) { return e.to_protocol(); }
xpub_b64
}
Ok(None) => return AgeWireError::NotFound("recipient (age:key:{name})").to_protocol(),
Err(e) => return e.to_protocol(),
}
}
Err(e) => return e.to_protocol(),
};
match encrypt_b64(&recip, message) {
Ok(ct) => Protocol::BulkString(ct),
Err(e) => e.to_protocol(),
if looks_like_age_format(&recip_or_b64) {
match encrypt_b64(&recip_or_b64, message) {
Ok(ct) => Protocol::BulkString(ct),
Err(e) => e.to_protocol(),
}
} else {
match encrypt_b64_with_x25519_raw(&recip_or_b64, message) {
Ok(ct) => Protocol::BulkString(ct),
Err(e) => e.to_protocol(),
}
}
}
pub async fn cmd_age_decrypt_name(server: &Server, name: &str, ct_b64: &str) -> Protocol {
let ident = match sget(server, &enc_priv_key_key(name)) {
// Load stored identity (could be raw b64 32-byte or "AGE-SECRET-KEY-1..." from legacy)
let ident_or_b64 = match sget(server, &enc_priv_key_key(name)) {
Ok(Some(v)) => v,
Ok(None) => return AgeWireError::NotFound("identity (age:privkey:{name})").to_protocol(),
Ok(None) => {
// Derive from stored Ed25519 if present, then persist
match sget(server, &sign_priv_key_key(name)) {
Ok(Some(sign_b64)) => {
let sk = match parse_ed25519_signing_key(&sign_b64) {
Ok(k) => k,
Err(e) => return e.to_protocol(),
};
let (xpub_b64, xsec_b64) = derive_x25519_raw_b64_from_ed25519(&sk);
if let Err(e) = sset(server, &enc_pub_key_key(name), &xpub_b64) { return e.to_protocol(); }
if let Err(e) = sset(server, &enc_priv_key_key(name), &xsec_b64) { return e.to_protocol(); }
xsec_b64
}
Ok(None) => return AgeWireError::NotFound("identity (age:privkey:{name})").to_protocol(),
Err(e) => return e.to_protocol(),
}
}
Err(e) => return e.to_protocol(),
};
match decrypt_b64(&ident, ct_b64) {
Ok(pt) => Protocol::BulkString(pt),
Err(e) => e.to_protocol(),
if looks_like_age_format(&ident_or_b64) {
match decrypt_b64(&ident_or_b64, ct_b64) {
Ok(pt) => Protocol::BulkString(pt),
Err(e) => e.to_protocol(),
}
} else {
match decrypt_b64_with_x25519_raw(&ident_or_b64, ct_b64) {
Ok(pt) => Protocol::BulkString(pt),
Err(e) => e.to_protocol(),
}
}
}
@@ -276,33 +506,31 @@ pub async fn cmd_age_verify_name(server: &Server, name: &str, message: &str, sig
}
pub async fn cmd_age_list(server: &Server) -> Protocol {
// Returns 4 arrays: ["encpub", <names...>], ["encpriv", ...], ["signpub", ...], ["signpriv", ...]
// Return a flat, deduplicated, sorted list of managed key names (no labels)
let st = match server.current_storage() { Ok(s) => s, Err(e) => return Protocol::err(&e.0) };
let pull = |pat: &str, prefix: &str| -> Result<Vec<String>, DBError> {
let keys = st.keys(pat)?;
let mut names: Vec<String> = keys.into_iter()
let mut names: Vec<String> = keys
.into_iter()
.filter_map(|k| k.strip_prefix(prefix).map(|x| x.to_string()))
.collect();
names.sort();
Ok(names)
};
let encpub = match pull("age:key:*", "age:key:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let encpriv = match pull("age:privkey:*", "age:privkey:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let signpub = match pull("age:signpub:*", "age:signpub:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let signpriv= match pull("age:signpriv:*", "age:signpriv:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let encpub = match pull("age:key:*", "age:key:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let encpriv = match pull("age:privkey:*", "age:privkey:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let signpub = match pull("age:signpub:*", "age:signpub:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let signpriv = match pull("age:signpriv:*", "age:signpriv:") { Ok(v) => v, Err(e)=> return Protocol::err(&e.0) };
let to_arr = |label: &str, v: Vec<String>| {
let mut out = vec![Protocol::BulkString(label.to_string())];
out.push(Protocol::Array(v.into_iter().map(Protocol::BulkString).collect()));
Protocol::Array(out)
};
let mut set: HashSet<String> = HashSet::new();
for n in encpub.into_iter().chain(encpriv).chain(signpub).chain(signpriv) {
set.insert(n);
}
Protocol::Array(vec![
to_arr("encpub", encpub),
to_arr("encpriv", encpriv),
to_arr("signpub", signpub),
to_arr("signpriv", signpriv),
])
let mut names: Vec<String> = set.into_iter().collect();
names.sort();
Protocol::Array(names.into_iter().map(Protocol::BulkString).collect())
}

1148
src/cmd.rs

File diff suppressed because it is too large Load Diff

View File

@@ -1,8 +1,8 @@
use chacha20poly1305::{
aead::{Aead, KeyInit, OsRng},
aead::{Aead, KeyInit},
XChaCha20Poly1305, XNonce,
};
use rand::RngCore;
use rand::{rngs::OsRng, RngCore};
use sha2::{Digest, Sha256};
const VERSION: u8 = 1;
@@ -31,7 +31,7 @@ pub struct CryptoFactory {
impl CryptoFactory {
/// Accepts any secret bytes; turns them into a 32-byte key (SHA-256).
pub fn new<S: AsRef<[u8]>>(secret: S) -> Self {
let mut h = Sha256::new();
let mut h = Sha256::default();
h.update(b"xchacha20poly1305-factory:v1"); // domain separation
h.update(secret.as_ref());
let digest = h.finalize(); // 32 bytes

405
src/embedding.rs Normal file
View File

@@ -0,0 +1,405 @@
// Embedding abstraction and minimal providers.
use std::collections::HashMap;
use std::sync::Arc;
use serde::{Deserialize, Serialize};
use crate::error::DBError;
// Networking for OpenAI/Azure
use std::time::Duration;
use ureq::{Agent, AgentBuilder};
use serde_json::json;
/// Provider identifiers. Extend as needed to mirror LanceDB-supported providers.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "snake_case")]
pub enum EmbeddingProvider {
// Deterministic, local-only embedder for CI and offline development (text).
TestHash,
// Deterministic, local-only embedder for CI and offline development (image).
ImageTestHash,
// Placeholders for LanceDB-supported providers; implementers can add concrete backends later.
LanceFastEmbed,
LanceOpenAI,
LanceOther(String),
}
/// Serializable embedding configuration.
/// params: arbitrary key-value map for provider-specific knobs (e.g., "dim", "api_key_env", etc.)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmbeddingConfig {
pub provider: EmbeddingProvider,
pub model: String,
#[serde(default)]
pub params: HashMap<String, String>,
}
impl EmbeddingConfig {
pub fn get_param_usize(&self, key: &str) -> Option<usize> {
self.params.get(key).and_then(|v| v.parse::<usize>().ok())
}
pub fn get_param_string(&self, key: &str) -> Option<String> {
self.params.get(key).cloned()
}
}
/// A provider-agnostic text embedding interface.
pub trait Embedder: Send + Sync {
/// Human-readable provider/model name
fn name(&self) -> String;
/// Embedding dimension
fn dim(&self) -> usize;
/// Embed a single text string into a fixed-length vector
fn embed(&self, text: &str) -> Result<Vec<f32>, DBError>;
/// Embed many texts; default maps embed() over inputs
fn embed_many(&self, texts: &[String]) -> Result<Vec<Vec<f32>>, DBError> {
texts.iter().map(|t| self.embed(t)).collect()
}
}
//// ----------------------------- TEXT: deterministic test embedder -----------------------------
/// Deterministic, no-deps, no-network embedder for CI and offline dev.
/// Algorithm:
/// - Fold bytes of UTF-8 into 'dim' buckets with a simple rolling hash
/// - Apply tanh-like scaling and L2-normalize to unit length
pub struct TestHashEmbedder {
dim: usize,
model_name: String,
}
impl TestHashEmbedder {
pub fn new(dim: usize, model_name: impl Into<String>) -> Self {
Self { dim, model_name: model_name.into() }
}
fn l2_normalize(mut v: Vec<f32>) -> Vec<f32> {
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for x in &mut v {
*x /= norm;
}
}
v
}
}
impl Embedder for TestHashEmbedder {
fn name(&self) -> String {
format!("test-hash:{}", self.model_name)
}
fn dim(&self) -> usize {
self.dim
}
fn embed(&self, text: &str) -> Result<Vec<f32>, DBError> {
let mut acc = vec![0f32; self.dim];
// A simple, deterministic folding hash over bytes
let mut h1: u32 = 2166136261u32; // FNV-like seed
let mut h2: u32 = 0x9e3779b9u32; // golden ratio
for (i, b) in text.as_bytes().iter().enumerate() {
h1 ^= *b as u32;
h1 = h1.wrapping_mul(16777619u32);
h2 = h2.wrapping_add(((*b as u32) << (i % 13)) ^ (h1.rotate_left((i % 7) as u32)));
let idx = (h1 ^ h2) as usize % self.dim;
// Map byte to [-1, 1] and accumulate with mild decay by position
let val = ((*b as f32) / 127.5 - 1.0) * (1.0 / (1.0 + (i as f32 / 32.0)));
acc[idx] += val;
}
// Non-linear squashing to stabilize + normalize
for x in &mut acc {
*x = x.tanh();
}
Ok(Self::l2_normalize(acc))
}
}
//// ----------------------------- IMAGE: trait + deterministic test embedder -----------------------------
/// Image embedding interface (separate from text to keep modality-specific inputs).
pub trait ImageEmbedder: Send + Sync {
/// Human-readable provider/model name
fn name(&self) -> String;
/// Embedding dimension
fn dim(&self) -> usize;
/// Embed a single image (raw bytes)
fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError>;
/// Embed many images; default maps embed_image() over inputs
fn embed_many_images(&self, images: &[Vec<u8>]) -> Result<Vec<Vec<f32>>, DBError> {
images.iter().map(|b| self.embed_image(b)).collect()
}
}
/// Deterministic image embedder that folds bytes into buckets, applies tanh-like nonlinearity,
/// and L2-normalizes. Suitable for CI and offline development.
/// NOTE: This is NOT semantic; it is a stable hash-like representation.
pub struct TestImageHashEmbedder {
dim: usize,
model_name: String,
}
impl TestImageHashEmbedder {
pub fn new(dim: usize, model_name: impl Into<String>) -> Self {
Self { dim, model_name: model_name.into() }
}
fn l2_normalize(mut v: Vec<f32>) -> Vec<f32> {
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for x in &mut v {
*x /= norm;
}
}
v
}
}
impl ImageEmbedder for TestImageHashEmbedder {
fn name(&self) -> String {
format!("test-image-hash:{}", self.model_name)
}
fn dim(&self) -> usize {
self.dim
}
fn embed_image(&self, bytes: &[u8]) -> Result<Vec<f32>, DBError> {
// Deterministic fold across bytes with two rolling accumulators.
let mut acc = vec![0f32; self.dim];
let mut h1: u32 = 0x811C9DC5; // FNV-like
let mut h2: u32 = 0x9E3779B9; // golden ratio
for (i, b) in bytes.iter().enumerate() {
h1 ^= *b as u32;
h1 = h1.wrapping_mul(16777619u32);
// combine with position and h2
h2 = h2.wrapping_add(((i as u32).rotate_left((i % 13) as u32)) ^ h1.rotate_left((i % 7) as u32));
let idx = (h1 ^ h2) as usize % self.dim;
// Map to [-1,1] and decay with position
let val = ((*b as f32) / 127.5 - 1.0) * (1.0 / (1.0 + (i as f32 / 128.0)));
acc[idx] += val;
}
for x in &mut acc {
*x = x.tanh();
}
Ok(Self::l2_normalize(acc))
}
}
//// OpenAI embedder (supports OpenAI and Azure OpenAI via REST)
struct OpenAIEmbedder {
model: String,
dim: usize,
agent: Agent,
endpoint: String,
headers: Vec<(String, String)>,
use_azure: bool,
}
impl OpenAIEmbedder {
fn new_from_config(cfg: &EmbeddingConfig) -> Result<Self, DBError> {
// Whether to use Azure OpenAI
let use_azure = cfg
.get_param_string("use_azure")
.map(|s| s.eq_ignore_ascii_case("true"))
.unwrap_or(false);
// Resolve API key (OPENAI_API_KEY or AZURE_OPENAI_API_KEY by default)
let api_key_env = cfg
.get_param_string("api_key_env")
.unwrap_or_else(|| {
if use_azure {
"AZURE_OPENAI_API_KEY".to_string()
} else {
"OPENAI_API_KEY".to_string()
}
});
let api_key = std::env::var(&api_key_env)
.map_err(|_| DBError(format!("Missing API key in env '{}'", api_key_env)))?;
// Resolve endpoint
// - Standard OpenAI: https://api.openai.com/v1/embeddings (default) or params["base_url"]
// - Azure OpenAI: {azure_endpoint}/openai/deployments/{deployment}/embeddings?api-version=...
let endpoint = if use_azure {
let base = cfg
.get_param_string("azure_endpoint")
.ok_or_else(|| DBError("Missing 'azure_endpoint' for Azure OpenAI".into()))?;
let deployment = cfg
.get_param_string("azure_deployment")
.unwrap_or_else(|| cfg.model.clone());
let api_version = cfg
.get_param_string("azure_api_version")
.unwrap_or_else(|| "2023-05-15".to_string());
format!(
"{}/openai/deployments/{}/embeddings?api-version={}",
base.trim_end_matches('/'),
deployment,
api_version
)
} else {
cfg.get_param_string("base_url")
.unwrap_or_else(|| "https://api.openai.com/v1/embeddings".to_string())
};
// Determine expected dimension (default 1536 for text-embedding-3-small; callers should override if needed)
let dim = cfg
.get_param_usize("dim")
.or_else(|| cfg.get_param_usize("dimensions"))
.unwrap_or(1536);
// Build an HTTP agent with timeouts (blocking; no tokio runtime involved)
let agent = AgentBuilder::new()
.timeout_read(Duration::from_secs(30))
.timeout_write(Duration::from_secs(30))
.build();
// Headers
let mut headers: Vec<(String, String)> = Vec::new();
headers.push(("Content-Type".to_string(), "application/json".to_string()));
if use_azure {
headers.push(("api-key".to_string(), api_key));
} else {
headers.push(("Authorization".to_string(), format!("Bearer {}", api_key)));
}
Ok(Self {
model: cfg.model.clone(),
dim,
agent,
endpoint,
headers,
use_azure,
})
}
fn request_many(&self, inputs: &[String]) -> Result<Vec<Vec<f32>>, DBError> {
// Compose request body:
// - Standard OpenAI: { "model": ..., "input": [...], "dimensions": dim? }
// - Azure: { "input": [...], "dimensions": dim? } (model from deployment)
let mut body = if self.use_azure {
json!({ "input": inputs })
} else {
json!({ "model": self.model, "input": inputs })
};
if self.dim > 0 {
body.as_object_mut()
.unwrap()
.insert("dimensions".to_string(), json!(self.dim));
}
// Build request
let mut req = self.agent.post(&self.endpoint);
for (k, v) in &self.headers {
req = req.set(k, v);
}
// Send and handle errors
let resp = req.send_json(body);
let text = match resp {
Ok(r) => r
.into_string()
.map_err(|e| DBError(format!("Failed to read embeddings response: {}", e)))?,
Err(ureq::Error::Status(code, r)) => {
let body = r.into_string().unwrap_or_default();
return Err(DBError(format!("Embeddings API error {}: {}", code, body)));
}
Err(e) => return Err(DBError(format!("HTTP request failed: {}", e))),
};
let val: serde_json::Value = serde_json::from_str(&text)
.map_err(|e| DBError(format!("Invalid JSON from embeddings API: {}", e)))?;
let data = val
.get("data")
.and_then(|d| d.as_array())
.ok_or_else(|| DBError("Embeddings API response missing 'data' array".into()))?;
let mut out: Vec<Vec<f32>> = Vec::with_capacity(data.len());
for item in data {
let emb = item
.get("embedding")
.and_then(|e| e.as_array())
.ok_or_else(|| DBError("Embeddings API item missing 'embedding'".into()))?;
let mut v: Vec<f32> = Vec::with_capacity(emb.len());
for n in emb {
let f = n
.as_f64()
.ok_or_else(|| DBError("Embedding element is not a number".into()))?;
v.push(f as f32);
}
if self.dim > 0 && v.len() != self.dim {
return Err(DBError(format!(
"Embedding dimension mismatch: expected {}, got {}. Configure 'dim' or 'dimensions' to match output.",
self.dim, v.len()
)));
}
out.push(v);
}
Ok(out)
}
}
impl Embedder for OpenAIEmbedder {
fn name(&self) -> String {
if self.use_azure {
format!("azure-openai:{}", self.model)
} else {
format!("openai:{}", self.model)
}
}
fn dim(&self) -> usize {
self.dim
}
fn embed(&self, text: &str) -> Result<Vec<f32>, DBError> {
let v = self.request_many(&[text.to_string()])?;
Ok(v.into_iter().next().unwrap_or_else(|| vec![0.0; self.dim]))
}
fn embed_many(&self, texts: &[String]) -> Result<Vec<Vec<f32>>, DBError> {
if texts.is_empty() {
return Ok(vec![]);
}
self.request_many(texts)
}
}
/// Create an embedder instance from a config.
/// - TestHash: uses params["dim"] or defaults to 64
/// - LanceOpenAI: uses OpenAI (or Azure OpenAI) embeddings REST API
/// - Other Lance providers can be added similarly
pub fn create_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn Embedder>, DBError> {
match &config.provider {
EmbeddingProvider::TestHash => {
let dim = config.get_param_usize("dim").unwrap_or(64);
Ok(Arc::new(TestHashEmbedder::new(dim, config.model.clone())))
}
EmbeddingProvider::LanceOpenAI => {
let inner = OpenAIEmbedder::new_from_config(config)?;
Ok(Arc::new(inner))
}
EmbeddingProvider::ImageTestHash => {
Err(DBError("Use create_image_embedder() for image providers".into()))
}
EmbeddingProvider::LanceFastEmbed => Err(DBError("LanceFastEmbed provider not yet implemented in Rust embedding layer; configure 'test-hash' or use 'openai'".into())),
EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Lance provider '{}' not implemented; configure 'openai' or 'test-hash'", p))),
}
}
/// Create an image embedder instance from a config.
pub fn create_image_embedder(config: &EmbeddingConfig) -> Result<Arc<dyn ImageEmbedder>, DBError> {
match &config.provider {
EmbeddingProvider::ImageTestHash => {
let dim = config.get_param_usize("dim").unwrap_or(512);
Ok(Arc::new(TestImageHashEmbedder::new(dim, config.model.clone())))
}
EmbeddingProvider::TestHash | EmbeddingProvider::LanceOpenAI => {
Err(DBError("Configured text provider; dataset expects image provider (e.g., 'testimagehash')".into()))
}
EmbeddingProvider::LanceFastEmbed => Err(DBError("Image provider 'lancefastembed' not yet implemented".into())),
EmbeddingProvider::LanceOther(p) => Err(DBError(format!("Image provider '{}' not implemented; use 'testimagehash' for now", p))),
}
}

663
src/lance_store.rs Normal file
View File

@@ -0,0 +1,663 @@
// LanceDB store abstraction (per database instance)
// This module encapsulates all Lance/LanceDB operations for a given DB id.
// Notes:
// - We persist each dataset (aka "table") under <base_dir>/lance/<db_id>/<name>.lance
// - Schema convention: id: Utf8 (non-null), vector: FixedSizeList<Float32, dim> (non-null), meta: Utf8 (nullable JSON string)
// - We implement naive KNN (L2) scan in Rust for search to avoid tight coupling to lancedb search builder API.
// Index creation uses lance::Dataset vector index; future optimization can route to index-aware search.
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use crate::error::DBError;
use arrow_array::{Array, RecordBatch, RecordBatchIterator, StringArray};
use arrow_array::builder::{FixedSizeListBuilder, Float32Builder, StringBuilder};
use arrow_array::cast::AsArray;
use arrow_schema::{DataType, Field, Schema};
use futures::StreamExt;
use serde_json::Value as JsonValue;
// Low-level Lance core
use lance::dataset::{WriteMode, WriteParams};
use lance::Dataset;
// Vector index (IVF_PQ etc.)
// High-level LanceDB (for deletes where available)
use lancedb::connection::Connection;
use arrow_array::types::Float32Type;
#[derive(Clone)]
pub struct LanceStore {
base_dir: PathBuf,
db_id: u64,
}
impl LanceStore {
// Create a new LanceStore rooted at <base_dir>/lance/<db_id>
pub fn new(base_dir: &Path, db_id: u64) -> Result<Self, DBError> {
let p = base_dir.join("lance").join(db_id.to_string());
std::fs::create_dir_all(&p)
.map_err(|e| DBError(format!("Failed to create Lance dir {}: {}", p.display(), e)))?;
Ok(Self { base_dir: p, db_id })
}
fn dataset_path(&self, name: &str) -> PathBuf {
// Store datasets as directories or files with .lance suffix
// We accept both "<name>" and "<name>.lance" as logical name; normalize on ".lance"
let has_ext = name.ends_with(".lance");
if has_ext {
self.base_dir.join(name)
} else {
self.base_dir.join(format!("{name}.lance"))
}
}
fn file_uri(path: &Path) -> String {
// lancedb can use filesystem path directly; keep it simple
// Avoid file:// scheme since local paths are supported.
path.to_string_lossy().to_string()
}
async fn connect_db(&self) -> Result<Connection, DBError> {
let uri = Self::file_uri(&self.base_dir);
lancedb::connect(&uri)
.execute()
.await
.map_err(|e| DBError(format!("LanceDB connect failed at {}: {}", uri, e)))
}
fn vector_field(dim: i32) -> Field {
Field::new(
"vector",
DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), dim),
false,
)
}
async fn read_existing_dim(&self, name: &str) -> Result<Option<i32>, DBError> {
let path = self.dataset_path(name);
if !path.exists() {
return Ok(None);
}
let ds = Dataset::open(path.to_string_lossy().as_ref())
.await
.map_err(|e| DBError(format!("Open dataset failed: {}: {}", path.display(), e)))?;
// Scan a single batch to infer vector dimension from the 'vector' column type
let mut scan = ds.scan();
if let Err(e) = scan.project(&["vector"]) {
return Err(DBError(format!("Project failed while inferring dim: {}", e)));
}
let mut stream = scan
.try_into_stream()
.await
.map_err(|e| DBError(format!("Scan stream failed while inferring dim: {}", e)))?;
if let Some(batch_res) = stream.next().await {
let batch = batch_res.map_err(|e| DBError(format!("Batch error: {}", e)))?;
let vec_col = batch
.column_by_name("vector")
.ok_or_else(|| DBError("Column 'vector' missing".into()))?;
let fsl = vec_col.as_fixed_size_list();
let dim = fsl.value_length();
return Ok(Some(dim));
}
Ok(None)
}
fn build_schema(dim: i32) -> Arc<Schema> {
Arc::new(Schema::new(vec![
Field::new("id", DataType::Utf8, false),
Self::vector_field(dim),
Field::new("text", DataType::Utf8, true),
Field::new("media_type", DataType::Utf8, true),
Field::new("media_uri", DataType::Utf8, true),
Field::new("meta", DataType::Utf8, true),
]))
}
fn build_one_row_batch(
id: &str,
vector: &[f32],
meta: &HashMap<String, String>,
text: Option<&str>,
media_type: Option<&str>,
media_uri: Option<&str>,
dim: i32,
) -> Result<(Arc<Schema>, RecordBatch), DBError> {
if vector.len() as i32 != dim {
return Err(DBError(format!(
"Vector length mismatch: expected {}, got {}",
dim,
vector.len()
)));
}
let schema = Self::build_schema(dim);
// id column
let mut id_builder = StringBuilder::new();
id_builder.append_value(id);
let id_arr = Arc::new(id_builder.finish()) as Arc<dyn Array>;
// vector column (FixedSizeList<Float32, dim>)
let v_builder = Float32Builder::with_capacity(vector.len());
let mut list_builder = FixedSizeListBuilder::new(v_builder, dim);
for v in vector {
list_builder.values().append_value(*v);
}
list_builder.append(true);
let vec_arr = Arc::new(list_builder.finish()) as Arc<dyn Array>;
// text column (optional)
let mut text_builder = StringBuilder::new();
if let Some(t) = text {
text_builder.append_value(t);
} else {
text_builder.append_null();
}
let text_arr = Arc::new(text_builder.finish()) as Arc<dyn Array>;
// media_type column (optional)
let mut mt_builder = StringBuilder::new();
if let Some(mt) = media_type {
mt_builder.append_value(mt);
} else {
mt_builder.append_null();
}
let mt_arr = Arc::new(mt_builder.finish()) as Arc<dyn Array>;
// media_uri column (optional)
let mut mu_builder = StringBuilder::new();
if let Some(mu) = media_uri {
mu_builder.append_value(mu);
} else {
mu_builder.append_null();
}
let mu_arr = Arc::new(mu_builder.finish()) as Arc<dyn Array>;
// meta column (JSON string)
let meta_json = if meta.is_empty() {
None
} else {
Some(serde_json::to_string(meta).map_err(|e| DBError(format!("Serialize meta error: {e}")))?)
};
let mut meta_builder = StringBuilder::new();
if let Some(s) = meta_json {
meta_builder.append_value(&s);
} else {
meta_builder.append_null();
}
let meta_arr = Arc::new(meta_builder.finish()) as Arc<dyn Array>;
let batch =
RecordBatch::try_new(schema.clone(), vec![id_arr, vec_arr, text_arr, mt_arr, mu_arr, meta_arr]).map_err(|e| {
DBError(format!("RecordBatch build failed: {e}"))
})?;
Ok((schema, batch))
}
// Create a new dataset (vector collection) with dimension `dim`.
pub async fn create_dataset(&self, name: &str, dim: usize) -> Result<(), DBError> {
let dim_i32: i32 = dim
.try_into()
.map_err(|_| DBError("Dimension too large".into()))?;
let path = self.dataset_path(name);
if path.exists() {
// Validate dimension if present
if let Some(existing_dim) = self.read_existing_dim(name).await? {
if existing_dim != dim_i32 {
return Err(DBError(format!(
"Dataset '{}' already exists with dim {}, requested {}",
name, existing_dim, dim_i32
)));
}
// No-op
return Ok(());
}
}
// Create an empty dataset by writing an empty batch
let schema = Self::build_schema(dim_i32);
let empty_id = Arc::new(StringArray::new_null(0));
// Build an empty FixedSizeListArray
let v_builder = Float32Builder::new();
let mut list_builder = FixedSizeListBuilder::new(v_builder, dim_i32);
let empty_vec = Arc::new(list_builder.finish()) as Arc<dyn Array>;
let empty_text = Arc::new(StringArray::new_null(0));
let empty_media_type = Arc::new(StringArray::new_null(0));
let empty_media_uri = Arc::new(StringArray::new_null(0));
let empty_meta = Arc::new(StringArray::new_null(0));
let empty_batch =
RecordBatch::try_new(schema.clone(), vec![empty_id, empty_vec, empty_text, empty_media_type, empty_media_uri, empty_meta])
.map_err(|e| DBError(format!("Build empty batch failed: {e}")))?;
let write_params = WriteParams {
mode: WriteMode::Create,
..Default::default()
};
let reader = RecordBatchIterator::new([Ok(empty_batch)], schema.clone());
Dataset::write(reader, path.to_string_lossy().as_ref(), Some(write_params))
.await
.map_err(|e| DBError(format!("Create dataset failed at {}: {}", path.display(), e)))?;
Ok(())
}
// Store/Upsert a single vector with ID and optional metadata (append; duplicate IDs are possible for now)
pub async fn store_vector(
&self,
name: &str,
id: &str,
vector: Vec<f32>,
meta: HashMap<String, String>,
text: Option<String>,
) -> Result<(), DBError> {
// Delegate to media-aware path with no media fields
self.store_vector_with_media(name, id, vector, meta, text, None, None).await
}
/// Store/Upsert a single vector with optional text and media fields (media_type/media_uri).
pub async fn store_vector_with_media(
&self,
name: &str,
id: &str,
vector: Vec<f32>,
meta: HashMap<String, String>,
text: Option<String>,
media_type: Option<String>,
media_uri: Option<String>,
) -> Result<(), DBError> {
let path = self.dataset_path(name);
// Determine dimension: use existing or infer from vector
let dim_i32 = if let Some(d) = self.read_existing_dim(name).await? {
d
} else {
vector
.len()
.try_into()
.map_err(|_| DBError("Vector length too large".into()))?
};
let (schema, batch) = Self::build_one_row_batch(
id,
&vector,
&meta,
text.as_deref(),
media_type.as_deref(),
media_uri.as_deref(),
dim_i32,
)?;
// If LanceDB table exists and provides delete, we can upsert by deleting same id
// Try best-effort delete; ignore errors to keep operation append-only on failure
if path.exists() {
if let Ok(conn) = self.connect_db().await {
if let Ok(mut tbl) = conn.open_table(name).execute().await {
let _ = tbl
.delete(&format!("id = '{}'", id.replace('\'', "''")))
.await;
}
}
}
let write_params = WriteParams {
mode: if path.exists() {
WriteMode::Append
} else {
WriteMode::Create
},
..Default::default()
};
let reader = RecordBatchIterator::new([Ok(batch)], schema.clone());
Dataset::write(reader, path.to_string_lossy().as_ref(), Some(write_params))
.await
.map_err(|e| DBError(format!("Write (append/create) failed: {}", e)))?;
Ok(())
}
// Delete a record by ID (best-effort; returns true if delete likely removed rows)
pub async fn delete_by_id(&self, name: &str, id: &str) -> Result<bool, DBError> {
let path = self.dataset_path(name);
if !path.exists() {
return Ok(false);
}
let conn = self.connect_db().await?;
let mut tbl = conn
.open_table(name)
.execute()
.await
.map_err(|e| DBError(format!("Open table '{}' failed: {}", name, e)))?;
// SQL-like predicate quoting
let pred = format!("id = '{}'", id.replace('\'', "''"));
// lancedb returns count or () depending on version; treat Ok as success
match tbl.delete(&pred).await {
Ok(_) => Ok(true),
Err(e) => Err(DBError(format!("Delete failed: {}", e))),
}
}
// Drop the entire dataset
pub async fn drop_dataset(&self, name: &str) -> Result<bool, DBError> {
let path = self.dataset_path(name);
// Try LanceDB drop first
// Best-effort logical drop via lancedb if available; ignore failures.
// Note: we rely on filesystem removal below for final cleanup.
if let Ok(conn) = self.connect_db().await {
if let Ok(mut t) = conn.open_table(name).execute().await {
// Best-effort delete-all to reduce footprint prior to fs removal
let _ = t.delete("true").await;
}
}
if path.exists() {
if path.is_dir() {
std::fs::remove_dir_all(&path)
.map_err(|e| DBError(format!("Failed to drop dataset '{}': {}", name, e)))?;
} else {
std::fs::remove_file(&path)
.map_err(|e| DBError(format!("Failed to drop dataset '{}': {}", name, e)))?;
}
return Ok(true);
}
Ok(false)
}
// Search top-k nearest with optional filter; returns tuple of (id, score (lower=L2), meta)
pub async fn search_vectors(
&self,
name: &str,
query: Vec<f32>,
k: usize,
filter: Option<String>,
return_fields: Option<Vec<String>>,
) -> Result<Vec<(String, f32, HashMap<String, String>)>, DBError> {
let path = self.dataset_path(name);
if !path.exists() {
return Err(DBError(format!("Dataset '{}' not found", name)));
}
// Determine dim and validate query length
let dim_i32 = self
.read_existing_dim(name)
.await?
.ok_or_else(|| DBError("Vector column not found".into()))?;
if query.len() as i32 != dim_i32 {
return Err(DBError(format!(
"Query vector length mismatch: expected {}, got {}",
dim_i32,
query.len()
)));
}
let ds = Dataset::open(path.to_string_lossy().as_ref())
.await
.map_err(|e| DBError(format!("Open dataset failed: {}", e)))?;
// Build scanner with projection; we project needed fields and filter client-side to support meta keys
let mut scan = ds.scan();
if let Err(e) = scan.project(&["id", "vector", "meta", "text", "media_type", "media_uri"]) {
return Err(DBError(format!("Project failed: {}", e)));
}
// Note: we no longer push down filter to Lance to allow filtering on meta fields client-side.
let mut stream = scan
.try_into_stream()
.await
.map_err(|e| DBError(format!("Scan stream failed: {}", e)))?;
// Parse simple equality clause from filter for client-side filtering (supports one `key = 'value'`)
let clause = filter.as_ref().and_then(|s| {
fn parse_eq(s: &str) -> Option<(String, String)> {
let s = s.trim();
let pos = s.find('=').or_else(|| s.find(" = "))?;
let (k, vraw) = s.split_at(pos);
let mut v = vraw.trim_start_matches('=').trim();
if (v.starts_with('\'') && v.ends_with('\'')) || (v.starts_with('"') && v.ends_with('"')) {
if v.len() >= 2 {
v = &v[1..v.len()-1];
}
}
let key = k.trim().trim_matches('"').trim_matches('\'').to_string();
if key.is_empty() { return None; }
Some((key, v.to_string()))
}
parse_eq(s)
});
// Maintain a max-heap with reverse ordering to keep top-k smallest distances
#[derive(Debug)]
struct Hit {
dist: f32,
id: String,
meta: HashMap<String, String>,
}
impl PartialEq for Hit {
fn eq(&self, other: &Self) -> bool {
self.dist.eq(&other.dist)
}
}
impl Eq for Hit {}
impl PartialOrd for Hit {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
// Reverse for max-heap: larger distance = "greater"
other.dist.partial_cmp(&self.dist)
}
}
impl Ord for Hit {
fn cmp(&self, other: &Self) -> Ordering {
self.partial_cmp(other).unwrap_or(Ordering::Equal)
}
}
let mut heap: BinaryHeap<Hit> = BinaryHeap::with_capacity(k);
while let Some(batch_res) = stream.next().await {
let batch = batch_res.map_err(|e| DBError(format!("Stream batch error: {}", e)))?;
let id_arr = batch
.column_by_name("id")
.ok_or_else(|| DBError("Column 'id' missing".into()))?
.as_string::<i32>();
let vec_arr = batch
.column_by_name("vector")
.ok_or_else(|| DBError("Column 'vector' missing".into()))?
.as_fixed_size_list();
let meta_arr = batch
.column_by_name("meta")
.map(|a| a.as_string::<i32>().clone());
let text_arr = batch
.column_by_name("text")
.map(|a| a.as_string::<i32>().clone());
let mt_arr = batch
.column_by_name("media_type")
.map(|a| a.as_string::<i32>().clone());
let mu_arr = batch
.column_by_name("media_uri")
.map(|a| a.as_string::<i32>().clone());
for i in 0..batch.num_rows() {
// Extract id
let id_val = id_arr.value(i).to_string();
// Parse meta JSON if present
let mut meta: HashMap<String, String> = HashMap::new();
if let Some(meta_col) = &meta_arr {
if !meta_col.is_null(i) {
let s = meta_col.value(i);
if let Ok(JsonValue::Object(map)) = serde_json::from_str::<JsonValue>(s) {
for (k, v) in map {
if let Some(vs) = v.as_str() {
meta.insert(k, vs.to_string());
} else if v.is_number() || v.is_boolean() {
meta.insert(k, v.to_string());
}
}
}
}
}
// Evaluate simple equality filter if provided (supports one clause)
let passes = if let Some((ref key, ref val)) = clause {
let candidate = match key.as_str() {
"id" => Some(id_val.clone()),
"text" => text_arr.as_ref().and_then(|col| if col.is_null(i) { None } else { Some(col.value(i).to_string()) }),
"media_type" => mt_arr.as_ref().and_then(|col| if col.is_null(i) { None } else { Some(col.value(i).to_string()) }),
"media_uri" => mu_arr.as_ref().and_then(|col| if col.is_null(i) { None } else { Some(col.value(i).to_string()) }),
_ => meta.get(key).cloned(),
};
match candidate {
Some(cv) => cv == *val,
None => false,
}
} else { true };
if !passes {
continue;
}
// Compute L2 distance
let val = vec_arr.value(i);
let prim = val.as_primitive::<Float32Type>();
let mut dist: f32 = 0.0;
let plen = prim.len();
for j in 0..plen {
let r = prim.value(j);
let d = query[j] - r;
dist += d * d;
}
// Apply return_fields on meta
let mut meta_out = meta;
if let Some(fields) = &return_fields {
let mut filtered = HashMap::new();
for f in fields {
if let Some(val) = meta_out.get(f) {
filtered.insert(f.clone(), val.clone());
}
}
meta_out = filtered;
}
let hit = Hit { dist, id: id_val, meta: meta_out };
if heap.len() < k {
heap.push(hit);
} else if let Some(top) = heap.peek() {
if hit.dist < top.dist {
heap.pop();
heap.push(hit);
}
}
}
}
// Extract and sort ascending by distance
let mut hits: Vec<Hit> = heap.into_sorted_vec(); // already ascending by dist due to Ord
let out = hits
.drain(..)
.map(|h| (h.id, h.dist, h.meta))
.collect::<Vec<_>>();
Ok(out)
}
// Create an ANN index on the vector column (IVF_PQ or similar)
pub async fn create_index(
&self,
name: &str,
index_type: &str,
params: HashMap<String, String>,
) -> Result<(), DBError> {
let path = self.dataset_path(name);
if !path.exists() {
return Err(DBError(format!("Dataset '{}' not found", name)));
}
// Attempt to create a vector index using lance low-level API if available.
// Some crate versions hide IndexType; to ensure build stability, we fall back to a no-op if the API is not accessible.
let _ = (index_type, params); // currently unused; reserved for future tuning
// TODO: Implement using lance::Dataset::create_index when public API is stable across versions.
// For now, succeed as a no-op to keep flows working; search will operate as brute-force scan.
Ok(())
}
// List datasets (tables) under this DB (show user-level logical names without .lance)
pub async fn list_datasets(&self) -> Result<Vec<String>, DBError> {
let mut out = Vec::new();
if self.base_dir.exists() {
if let Ok(rd) = std::fs::read_dir(&self.base_dir) {
for entry in rd.flatten() {
let p = entry.path();
if let Some(name) = p.file_name().and_then(|s| s.to_str()) {
// Only list .lance datasets
if name.ends_with(".lance") {
out.push(name.trim_end_matches(".lance").to_string());
}
}
}
}
}
Ok(out)
}
// Return basic dataset info map
pub async fn get_dataset_info(&self, name: &str) -> Result<HashMap<String, String>, DBError> {
let path = self.dataset_path(name);
let mut m = HashMap::new();
m.insert("name".to_string(), name.to_string());
m.insert("path".to_string(), path.display().to_string());
if !path.exists() {
return Err(DBError(format!("Dataset '{}' not found", name)));
}
let ds = Dataset::open(path.to_string_lossy().as_ref())
.await
.map_err(|e| DBError(format!("Open dataset failed: {}", e)))?;
// dim: infer by scanning first batch
let mut dim_str = "unknown".to_string();
{
let mut scan = ds.scan();
if scan.project(&["vector"]).is_ok() {
if let Ok(mut stream) = scan.try_into_stream().await {
if let Some(batch_res) = stream.next().await {
if let Ok(batch) = batch_res {
if let Some(col) = batch.column_by_name("vector") {
let fsl = col.as_fixed_size_list();
dim_str = fsl.value_length().to_string();
}
}
}
}
}
}
m.insert("dimension".to_string(), dim_str);
// row_count (approximate by scanning)
let mut scan = ds.scan();
if let Err(e) = scan.project(&["id"]) {
return Err(DBError(format!("Project failed: {e}")));
}
let mut stream = scan
.try_into_stream()
.await
.map_err(|e| DBError(format!("Scan failed: {e}")))?;
let mut rows: usize = 0;
while let Some(batch_res) = stream.next().await {
let batch = batch_res.map_err(|e| DBError(format!("Scan batch error: {}", e)))?;
rows += batch.num_rows();
}
m.insert("row_count".to_string(), rows.to_string());
// indexes: we cant easily enumerate; set to "unknown" (future: read index metadata)
m.insert("indexes".to_string(), "unknown".to_string());
Ok(m)
}
}

View File

@@ -1,10 +1,18 @@
pub mod age; // NEW
pub mod age;
pub mod sym;
pub mod cmd;
pub mod crypto;
pub mod error;
pub mod options;
pub mod protocol;
pub mod rpc;
pub mod rpc_server;
pub mod server;
pub mod storage;
pub mod storage_trait; // Add this
pub mod storage_sled; // Add this
pub mod storage_trait;
pub mod storage_sled;
pub mod admin_meta;
pub mod tantivy_search;
pub mod search_cmd;
pub mod lance_store;
pub mod embedding;

View File

@@ -1,8 +1,10 @@
// #![allow(unused_imports)]
use std::path::PathBuf;
use tokio::net::TcpListener;
use herodb::server;
use herodb::rpc_server;
use clap::Parser;
@@ -12,7 +14,7 @@ use clap::Parser;
struct Args {
/// The directory of Redis DB file
#[arg(long)]
dir: String,
dir: PathBuf,
/// The port of the Redis server, default is 6379 if not specified
#[arg(long)]
@@ -22,18 +24,29 @@ struct Args {
#[arg(long)]
debug: bool,
/// Master encryption key for encrypted databases
/// Master encryption key for encrypted databases (deprecated; ignored for data DBs)
#[arg(long)]
encryption_key: Option<String>,
/// Encrypt the database
/// Encrypt the database (deprecated; ignored for data DBs)
#[arg(long)]
encrypt: bool,
/// Enable RPC management server
#[arg(long)]
enable_rpc: bool,
/// RPC server port (default: 8080)
#[arg(long, default_value = "8080")]
rpc_port: u16,
/// Use the sled backend
#[arg(long)]
sled: bool,
/// Admin secret used to encrypt DB 0 and authorize admin access (required)
#[arg(long)]
admin_secret: String,
}
#[tokio::main]
@@ -48,9 +61,19 @@ async fn main() {
.await
.unwrap();
// deprecation warnings for legacy flags
if args.encrypt || args.encryption_key.is_some() {
eprintln!("warning: --encrypt and --encryption-key are deprecated and ignored for data DBs. Admin DB 0 is always encrypted with --admin-secret.");
}
// basic validation for admin secret
if args.admin_secret.trim().is_empty() {
eprintln!("error: --admin-secret must not be empty");
std::process::exit(2);
}
// new DB option
let option = herodb::options::DBOption {
dir: args.dir,
dir: args.dir.clone(),
port,
debug: args.debug,
encryption_key: args.encryption_key,
@@ -60,14 +83,42 @@ async fn main() {
} else {
herodb::options::BackendType::Redb
},
admin_secret: args.admin_secret.clone(),
};
let backend = option.backend.clone();
// Bootstrap admin DB 0 before opening any server storage
if let Err(e) = herodb::admin_meta::ensure_bootstrap(&args.dir, backend.clone(), &args.admin_secret) {
eprintln!("Failed to bootstrap admin DB 0: {}", e.0);
std::process::exit(2);
}
// new server
let server = server::Server::new(option).await;
// Add a small delay to ensure the port is ready
tokio::time::sleep(std::time::Duration::from_millis(100)).await;
// Start RPC server if enabled
let _rpc_handle = if args.enable_rpc {
let rpc_addr = format!("127.0.0.1:{}", args.rpc_port).parse().unwrap();
let base_dir = args.dir.clone();
match rpc_server::start_rpc_server(rpc_addr, base_dir, backend, args.admin_secret.clone()).await {
Ok(handle) => {
println!("RPC management server started on port {}", args.rpc_port);
Some(handle)
}
Err(e) => {
eprintln!("Failed to start RPC server: {}", e);
None
}
}
} else {
None
};
// accept new connections
loop {
let stream = listener.accept().await;

View File

@@ -1,15 +1,23 @@
#[derive(Debug, Clone)]
use std::path::PathBuf;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum BackendType {
Redb,
Sled,
Tantivy, // Full-text search backend (no KV storage)
Lance, // Vector database backend (no KV storage)
}
#[derive(Debug, Clone)]
pub struct DBOption {
pub dir: String,
pub dir: PathBuf,
pub port: u16,
pub debug: bool,
// Deprecated for data DBs; retained for backward-compat on CLI parsing
pub encrypt: bool,
// Deprecated for data DBs; retained for backward-compat on CLI parsing
pub encryption_key: Option<String>,
pub backend: BackendType,
// New: required admin secret, used to encrypt DB 0 and authorize admin operations
pub admin_secret: String,
}

1362
src/rpc.rs Normal file

File diff suppressed because it is too large Load Diff

50
src/rpc_server.rs Normal file
View File

@@ -0,0 +1,50 @@
use std::net::SocketAddr;
use std::path::PathBuf;
use jsonrpsee::server::{ServerBuilder, ServerHandle};
use jsonrpsee::RpcModule;
use crate::rpc::{RpcServer, RpcServerImpl};
/// Start the RPC server on the specified address
pub async fn start_rpc_server(addr: SocketAddr, base_dir: PathBuf, backend: crate::options::BackendType, admin_secret: String) -> Result<ServerHandle, Box<dyn std::error::Error + Send + Sync>> {
// Create the RPC server implementation
let rpc_impl = RpcServerImpl::new(base_dir, backend, admin_secret);
// Create the RPC module
let mut module = RpcModule::new(());
module.merge(RpcServer::into_rpc(rpc_impl))?;
// Build the server with both HTTP and WebSocket support
let server = ServerBuilder::default()
.build(addr)
.await?;
// Start the server
let handle = server.start(module);
println!("RPC server started on {}", addr);
Ok(handle)
}
#[cfg(test)]
mod tests {
use super::*;
use std::time::Duration;
#[tokio::test]
async fn test_rpc_server_startup() {
let addr = "127.0.0.1:0".parse().unwrap(); // Use port 0 for auto-assignment
let base_dir = PathBuf::from("/tmp/test_rpc");
let backend = crate::options::BackendType::Redb; // Default for test
let handle = start_rpc_server(addr, base_dir, backend, "test-admin".to_string()).await.unwrap();
// Give the server a moment to start
tokio::time::sleep(Duration::from_millis(100)).await;
// Stop the server
handle.stop().unwrap();
handle.stopped().await;
}
}

378
src/search_cmd.rs Normal file
View File

@@ -0,0 +1,378 @@
use crate::{
error::DBError,
protocol::Protocol,
server::Server,
tantivy_search::{
FieldDef, Filter, FilterType, IndexConfig, NumericType, SearchOptions, TantivySearch,
},
};
use std::collections::HashMap;
use std::sync::Arc;
pub async fn ft_create_cmd(
server: &Server,
index_name: String,
schema: Vec<(String, String, Vec<String>)>,
) -> Result<Protocol, DBError> {
if server.selected_db == 0 {
return Ok(Protocol::err("FT commands are not allowed on DB 0"));
}
// Enforce Tantivy backend for selected DB
let is_tantivy = crate::admin_meta::get_database_backend(
&server.option.dir,
server.option.backend.clone(),
&server.option.admin_secret,
server.selected_db,
)
.ok()
.flatten()
.map(|b| matches!(b, crate::options::BackendType::Tantivy))
.unwrap_or(false);
if !is_tantivy {
return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed"));
}
if !server.has_write_permission() {
return Ok(Protocol::err("ERR write permission denied"));
}
// Parse schema into field definitions
let mut field_definitions = Vec::new();
for (field_name, field_type, options) in schema {
let field_def = match field_type.to_uppercase().as_str() {
"TEXT" => {
let mut sortable = false;
let mut no_index = false;
// Weight is not used in current implementation
let mut _weight = 1.0f32;
let mut i = 0;
while i < options.len() {
match options[i].to_uppercase().as_str() {
"WEIGHT" => {
if i + 1 < options.len() {
_weight = options[i + 1].parse::<f32>().unwrap_or(1.0);
i += 2;
continue;
}
}
"SORTABLE" => {
sortable = true;
}
"NOINDEX" => {
no_index = true;
}
_ => {}
}
i += 1;
}
FieldDef::Text {
stored: true,
indexed: !no_index,
tokenized: true,
fast: sortable,
}
}
"NUMERIC" => {
// default to F64
let mut sortable = false;
for opt in &options {
if opt.to_uppercase() == "SORTABLE" {
sortable = true;
}
}
FieldDef::Numeric {
stored: true,
indexed: true,
fast: sortable,
precision: NumericType::F64,
}
}
"TAG" => {
let mut separator = ",".to_string();
let mut case_sensitive = false;
let mut i = 0;
while i < options.len() {
match options[i].to_uppercase().as_str() {
"SEPARATOR" => {
if i + 1 < options.len() {
separator = options[i + 1].clone();
i += 2;
continue;
}
}
"CASESENSITIVE" => {
case_sensitive = true;
}
_ => {}
}
i += 1;
}
FieldDef::Tag {
stored: true,
separator,
case_sensitive,
}
}
"GEO" => FieldDef::Geo { stored: true },
_ => {
return Err(DBError(format!("Unknown field type: {}", field_type)));
}
};
field_definitions.push((field_name, field_def));
}
// Create the search index
let search_path = server.search_index_path();
let config = IndexConfig::default();
let search_index = TantivySearch::new_with_schema(
search_path,
index_name.clone(),
field_definitions,
Some(config),
)?;
// Store in registry
let mut indexes = server.search_indexes.write().unwrap();
indexes.insert(index_name, Arc::new(search_index));
Ok(Protocol::SimpleString("OK".to_string()))
}
pub async fn ft_add_cmd(
server: &Server,
index_name: String,
doc_id: String,
_score: f64,
fields: HashMap<String, String>,
) -> Result<Protocol, DBError> {
if server.selected_db == 0 {
return Ok(Protocol::err("FT commands are not allowed on DB 0"));
}
// Enforce Tantivy backend for selected DB
let is_tantivy = crate::admin_meta::get_database_backend(
&server.option.dir,
server.option.backend.clone(),
&server.option.admin_secret,
server.selected_db,
)
.ok()
.flatten()
.map(|b| matches!(b, crate::options::BackendType::Tantivy))
.unwrap_or(false);
if !is_tantivy {
return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed"));
}
if !server.has_write_permission() {
return Ok(Protocol::err("ERR write permission denied"));
}
let indexes = server.search_indexes.read().unwrap();
let search_index = indexes
.get(&index_name)
.ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?;
search_index.add_document_with_fields(&doc_id, fields)?;
Ok(Protocol::SimpleString("OK".to_string()))
}
pub async fn ft_search_cmd(
server: &Server,
index_name: String,
query: String,
filters: Vec<(String, String)>,
limit: Option<usize>,
offset: Option<usize>,
return_fields: Option<Vec<String>>,
) -> Result<Protocol, DBError> {
if server.selected_db == 0 {
return Ok(Protocol::err("FT commands are not allowed on DB 0"));
}
// Enforce Tantivy backend for selected DB
let is_tantivy = crate::admin_meta::get_database_backend(
&server.option.dir,
server.option.backend.clone(),
&server.option.admin_secret,
server.selected_db,
)
.ok()
.flatten()
.map(|b| matches!(b, crate::options::BackendType::Tantivy))
.unwrap_or(false);
if !is_tantivy {
return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed"));
}
if !server.has_read_permission() {
return Ok(Protocol::err("ERR read permission denied"));
}
let indexes = server.search_indexes.read().unwrap();
let search_index = indexes
.get(&index_name)
.ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?;
let search_filters = filters
.into_iter()
.map(|(field, value)| Filter {
field,
filter_type: FilterType::Equals(value),
})
.collect();
let options = SearchOptions {
limit: limit.unwrap_or(10),
offset: offset.unwrap_or(0),
filters: search_filters,
sort_by: None,
return_fields,
highlight: false,
};
let results = search_index.search_with_options(&query, options)?;
// Format results as a flattened Redis protocol array to match client expectations:
// [ total, doc_id, score, field, value, field, value, ... , doc_id, score, ... ]
let mut response = Vec::new();
// First element is the total count
response.push(Protocol::BulkString(results.total.to_string()));
// Then each document flattened
for mut doc in results.documents {
// Add document ID if it exists
if let Some(id) = doc.fields.get("_id") {
response.push(Protocol::BulkString(id.clone()));
}
// Add score
response.push(Protocol::BulkString(doc.score.to_string()));
// Add fields as key-value pairs
for (field_name, field_value) in std::mem::take(&mut doc.fields) {
if field_name != "_id" {
response.push(Protocol::BulkString(field_name));
response.push(Protocol::BulkString(field_value));
}
}
}
Ok(Protocol::Array(response))
}
pub async fn ft_del_cmd(
server: &Server,
index_name: String,
doc_id: String,
) -> Result<Protocol, DBError> {
if server.selected_db == 0 {
return Ok(Protocol::err("FT commands are not allowed on DB 0"));
}
// Enforce Tantivy backend for selected DB
let is_tantivy = crate::admin_meta::get_database_backend(
&server.option.dir,
server.option.backend.clone(),
&server.option.admin_secret,
server.selected_db,
)
.ok()
.flatten()
.map(|b| matches!(b, crate::options::BackendType::Tantivy))
.unwrap_or(false);
if !is_tantivy {
return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed"));
}
if !server.has_write_permission() {
return Ok(Protocol::err("ERR write permission denied"));
}
let indexes = server.search_indexes.read().unwrap();
let search_index = indexes
.get(&index_name)
.ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?;
let existed = search_index.delete_document_by_id(&doc_id)?;
Ok(Protocol::SimpleString(if existed { "1".to_string() } else { "0".to_string() }))
}
pub async fn ft_info_cmd(server: &Server, index_name: String) -> Result<Protocol, DBError> {
if server.selected_db == 0 {
return Ok(Protocol::err("FT commands are not allowed on DB 0"));
}
// Enforce Tantivy backend for selected DB
let is_tantivy = crate::admin_meta::get_database_backend(
&server.option.dir,
server.option.backend.clone(),
&server.option.admin_secret,
server.selected_db,
)
.ok()
.flatten()
.map(|b| matches!(b, crate::options::BackendType::Tantivy))
.unwrap_or(false);
if !is_tantivy {
return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed"));
}
if !server.has_read_permission() {
return Ok(Protocol::err("ERR read permission denied"));
}
let indexes = server.search_indexes.read().unwrap();
let search_index = indexes
.get(&index_name)
.ok_or_else(|| DBError(format!("Index '{}' not found", index_name)))?;
let info = search_index.get_info()?;
// Format info as Redis protocol
let mut response = Vec::new();
response.push(Protocol::BulkString("index_name".to_string()));
response.push(Protocol::BulkString(info.name));
response.push(Protocol::BulkString("num_docs".to_string()));
response.push(Protocol::BulkString(info.num_docs.to_string()));
response.push(Protocol::BulkString("num_fields".to_string()));
response.push(Protocol::BulkString(info.fields.len().to_string()));
response.push(Protocol::BulkString("fields".to_string()));
let fields_str = info
.fields
.iter()
.map(|f| format!("{}:{}", f.name, f.field_type))
.collect::<Vec<_>>()
.join(", ");
response.push(Protocol::BulkString(fields_str));
Ok(Protocol::Array(response))
}
pub async fn ft_drop_cmd(server: &Server, index_name: String) -> Result<Protocol, DBError> {
if server.selected_db == 0 {
return Ok(Protocol::err("FT commands are not allowed on DB 0"));
}
// Enforce Tantivy backend for selected DB
let is_tantivy = crate::admin_meta::get_database_backend(
&server.option.dir,
server.option.backend.clone(),
&server.option.admin_secret,
server.selected_db,
)
.ok()
.flatten()
.map(|b| matches!(b, crate::options::BackendType::Tantivy))
.unwrap_or(false);
if !is_tantivy {
return Ok(Protocol::err("ERR DB backend is not Tantivy; FT.* commands are not allowed"));
}
if !server.has_write_permission() {
return Ok(Protocol::err("ERR write permission denied"));
}
// Remove from registry and files; report error if nothing to drop
let mut existed = false;
{
let mut indexes = server.search_indexes.write().unwrap();
if indexes.remove(&index_name).is_some() {
existed = true;
}
}
// Remove the index files from disk
let index_path = server.search_index_path().join(&index_name);
if index_path.exists() {
std::fs::remove_dir_all(&index_path)
.map_err(|e| DBError(format!("Failed to remove index files: {}", e)))?;
existed = true;
}
if !existed {
return Ok(Protocol::err(&format!("ERR Index '{}' not found", index_name)));
}
Ok(Protocol::SimpleString("OK".to_string()))
}

View File

@@ -11,9 +11,17 @@ use crate::cmd::Cmd;
use crate::error::DBError;
use crate::options;
use crate::protocol::Protocol;
use crate::storage::Storage;
use crate::storage_sled::SledStorage;
use crate::storage_trait::StorageBackend;
use crate::admin_meta;
// Embeddings: config and cache
use crate::embedding::{EmbeddingConfig, create_embedder, Embedder, create_image_embedder, ImageEmbedder};
use serde_json;
use ureq::{Agent, AgentBuilder};
use std::time::Duration;
use std::io::Read;
const NO_DB_SELECTED: u64 = u64::MAX;
#[derive(Clone)]
pub struct Server {
@@ -22,6 +30,19 @@ pub struct Server {
pub client_name: Option<String>,
pub selected_db: u64, // Changed from usize to u64
pub queued_cmd: Option<Vec<(Cmd, Protocol)>>,
pub current_permissions: Option<crate::rpc::Permissions>,
// In-memory registry of Tantivy search indexes for this server
pub search_indexes: Arc<std::sync::RwLock<HashMap<String, Arc<crate::tantivy_search::TantivySearch>>>>,
// Per-DB Lance stores (vector DB), keyed by db_id
pub lance_stores: Arc<std::sync::RwLock<HashMap<u64, Arc<crate::lance_store::LanceStore>>>>,
// Per-(db_id, dataset) embedder cache (text)
pub embedders: Arc<std::sync::RwLock<HashMap<(u64, String), Arc<dyn Embedder>>>>,
// Per-(db_id, dataset) image embedder cache (image)
pub image_embedders: Arc<std::sync::RwLock<HashMap<(u64, String), Arc<dyn ImageEmbedder>>>>,
// BLPOP waiter registry: per (db_index, key) FIFO of waiters
pub list_waiters: Arc<Mutex<HashMap<u64, HashMap<String, Vec<Waiter>>>>>,
@@ -46,59 +67,322 @@ impl Server {
db_cache: Arc::new(std::sync::RwLock::new(HashMap::new())),
option,
client_name: None,
selected_db: 0,
selected_db: NO_DB_SELECTED,
queued_cmd: None,
current_permissions: None,
search_indexes: Arc::new(std::sync::RwLock::new(HashMap::new())),
lance_stores: Arc::new(std::sync::RwLock::new(HashMap::new())),
embedders: Arc::new(std::sync::RwLock::new(HashMap::new())),
image_embedders: Arc::new(std::sync::RwLock::new(HashMap::new())),
list_waiters: Arc::new(Mutex::new(HashMap::new())),
waiter_seq: Arc::new(AtomicU64::new(1)),
}
}
// Path where search indexes are stored, namespaced per selected DB:
// <base_dir>/search_indexes/<db_id>
pub fn search_index_path(&self) -> std::path::PathBuf {
let base = std::path::PathBuf::from(&self.option.dir)
.join("search_indexes")
.join(self.selected_db.to_string());
if !base.exists() {
let _ = std::fs::create_dir_all(&base);
}
base
}
// Path where Lance datasets are stored, namespaced per selected DB:
// <base_dir>/lance/<db_id>
pub fn lance_data_path(&self) -> std::path::PathBuf {
let base = std::path::PathBuf::from(&self.option.dir)
.join("lance")
.join(self.selected_db.to_string());
if !base.exists() {
let _ = std::fs::create_dir_all(&base);
}
base
}
pub fn current_storage(&self) -> Result<Arc<dyn StorageBackend>, DBError> {
// Require explicit SELECT before any storage access
if self.selected_db == NO_DB_SELECTED {
return Err(DBError("No database selected. Use SELECT <id> [KEY <key>] first".to_string()));
}
// Admin DB 0 access must be authenticated with SELECT 0 KEY <admin_secret>
if self.selected_db == 0 {
if !matches!(self.current_permissions, Some(crate::rpc::Permissions::ReadWrite)) {
return Err(DBError("Admin DB 0 requires SELECT 0 KEY <admin_secret>".to_string()));
}
}
let mut cache = self.db_cache.write().unwrap();
if let Some(storage) = cache.get(&self.selected_db) {
return Ok(storage.clone());
}
// Create new database file
let db_file_path = std::path::PathBuf::from(self.option.dir.clone())
.join(format!("{}.db", self.selected_db));
// Ensure the directory exists before creating the database file
if let Some(parent_dir) = db_file_path.parent() {
std::fs::create_dir_all(parent_dir).map_err(|e| {
DBError(format!("Failed to create directory {}: {}", parent_dir.display(), e))
})?;
}
println!("Creating new db file: {}", db_file_path.display());
let storage: Arc<dyn StorageBackend> = match self.option.backend {
options::BackendType::Redb => {
Arc::new(Storage::new(
db_file_path,
self.should_encrypt_db(self.selected_db),
self.option.encryption_key.as_deref()
)?)
}
options::BackendType::Sled => {
Arc::new(SledStorage::new(
db_file_path,
self.should_encrypt_db(self.selected_db),
self.option.encryption_key.as_deref()
)?)
}
// Use process-wide shared handles to avoid sled/reDB double-open lock contention.
let storage = if self.selected_db == 0 {
// Admin DB 0: always via singleton
admin_meta::open_admin_storage(
&self.option.dir,
self.option.backend.clone(),
&self.option.admin_secret,
)?
} else {
// Data DBs: via global registry keyed by id
admin_meta::open_data_storage(
&self.option.dir,
self.option.backend.clone(),
&self.option.admin_secret,
self.selected_db,
)?
};
cache.insert(self.selected_db, storage.clone());
Ok(storage)
}
fn should_encrypt_db(&self, db_index: u64) -> bool {
// DB 0-9 are non-encrypted, DB 10+ are encrypted
self.option.encrypt && db_index >= 10
/// Get or create the LanceStore for the currently selected DB.
/// Only valid for non-zero DBs and when the backend is Lance.
pub fn lance_store(&self) -> Result<Arc<crate::lance_store::LanceStore>, DBError> {
if self.selected_db == 0 {
return Err(DBError("Lance not available on admin DB 0".to_string()));
}
// Resolve backend for selected_db
let backend_opt = crate::admin_meta::get_database_backend(
&self.option.dir,
self.option.backend.clone(),
&self.option.admin_secret,
self.selected_db,
)
.ok()
.flatten();
if !matches!(backend_opt, Some(crate::options::BackendType::Lance)) {
return Err(DBError("ERR DB backend is not Lance; LANCE.* commands are not allowed".to_string()));
}
// Fast path: read lock
{
let map = self.lance_stores.read().unwrap();
if let Some(store) = map.get(&self.selected_db) {
return Ok(store.clone());
}
}
// Slow path: create and insert
let store = Arc::new(crate::lance_store::LanceStore::new(&self.option.dir, self.selected_db)?);
{
let mut map = self.lance_stores.write().unwrap();
map.insert(self.selected_db, store.clone());
}
Ok(store)
}
// ----- Embedding configuration and resolution -----
// Sidecar embedding config path: <base_dir>/lance/<db_id>/<dataset>.lance.embedding.json
fn dataset_embedding_config_path(&self, dataset: &str) -> std::path::PathBuf {
let mut base = self.lance_data_path();
// Ensure parent dir exists
if !base.exists() {
let _ = std::fs::create_dir_all(&base);
}
base.push(format!("{}.lance.embedding.json", dataset));
base
}
/// Persist per-dataset embedding config as JSON sidecar.
pub fn set_dataset_embedding_config(&self, dataset: &str, cfg: &EmbeddingConfig) -> Result<(), DBError> {
if self.selected_db == 0 {
return Err(DBError("Lance not available on admin DB 0".to_string()));
}
let p = self.dataset_embedding_config_path(dataset);
let data = serde_json::to_vec_pretty(cfg)
.map_err(|e| DBError(format!("Failed to serialize embedding config: {}", e)))?;
std::fs::write(&p, data)
.map_err(|e| DBError(format!("Failed to write embedding config {}: {}", p.display(), e)))?;
// Invalidate embedder cache entry for this dataset
{
let mut map = self.embedders.write().unwrap();
map.remove(&(self.selected_db, dataset.to_string()));
}
{
let mut map_img = self.image_embedders.write().unwrap();
map_img.remove(&(self.selected_db, dataset.to_string()));
}
Ok(())
}
/// Load per-dataset embedding config.
pub fn get_dataset_embedding_config(&self, dataset: &str) -> Result<EmbeddingConfig, DBError> {
if self.selected_db == 0 {
return Err(DBError("Lance not available on admin DB 0".to_string()));
}
let p = self.dataset_embedding_config_path(dataset);
if !p.exists() {
return Err(DBError(format!(
"Embedding config not set for dataset '{}'. Use LANCE.EMBEDDING CONFIG SET ... or RPC to configure.",
dataset
)));
}
let data = std::fs::read(&p)
.map_err(|e| DBError(format!("Failed to read embedding config {}: {}", p.display(), e)))?;
let cfg: EmbeddingConfig = serde_json::from_slice(&data)
.map_err(|e| DBError(format!("Failed to parse embedding config {}: {}", p.display(), e)))?;
Ok(cfg)
}
/// Resolve or build an embedder for (db_id, dataset). Caches instance.
pub fn get_embedder_for(&self, dataset: &str) -> Result<Arc<dyn Embedder>, DBError> {
if self.selected_db == 0 {
return Err(DBError("Lance not available on admin DB 0".to_string()));
}
// Fast path
{
let map = self.embedders.read().unwrap();
if let Some(e) = map.get(&(self.selected_db, dataset.to_string())) {
return Ok(e.clone());
}
}
// Load config and instantiate
let cfg = self.get_dataset_embedding_config(dataset)?;
let emb = create_embedder(&cfg)?;
{
let mut map = self.embedders.write().unwrap();
map.insert((self.selected_db, dataset.to_string()), emb.clone());
}
Ok(emb)
}
/// Resolve or build an IMAGE embedder for (db_id, dataset). Caches instance.
pub fn get_image_embedder_for(&self, dataset: &str) -> Result<Arc<dyn ImageEmbedder>, DBError> {
if self.selected_db == 0 {
return Err(DBError("Lance not available on admin DB 0".to_string()));
}
// Fast path
{
let map = self.image_embedders.read().unwrap();
if let Some(e) = map.get(&(self.selected_db, dataset.to_string())) {
return Ok(e.clone());
}
}
// Load config and instantiate
let cfg = self.get_dataset_embedding_config(dataset)?;
let emb = create_image_embedder(&cfg)?;
{
let mut map = self.image_embedders.write().unwrap();
map.insert((self.selected_db, dataset.to_string()), emb.clone());
}
Ok(emb)
}
/// Download image bytes from a URI with safety checks (size, timeout, content-type, optional host allowlist).
/// Env overrides:
/// - HERODB_IMAGE_MAX_BYTES (u64, default 10485760)
/// - HERODB_IMAGE_FETCH_TIMEOUT_SECS (u64, default 30)
/// - HERODB_IMAGE_ALLOWED_HOSTS (comma-separated, optional)
pub fn fetch_image_bytes_from_uri(&self, uri: &str) -> Result<Vec<u8>, DBError> {
// Basic scheme validation
if !(uri.starts_with("http://") || uri.starts_with("https://")) {
return Err(DBError("Only http(s) URIs are supported for image fetch".into()));
}
// Parse host (naive) for allowlist check
let host = {
let after_scheme = match uri.find("://") {
Some(i) => &uri[i + 3..],
None => uri,
};
let end = after_scheme.find('/').unwrap_or(after_scheme.len());
let host_port = &after_scheme[..end];
host_port.split('@').last().unwrap_or(host_port).split(':').next().unwrap_or(host_port).to_string()
};
let max_bytes: u64 = std::env::var("HERODB_IMAGE_MAX_BYTES").ok().and_then(|s| s.parse::<u64>().ok()).unwrap_or(10 * 1024 * 1024);
let timeout_secs: u64 = std::env::var("HERODB_IMAGE_FETCH_TIMEOUT_SECS").ok().and_then(|s| s.parse::<u64>().ok()).unwrap_or(30);
let allowed_hosts_env = std::env::var("HERODB_IMAGE_ALLOWED_HOSTS").ok();
if let Some(allow) = allowed_hosts_env {
if !allow.split(',').map(|s| s.trim()).filter(|s| !s.is_empty()).any(|h| h.eq_ignore_ascii_case(&host)) {
return Err(DBError(format!("Host '{}' not allowed for image fetch (HERODB_IMAGE_ALLOWED_HOSTS)", host)));
}
}
let agent: Agent = AgentBuilder::new()
.timeout_read(Duration::from_secs(timeout_secs))
.timeout_write(Duration::from_secs(timeout_secs))
.build();
let resp = agent.get(uri).call().map_err(|e| DBError(format!("HTTP GET failed: {}", e)))?;
// Validate content-type
let ctype = resp.header("Content-Type").unwrap_or("");
let ctype_main = ctype.split(';').next().unwrap_or("").trim().to_ascii_lowercase();
if !ctype_main.starts_with("image/") {
return Err(DBError(format!("Remote content-type '{}' is not image/*", ctype)));
}
// Read with cap
let mut reader = resp.into_reader();
let mut buf: Vec<u8> = Vec::with_capacity(8192);
let mut tmp = [0u8; 8192];
let mut total: u64 = 0;
loop {
let n = reader.read(&mut tmp).map_err(|e| DBError(format!("Read error: {}", e)))?;
if n == 0 { break; }
total += n as u64;
if total > max_bytes {
return Err(DBError(format!("Image exceeds max allowed bytes {}", max_bytes)));
}
buf.extend_from_slice(&tmp[..n]);
}
Ok(buf)
}
/// Check if current permissions allow read operations
pub fn has_read_permission(&self) -> bool {
// No DB selected -> no permissions
if self.selected_db == NO_DB_SELECTED {
return false;
}
// If an explicit permission is set for this connection, honor it.
if let Some(perms) = self.current_permissions.as_ref() {
return matches!(*perms, crate::rpc::Permissions::Read | crate::rpc::Permissions::ReadWrite);
}
// Fallback ONLY when no explicit permission context (e.g., JSON-RPC flows without SELECT).
match crate::admin_meta::verify_access(
&self.option.dir,
self.option.backend.clone(),
&self.option.admin_secret,
self.selected_db,
None,
) {
Ok(Some(crate::rpc::Permissions::Read)) | Ok(Some(crate::rpc::Permissions::ReadWrite)) => true,
_ => false,
}
}
/// Check if current permissions allow write operations
pub fn has_write_permission(&self) -> bool {
// No DB selected -> no permissions
if self.selected_db == NO_DB_SELECTED {
return false;
}
// If an explicit permission is set for this connection, honor it.
if let Some(perms) = self.current_permissions.as_ref() {
return matches!(*perms, crate::rpc::Permissions::ReadWrite);
}
// Fallback ONLY when no explicit permission context (e.g., JSON-RPC flows without SELECT).
match crate::admin_meta::verify_access(
&self.option.dir,
self.option.backend.clone(),
&self.option.admin_secret,
self.selected_db,
None,
) {
Ok(Some(crate::rpc::Permissions::ReadWrite)) => true,
_ => false,
}
}
// ----- BLPOP waiter helpers -----

123
src/sym.rs Normal file
View File

@@ -0,0 +1,123 @@
//! sym.rs — Stateless symmetric encryption (Phase 1)
//!
//! Commands implemented (RESP):
//! - SYM KEYGEN
//! - SYM ENCRYPT <key_b64> <message>
//! - SYM DECRYPT <key_b64> <ciphertext_b64>
//!
//! Notes:
//! - Raw key: exactly 32 bytes, provided as Base64 in commands.
//! - Cipher: XChaCha20-Poly1305 (AEAD) without AAD in Phase 1
//! - Ciphertext binary layout: [version:1][nonce:24][ciphertext||tag]
//! - Encoding for wire I/O: Base64
use base64::{engine::general_purpose::STANDARD as B64, Engine as _};
use chacha20poly1305::{
aead::{Aead, KeyInit, OsRng},
XChaCha20Poly1305, XNonce,
};
use rand::RngCore;
use crate::protocol::Protocol;
const VERSION: u8 = 1;
const NONCE_LEN: usize = 24;
const TAG_LEN: usize = 16;
#[derive(Debug)]
pub enum SymWireError {
InvalidKey,
BadEncoding,
BadFormat,
BadVersion(u8),
Crypto,
}
impl SymWireError {
fn to_protocol(self) -> Protocol {
match self {
SymWireError::InvalidKey => Protocol::err("ERR sym: invalid key"),
SymWireError::BadEncoding => Protocol::err("ERR sym: bad encoding"),
SymWireError::BadFormat => Protocol::err("ERR sym: bad format"),
SymWireError::BadVersion(v) => Protocol::err(&format!("ERR sym: unsupported version {}", v)),
SymWireError::Crypto => Protocol::err("ERR sym: auth failed"),
}
}
}
fn decode_key_b64(s: &str) -> Result<chacha20poly1305::Key, SymWireError> {
let bytes = B64.decode(s.as_bytes()).map_err(|_| SymWireError::BadEncoding)?;
if bytes.len() != 32 {
return Err(SymWireError::InvalidKey);
}
Ok(chacha20poly1305::Key::from_slice(&bytes).to_owned())
}
fn encrypt_blob(key: &chacha20poly1305::Key, plaintext: &[u8]) -> Result<Vec<u8>, SymWireError> {
let cipher = XChaCha20Poly1305::new(key);
let mut nonce_bytes = [0u8; NONCE_LEN];
OsRng.fill_bytes(&mut nonce_bytes);
let nonce = XNonce::from_slice(&nonce_bytes);
let mut out = Vec::with_capacity(1 + NONCE_LEN + plaintext.len() + TAG_LEN);
out.push(VERSION);
out.extend_from_slice(&nonce_bytes);
let ct = cipher.encrypt(nonce, plaintext).map_err(|_| SymWireError::Crypto)?;
out.extend_from_slice(&ct);
Ok(out)
}
fn decrypt_blob(key: &chacha20poly1305::Key, blob: &[u8]) -> Result<Vec<u8>, SymWireError> {
if blob.len() < 1 + NONCE_LEN + TAG_LEN {
return Err(SymWireError::BadFormat);
}
let ver = blob[0];
if ver != VERSION {
return Err(SymWireError::BadVersion(ver));
}
let nonce = XNonce::from_slice(&blob[1..1 + NONCE_LEN]);
let ct = &blob[1 + NONCE_LEN..];
let cipher = XChaCha20Poly1305::new(key);
cipher.decrypt(nonce, ct).map_err(|_| SymWireError::Crypto)
}
// ---------- Command handlers (RESP) ----------
pub async fn cmd_sym_keygen() -> Protocol {
let mut key_bytes = [0u8; 32];
OsRng.fill_bytes(&mut key_bytes);
let key_b64 = B64.encode(key_bytes);
Protocol::BulkString(key_b64)
}
pub async fn cmd_sym_encrypt(key_b64: &str, message: &str) -> Protocol {
let key = match decode_key_b64(key_b64) {
Ok(k) => k,
Err(e) => return e.to_protocol(),
};
match encrypt_blob(&key, message.as_bytes()) {
Ok(blob) => Protocol::BulkString(B64.encode(blob)),
Err(e) => e.to_protocol(),
}
}
pub async fn cmd_sym_decrypt(key_b64: &str, ct_b64: &str) -> Protocol {
let key = match decode_key_b64(key_b64) {
Ok(k) => k,
Err(e) => return e.to_protocol(),
};
let blob = match B64.decode(ct_b64.as_bytes()) {
Ok(b) => b,
Err(_) => return SymWireError::BadEncoding.to_protocol(),
};
match decrypt_blob(&key, &blob) {
Ok(pt) => match String::from_utf8(pt) {
Ok(s) => Protocol::BulkString(s),
Err(_) => Protocol::err("ERR sym: invalid UTF-8 plaintext"),
},
Err(e) => e.to_protocol(),
}
}

709
src/tantivy_search.rs Normal file
View File

@@ -0,0 +1,709 @@
use crate::error::DBError;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::path::PathBuf;
use std::sync::{Arc, RwLock};
use tantivy::{
collector::TopDocs,
directory::MmapDirectory,
query::{BooleanQuery, Occur, Query, QueryParser, TermQuery},
schema::{
DateOptions, Field, IndexRecordOption, NumericOptions, Schema, TextFieldIndexing, TextOptions, STORED, STRING,
},
tokenizer::TokenizerManager,
DateTime, Index, IndexReader, IndexWriter, TantivyDocument, Term,
};
use tantivy::schema::Value;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum FieldDef {
Text {
stored: bool,
indexed: bool,
tokenized: bool,
fast: bool,
},
Numeric {
stored: bool,
indexed: bool,
fast: bool,
precision: NumericType,
},
Tag {
stored: bool,
separator: String,
case_sensitive: bool,
},
Geo {
stored: bool,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum NumericType {
I64,
U64,
F64,
Date,
}
pub struct IndexSchema {
schema: Schema,
fields: HashMap<String, (Field, FieldDef)>,
default_search_fields: Vec<Field>,
}
pub struct TantivySearch {
index: Index,
writer: Arc<RwLock<IndexWriter>>,
reader: IndexReader,
index_schema: IndexSchema,
name: String,
config: IndexConfig,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct IndexConfig {
pub language: String,
pub stopwords: Vec<String>,
pub stemming: bool,
pub max_doc_count: Option<usize>,
pub default_score: f64,
}
impl Default for IndexConfig {
fn default() -> Self {
IndexConfig {
language: "english".to_string(),
stopwords: vec![],
stemming: true,
max_doc_count: None,
default_score: 1.0,
}
}
}
impl TantivySearch {
pub fn new_with_schema(
base_path: PathBuf,
name: String,
field_definitions: Vec<(String, FieldDef)>,
config: Option<IndexConfig>,
) -> Result<Self, DBError> {
let index_path = base_path.join(&name);
std::fs::create_dir_all(&index_path)
.map_err(|e| DBError(format!("Failed to create index dir: {}", e)))?;
// Build schema from field definitions
let mut schema_builder = Schema::builder();
let mut fields = HashMap::new();
let mut default_search_fields = Vec::new();
// Always add a document ID field
let id_field = schema_builder.add_text_field("_id", STRING | STORED);
fields.insert(
"_id".to_string(),
(
id_field,
FieldDef::Text {
stored: true,
indexed: true,
tokenized: false,
fast: false,
},
),
);
// Add user-defined fields
for (field_name, field_def) in field_definitions {
let field = match &field_def {
FieldDef::Text {
stored,
indexed,
tokenized,
fast: _fast,
} => {
let mut text_options = TextOptions::default();
if *stored {
text_options = text_options.set_stored();
}
if *indexed {
let indexing_options = if *tokenized {
TextFieldIndexing::default()
.set_tokenizer("default")
.set_index_option(IndexRecordOption::WithFreqsAndPositions)
} else {
TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic)
};
text_options = text_options.set_indexing_options(indexing_options);
let f = schema_builder.add_text_field(&field_name, text_options);
if *tokenized {
default_search_fields.push(f);
}
f
} else {
schema_builder.add_text_field(&field_name, text_options)
}
}
FieldDef::Numeric {
stored,
indexed,
fast,
precision,
} => match precision {
NumericType::I64 => {
let mut opts = NumericOptions::default();
if *stored {
opts = opts.set_stored();
}
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_i64_field(&field_name, opts)
}
NumericType::U64 => {
let mut opts = NumericOptions::default();
if *stored {
opts = opts.set_stored();
}
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_u64_field(&field_name, opts)
}
NumericType::F64 => {
let mut opts = NumericOptions::default();
if *stored {
opts = opts.set_stored();
}
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_f64_field(&field_name, opts)
}
NumericType::Date => {
let mut opts = DateOptions::default();
if *stored {
opts = opts.set_stored();
}
if *indexed {
opts = opts.set_indexed();
}
if *fast {
opts = opts.set_fast();
}
schema_builder.add_date_field(&field_name, opts)
}
},
FieldDef::Tag {
stored,
separator: _,
case_sensitive: _,
} => {
let mut text_options = TextOptions::default();
if *stored {
text_options = text_options.set_stored();
}
text_options = text_options.set_indexing_options(
TextFieldIndexing::default()
.set_tokenizer("raw")
.set_index_option(IndexRecordOption::Basic),
);
schema_builder.add_text_field(&field_name, text_options)
}
FieldDef::Geo { stored } => {
// For now, store as two f64 fields for lat/lon
let mut opts = NumericOptions::default();
if *stored {
opts = opts.set_stored();
}
opts = opts.set_indexed().set_fast();
let lat_field =
schema_builder.add_f64_field(&format!("{}_lat", field_name), opts.clone());
let lon_field =
schema_builder.add_f64_field(&format!("{}_lon", field_name), opts);
fields.insert(
format!("{}_lat", field_name),
(
lat_field,
FieldDef::Numeric {
stored: *stored,
indexed: true,
fast: true,
precision: NumericType::F64,
},
),
);
fields.insert(
format!("{}_lon", field_name),
(
lon_field,
FieldDef::Numeric {
stored: *stored,
indexed: true,
fast: true,
precision: NumericType::F64,
},
),
);
continue; // Skip adding the geo field itself
}
};
fields.insert(field_name.clone(), (field, field_def));
}
let schema = schema_builder.build();
let index_schema = IndexSchema {
schema: schema.clone(),
fields,
default_search_fields,
};
// Create or open index
let dir = MmapDirectory::open(&index_path)
.map_err(|e| DBError(format!("Failed to open index directory: {}", e)))?;
let mut index =
Index::open_or_create(dir, schema).map_err(|e| DBError(format!("Failed to create index: {}", e)))?;
// Configure tokenizers
let tokenizer_manager = TokenizerManager::default();
index.set_tokenizers(tokenizer_manager);
let writer = index
.writer(15_000_000)
.map_err(|e| DBError(format!("Failed to create index writer: {}", e)))?;
let reader = index
.reader()
.map_err(|e| DBError(format!("Failed to create reader: {}", e)))?;
let config = config.unwrap_or_default();
Ok(TantivySearch {
index,
writer: Arc::new(RwLock::new(writer)),
reader,
index_schema,
name,
config,
})
}
pub fn add_document_with_fields(
&self,
doc_id: &str,
fields: HashMap<String, String>,
) -> Result<(), DBError> {
let mut writer = self
.writer
.write()
.map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?;
// Delete existing document with same ID
if let Some((id_field, _)) = self.index_schema.fields.get("_id") {
writer.delete_term(Term::from_field_text(*id_field, doc_id));
}
// Create new document
let mut doc = tantivy::doc!();
// Add document ID
if let Some((id_field, _)) = self.index_schema.fields.get("_id") {
doc.add_text(*id_field, doc_id);
}
// Add other fields based on schema
for (field_name, field_value) in fields {
if let Some((field, field_def)) = self.index_schema.fields.get(&field_name) {
match field_def {
FieldDef::Text { .. } => {
doc.add_text(*field, &field_value);
}
FieldDef::Numeric { precision, .. } => match precision {
NumericType::I64 => {
if let Ok(v) = field_value.parse::<i64>() {
doc.add_i64(*field, v);
}
}
NumericType::U64 => {
if let Ok(v) = field_value.parse::<u64>() {
doc.add_u64(*field, v);
}
}
NumericType::F64 => {
if let Ok(v) = field_value.parse::<f64>() {
doc.add_f64(*field, v);
}
}
NumericType::Date => {
if let Ok(v) = field_value.parse::<i64>() {
doc.add_date(*field, DateTime::from_timestamp_millis(v));
}
}
},
FieldDef::Tag {
separator,
case_sensitive,
..
} => {
let tags = if !case_sensitive {
field_value.to_lowercase()
} else {
field_value.clone()
};
for tag in tags.split(separator.as_str()) {
doc.add_text(*field, tag.trim());
}
}
FieldDef::Geo { .. } => {
let parts: Vec<&str> = field_value.split(',').collect();
if parts.len() == 2 {
if let (Ok(lat), Ok(lon)) =
(parts[0].parse::<f64>(), parts[1].parse::<f64>())
{
if let Some((lat_field, _)) =
self.index_schema.fields.get(&format!("{}_lat", field_name))
{
doc.add_f64(*lat_field, lat);
}
if let Some((lon_field, _)) =
self.index_schema.fields.get(&format!("{}_lon", field_name))
{
doc.add_f64(*lon_field, lon);
}
}
}
}
}
}
}
writer
.add_document(doc)
.map_err(|e| DBError(format!("Failed to add document: {}", e)))?;
writer
.commit()
.map_err(|e| DBError(format!("Failed to commit: {}", e)))?;
// Make new documents visible to searches
self.reader
.reload()
.map_err(|e| DBError(format!("Failed to reload reader: {}", e)))?;
Ok(())
}
pub fn search_with_options(
&self,
query_str: &str,
options: SearchOptions,
) -> Result<SearchResults, DBError> {
// Ensure reader is up to date with latest commits
self.reader
.reload()
.map_err(|e| DBError(format!("Failed to reload reader: {}", e)))?;
let searcher = self.reader.searcher();
// Ensure we have searchable fields
if self.index_schema.default_search_fields.is_empty() {
return Err(DBError("No searchable fields defined in schema".to_string()));
}
// Parse query based on search fields
let query_parser = QueryParser::for_index(
&self.index,
self.index_schema.default_search_fields.clone(),
);
let parsed_query = query_parser
.parse_query(query_str)
.map_err(|e| DBError(format!("Failed to parse query: {}", e)))?;
let mut clauses: Vec<(Occur, Box<dyn Query>)> = vec![(Occur::Must, parsed_query)];
// Apply filters if any
for filter in options.filters {
if let Some((field, field_def)) = self.index_schema.fields.get(&filter.field) {
match filter.filter_type {
FilterType::Equals(value) => {
match field_def {
FieldDef::Text { .. } | FieldDef::Tag { .. } => {
let term_query =
TermQuery::new(Term::from_field_text(*field, &value), IndexRecordOption::Basic);
clauses.push((Occur::Must, Box::new(term_query)));
}
FieldDef::Numeric { precision, .. } => {
// Equals on numeric fields: parse to the right numeric type and use term query
match precision {
NumericType::I64 => {
if let Ok(v) = value.parse::<i64>() {
let term = Term::from_field_i64(*field, v);
let tq = TermQuery::new(term, IndexRecordOption::Basic);
clauses.push((Occur::Must, Box::new(tq)));
}
}
NumericType::U64 => {
if let Ok(v) = value.parse::<u64>() {
let term = Term::from_field_u64(*field, v);
let tq = TermQuery::new(term, IndexRecordOption::Basic);
clauses.push((Occur::Must, Box::new(tq)));
}
}
NumericType::F64 => {
if let Ok(v) = value.parse::<f64>() {
let term = Term::from_field_f64(*field, v);
let tq = TermQuery::new(term, IndexRecordOption::Basic);
clauses.push((Occur::Must, Box::new(tq)));
}
}
NumericType::Date => {
if let Ok(v) = value.parse::<i64>() {
let dt = DateTime::from_timestamp_millis(v);
let term = Term::from_field_date(*field, dt);
let tq = TermQuery::new(term, IndexRecordOption::Basic);
clauses.push((Occur::Must, Box::new(tq)));
}
}
}
}
FieldDef::Geo { .. } => {
// Geo equals isn't supported in this simplified version
}
}
}
FilterType::Range { .. } => {
// TODO: Implement numeric range queries by building a RangeQuery per type
}
FilterType::InSet(values) => {
// OR across values
let mut sub_clauses: Vec<(Occur, Box<dyn Query>)> = vec![];
for value in values {
let term_query = TermQuery::new(
Term::from_field_text(*field, &value),
IndexRecordOption::Basic,
);
sub_clauses.push((Occur::Should, Box::new(term_query)));
}
clauses.push((Occur::Must, Box::new(BooleanQuery::new(sub_clauses))));
}
}
}
}
let final_query: Box<dyn Query> = if clauses.len() == 1 {
clauses.pop().unwrap().1
} else {
Box::new(BooleanQuery::new(clauses))
};
// Execute search
let top_docs = searcher
.search(&*final_query, &TopDocs::with_limit(options.limit + options.offset))
.map_err(|e| DBError(format!("Search failed: {}", e)))?;
let total_hits = top_docs.len();
let mut documents = Vec::new();
for (score, doc_address) in top_docs.into_iter().skip(options.offset).take(options.limit) {
let retrieved_doc: TantivyDocument = searcher
.doc(doc_address)
.map_err(|e| DBError(format!("Failed to retrieve doc: {}", e)))?;
let mut doc_fields = HashMap::new();
// Extract stored fields (or synthesize)
for (field_name, (field, field_def)) in &self.index_schema.fields {
match field_def {
FieldDef::Text { stored, .. } | FieldDef::Tag { stored, .. } => {
if *stored {
if let Some(value) = retrieved_doc.get_first(*field) {
if let Some(text) = value.as_str() {
doc_fields.insert(field_name.clone(), text.to_string());
}
}
}
}
FieldDef::Numeric {
stored, precision, ..
} => {
if *stored {
let value_str = match precision {
NumericType::I64 => retrieved_doc
.get_first(*field)
.and_then(|v| v.as_i64())
.map(|v| v.to_string()),
NumericType::U64 => retrieved_doc
.get_first(*field)
.and_then(|v| v.as_u64())
.map(|v| v.to_string()),
NumericType::F64 => retrieved_doc
.get_first(*field)
.and_then(|v| v.as_f64())
.map(|v| v.to_string()),
NumericType::Date => retrieved_doc
.get_first(*field)
.and_then(|v| v.as_datetime())
.map(|v| v.into_timestamp_millis().to_string()),
};
if let Some(v) = value_str {
doc_fields.insert(field_name.clone(), v);
}
}
}
FieldDef::Geo { stored } => {
if *stored {
let lat_field = self
.index_schema
.fields
.get(&format!("{}_lat", field_name))
.unwrap()
.0;
let lon_field = self
.index_schema
.fields
.get(&format!("{}_lon", field_name))
.unwrap()
.0;
let lat = retrieved_doc.get_first(lat_field).and_then(|v| v.as_f64());
let lon = retrieved_doc.get_first(lon_field).and_then(|v| v.as_f64());
if let (Some(lat), Some(lon)) = (lat, lon) {
doc_fields.insert(field_name.clone(), format!("{},{}", lat, lon));
}
}
}
}
}
documents.push(SearchDocument {
fields: doc_fields,
score,
});
}
Ok(SearchResults {
total: total_hits,
documents,
})
}
pub fn get_info(&self) -> Result<IndexInfo, DBError> {
let searcher = self.reader.searcher();
let num_docs = searcher.num_docs();
let fields_info: Vec<FieldInfo> = self
.index_schema
.fields
.iter()
.map(|(name, (_, def))| FieldInfo {
name: name.clone(),
field_type: format!("{:?}", def),
})
.collect();
Ok(IndexInfo {
name: self.name.clone(),
num_docs,
fields: fields_info,
config: self.config.clone(),
})
}
/// Delete a document by its _id term. Returns true if the document existed before deletion.
pub fn delete_document_by_id(&self, doc_id: &str) -> Result<bool, DBError> {
// Determine existence by running a tiny term query
let existed = if let Some((id_field, _)) = self.index_schema.fields.get("_id") {
let term = Term::from_field_text(*id_field, doc_id);
let searcher = self.reader.searcher();
let tq = TermQuery::new(term.clone(), IndexRecordOption::Basic);
let hits = searcher
.search(&tq, &TopDocs::with_limit(1))
.map_err(|e| DBError(format!("Failed to search for existing doc: {}", e)))?;
!hits.is_empty()
} else {
false
};
// Perform deletion and commit
let mut writer = self
.writer
.write()
.map_err(|e| DBError(format!("Failed to acquire writer lock: {}", e)))?;
if let Some((id_field, _)) = self.index_schema.fields.get("_id") {
writer.delete_term(Term::from_field_text(*id_field, doc_id));
}
writer
.commit()
.map_err(|e| DBError(format!("Failed to commit delete: {}", e)))?;
// Refresh reader to observe deletion
self.reader
.reload()
.map_err(|e| DBError(format!("Failed to reload reader: {}", e)))?;
Ok(existed)
}
}
#[derive(Debug, Clone)]
pub struct SearchOptions {
pub limit: usize,
pub offset: usize,
pub filters: Vec<Filter>,
pub sort_by: Option<String>,
pub return_fields: Option<Vec<String>>,
pub highlight: bool,
}
impl Default for SearchOptions {
fn default() -> Self {
SearchOptions {
limit: 10,
offset: 0,
filters: vec![],
sort_by: None,
return_fields: None,
highlight: false,
}
}
}
#[derive(Debug, Clone)]
pub struct Filter {
pub field: String,
pub filter_type: FilterType,
}
#[derive(Debug, Clone)]
pub enum FilterType {
Equals(String),
Range { min: String, max: String },
InSet(Vec<String>),
}
#[derive(Debug)]
pub struct SearchResults {
pub total: usize,
pub documents: Vec<SearchDocument>,
}
#[derive(Debug)]
pub struct SearchDocument {
pub fields: HashMap<String, String>,
pub score: f32,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct IndexInfo {
pub name: String,
pub num_docs: u64,
pub fields: Vec<FieldInfo>,
pub config: IndexConfig,
}
#[derive(Debug, Serialize, Deserialize)]
pub struct FieldInfo {
pub name: String,
pub field_type: String,
}

View File

@@ -1,10 +1,11 @@
#!/bin/bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$SCRIPT_DIR"
# Test script for HeroDB - Redis-compatible database with redb backend
# This script starts the server and runs comprehensive tests
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'

View File

@@ -1,4 +1,5 @@
use herodb::{server::Server, options::DBOption};
use std::path::PathBuf;
use std::time::Duration;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::TcpStream;
@@ -22,12 +23,13 @@ async fn debug_hset_simple() {
let port = 16500;
let option = DBOption {
dir: test_dir.to_string(),
dir: PathBuf::from(test_dir),
port,
debug: false,
encrypt: false,
encryption_key: None,
backend: herodb::options::BackendType::Redb,
admin_secret: "test-admin".to_string(),
};
let mut server = Server::new(option).await;
@@ -48,6 +50,12 @@ async fn debug_hset_simple() {
sleep(Duration::from_millis(200)).await;
let mut stream = TcpStream::connect(format!("127.0.0.1:{}", port)).await.unwrap();
// Acquire ReadWrite permissions on this connection
let resp = send_command(
&mut stream,
"*4\r\n$6\r\nSELECT\r\n$1\r\n0\r\n$3\r\nKEY\r\n$10\r\ntest-admin\r\n",
).await;
assert!(resp.contains("OK"), "Failed SELECT handshake: {}", resp);
// Test simple HSET
println!("Testing HSET...");

View File

@@ -1,4 +1,5 @@
use herodb::{server::Server, options::DBOption};
use std::path::PathBuf;
use std::time::Duration;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::TcpStream;
@@ -13,12 +14,13 @@ async fn debug_hset_return_value() {
std::fs::create_dir_all(&test_dir).unwrap();
let option = DBOption {
dir: test_dir.to_string(),
dir: PathBuf::from(test_dir),
port: 16390,
debug: false,
encrypt: false,
encryption_key: None,
backend: herodb::options::BackendType::Redb,
admin_secret: "test-admin".to_string(),
};
let mut server = Server::new(option).await;
@@ -41,11 +43,18 @@ async fn debug_hset_return_value() {
// Connect and test HSET
let mut stream = TcpStream::connect("127.0.0.1:16390").await.unwrap();
// Acquire ReadWrite permissions for this new connection
let handshake = "*4\r\n$6\r\nSELECT\r\n$1\r\n0\r\n$3\r\nKEY\r\n$10\r\ntest-admin\r\n";
stream.write_all(handshake.as_bytes()).await.unwrap();
let mut buffer = [0; 1024];
let n = stream.read(&mut buffer).await.unwrap();
let resp = String::from_utf8_lossy(&buffer[..n]);
assert!(resp.contains("OK"), "Failed SELECT handshake: {}", resp);
// Send HSET command
let cmd = "*4\r\n$4\r\nHSET\r\n$4\r\nhash\r\n$6\r\nfield1\r\n$6\r\nvalue1\r\n";
stream.write_all(cmd.as_bytes()).await.unwrap();
let mut buffer = [0; 1024];
let n = stream.read(&mut buffer).await.unwrap();
let response = String::from_utf8_lossy(&buffer[..n]);

View File

@@ -0,0 +1,484 @@
use redis::{Client, Connection, RedisResult, Value};
use std::process::{Child, Command};
use std::time::Duration;
use jsonrpsee::http_client::{HttpClient, HttpClientBuilder};
use herodb::rpc::{BackendType, DatabaseConfig, RpcClient};
use base64::Engine;
use tokio::time::sleep;
// ------------------------
// Helpers
// ------------------------
fn get_redis_connection(port: u16) -> Connection {
let connection_info = format!("redis://127.0.0.1:{}", port);
let client = Client::open(connection_info).unwrap();
let mut attempts = 0;
loop {
match client.get_connection() {
Ok(mut conn) => {
if redis::cmd("PING").query::<String>(&mut conn).is_ok() {
return conn;
}
}
Err(e) => {
if attempts >= 3600 {
panic!("Failed to connect to Redis server after 3600 attempts: {}", e);
}
}
}
attempts += 1;
std::thread::sleep(Duration::from_millis(500));
}
}
async fn get_rpc_client(port: u16) -> HttpClient {
let url = format!("http://127.0.0.1:{}", port + 1); // RPC port = Redis port + 1
HttpClientBuilder::default().build(url).unwrap()
}
/// Wait until RPC server is responsive (getServerStats succeeds) or panic after retries.
async fn wait_for_rpc_ready(client: &HttpClient, max_attempts: u32, delay: Duration) {
for _ in 0..max_attempts {
match client.get_server_stats().await {
Ok(_) => return,
Err(_) => {
sleep(delay).await;
}
}
}
panic!("RPC server did not become ready in time");
}
// A guard to ensure the server process is killed when it goes out of scope and test dir cleaned.
struct ServerProcessGuard {
process: Child,
test_dir: String,
}
impl Drop for ServerProcessGuard {
fn drop(&mut self) {
eprintln!("Killing server process (pid: {})...", self.process.id());
if let Err(e) = self.process.kill() {
eprintln!("Failed to kill server process: {}", e);
}
match self.process.wait() {
Ok(status) => eprintln!("Server process exited with: {}", status),
Err(e) => eprintln!("Failed to wait on server process: {}", e),
}
// Clean up the specific test directory
eprintln!("Cleaning up test directory: {}", self.test_dir);
if let Err(e) = std::fs::remove_dir_all(&self.test_dir) {
eprintln!("Failed to clean up test directory: {}", e);
}
}
}
// Helper to set up the server and return guard + ports
async fn setup_server() -> (ServerProcessGuard, u16) {
use std::sync::atomic::{AtomicU16, Ordering};
static PORT_COUNTER: AtomicU16 = AtomicU16::new(17500);
let port = PORT_COUNTER.fetch_add(1, Ordering::SeqCst);
let test_dir = format!("/tmp/herodb_lance_test_{}", port);
// Clean up previous test data
if std::path::Path::new(&test_dir).exists() {
let _ = std::fs::remove_dir_all(&test_dir);
}
std::fs::create_dir_all(&test_dir).unwrap();
// Start the server in a subprocess with RPC enabled (follows tantivy test pattern)
let child = Command::new("cargo")
.args(&[
"run",
"--",
"--dir",
&test_dir,
"--port",
&port.to_string(),
"--rpc-port",
&(port + 1).to_string(),
"--enable-rpc",
"--debug",
"--admin-secret",
"test-admin",
])
.spawn()
.expect("Failed to start server process");
let guard = ServerProcessGuard {
process: child,
test_dir,
};
// Give the server time to build and start (cargo run may compile first)
// Increase significantly to accommodate first-time dependency compilation in CI.
std::thread::sleep(Duration::from_millis(60000));
(guard, port)
}
// Convenient helpers for assertions on redis::Value
fn value_is_ok(v: &Value) -> bool {
match v {
Value::Okay => true,
Value::Status(s) if s == "OK" => true,
Value::Data(d) if d == b"OK" => true,
_ => false,
}
}
fn value_is_int_eq(v: &Value, expected: i64) -> bool {
matches!(v, Value::Int(n) if *n == expected)
}
fn value_is_str_eq(v: &Value, expected: &str) -> bool {
match v {
Value::Status(s) => s == expected,
Value::Data(d) => String::from_utf8_lossy(d) == expected,
_ => false,
}
}
fn to_string_lossy(v: &Value) -> String {
match v {
Value::Nil => "Nil".to_string(),
Value::Int(n) => n.to_string(),
Value::Status(s) => s.clone(),
Value::Okay => "OK".to_string(),
Value::Data(d) => String::from_utf8_lossy(d).to_string(),
Value::Bulk(items) => {
let inner: Vec<String> = items.iter().map(to_string_lossy).collect();
format!("[{}]", inner.join(", "))
}
}
}
// Extract ids from LANCE.SEARCH / LANCE.SEARCHIMAGE reply which is:
// Array of elements: [ [id, score, [k,v,...]], [id, score, ...], ... ]
fn extract_hit_ids(v: &Value) -> Vec<String> {
let mut ids = Vec::new();
if let Value::Bulk(items) = v {
for item in items {
if let Value::Bulk(row) = item {
if !row.is_empty() {
// first element is id (Data or Status)
let id = match &row[0] {
Value::Data(d) => String::from_utf8_lossy(d).to_string(),
Value::Status(s) => s.clone(),
Value::Int(n) => n.to_string(),
_ => continue,
};
ids.push(id);
}
}
}
}
ids
}
// Check whether a Bulk array (RESP array) contains a given string element.
fn bulk_contains_string(v: &Value, needle: &str) -> bool {
match v {
Value::Bulk(items) => items.iter().any(|it| match it {
Value::Data(d) => String::from_utf8_lossy(d).contains(needle),
Value::Status(s) => s.contains(needle),
Value::Bulk(_) => bulk_contains_string(it, needle),
_ => false,
}),
_ => false,
}
}
// ------------------------
// Test: Lance end-to-end (RESP) using only local embedders
// ------------------------
#[tokio::test]
async fn test_lance_end_to_end() {
let (_guard, port) = setup_server().await;
// First, wait for RESP to be available; this also gives cargo-run child ample time to finish building.
// Reuse the helper that retries PING until success.
{
let _conn_ready = get_redis_connection(port);
// Drop immediately; we only needed readiness.
}
// Build RPC client and create a Lance DB
let rpc_client = get_rpc_client(port).await;
// Ensure RPC server is listening before we issue createDatabase (allow longer warm-up to accommodate first-build costs)
wait_for_rpc_ready(&rpc_client, 3600, Duration::from_millis(250)).await;
let db_config = DatabaseConfig {
name: Some("media-db".to_string()),
storage_path: None,
max_size: None,
redis_version: None,
};
let db_id = rpc_client
.create_database(BackendType::Lance, db_config, None)
.await
.expect("create_database Lance failed");
assert_eq!(db_id, 1, "Expected first Lance DB id to be 1");
// Add access keys
let _ = rpc_client
.add_access_key(db_id, "readwrite_key".to_string(), "readwrite".to_string())
.await
.expect("add_access_key readwrite failed");
let _ = rpc_client
.add_access_key(db_id, "read_key".to_string(), "read".to_string())
.await
.expect("add_access_key read failed");
// Connect to Redis and SELECT DB with readwrite key
let mut conn = get_redis_connection(port);
let sel_ok: RedisResult<String> = redis::cmd("SELECT")
.arg(db_id)
.arg("KEY")
.arg("readwrite_key")
.query(&mut conn);
assert!(sel_ok.is_ok(), "SELECT db with key failed: {:?}", sel_ok);
assert_eq!(sel_ok.unwrap(), "OK");
// 1) Configure embedding providers: textset -> testhash dim 64, imageset -> testimagehash dim 512
let v = redis::cmd("LANCE.EMBEDDING")
.arg("CONFIG")
.arg("SET")
.arg("textset")
.arg("PROVIDER")
.arg("testhash")
.arg("MODEL")
.arg("any")
.arg("PARAM")
.arg("dim")
.arg("64")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "Embedding config set (text) not OK: {}", to_string_lossy(&v));
let v = redis::cmd("LANCE.EMBEDDING")
.arg("CONFIG")
.arg("SET")
.arg("imageset")
.arg("PROVIDER")
.arg("testimagehash")
.arg("MODEL")
.arg("any")
.arg("PARAM")
.arg("dim")
.arg("512")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "Embedding config set (image) not OK: {}", to_string_lossy(&v));
// 2) Create datasets
let v = redis::cmd("LANCE.CREATE")
.arg("textset")
.arg("DIM")
.arg(64)
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.CREATE textset failed: {}", to_string_lossy(&v));
let v = redis::cmd("LANCE.CREATE")
.arg("imageset")
.arg("DIM")
.arg(512)
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.CREATE imageset failed: {}", to_string_lossy(&v));
// 3) Store two text documents
let v = redis::cmd("LANCE.STORE")
.arg("textset")
.arg("ID")
.arg("doc-1")
.arg("TEXT")
.arg("The quick brown fox jumps over the lazy dog")
.arg("META")
.arg("title")
.arg("Fox")
.arg("category")
.arg("animal")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.STORE doc-1 failed: {}", to_string_lossy(&v));
let v = redis::cmd("LANCE.STORE")
.arg("textset")
.arg("ID")
.arg("doc-2")
.arg("TEXT")
.arg("A fast auburn fox vaulted a sleepy canine")
.arg("META")
.arg("title")
.arg("Paraphrase")
.arg("category")
.arg("animal")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.STORE doc-2 failed: {}", to_string_lossy(&v));
// 4) Store two images via BYTES (local fake bytes; embedder only hashes bytes, not decoding)
let img1: Vec<u8> = b"local-image-bytes-1-abcdefghijklmnopqrstuvwxyz".to_vec();
let img2: Vec<u8> = b"local-image-bytes-2-ABCDEFGHIJKLMNOPQRSTUVWXYZ".to_vec();
let img1_b64 = base64::engine::general_purpose::STANDARD.encode(&img1);
let img2_b64 = base64::engine::general_purpose::STANDARD.encode(&img2);
let v = redis::cmd("LANCE.STOREIMAGE")
.arg("imageset")
.arg("ID")
.arg("img-1")
.arg("BYTES")
.arg(&img1_b64)
.arg("META")
.arg("title")
.arg("Local1")
.arg("group")
.arg("demo")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.STOREIMAGE img-1 failed: {}", to_string_lossy(&v));
let v = redis::cmd("LANCE.STOREIMAGE")
.arg("imageset")
.arg("ID")
.arg("img-2")
.arg("BYTES")
.arg(&img2_b64)
.arg("META")
.arg("title")
.arg("Local2")
.arg("group")
.arg("demo")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.STOREIMAGE img-2 failed: {}", to_string_lossy(&v));
// 5) Search text: K 2 QUERY "quick brown fox" RETURN 1 title
let v = redis::cmd("LANCE.SEARCH")
.arg("textset")
.arg("K")
.arg(2)
.arg("QUERY")
.arg("quick brown fox")
.arg("RETURN")
.arg(1)
.arg("title")
.query::<Value>(&mut conn)
.unwrap();
// Should be an array of hits
let ids = extract_hit_ids(&v);
assert!(
ids.contains(&"doc-1".to_string()) || ids.contains(&"doc-2".to_string()),
"LANCE.SEARCH should return doc-1/doc-2; got: {}",
to_string_lossy(&v)
);
// With FILTER on category
let v = redis::cmd("LANCE.SEARCH")
.arg("textset")
.arg("K")
.arg(2)
.arg("QUERY")
.arg("fox jumps")
.arg("FILTER")
.arg("category = 'animal'")
.arg("RETURN")
.arg(1)
.arg("title")
.query::<Value>(&mut conn)
.unwrap();
let ids_f = extract_hit_ids(&v);
assert!(
!ids_f.is_empty(),
"Filtered LANCE.SEARCH should return at least one document; got: {}",
to_string_lossy(&v)
);
// 6) Search images with QUERYBYTES
let query_img: Vec<u8> = b"local-image-query-3-1234567890".to_vec();
let query_img_b64 = base64::engine::general_purpose::STANDARD.encode(&query_img);
let v = redis::cmd("LANCE.SEARCHIMAGE")
.arg("imageset")
.arg("K")
.arg(2)
.arg("QUERYBYTES")
.arg(&query_img_b64)
.arg("RETURN")
.arg(1)
.arg("title")
.query::<Value>(&mut conn)
.unwrap();
// Should get 2 hits (img-1 and img-2) in some order; assert array non-empty
let img_ids = extract_hit_ids(&v);
assert!(
!img_ids.is_empty(),
"LANCE.SEARCHIMAGE should return non-empty results; got: {}",
to_string_lossy(&v)
);
// 7) Inspect datasets
let v = redis::cmd("LANCE.LIST").query::<Value>(&mut conn).unwrap();
assert!(
bulk_contains_string(&v, "textset"),
"LANCE.LIST missing textset: {}",
to_string_lossy(&v)
);
assert!(
bulk_contains_string(&v, "imageset"),
"LANCE.LIST missing imageset: {}",
to_string_lossy(&v)
);
// INFO textset
let info_text = redis::cmd("LANCE.INFO")
.arg("textset")
.query::<Value>(&mut conn)
.unwrap();
// INFO returns Array [k,v,k,v,...] including "dimension" "64" and "row_count" "...".
let info_str = to_string_lossy(&info_text);
assert!(
info_str.contains("dimension") && info_str.contains("64"),
"LANCE.INFO textset should include dimension 64; got: {}",
info_str
);
// 8) Delete by id and drop datasets
let v = redis::cmd("LANCE.DEL")
.arg("textset")
.arg("doc-2")
.query::<Value>(&mut conn)
.unwrap();
// Returns SimpleString "1" or Int 1 depending on encoding path; accept either
assert!(
value_is_int_eq(&v, 1) || value_is_str_eq(&v, "1"),
"LANCE.DEL doc-2 expected 1; got {}",
to_string_lossy(&v)
);
let v = redis::cmd("LANCE.DROP")
.arg("textset")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.DROP textset failed: {}", to_string_lossy(&v));
let v = redis::cmd("LANCE.DROP")
.arg("imageset")
.query::<Value>(&mut conn)
.unwrap();
assert!(value_is_ok(&v), "LANCE.DROP imageset failed: {}", to_string_lossy(&v));
}

View File

@@ -12,7 +12,15 @@ fn get_redis_connection(port: u16) -> Connection {
match client.get_connection() {
Ok(mut conn) => {
if redis::cmd("PING").query::<String>(&mut conn).is_ok() {
return conn;
// Acquire ReadWrite permissions on this connection
let sel: RedisResult<String> = redis::cmd("SELECT")
.arg(0)
.arg("KEY")
.arg("test-admin")
.query(&mut conn);
if sel.is_ok() {
return conn;
}
}
}
Err(e) => {
@@ -78,6 +86,8 @@ fn setup_server() -> (ServerProcessGuard, u16) {
"--port",
&port.to_string(),
"--debug",
"--admin-secret",
"test-admin",
])
.spawn()
.expect("Failed to start server process");

View File

@@ -1,4 +1,5 @@
use herodb::{server::Server, options::DBOption};
use std::path::PathBuf;
use std::time::Duration;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::TcpStream;
@@ -17,24 +18,35 @@ async fn start_test_server(test_name: &str) -> (Server, u16) {
std::fs::create_dir_all(&test_dir).unwrap();
let option = DBOption {
dir: test_dir,
dir: PathBuf::from(test_dir),
port,
debug: true,
encrypt: false,
encryption_key: None,
backend: herodb::options::BackendType::Redb,
admin_secret: "test-admin".to_string(),
};
let server = Server::new(option).await;
(server, port)
}
// Helper function to connect to the test server
// Helper function to connect to the test server
async fn connect_to_server(port: u16) -> TcpStream {
let mut attempts = 0;
loop {
match TcpStream::connect(format!("127.0.0.1:{}", port)).await {
Ok(stream) => return stream,
Ok(mut stream) => {
// Obtain ReadWrite permissions for this connection by selecting DB 0 with admin key
let resp = send_command(
&mut stream,
"*4\r\n$6\r\nSELECT\r\n$1\r\n0\r\n$3\r\nKEY\r\n$10\r\ntest-admin\r\n",
).await;
if !resp.contains("OK") {
panic!("Failed to acquire write permissions via SELECT 0 KEY test-admin: {}", resp);
}
return stream;
}
Err(_) if attempts < 10 => {
attempts += 1;
sleep(Duration::from_millis(100)).await;

86
tests/rpc_tests.rs Normal file
View File

@@ -0,0 +1,86 @@
use herodb::rpc::{BackendType, DatabaseConfig};
use herodb::admin_meta;
use herodb::options::BackendType as OptionsBackendType;
use std::path::Path;
#[tokio::test]
async fn test_rpc_server_basic() {
// This test would require starting the RPC server in a separate thread
// For now, we'll just test that the types compile correctly
// Test serialization of types
let backend = BackendType::Redb;
let config = DatabaseConfig {
name: Some("test_db".to_string()),
storage_path: Some("/tmp/test".to_string()),
max_size: Some(1024 * 1024),
redis_version: Some("7.0".to_string()),
};
let backend_json = serde_json::to_string(&backend).unwrap();
let config_json = serde_json::to_string(&config).unwrap();
assert_eq!(backend_json, "\"Redb\"");
assert!(config_json.contains("test_db"));
}
#[tokio::test]
async fn test_database_config_serialization() {
let config = DatabaseConfig {
name: Some("my_db".to_string()),
storage_path: None,
max_size: Some(1000000),
redis_version: Some("7.0".to_string()),
};
let json = serde_json::to_value(&config).unwrap();
assert_eq!(json["name"], "my_db");
assert_eq!(json["max_size"], 1000000);
assert_eq!(json["redis_version"], "7.0");
}
#[tokio::test]
async fn test_backend_type_serialization() {
// Test that both Redb and Sled backends serialize correctly
let redb_backend = BackendType::Redb;
let sled_backend = BackendType::Sled;
let redb_json = serde_json::to_string(&redb_backend).unwrap();
let sled_json = serde_json::to_string(&sled_backend).unwrap();
assert_eq!(redb_json, "\"Redb\"");
assert_eq!(sled_json, "\"Sled\"");
// Test deserialization
let redb_deserialized: BackendType = serde_json::from_str(&redb_json).unwrap();
let sled_deserialized: BackendType = serde_json::from_str(&sled_json).unwrap();
assert!(matches!(redb_deserialized, BackendType::Redb));
assert!(matches!(sled_deserialized, BackendType::Sled));
}
#[tokio::test]
async fn test_database_name_persistence() {
let base_dir = "/tmp/test_db_name_persistence";
let admin_secret = "test-admin-secret";
let backend = OptionsBackendType::Redb;
let db_id = 1;
let test_name = "test-database-name";
// Clean up any existing test data
let _ = std::fs::remove_dir_all(base_dir);
// Set the database name
admin_meta::set_database_name(Path::new(base_dir), backend.clone(), admin_secret, db_id, test_name)
.expect("Failed to set database name");
// Retrieve the database name
let retrieved_name = admin_meta::get_database_name(Path::new(base_dir), backend, admin_secret, db_id)
.expect("Failed to get database name");
// Verify the name matches
assert_eq!(retrieved_name, Some(test_name.to_string()));
// Clean up
let _ = std::fs::remove_dir_all(base_dir);
}

View File

@@ -1,4 +1,5 @@
use herodb::{server::Server, options::DBOption};
use std::path::PathBuf;
use std::time::Duration;
use tokio::time::sleep;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
@@ -19,12 +20,13 @@ async fn start_test_server(test_name: &str) -> (Server, u16) {
std::fs::create_dir_all(&test_dir).unwrap();
let option = DBOption {
dir: test_dir,
dir: PathBuf::from(test_dir),
port,
debug: true,
encrypt: false,
encryption_key: None,
backend: herodb::options::BackendType::Redb,
admin_secret: "test-admin".to_string(),
};
let server = Server::new(option).await;
@@ -34,9 +36,16 @@ async fn start_test_server(test_name: &str) -> (Server, u16) {
// Helper function to send Redis command and get response
async fn send_redis_command(port: u16, command: &str) -> String {
let mut stream = TcpStream::connect(format!("127.0.0.1:{}", port)).await.unwrap();
// Acquire ReadWrite permissions on this new connection
let handshake = "*4\r\n$6\r\nSELECT\r\n$1\r\n0\r\n$3\r\nKEY\r\n$10\r\ntest-admin\r\n";
stream.write_all(handshake.as_bytes()).await.unwrap();
let mut buffer = [0; 1024];
let _ = stream.read(&mut buffer).await.unwrap(); // Read and ignore the OK for handshake
// Now send the intended command
stream.write_all(command.as_bytes()).await.unwrap();
let mut buffer = [0; 1024];
let n = stream.read(&mut buffer).await.unwrap();
String::from_utf8_lossy(&buffer[..n]).to_string()
}
@@ -184,12 +193,19 @@ async fn test_transaction_operations() {
sleep(Duration::from_millis(100)).await;
// Use a single connection for the transaction
// Use a single connection for the transaction
let mut stream = TcpStream::connect(format!("127.0.0.1:{}", port)).await.unwrap();
// Acquire write permissions for this connection
let handshake = "*4\r\n$6\r\nSELECT\r\n$1\r\n0\r\n$3\r\nKEY\r\n$10\r\ntest-admin\r\n";
stream.write_all(handshake.as_bytes()).await.unwrap();
let mut buffer = [0; 1024];
let n = stream.read(&mut buffer).await.unwrap();
let resp = String::from_utf8_lossy(&buffer[..n]);
assert!(resp.contains("OK"));
// Test MULTI
stream.write_all("*1\r\n$5\r\nMULTI\r\n".as_bytes()).await.unwrap();
let mut buffer = [0; 1024];
let n = stream.read(&mut buffer).await.unwrap();
let response = String::from_utf8_lossy(&buffer[..n]);
assert!(response.contains("OK"));

View File

@@ -1,4 +1,5 @@
use herodb::{server::Server, options::DBOption};
use std::path::PathBuf;
use std::time::Duration;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::TcpStream;
@@ -17,12 +18,13 @@ async fn start_test_server(test_name: &str) -> (Server, u16) {
std::fs::create_dir_all(&test_dir).unwrap();
let option = DBOption {
dir: test_dir,
dir: PathBuf::from(test_dir),
port,
debug: false,
encrypt: false,
encryption_key: None,
backend: herodb::options::BackendType::Redb,
admin_secret: "test-admin".to_string(),
};
let server = Server::new(option).await;
@@ -38,12 +40,22 @@ async fn send_command(stream: &mut TcpStream, command: &str) -> String {
String::from_utf8_lossy(&buffer[..n]).to_string()
}
// Helper function to connect to the test server
// Helper function to connect to the test server
async fn connect_to_server(port: u16) -> TcpStream {
let mut attempts = 0;
loop {
match TcpStream::connect(format!("127.0.0.1:{}", port)).await {
Ok(stream) => return stream,
Ok(mut stream) => {
// Acquire ReadWrite permissions for this connection
let resp = send_command(
&mut stream,
"*4\r\n$6\r\nSELECT\r\n$1\r\n0\r\n$3\r\nKEY\r\n$10\r\ntest-admin\r\n",
).await;
if !resp.contains("OK") {
panic!("Failed to acquire write permissions via SELECT 0 KEY test-admin: {}", resp);
}
return stream;
}
Err(_) if attempts < 10 => {
attempts += 1;
sleep(Duration::from_millis(100)).await;
@@ -98,13 +110,20 @@ async fn test_hset_clean_db() {
let mut stream = connect_to_server(port).await;
// Test HSET - should return 1 for new field
let response = send_command(&mut stream, "*4\r\n$4\r\nHSET\r\n$4\r\nhash\r\n$6\r\nfield1\r\n$6\r\nvalue1\r\n").await;
// Ensure clean DB state (admin DB 0 may be shared due to global singleton)
let flush = send_command(&mut stream, "*1\r\n$7\r\nFLUSHDB\r\n").await;
assert!(flush.contains("OK"), "Failed to FLUSHDB: {}", flush);
// Test HSET - should return 1 for new field (use a unique key name to avoid collisions)
let key = "hash_clean";
let hset_cmd = format!("*4\r\n$4\r\nHSET\r\n${}\r\n{}\r\n$6\r\nfield1\r\n$6\r\nvalue1\r\n", key.len(), key);
let response = send_command(&mut stream, &hset_cmd).await;
println!("HSET response: {}", response);
assert!(response.contains("1"), "Expected HSET to return 1, got: {}", response);
// Test HGET
let response = send_command(&mut stream, "*3\r\n$4\r\nHGET\r\n$4\r\nhash\r\n$6\r\nfield1\r\n").await;
let hget_cmd = format!("*3\r\n$4\r\nHGET\r\n${}\r\n{}\r\n$6\r\nfield1\r\n", key.len(), key);
let response = send_command(&mut stream, &hget_cmd).await;
println!("HGET response: {}", response);
assert!(response.contains("value1"));
}

View File

@@ -0,0 +1,294 @@
use redis::{Client, Connection, RedisResult};
use std::process::{Child, Command};
use std::time::Duration;
use jsonrpsee::http_client::{HttpClientBuilder, HttpClient};
use herodb::rpc::{RpcClient, BackendType, DatabaseConfig};
// Helper function to get Redis connection, retrying until successful
fn get_redis_connection(port: u16) -> Connection {
let connection_info = format!("redis://127.0.0.1:{}", port);
let client = Client::open(connection_info).unwrap();
let mut attempts = 0;
loop {
match client.get_connection() {
Ok(mut conn) => {
if redis::cmd("PING").query::<String>(&mut conn).is_ok() {
return conn;
}
}
Err(e) => {
if attempts >= 120 {
panic!(
"Failed to connect to Redis server after 120 attempts: {}",
e
);
}
}
}
attempts += 1;
std::thread::sleep(Duration::from_millis(100));
}
}
// Helper function to get RPC client
async fn get_rpc_client(port: u16) -> HttpClient {
let url = format!("http://127.0.0.1:{}", port + 1); // RPC port is Redis port + 1
let client = HttpClientBuilder::default().build(url).unwrap();
client
}
// A guard to ensure the server process is killed when it goes out of scope
struct ServerProcessGuard {
process: Child,
test_dir: String,
}
impl Drop for ServerProcessGuard {
fn drop(&mut self) {
println!("Killing server process (pid: {})...", self.process.id());
if let Err(e) = self.process.kill() {
eprintln!("Failed to kill server process: {}", e);
}
match self.process.wait() {
Ok(status) => println!("Server process exited with: {}", status),
Err(e) => eprintln!("Failed to wait on server process: {}", e),
}
// Clean up the specific test directory
println!("Cleaning up test directory: {}", self.test_dir);
if let Err(e) = std::fs::remove_dir_all(&self.test_dir) {
eprintln!("Failed to clean up test directory: {}", e);
}
}
}
// Helper to set up the server and return connections
async fn setup_server() -> (ServerProcessGuard, u16, Connection, HttpClient) {
use std::sync::atomic::{AtomicU16, Ordering};
static PORT_COUNTER: AtomicU16 = AtomicU16::new(16500);
let port = PORT_COUNTER.fetch_add(1, Ordering::SeqCst);
let test_dir = format!("/tmp/herodb_tantivy_test_{}", port);
// Clean up previous test data
if std::path::Path::new(&test_dir).exists() {
let _ = std::fs::remove_dir_all(&test_dir);
}
std::fs::create_dir_all(&test_dir).unwrap();
// Start the server in a subprocess
let child = Command::new("cargo")
.args(&[
"run",
"--",
"--dir",
&test_dir,
"--port",
&port.to_string(),
"--rpc-port",
&(port + 1).to_string(),
"--enable-rpc",
"--debug",
"--admin-secret",
"test-admin",
])
.spawn()
.expect("Failed to start server process");
// Create a new guard that also owns the test directory path
let guard = ServerProcessGuard {
process: child,
test_dir,
};
// Give the server time to build and start (cargo run may compile first)
std::thread::sleep(Duration::from_millis(3000));
let conn = get_redis_connection(port);
let rpc_client = get_rpc_client(port).await;
(guard, port, conn, rpc_client)
}
#[tokio::test]
async fn test_tantivy_full_text_search() {
let (_server_guard, _port, mut conn, rpc_client) = setup_server().await;
// Create a Tantivy database via RPC
let db_config = DatabaseConfig {
name: Some("test_tantivy_db".to_string()),
storage_path: None,
max_size: None,
redis_version: None,
};
let db_id = rpc_client.create_database(BackendType::Tantivy, db_config, None).await.unwrap();
assert_eq!(db_id, 1);
// Add readwrite access key
let _ = rpc_client.add_access_key(db_id, "readwrite_key".to_string(), "readwrite".to_string()).await.unwrap();
// Add read-only access key
let _ = rpc_client.add_access_key(db_id, "read_key".to_string(), "read".to_string()).await.unwrap();
// Test with readwrite permissions
test_tantivy_with_readwrite_permissions(&mut conn, db_id).await;
// Test with read-only permissions
test_tantivy_with_read_permissions(&mut conn, db_id).await;
// Test access denied for invalid key
test_tantivy_access_denied(&mut conn, db_id).await;
}
async fn test_tantivy_with_readwrite_permissions(conn: &mut Connection, db_id: u64) {
// Select database with readwrite key
let result: RedisResult<String> = redis::cmd("SELECT")
.arg(db_id)
.arg("KEY")
.arg("readwrite_key")
.query(conn);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "OK");
// Test FT.CREATE
let result: RedisResult<String> = redis::cmd("FT.CREATE")
.arg("test_index")
.arg("SCHEMA")
.arg("title")
.arg("TEXT")
.arg("content")
.arg("TEXT")
.arg("tags")
.arg("TAG")
.query(conn);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "OK");
// Test FT.ADD
let result: RedisResult<String> = redis::cmd("FT.ADD")
.arg("test_index")
.arg("doc1")
.arg("1.0")
.arg("title")
.arg("Hello World")
.arg("content")
.arg("This is a test document")
.arg("tags")
.arg("test,example")
.query(conn);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "OK");
// Add another document
let result: RedisResult<String> = redis::cmd("FT.ADD")
.arg("test_index")
.arg("doc2")
.arg("1.0")
.arg("title")
.arg("Goodbye World")
.arg("content")
.arg("Another test document")
.arg("tags")
.arg("test,another")
.query(conn);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "OK");
// Test FT.SEARCH
let result: RedisResult<Vec<String>> = redis::cmd("FT.SEARCH")
.arg("test_index")
.arg("test")
.query(conn);
assert!(result.is_ok());
let results = result.unwrap();
assert!(results.len() >= 3); // At least total count + 2 documents
assert_eq!(results[0], "2"); // Total matches
// Test FT.INFO
let result: RedisResult<Vec<String>> = redis::cmd("FT.INFO")
.arg("test_index")
.query(conn);
assert!(result.is_ok());
let info = result.unwrap();
assert!(info.contains(&"index_name".to_string()));
assert!(info.contains(&"test_index".to_string()));
// Test FT.DEL
let result: RedisResult<String> = redis::cmd("FT.DEL")
.arg("test_index")
.arg("doc1")
.query(conn);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "1");
// Verify document was deleted
let result: RedisResult<Vec<String>> = redis::cmd("FT.SEARCH")
.arg("test_index")
.arg("Hello")
.query(conn);
assert!(result.is_ok());
let results = result.unwrap();
assert_eq!(results[0], "0"); // No matches
// Test FT.DROP
let result: RedisResult<String> = redis::cmd("FT.DROP")
.arg("test_index")
.query(conn);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "OK");
// Verify index was dropped
let result: RedisResult<String> = redis::cmd("FT.INFO")
.arg("test_index")
.query(conn);
assert!(result.is_err()); // Should fail
}
async fn test_tantivy_with_read_permissions(conn: &mut Connection, db_id: u64) {
// Select database with read-only key
let result: RedisResult<String> = redis::cmd("SELECT")
.arg(db_id)
.arg("KEY")
.arg("read_key")
.query(conn);
assert!(result.is_ok());
assert_eq!(result.unwrap(), "OK");
// Recreate index for testing
let result: RedisResult<String> = redis::cmd("FT.CREATE")
.arg("test_index_read")
.arg("SCHEMA")
.arg("title")
.arg("TEXT")
.query(conn);
assert!(result.is_err()); // Should fail due to read-only permissions
assert!(result.unwrap_err().to_string().contains("write permission denied"));
// Add document should fail
let result: RedisResult<String> = redis::cmd("FT.ADD")
.arg("test_index_read")
.arg("doc1")
.arg("1.0")
.arg("title")
.arg("Test")
.query(conn);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("write permission denied"));
// But search should work (if index exists)
// First create index with write permissions, then switch to read
// For this test, we'll assume the index doesn't exist, so search fails differently
}
async fn test_tantivy_access_denied(conn: &mut Connection, db_id: u64) {
// Try to select with invalid key
let result: RedisResult<String> = redis::cmd("SELECT")
.arg(db_id)
.arg("KEY")
.arg("invalid_key")
.query(conn);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("invalid access key"));
}

View File

@@ -1,4 +1,5 @@
use herodb::{options::DBOption, server::Server};
use std::path::PathBuf;
use tokio::io::{AsyncReadExt, AsyncWriteExt};
use tokio::net::TcpStream;
use tokio::time::{sleep, Duration};
@@ -17,12 +18,13 @@ async fn start_test_server(test_name: &str) -> (Server, u16) {
std::fs::create_dir_all(&test_dir).unwrap();
let option = DBOption {
dir: test_dir,
dir: PathBuf::from(test_dir),
port,
debug: false,
encrypt: false,
encryption_key: None,
backend: herodb::options::BackendType::Redb,
admin_secret: "test-admin".to_string(),
};
let server = Server::new(option).await;
@@ -61,7 +63,17 @@ async fn connect(port: u16) -> TcpStream {
let mut attempts = 0;
loop {
match TcpStream::connect(format!("127.0.0.1:{}", port)).await {
Ok(s) => return s,
Ok(mut s) => {
// Acquire ReadWrite permissions for this connection using admin DB 0
let resp = send_cmd(&mut s, &["SELECT", "0", "KEY", "test-admin"]).await;
assert_contains(&resp, "OK", "SELECT 0 KEY test-admin handshake");
// Ensure clean slate per test on DB 0
let fl = send_cmd(&mut s, &["FLUSHDB"]).await;
assert_contains(&fl, "OK", "FLUSHDB after handshake");
return s;
}
Err(_) if attempts < 30 => {
attempts += 1;
sleep(Duration::from_millis(100)).await;
@@ -246,9 +258,9 @@ async fn test_01_connection_and_info() {
let getname = send_cmd(&mut s, &["CLIENT", "GETNAME"]).await;
assert_contains(&getname, "myapp", "CLIENT GETNAME");
// SELECT db
let sel = send_cmd(&mut s, &["SELECT", "0"]).await;
assert_contains(&sel, "OK", "SELECT 0");
// SELECT db (requires key on DB 0)
let sel = send_cmd(&mut s, &["SELECT", "0", "KEY", "test-admin"]).await;
assert_contains(&sel, "OK", "SELECT 0 with key");
// QUIT should close connection after sending OK
let quit = send_cmd(&mut s, &["QUIT"]).await;
@@ -280,6 +292,10 @@ async fn test_02_strings_and_expiry() {
let ex0 = send_cmd(&mut s, &["EXISTS", "user:1"]).await;
assert_contains(&ex0, "0", "EXISTS after DEL");
// DEL non-existent should return 0
let del0 = send_cmd(&mut s, &["DEL", "user:1"]).await;
assert_contains(&del0, "0", "DEL user:1 when not exists -> 0");
// INCR behavior
let i1 = send_cmd(&mut s, &["INCR", "count"]).await;
assert_contains(&i1, "1", "INCR new key -> 1");
@@ -501,11 +517,11 @@ async fn test_07_age_stateless_suite() {
let mut s = connect(port).await;
// GENENC -> [recipient, identity]
let gen = send_cmd(&mut s, &["AGE", "GENENC"]).await;
let genenc = send_cmd(&mut s, &["AGE", "GENENC"]).await;
assert!(
gen.starts_with("*2\r\n$"),
genenc.starts_with("*2\r\n$"),
"AGE GENENC should return array [recipient, identity], got:\n{}",
gen
genenc
);
// Parse simple RESP array of two bulk strings to extract keys
@@ -520,7 +536,7 @@ async fn test_07_age_stateless_suite() {
let ident = lines.next().unwrap_or("").to_string();
(recip, ident)
}
let (recipient, identity) = parse_two_bulk_array(&gen);
let (recipient, identity) = parse_two_bulk_array(&genenc);
assert!(
recipient.starts_with("age1") && identity.starts_with("AGE-SECRET-KEY-1"),
"Unexpected AGE key formats.\nrecipient: {}\nidentity: {}",
@@ -591,7 +607,7 @@ async fn test_08_age_persistent_named_suite() {
// AGE LIST
let lst = send_cmd(&mut s, &["AGE", "LIST"]).await;
assert_contains(&lst, "encpub", "AGE LIST label encpub");
// After flattening, LIST returns a flat array of managed key names
assert_contains(&lst, "app1", "AGE LIST includes app1");
}