move in radixtree and dedupstore

This commit is contained in:
timurgordon
2025-02-26 02:38:38 +03:00
parent 59efa18bce
commit 68d25d3622
4 changed files with 309 additions and 8 deletions

View File

@@ -0,0 +1,94 @@
# DedupeStore
DedupeStore is a content-addressable key-value store with built-in deduplication. It uses blake2b-160 content hashing to identify and deduplicate data, making it ideal for storing files or data blocks where the same content might appear multiple times.
## Features
- Content-based deduplication using blake2b-160 hashing
- Efficient storage using RadixTree for hash lookups
- Persistent storage using OurDB
- Maximum value size limit of 1MB
- Fast retrieval of data using content hash
- Automatic deduplication of identical content
## Usage
```v
import freeflowuniverse.herolib.data.dedupestor
fn main() ! {
// Create a new dedupestore
mut ds := dedupestor.new(
path: 'path/to/store'
reset: false // Set to true to reset existing data
)!
// Store some data
data := 'Hello, World!'.bytes()
hash := ds.store(data)!
println('Stored data with hash: ${hash}')
// Retrieve data using hash
retrieved := ds.get(hash)!
println('Retrieved data: ${retrieved.bytestr()}')
// Check if data exists
exists := ds.exists(hash)
println('Data exists: ${exists}')
// Attempting to store the same data again returns the same hash
same_hash := ds.store(data)!
assert hash == same_hash // True, data was deduplicated
}
```
## Implementation Details
DedupeStore uses two main components for storage:
1. **RadixTree**: Stores mappings from content hashes to data location IDs
2. **OurDB**: Stores the actual data blocks
When storing data:
1. The data is hashed using blake2b-160
2. If the hash exists in the RadixTree, the existing data location is returned
3. If the hash is new:
- Data is stored in OurDB, getting a new location ID
- Hash -> ID mapping is stored in RadixTree
- The hash is returned
When retrieving data:
1. The RadixTree is queried with the hash to get the data location ID
2. The data is retrieved from OurDB using the ID
## Size Limits
- Maximum value size: 1MB
- Attempting to store larger values will result in an error
## Error Handling
The store methods return results that should be handled with V's error handling:
```v
// Handle potential errors
if hash := ds.store(large_data) {
// Success
println('Stored with hash: ${hash}')
} else {
// Error occurred
println('Error: ${err}')
}
```
## Testing
The module includes comprehensive tests covering:
- Basic store/retrieve operations
- Deduplication functionality
- Size limit enforcement
- Edge cases
Run tests with:
```bash
v test lib/data/dedupestor/

View File

@@ -0,0 +1,99 @@
module dedupestor
import crypto.blake2b
import freeflowuniverse.herolib.data.radixtree
import freeflowuniverse.herolib.data.ourdb
pub const max_value_size = 1024 * 1024 // 1MB
// DedupeStore provides a key-value store with deduplication based on content hashing
pub struct DedupeStore {
mut:
radix &radixtree.RadixTree // For storing hash -> id mappings
data &ourdb.OurDB // For storing the actual data
}
@[params]
pub struct NewArgs {
pub mut:
path string // Base path for the store
reset bool // Whether to reset existing data
}
// new creates a new deduplication store
pub fn new(args NewArgs) !&DedupeStore {
// Create the radixtree for hash -> id mapping
mut rt := radixtree.new(
path: '${args.path}/radixtree'
reset: args.reset
)!
// Create the ourdb for actual data storage
mut db := ourdb.new(
path: '${args.path}/data'
record_size_max: max_value_size
incremental_mode: true // We want auto-incrementing IDs
reset: args.reset
)!
return &DedupeStore{
radix: rt
data: &db
}
}
// store stores a value and returns its hash
// If the value already exists (same hash), returns the existing hash without storing again
pub fn (mut ds DedupeStore) store(value []u8) !string {
// Check size limit
if value.len > max_value_size {
return error('value size exceeds maximum allowed size of 1MB')
}
// Calculate blake160 hash of the value
hash := blake2b.sum160(value).hex()
// Check if this hash already exists
if _ := ds.radix.search(hash) {
// Value already exists, return the hash
return hash
}
// Store the actual data in ourdb
id := ds.data.set(data: value)!
// Convert id to bytes for storage in radixtree
id_bytes := u32_to_bytes(id)
// Store the mapping of hash -> id in radixtree
ds.radix.insert(hash, id_bytes)!
return hash
}
// get retrieves a value by its hash
pub fn (mut ds DedupeStore) get(hash string) ![]u8 {
// Get the ID from radixtree
id_bytes := ds.radix.search(hash)!
// Convert bytes back to u32 id
id := bytes_to_u32(id_bytes)
// Get the actual data from ourdb
return ds.data.get(id)!
}
// exists checks if a value with the given hash exists
pub fn (mut ds DedupeStore) exists(hash string) bool {
return if _ := ds.radix.search(hash) { true } else { false }
}
// Helper function to convert u32 to []u8
fn u32_to_bytes(n u32) []u8 {
return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
}
// Helper function to convert []u8 to u32
fn bytes_to_u32(b []u8) u32 {
return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
}

View File

@@ -0,0 +1,108 @@
module dedupestor
import os
fn testsuite_begin() ! {
// Ensure test directories exist and are clean
test_dirs := [
'/tmp/dedupestor_test',
'/tmp/dedupestor_test_size',
'/tmp/dedupestor_test_exists',
'/tmp/dedupestor_test_multiple'
]
for dir in test_dirs {
if os.exists(dir) {
os.rmdir_all(dir) or {}
}
os.mkdir_all(dir) or {}
}
}
fn test_basic_operations() ! {
mut ds := new(
path: '/tmp/dedupestor_test'
reset: true
)!
// Test storing and retrieving data
value1 := 'test data 1'.bytes()
hash1 := ds.store(value1)!
retrieved1 := ds.get(hash1)!
assert retrieved1 == value1
// Test deduplication
hash2 := ds.store(value1)!
assert hash1 == hash2 // Should return same hash for same data
// Test different data gets different hash
value2 := 'test data 2'.bytes()
hash3 := ds.store(value2)!
assert hash1 != hash3 // Should be different hash for different data
retrieved2 := ds.get(hash3)!
assert retrieved2 == value2
}
fn test_size_limit() ! {
mut ds := new(
path: '/tmp/dedupestor_test_size'
reset: true
)!
// Test data under size limit (1KB)
small_data := []u8{len: 1024, init: u8(index)}
small_hash := ds.store(small_data)!
retrieved := ds.get(small_hash)!
assert retrieved == small_data
// Test data over size limit (2MB)
large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)}
if _ := ds.store(large_data) {
assert false, 'Expected error for data exceeding size limit'
}
}
fn test_exists() ! {
mut ds := new(
path: '/tmp/dedupestor_test_exists'
reset: true
)!
value := 'test data'.bytes()
hash := ds.store(value)!
assert ds.exists(hash) == true
assert ds.exists('nonexistent') == false
}
fn test_multiple_operations() ! {
mut ds := new(
path: '/tmp/dedupestor_test_multiple'
reset: true
)!
// Store multiple values
mut values := [][]u8{}
mut hashes := []string{}
for i in 0..5 {
value := 'test data ${i}'.bytes()
values << value
hash := ds.store(value)!
hashes << hash
}
// Verify all values can be retrieved
for i, hash in hashes {
retrieved := ds.get(hash)!
assert retrieved == values[i]
}
// Test deduplication by storing same values again
for i, value in values {
hash := ds.store(value)!
assert hash == hashes[i] // Should get same hash for same data
}
}

View File

@@ -36,27 +36,27 @@ fn deserialize_node(data []u8) !Node {
mut d := encoder.decoder_new(data)
// Read and verify version
version_byte := d.get_u8()
version_byte := d.get_u8()!
if version_byte != version {
return error('Invalid version byte: expected ${version}, got ${version_byte}')
}
// Read key segment
key_segment := d.get_string()
key_segment := d.get_string()!
// Read value as []u8
value_len := d.get_u16()
value_len := d.get_u16()!
mut value := []u8{len: int(value_len)}
for i in 0 .. int(value_len) {
value[i] = d.get_u8()
value[i] = d.get_u8()!
}
// Read children
children_len := d.get_u16()
children_len := d.get_u16()!
mut children := []NodeRef{cap: int(children_len)}
for _ in 0 .. children_len {
key_part := d.get_string()
node_id := d.get_u32()
key_part := d.get_string()!
node_id := d.get_u32()!
children << NodeRef{
key_part: key_part
node_id: node_id
@@ -64,7 +64,7 @@ fn deserialize_node(data []u8) !Node {
}
// Read leaf flag
is_leaf := d.get_u8() == 1
is_leaf := d.get_u8()! == 1
return Node{
key_segment: key_segment