move in radixtree and dedupstore
This commit is contained in:
94
lib/data/dedupestor/README.md
Normal file
94
lib/data/dedupestor/README.md
Normal file
@@ -0,0 +1,94 @@
|
||||
# DedupeStore
|
||||
|
||||
DedupeStore is a content-addressable key-value store with built-in deduplication. It uses blake2b-160 content hashing to identify and deduplicate data, making it ideal for storing files or data blocks where the same content might appear multiple times.
|
||||
|
||||
## Features
|
||||
|
||||
- Content-based deduplication using blake2b-160 hashing
|
||||
- Efficient storage using RadixTree for hash lookups
|
||||
- Persistent storage using OurDB
|
||||
- Maximum value size limit of 1MB
|
||||
- Fast retrieval of data using content hash
|
||||
- Automatic deduplication of identical content
|
||||
|
||||
## Usage
|
||||
|
||||
```v
|
||||
import freeflowuniverse.herolib.data.dedupestor
|
||||
|
||||
fn main() ! {
|
||||
// Create a new dedupestore
|
||||
mut ds := dedupestor.new(
|
||||
path: 'path/to/store'
|
||||
reset: false // Set to true to reset existing data
|
||||
)!
|
||||
|
||||
// Store some data
|
||||
data := 'Hello, World!'.bytes()
|
||||
hash := ds.store(data)!
|
||||
println('Stored data with hash: ${hash}')
|
||||
|
||||
// Retrieve data using hash
|
||||
retrieved := ds.get(hash)!
|
||||
println('Retrieved data: ${retrieved.bytestr()}')
|
||||
|
||||
// Check if data exists
|
||||
exists := ds.exists(hash)
|
||||
println('Data exists: ${exists}')
|
||||
|
||||
// Attempting to store the same data again returns the same hash
|
||||
same_hash := ds.store(data)!
|
||||
assert hash == same_hash // True, data was deduplicated
|
||||
}
|
||||
```
|
||||
|
||||
## Implementation Details
|
||||
|
||||
DedupeStore uses two main components for storage:
|
||||
|
||||
1. **RadixTree**: Stores mappings from content hashes to data location IDs
|
||||
2. **OurDB**: Stores the actual data blocks
|
||||
|
||||
When storing data:
|
||||
1. The data is hashed using blake2b-160
|
||||
2. If the hash exists in the RadixTree, the existing data location is returned
|
||||
3. If the hash is new:
|
||||
- Data is stored in OurDB, getting a new location ID
|
||||
- Hash -> ID mapping is stored in RadixTree
|
||||
- The hash is returned
|
||||
|
||||
When retrieving data:
|
||||
1. The RadixTree is queried with the hash to get the data location ID
|
||||
2. The data is retrieved from OurDB using the ID
|
||||
|
||||
## Size Limits
|
||||
|
||||
- Maximum value size: 1MB
|
||||
- Attempting to store larger values will result in an error
|
||||
|
||||
## Error Handling
|
||||
|
||||
The store methods return results that should be handled with V's error handling:
|
||||
|
||||
```v
|
||||
// Handle potential errors
|
||||
if hash := ds.store(large_data) {
|
||||
// Success
|
||||
println('Stored with hash: ${hash}')
|
||||
} else {
|
||||
// Error occurred
|
||||
println('Error: ${err}')
|
||||
}
|
||||
```
|
||||
|
||||
## Testing
|
||||
|
||||
The module includes comprehensive tests covering:
|
||||
- Basic store/retrieve operations
|
||||
- Deduplication functionality
|
||||
- Size limit enforcement
|
||||
- Edge cases
|
||||
|
||||
Run tests with:
|
||||
```bash
|
||||
v test lib/data/dedupestor/
|
||||
99
lib/data/dedupestor/dedupestor.v
Normal file
99
lib/data/dedupestor/dedupestor.v
Normal file
@@ -0,0 +1,99 @@
|
||||
module dedupestor
|
||||
|
||||
import crypto.blake2b
|
||||
import freeflowuniverse.herolib.data.radixtree
|
||||
import freeflowuniverse.herolib.data.ourdb
|
||||
|
||||
pub const max_value_size = 1024 * 1024 // 1MB
|
||||
|
||||
// DedupeStore provides a key-value store with deduplication based on content hashing
|
||||
pub struct DedupeStore {
|
||||
mut:
|
||||
radix &radixtree.RadixTree // For storing hash -> id mappings
|
||||
data &ourdb.OurDB // For storing the actual data
|
||||
}
|
||||
|
||||
@[params]
|
||||
pub struct NewArgs {
|
||||
pub mut:
|
||||
path string // Base path for the store
|
||||
reset bool // Whether to reset existing data
|
||||
}
|
||||
|
||||
// new creates a new deduplication store
|
||||
pub fn new(args NewArgs) !&DedupeStore {
|
||||
// Create the radixtree for hash -> id mapping
|
||||
mut rt := radixtree.new(
|
||||
path: '${args.path}/radixtree'
|
||||
reset: args.reset
|
||||
)!
|
||||
|
||||
// Create the ourdb for actual data storage
|
||||
mut db := ourdb.new(
|
||||
path: '${args.path}/data'
|
||||
record_size_max: max_value_size
|
||||
incremental_mode: true // We want auto-incrementing IDs
|
||||
reset: args.reset
|
||||
)!
|
||||
|
||||
return &DedupeStore{
|
||||
radix: rt
|
||||
data: &db
|
||||
}
|
||||
}
|
||||
|
||||
// store stores a value and returns its hash
|
||||
// If the value already exists (same hash), returns the existing hash without storing again
|
||||
pub fn (mut ds DedupeStore) store(value []u8) !string {
|
||||
// Check size limit
|
||||
if value.len > max_value_size {
|
||||
return error('value size exceeds maximum allowed size of 1MB')
|
||||
}
|
||||
|
||||
// Calculate blake160 hash of the value
|
||||
hash := blake2b.sum160(value).hex()
|
||||
|
||||
// Check if this hash already exists
|
||||
if _ := ds.radix.search(hash) {
|
||||
// Value already exists, return the hash
|
||||
return hash
|
||||
}
|
||||
|
||||
// Store the actual data in ourdb
|
||||
id := ds.data.set(data: value)!
|
||||
|
||||
// Convert id to bytes for storage in radixtree
|
||||
id_bytes := u32_to_bytes(id)
|
||||
|
||||
// Store the mapping of hash -> id in radixtree
|
||||
ds.radix.insert(hash, id_bytes)!
|
||||
|
||||
return hash
|
||||
}
|
||||
|
||||
// get retrieves a value by its hash
|
||||
pub fn (mut ds DedupeStore) get(hash string) ![]u8 {
|
||||
// Get the ID from radixtree
|
||||
id_bytes := ds.radix.search(hash)!
|
||||
|
||||
// Convert bytes back to u32 id
|
||||
id := bytes_to_u32(id_bytes)
|
||||
|
||||
// Get the actual data from ourdb
|
||||
return ds.data.get(id)!
|
||||
}
|
||||
|
||||
// exists checks if a value with the given hash exists
|
||||
pub fn (mut ds DedupeStore) exists(hash string) bool {
|
||||
return if _ := ds.radix.search(hash) { true } else { false }
|
||||
}
|
||||
|
||||
// Helper function to convert u32 to []u8
|
||||
fn u32_to_bytes(n u32) []u8 {
|
||||
return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
|
||||
}
|
||||
|
||||
// Helper function to convert []u8 to u32
|
||||
fn bytes_to_u32(b []u8) u32 {
|
||||
return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
|
||||
}
|
||||
108
lib/data/dedupestor/dedupestor_test.v
Normal file
108
lib/data/dedupestor/dedupestor_test.v
Normal file
@@ -0,0 +1,108 @@
|
||||
module dedupestor
|
||||
|
||||
import os
|
||||
|
||||
fn testsuite_begin() ! {
|
||||
// Ensure test directories exist and are clean
|
||||
test_dirs := [
|
||||
'/tmp/dedupestor_test',
|
||||
'/tmp/dedupestor_test_size',
|
||||
'/tmp/dedupestor_test_exists',
|
||||
'/tmp/dedupestor_test_multiple'
|
||||
]
|
||||
|
||||
for dir in test_dirs {
|
||||
if os.exists(dir) {
|
||||
os.rmdir_all(dir) or {}
|
||||
}
|
||||
os.mkdir_all(dir) or {}
|
||||
}
|
||||
}
|
||||
|
||||
fn test_basic_operations() ! {
|
||||
mut ds := new(
|
||||
path: '/tmp/dedupestor_test'
|
||||
reset: true
|
||||
)!
|
||||
|
||||
// Test storing and retrieving data
|
||||
value1 := 'test data 1'.bytes()
|
||||
hash1 := ds.store(value1)!
|
||||
|
||||
retrieved1 := ds.get(hash1)!
|
||||
assert retrieved1 == value1
|
||||
|
||||
// Test deduplication
|
||||
hash2 := ds.store(value1)!
|
||||
assert hash1 == hash2 // Should return same hash for same data
|
||||
|
||||
// Test different data gets different hash
|
||||
value2 := 'test data 2'.bytes()
|
||||
hash3 := ds.store(value2)!
|
||||
assert hash1 != hash3 // Should be different hash for different data
|
||||
|
||||
retrieved2 := ds.get(hash3)!
|
||||
assert retrieved2 == value2
|
||||
}
|
||||
|
||||
fn test_size_limit() ! {
|
||||
mut ds := new(
|
||||
path: '/tmp/dedupestor_test_size'
|
||||
reset: true
|
||||
)!
|
||||
|
||||
// Test data under size limit (1KB)
|
||||
small_data := []u8{len: 1024, init: u8(index)}
|
||||
small_hash := ds.store(small_data)!
|
||||
retrieved := ds.get(small_hash)!
|
||||
assert retrieved == small_data
|
||||
|
||||
// Test data over size limit (2MB)
|
||||
large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)}
|
||||
if _ := ds.store(large_data) {
|
||||
assert false, 'Expected error for data exceeding size limit'
|
||||
}
|
||||
}
|
||||
|
||||
fn test_exists() ! {
|
||||
mut ds := new(
|
||||
path: '/tmp/dedupestor_test_exists'
|
||||
reset: true
|
||||
)!
|
||||
|
||||
value := 'test data'.bytes()
|
||||
hash := ds.store(value)!
|
||||
|
||||
assert ds.exists(hash) == true
|
||||
assert ds.exists('nonexistent') == false
|
||||
}
|
||||
|
||||
fn test_multiple_operations() ! {
|
||||
mut ds := new(
|
||||
path: '/tmp/dedupestor_test_multiple'
|
||||
reset: true
|
||||
)!
|
||||
|
||||
// Store multiple values
|
||||
mut values := [][]u8{}
|
||||
mut hashes := []string{}
|
||||
|
||||
for i in 0..5 {
|
||||
value := 'test data ${i}'.bytes()
|
||||
values << value
|
||||
hash := ds.store(value)!
|
||||
hashes << hash
|
||||
}
|
||||
|
||||
// Verify all values can be retrieved
|
||||
for i, hash in hashes {
|
||||
retrieved := ds.get(hash)!
|
||||
assert retrieved == values[i]
|
||||
}
|
||||
|
||||
// Test deduplication by storing same values again
|
||||
for i, value in values {
|
||||
hash := ds.store(value)!
|
||||
assert hash == hashes[i] // Should get same hash for same data
|
||||
}
|
||||
}
|
||||
@@ -36,27 +36,27 @@ fn deserialize_node(data []u8) !Node {
|
||||
mut d := encoder.decoder_new(data)
|
||||
|
||||
// Read and verify version
|
||||
version_byte := d.get_u8()
|
||||
version_byte := d.get_u8()!
|
||||
if version_byte != version {
|
||||
return error('Invalid version byte: expected ${version}, got ${version_byte}')
|
||||
}
|
||||
|
||||
// Read key segment
|
||||
key_segment := d.get_string()
|
||||
key_segment := d.get_string()!
|
||||
|
||||
// Read value as []u8
|
||||
value_len := d.get_u16()
|
||||
value_len := d.get_u16()!
|
||||
mut value := []u8{len: int(value_len)}
|
||||
for i in 0 .. int(value_len) {
|
||||
value[i] = d.get_u8()
|
||||
value[i] = d.get_u8()!
|
||||
}
|
||||
|
||||
// Read children
|
||||
children_len := d.get_u16()
|
||||
children_len := d.get_u16()!
|
||||
mut children := []NodeRef{cap: int(children_len)}
|
||||
for _ in 0 .. children_len {
|
||||
key_part := d.get_string()
|
||||
node_id := d.get_u32()
|
||||
key_part := d.get_string()!
|
||||
node_id := d.get_u32()!
|
||||
children << NodeRef{
|
||||
key_part: key_part
|
||||
node_id: node_id
|
||||
@@ -64,7 +64,7 @@ fn deserialize_node(data []u8) !Node {
|
||||
}
|
||||
|
||||
// Read leaf flag
|
||||
is_leaf := d.get_u8() == 1
|
||||
is_leaf := d.get_u8()! == 1
|
||||
|
||||
return Node{
|
||||
key_segment: key_segment
|
||||
|
||||
Reference in New Issue
Block a user