move in radixtree and dedupstore
This commit is contained in:
94
lib/data/dedupestor/README.md
Normal file
94
lib/data/dedupestor/README.md
Normal file
@@ -0,0 +1,94 @@
|
|||||||
|
# DedupeStore
|
||||||
|
|
||||||
|
DedupeStore is a content-addressable key-value store with built-in deduplication. It uses blake2b-160 content hashing to identify and deduplicate data, making it ideal for storing files or data blocks where the same content might appear multiple times.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- Content-based deduplication using blake2b-160 hashing
|
||||||
|
- Efficient storage using RadixTree for hash lookups
|
||||||
|
- Persistent storage using OurDB
|
||||||
|
- Maximum value size limit of 1MB
|
||||||
|
- Fast retrieval of data using content hash
|
||||||
|
- Automatic deduplication of identical content
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
```v
|
||||||
|
import freeflowuniverse.herolib.data.dedupestor
|
||||||
|
|
||||||
|
fn main() ! {
|
||||||
|
// Create a new dedupestore
|
||||||
|
mut ds := dedupestor.new(
|
||||||
|
path: 'path/to/store'
|
||||||
|
reset: false // Set to true to reset existing data
|
||||||
|
)!
|
||||||
|
|
||||||
|
// Store some data
|
||||||
|
data := 'Hello, World!'.bytes()
|
||||||
|
hash := ds.store(data)!
|
||||||
|
println('Stored data with hash: ${hash}')
|
||||||
|
|
||||||
|
// Retrieve data using hash
|
||||||
|
retrieved := ds.get(hash)!
|
||||||
|
println('Retrieved data: ${retrieved.bytestr()}')
|
||||||
|
|
||||||
|
// Check if data exists
|
||||||
|
exists := ds.exists(hash)
|
||||||
|
println('Data exists: ${exists}')
|
||||||
|
|
||||||
|
// Attempting to store the same data again returns the same hash
|
||||||
|
same_hash := ds.store(data)!
|
||||||
|
assert hash == same_hash // True, data was deduplicated
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Implementation Details
|
||||||
|
|
||||||
|
DedupeStore uses two main components for storage:
|
||||||
|
|
||||||
|
1. **RadixTree**: Stores mappings from content hashes to data location IDs
|
||||||
|
2. **OurDB**: Stores the actual data blocks
|
||||||
|
|
||||||
|
When storing data:
|
||||||
|
1. The data is hashed using blake2b-160
|
||||||
|
2. If the hash exists in the RadixTree, the existing data location is returned
|
||||||
|
3. If the hash is new:
|
||||||
|
- Data is stored in OurDB, getting a new location ID
|
||||||
|
- Hash -> ID mapping is stored in RadixTree
|
||||||
|
- The hash is returned
|
||||||
|
|
||||||
|
When retrieving data:
|
||||||
|
1. The RadixTree is queried with the hash to get the data location ID
|
||||||
|
2. The data is retrieved from OurDB using the ID
|
||||||
|
|
||||||
|
## Size Limits
|
||||||
|
|
||||||
|
- Maximum value size: 1MB
|
||||||
|
- Attempting to store larger values will result in an error
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
The store methods return results that should be handled with V's error handling:
|
||||||
|
|
||||||
|
```v
|
||||||
|
// Handle potential errors
|
||||||
|
if hash := ds.store(large_data) {
|
||||||
|
// Success
|
||||||
|
println('Stored with hash: ${hash}')
|
||||||
|
} else {
|
||||||
|
// Error occurred
|
||||||
|
println('Error: ${err}')
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Testing
|
||||||
|
|
||||||
|
The module includes comprehensive tests covering:
|
||||||
|
- Basic store/retrieve operations
|
||||||
|
- Deduplication functionality
|
||||||
|
- Size limit enforcement
|
||||||
|
- Edge cases
|
||||||
|
|
||||||
|
Run tests with:
|
||||||
|
```bash
|
||||||
|
v test lib/data/dedupestor/
|
||||||
99
lib/data/dedupestor/dedupestor.v
Normal file
99
lib/data/dedupestor/dedupestor.v
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
module dedupestor
|
||||||
|
|
||||||
|
import crypto.blake2b
|
||||||
|
import freeflowuniverse.herolib.data.radixtree
|
||||||
|
import freeflowuniverse.herolib.data.ourdb
|
||||||
|
|
||||||
|
pub const max_value_size = 1024 * 1024 // 1MB
|
||||||
|
|
||||||
|
// DedupeStore provides a key-value store with deduplication based on content hashing
|
||||||
|
pub struct DedupeStore {
|
||||||
|
mut:
|
||||||
|
radix &radixtree.RadixTree // For storing hash -> id mappings
|
||||||
|
data &ourdb.OurDB // For storing the actual data
|
||||||
|
}
|
||||||
|
|
||||||
|
@[params]
|
||||||
|
pub struct NewArgs {
|
||||||
|
pub mut:
|
||||||
|
path string // Base path for the store
|
||||||
|
reset bool // Whether to reset existing data
|
||||||
|
}
|
||||||
|
|
||||||
|
// new creates a new deduplication store
|
||||||
|
pub fn new(args NewArgs) !&DedupeStore {
|
||||||
|
// Create the radixtree for hash -> id mapping
|
||||||
|
mut rt := radixtree.new(
|
||||||
|
path: '${args.path}/radixtree'
|
||||||
|
reset: args.reset
|
||||||
|
)!
|
||||||
|
|
||||||
|
// Create the ourdb for actual data storage
|
||||||
|
mut db := ourdb.new(
|
||||||
|
path: '${args.path}/data'
|
||||||
|
record_size_max: max_value_size
|
||||||
|
incremental_mode: true // We want auto-incrementing IDs
|
||||||
|
reset: args.reset
|
||||||
|
)!
|
||||||
|
|
||||||
|
return &DedupeStore{
|
||||||
|
radix: rt
|
||||||
|
data: &db
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// store stores a value and returns its hash
|
||||||
|
// If the value already exists (same hash), returns the existing hash without storing again
|
||||||
|
pub fn (mut ds DedupeStore) store(value []u8) !string {
|
||||||
|
// Check size limit
|
||||||
|
if value.len > max_value_size {
|
||||||
|
return error('value size exceeds maximum allowed size of 1MB')
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate blake160 hash of the value
|
||||||
|
hash := blake2b.sum160(value).hex()
|
||||||
|
|
||||||
|
// Check if this hash already exists
|
||||||
|
if _ := ds.radix.search(hash) {
|
||||||
|
// Value already exists, return the hash
|
||||||
|
return hash
|
||||||
|
}
|
||||||
|
|
||||||
|
// Store the actual data in ourdb
|
||||||
|
id := ds.data.set(data: value)!
|
||||||
|
|
||||||
|
// Convert id to bytes for storage in radixtree
|
||||||
|
id_bytes := u32_to_bytes(id)
|
||||||
|
|
||||||
|
// Store the mapping of hash -> id in radixtree
|
||||||
|
ds.radix.insert(hash, id_bytes)!
|
||||||
|
|
||||||
|
return hash
|
||||||
|
}
|
||||||
|
|
||||||
|
// get retrieves a value by its hash
|
||||||
|
pub fn (mut ds DedupeStore) get(hash string) ![]u8 {
|
||||||
|
// Get the ID from radixtree
|
||||||
|
id_bytes := ds.radix.search(hash)!
|
||||||
|
|
||||||
|
// Convert bytes back to u32 id
|
||||||
|
id := bytes_to_u32(id_bytes)
|
||||||
|
|
||||||
|
// Get the actual data from ourdb
|
||||||
|
return ds.data.get(id)!
|
||||||
|
}
|
||||||
|
|
||||||
|
// exists checks if a value with the given hash exists
|
||||||
|
pub fn (mut ds DedupeStore) exists(hash string) bool {
|
||||||
|
return if _ := ds.radix.search(hash) { true } else { false }
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to convert u32 to []u8
|
||||||
|
fn u32_to_bytes(n u32) []u8 {
|
||||||
|
return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to convert []u8 to u32
|
||||||
|
fn bytes_to_u32(b []u8) u32 {
|
||||||
|
return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
|
||||||
|
}
|
||||||
108
lib/data/dedupestor/dedupestor_test.v
Normal file
108
lib/data/dedupestor/dedupestor_test.v
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
module dedupestor
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
fn testsuite_begin() ! {
|
||||||
|
// Ensure test directories exist and are clean
|
||||||
|
test_dirs := [
|
||||||
|
'/tmp/dedupestor_test',
|
||||||
|
'/tmp/dedupestor_test_size',
|
||||||
|
'/tmp/dedupestor_test_exists',
|
||||||
|
'/tmp/dedupestor_test_multiple'
|
||||||
|
]
|
||||||
|
|
||||||
|
for dir in test_dirs {
|
||||||
|
if os.exists(dir) {
|
||||||
|
os.rmdir_all(dir) or {}
|
||||||
|
}
|
||||||
|
os.mkdir_all(dir) or {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_basic_operations() ! {
|
||||||
|
mut ds := new(
|
||||||
|
path: '/tmp/dedupestor_test'
|
||||||
|
reset: true
|
||||||
|
)!
|
||||||
|
|
||||||
|
// Test storing and retrieving data
|
||||||
|
value1 := 'test data 1'.bytes()
|
||||||
|
hash1 := ds.store(value1)!
|
||||||
|
|
||||||
|
retrieved1 := ds.get(hash1)!
|
||||||
|
assert retrieved1 == value1
|
||||||
|
|
||||||
|
// Test deduplication
|
||||||
|
hash2 := ds.store(value1)!
|
||||||
|
assert hash1 == hash2 // Should return same hash for same data
|
||||||
|
|
||||||
|
// Test different data gets different hash
|
||||||
|
value2 := 'test data 2'.bytes()
|
||||||
|
hash3 := ds.store(value2)!
|
||||||
|
assert hash1 != hash3 // Should be different hash for different data
|
||||||
|
|
||||||
|
retrieved2 := ds.get(hash3)!
|
||||||
|
assert retrieved2 == value2
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_size_limit() ! {
|
||||||
|
mut ds := new(
|
||||||
|
path: '/tmp/dedupestor_test_size'
|
||||||
|
reset: true
|
||||||
|
)!
|
||||||
|
|
||||||
|
// Test data under size limit (1KB)
|
||||||
|
small_data := []u8{len: 1024, init: u8(index)}
|
||||||
|
small_hash := ds.store(small_data)!
|
||||||
|
retrieved := ds.get(small_hash)!
|
||||||
|
assert retrieved == small_data
|
||||||
|
|
||||||
|
// Test data over size limit (2MB)
|
||||||
|
large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)}
|
||||||
|
if _ := ds.store(large_data) {
|
||||||
|
assert false, 'Expected error for data exceeding size limit'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_exists() ! {
|
||||||
|
mut ds := new(
|
||||||
|
path: '/tmp/dedupestor_test_exists'
|
||||||
|
reset: true
|
||||||
|
)!
|
||||||
|
|
||||||
|
value := 'test data'.bytes()
|
||||||
|
hash := ds.store(value)!
|
||||||
|
|
||||||
|
assert ds.exists(hash) == true
|
||||||
|
assert ds.exists('nonexistent') == false
|
||||||
|
}
|
||||||
|
|
||||||
|
fn test_multiple_operations() ! {
|
||||||
|
mut ds := new(
|
||||||
|
path: '/tmp/dedupestor_test_multiple'
|
||||||
|
reset: true
|
||||||
|
)!
|
||||||
|
|
||||||
|
// Store multiple values
|
||||||
|
mut values := [][]u8{}
|
||||||
|
mut hashes := []string{}
|
||||||
|
|
||||||
|
for i in 0..5 {
|
||||||
|
value := 'test data ${i}'.bytes()
|
||||||
|
values << value
|
||||||
|
hash := ds.store(value)!
|
||||||
|
hashes << hash
|
||||||
|
}
|
||||||
|
|
||||||
|
// Verify all values can be retrieved
|
||||||
|
for i, hash in hashes {
|
||||||
|
retrieved := ds.get(hash)!
|
||||||
|
assert retrieved == values[i]
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test deduplication by storing same values again
|
||||||
|
for i, value in values {
|
||||||
|
hash := ds.store(value)!
|
||||||
|
assert hash == hashes[i] // Should get same hash for same data
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -36,27 +36,27 @@ fn deserialize_node(data []u8) !Node {
|
|||||||
mut d := encoder.decoder_new(data)
|
mut d := encoder.decoder_new(data)
|
||||||
|
|
||||||
// Read and verify version
|
// Read and verify version
|
||||||
version_byte := d.get_u8()
|
version_byte := d.get_u8()!
|
||||||
if version_byte != version {
|
if version_byte != version {
|
||||||
return error('Invalid version byte: expected ${version}, got ${version_byte}')
|
return error('Invalid version byte: expected ${version}, got ${version_byte}')
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read key segment
|
// Read key segment
|
||||||
key_segment := d.get_string()
|
key_segment := d.get_string()!
|
||||||
|
|
||||||
// Read value as []u8
|
// Read value as []u8
|
||||||
value_len := d.get_u16()
|
value_len := d.get_u16()!
|
||||||
mut value := []u8{len: int(value_len)}
|
mut value := []u8{len: int(value_len)}
|
||||||
for i in 0 .. int(value_len) {
|
for i in 0 .. int(value_len) {
|
||||||
value[i] = d.get_u8()
|
value[i] = d.get_u8()!
|
||||||
}
|
}
|
||||||
|
|
||||||
// Read children
|
// Read children
|
||||||
children_len := d.get_u16()
|
children_len := d.get_u16()!
|
||||||
mut children := []NodeRef{cap: int(children_len)}
|
mut children := []NodeRef{cap: int(children_len)}
|
||||||
for _ in 0 .. children_len {
|
for _ in 0 .. children_len {
|
||||||
key_part := d.get_string()
|
key_part := d.get_string()!
|
||||||
node_id := d.get_u32()
|
node_id := d.get_u32()!
|
||||||
children << NodeRef{
|
children << NodeRef{
|
||||||
key_part: key_part
|
key_part: key_part
|
||||||
node_id: node_id
|
node_id: node_id
|
||||||
@@ -64,7 +64,7 @@ fn deserialize_node(data []u8) !Node {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Read leaf flag
|
// Read leaf flag
|
||||||
is_leaf := d.get_u8() == 1
|
is_leaf := d.get_u8()! == 1
|
||||||
|
|
||||||
return Node{
|
return Node{
|
||||||
key_segment: key_segment
|
key_segment: key_segment
|
||||||
|
|||||||
Reference in New Issue
Block a user