move in radixtree and dedupstore

2025-02-26 02:38:38 +03:00
parent 59efa18bce
commit 68d25d3622
4 changed files with 309 additions and 8 deletions
--- a/lib/data/dedupestor/README.md
+++ b/lib/data/dedupestor/README.md
@@ -0,0 +1,94 @@
 # DedupeStore
 DedupeStore is a content-addressable key-value store with built-in deduplication. It uses blake2b-160 content hashing to identify and deduplicate data, making it ideal for storing files or data blocks where the same content might appear multiple times.
 ## Features
 - Content-based deduplication using blake2b-160 hashing
 - Efficient storage using RadixTree for hash lookups
 - Persistent storage using OurDB
 - Maximum value size limit of 1MB
 - Fast retrieval of data using content hash
 - Automatic deduplication of identical content
 ## Usage
 ```v
 import freeflowuniverse.herolib.data.dedupestor
 fn main() ! {
    // Create a new dedupestore
    mut ds := dedupestor.new(
        path: 'path/to/store'
        reset: false // Set to true to reset existing data
    )!
    // Store some data
    data := 'Hello, World!'.bytes()
    hash := ds.store(data)!
    println('Stored data with hash: ${hash}')
    // Retrieve data using hash
    retrieved := ds.get(hash)!
    println('Retrieved data: ${retrieved.bytestr()}')
    // Check if data exists
    exists := ds.exists(hash)
    println('Data exists: ${exists}')
    // Attempting to store the same data again returns the same hash
    same_hash := ds.store(data)!
    assert hash == same_hash // True, data was deduplicated
 }
 ```
 ## Implementation Details
 DedupeStore uses two main components for storage:
 1. **RadixTree**: Stores mappings from content hashes to data location IDs
 2. **OurDB**: Stores the actual data blocks
 When storing data:
 1. The data is hashed using blake2b-160
 2. If the hash exists in the RadixTree, the existing data location is returned
 3. If the hash is new:
   - Data is stored in OurDB, getting a new location ID
   - Hash -> ID mapping is stored in RadixTree
   - The hash is returned
 When retrieving data:
 1. The RadixTree is queried with the hash to get the data location ID
 2. The data is retrieved from OurDB using the ID
 ## Size Limits
 - Maximum value size: 1MB
 - Attempting to store larger values will result in an error
 ## Error Handling
 The store methods return results that should be handled with V's error handling:
 ```v
 // Handle potential errors
 if hash := ds.store(large_data) {
    // Success
    println('Stored with hash: ${hash}')
 } else {
    // Error occurred
    println('Error: ${err}')
 }
 ```
 ## Testing
 The module includes comprehensive tests covering:
 - Basic store/retrieve operations
 - Deduplication functionality
 - Size limit enforcement
 - Edge cases
 Run tests with:
 ```bash
 v test lib/data/dedupestor/
--- a/lib/data/dedupestor/dedupestor.v
+++ b/lib/data/dedupestor/dedupestor.v
@@ -0,0 +1,99 @@
 module dedupestor
 import crypto.blake2b
 import freeflowuniverse.herolib.data.radixtree
 import freeflowuniverse.herolib.data.ourdb
 pub const max_value_size = 1024 * 1024 // 1MB
 // DedupeStore provides a key-value store with deduplication based on content hashing
 pub struct DedupeStore {
 mut:
 	radix &radixtree.RadixTree // For storing hash -> id mappings
 	data  &ourdb.OurDB        // For storing the actual data
 }
@[params]
 pub struct NewArgs {
 pub mut:
 	path  string    // Base path for the store
 	reset bool      // Whether to reset existing data
 }
 // new creates a new deduplication store
 pub fn new(args NewArgs) !&DedupeStore {
 	// Create the radixtree for hash -> id mapping
 	mut rt := radixtree.new(
 		path: '${args.path}/radixtree'
 		reset: args.reset
 	)!
 	// Create the ourdb for actual data storage
 	mut db := ourdb.new(
 		path: '${args.path}/data'
 		record_size_max: max_value_size
 		incremental_mode: true // We want auto-incrementing IDs
 		reset: args.reset
 	)!
 	return &DedupeStore{
 		radix: rt
 		data: &db
 	}
 }
 // store stores a value and returns its hash
 // If the value already exists (same hash), returns the existing hash without storing again
 pub fn (mut ds DedupeStore) store(value []u8) !string {
 	// Check size limit
 	if value.len > max_value_size {
 		return error('value size exceeds maximum allowed size of 1MB')
 	}
 	// Calculate blake160 hash of the value
 	hash := blake2b.sum160(value).hex()
 	// Check if this hash already exists
 	if _ := ds.radix.search(hash) {
 		// Value already exists, return the hash
 		return hash
 	}
 	// Store the actual data in ourdb
 	id := ds.data.set(data: value)!
 	// Convert id to bytes for storage in radixtree
 	id_bytes := u32_to_bytes(id)
 	// Store the mapping of hash -> id in radixtree
 	ds.radix.insert(hash, id_bytes)!
 	return hash
 }
 // get retrieves a value by its hash
 pub fn (mut ds DedupeStore) get(hash string) ![]u8 {
 	// Get the ID from radixtree
 	id_bytes := ds.radix.search(hash)!
 	// Convert bytes back to u32 id
 	id := bytes_to_u32(id_bytes)
 	// Get the actual data from ourdb
 	return ds.data.get(id)!
 }
 // exists checks if a value with the given hash exists
 pub fn (mut ds DedupeStore) exists(hash string) bool {
 	return if _ := ds.radix.search(hash) { true } else { false }
 }
 // Helper function to convert u32 to []u8
 fn u32_to_bytes(n u32) []u8 {
 	return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
 }
 // Helper function to convert []u8 to u32
 fn bytes_to_u32(b []u8) u32 {
 	return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
 }
--- a/lib/data/dedupestor/dedupestor_test.v
+++ b/lib/data/dedupestor/dedupestor_test.v
@@ -0,0 +1,108 @@
 module dedupestor
 import os
 fn testsuite_begin() ! {
 	// Ensure test directories exist and are clean
 	test_dirs := [
 		'/tmp/dedupestor_test',
 		'/tmp/dedupestor_test_size',
 		'/tmp/dedupestor_test_exists',
 		'/tmp/dedupestor_test_multiple'
 	]
 	for dir in test_dirs {
 		if os.exists(dir) {
 			os.rmdir_all(dir) or {}
 		}
 		os.mkdir_all(dir) or {}
 	}
 }
 fn test_basic_operations() ! {
 	mut ds := new(
 		path: '/tmp/dedupestor_test'
 		reset: true
 	)!
 	// Test storing and retrieving data
 	value1 := 'test data 1'.bytes()
 	hash1 := ds.store(value1)!
 	retrieved1 := ds.get(hash1)!
 	assert retrieved1 == value1
 	// Test deduplication
 	hash2 := ds.store(value1)!
 	assert hash1 == hash2 // Should return same hash for same data
 	// Test different data gets different hash
 	value2 := 'test data 2'.bytes()
 	hash3 := ds.store(value2)!
 	assert hash1 != hash3 // Should be different hash for different data
 	retrieved2 := ds.get(hash3)!
 	assert retrieved2 == value2
 }
 fn test_size_limit() ! {
 	mut ds := new(
 		path: '/tmp/dedupestor_test_size'
 		reset: true
 	)!
 	// Test data under size limit (1KB)
 	small_data := []u8{len: 1024, init: u8(index)}
 	small_hash := ds.store(small_data)!
 	retrieved := ds.get(small_hash)!
 	assert retrieved == small_data
 	// Test data over size limit (2MB)
 	large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)}
 	if _ := ds.store(large_data) {
 		assert false, 'Expected error for data exceeding size limit'
 	}
 }
 fn test_exists() ! {
 	mut ds := new(
 		path: '/tmp/dedupestor_test_exists'
 		reset: true
 	)!
 	value := 'test data'.bytes()
 	hash := ds.store(value)!
 	assert ds.exists(hash) == true
 	assert ds.exists('nonexistent') == false
 }
 fn test_multiple_operations() ! {
 	mut ds := new(
 		path: '/tmp/dedupestor_test_multiple'
 		reset: true
 	)!
 	// Store multiple values
 	mut values := [][]u8{}
 	mut hashes := []string{}
 	for i in 0..5 {
 		value := 'test data ${i}'.bytes()
 		values << value
 		hash := ds.store(value)!
 		hashes << hash
 	}
 	// Verify all values can be retrieved
 	for i, hash in hashes {
 		retrieved := ds.get(hash)!
 		assert retrieved == values[i]
 	}
 	// Test deduplication by storing same values again
 	for i, value in values {
 		hash := ds.store(value)!
 		assert hash == hashes[i] // Should get same hash for same data
 	}
 }
--- a/lib/data/radixtree/serialize.v
+++ b/lib/data/radixtree/serialize.v
@@ -36,27 +36,27 @@ fn deserialize_node(data []u8) !Node {
 	mut d := encoder.decoder_new(data)
 	// Read and verify version
-	version_byte := d.get_u8()
+	version_byte := d.get_u8()!
 	if version_byte != version {
 		return error('Invalid version byte: expected ${version}, got ${version_byte}')
 	}
 	// Read key segment
-	key_segment := d.get_string()
+	key_segment := d.get_string()!
 	// Read value as []u8
-	value_len := d.get_u16()
+	value_len := d.get_u16()!
 	mut value := []u8{len: int(value_len)}
 	for i in 0 .. int(value_len) {
-		value[i] = d.get_u8()
+		value[i] = d.get_u8()!
 	}
 	// Read children
-	children_len := d.get_u16()
+	children_len := d.get_u16()!
 	mut children := []NodeRef{cap: int(children_len)}
 	for _ in 0 .. children_len {
-		key_part := d.get_string()
+		key_part := d.get_string()!
-		node_id := d.get_u32()
+		node_id := d.get_u32()!
 		children << NodeRef{
 			key_part: key_part
 			node_id:  node_id
@@ -64,7 +64,7 @@ fn deserialize_node(data []u8) !Node {
 	}
 	// Read leaf flag
-	is_leaf := d.get_u8() == 1
+	is_leaf := d.get_u8()! == 1
 	return Node{
 		key_segment: key_segment