move in radixtree and dedupstore

2025-02-26 02:38:38 +03:00
parent 59efa18bce
commit 68d25d3622
4 changed files with 309 additions and 8 deletions
--- a/lib/data/dedupestor/README.md
+++ b/lib/data/dedupestor/README.md
@@ -0,0 +1,94 @@
+# DedupeStore
+
+DedupeStore is a content-addressable key-value store with built-in deduplication. It uses blake2b-160 content hashing to identify and deduplicate data, making it ideal for storing files or data blocks where the same content might appear multiple times.
+
+## Features
+
+- Content-based deduplication using blake2b-160 hashing
+- Efficient storage using RadixTree for hash lookups
+- Persistent storage using OurDB
+- Maximum value size limit of 1MB
+- Fast retrieval of data using content hash
+- Automatic deduplication of identical content
+
+## Usage
+
+```v
+import freeflowuniverse.herolib.data.dedupestor
+
+fn main() ! {
+    // Create a new dedupestore
+    mut ds := dedupestor.new(
+        path: 'path/to/store'
+        reset: false // Set to true to reset existing data
+    )!
+
+    // Store some data
+    data := 'Hello, World!'.bytes()
+    hash := ds.store(data)!
+    println('Stored data with hash: ${hash}')
+
+    // Retrieve data using hash
+    retrieved := ds.get(hash)!
+    println('Retrieved data: ${retrieved.bytestr()}')
+
+    // Check if data exists
+    exists := ds.exists(hash)
+    println('Data exists: ${exists}')
+
+    // Attempting to store the same data again returns the same hash
+    same_hash := ds.store(data)!
+    assert hash == same_hash // True, data was deduplicated
+}
+```
+
+## Implementation Details
+
+DedupeStore uses two main components for storage:
+
+1. **RadixTree**: Stores mappings from content hashes to data location IDs
+2. **OurDB**: Stores the actual data blocks
+
+When storing data:
+1. The data is hashed using blake2b-160
+2. If the hash exists in the RadixTree, the existing data location is returned
+3. If the hash is new:
+   - Data is stored in OurDB, getting a new location ID
+   - Hash -> ID mapping is stored in RadixTree
+   - The hash is returned
+
+When retrieving data:
+1. The RadixTree is queried with the hash to get the data location ID
+2. The data is retrieved from OurDB using the ID
+
+## Size Limits
+
+- Maximum value size: 1MB
+- Attempting to store larger values will result in an error
+
+## Error Handling
+
+The store methods return results that should be handled with V's error handling:
+
+```v
+// Handle potential errors
+if hash := ds.store(large_data) {
+    // Success
+    println('Stored with hash: ${hash}')
+} else {
+    // Error occurred
+    println('Error: ${err}')
+}
+```
+
+## Testing
+
+The module includes comprehensive tests covering:
+- Basic store/retrieve operations
+- Deduplication functionality
+- Size limit enforcement
+- Edge cases
+
+Run tests with:
+```bash
+v test lib/data/dedupestor/
--- a/lib/data/dedupestor/dedupestor.v
+++ b/lib/data/dedupestor/dedupestor.v
@@ -0,0 +1,99 @@
+module dedupestor
+
+import crypto.blake2b
+import freeflowuniverse.herolib.data.radixtree
+import freeflowuniverse.herolib.data.ourdb
+
+pub const max_value_size = 1024 * 1024 // 1MB
+
+// DedupeStore provides a key-value store with deduplication based on content hashing
+pub struct DedupeStore {
+mut:
+	radix &radixtree.RadixTree // For storing hash -> id mappings
+	data  &ourdb.OurDB        // For storing the actual data
+}
+
+@[params]
+pub struct NewArgs {
+pub mut:
+	path  string    // Base path for the store
+	reset bool      // Whether to reset existing data
+}
+
+// new creates a new deduplication store
+pub fn new(args NewArgs) !&DedupeStore {
+	// Create the radixtree for hash -> id mapping
+	mut rt := radixtree.new(
+		path: '${args.path}/radixtree'
+		reset: args.reset
+	)!
+
+	// Create the ourdb for actual data storage
+	mut db := ourdb.new(
+		path: '${args.path}/data'
+		record_size_max: max_value_size
+		incremental_mode: true // We want auto-incrementing IDs
+		reset: args.reset
+	)!
+
+	return &DedupeStore{
+		radix: rt
+		data: &db
+	}
+}
+
+// store stores a value and returns its hash
+// If the value already exists (same hash), returns the existing hash without storing again
+pub fn (mut ds DedupeStore) store(value []u8) !string {
+	// Check size limit
+	if value.len > max_value_size {
+		return error('value size exceeds maximum allowed size of 1MB')
+	}
+
+	// Calculate blake160 hash of the value
+	hash := blake2b.sum160(value).hex()
+
+	// Check if this hash already exists
+	if _ := ds.radix.search(hash) {
+		// Value already exists, return the hash
+		return hash
+	}
+
+	// Store the actual data in ourdb
+	id := ds.data.set(data: value)!
+
+	// Convert id to bytes for storage in radixtree
+	id_bytes := u32_to_bytes(id)
+
+	// Store the mapping of hash -> id in radixtree
+	ds.radix.insert(hash, id_bytes)!
+
+	return hash
+}
+
+// get retrieves a value by its hash
+pub fn (mut ds DedupeStore) get(hash string) ![]u8 {
+	// Get the ID from radixtree
+	id_bytes := ds.radix.search(hash)!
+	
+	// Convert bytes back to u32 id
+	id := bytes_to_u32(id_bytes)
+
+	// Get the actual data from ourdb
+	return ds.data.get(id)!
+}
+
+// exists checks if a value with the given hash exists
+pub fn (mut ds DedupeStore) exists(hash string) bool {
+	return if _ := ds.radix.search(hash) { true } else { false }
+}
+
+// Helper function to convert u32 to []u8
+fn u32_to_bytes(n u32) []u8 {
+	return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
+}
+
+// Helper function to convert []u8 to u32
+fn bytes_to_u32(b []u8) u32 {
+	return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
+}
--- a/lib/data/dedupestor/dedupestor_test.v
+++ b/lib/data/dedupestor/dedupestor_test.v
@@ -0,0 +1,108 @@
+module dedupestor
+
+import os
+
+fn testsuite_begin() ! {
+	// Ensure test directories exist and are clean
+	test_dirs := [
+		'/tmp/dedupestor_test',
+		'/tmp/dedupestor_test_size',
+		'/tmp/dedupestor_test_exists',
+		'/tmp/dedupestor_test_multiple'
+	]
+	
+	for dir in test_dirs {
+		if os.exists(dir) {
+			os.rmdir_all(dir) or {}
+		}
+		os.mkdir_all(dir) or {}
+	}
+}
+
+fn test_basic_operations() ! {
+	mut ds := new(
+		path: '/tmp/dedupestor_test'
+		reset: true
+	)!
+
+	// Test storing and retrieving data
+	value1 := 'test data 1'.bytes()
+	hash1 := ds.store(value1)!
+	
+	retrieved1 := ds.get(hash1)!
+	assert retrieved1 == value1
+
+	// Test deduplication
+	hash2 := ds.store(value1)!
+	assert hash1 == hash2 // Should return same hash for same data
+
+	// Test different data gets different hash
+	value2 := 'test data 2'.bytes()
+	hash3 := ds.store(value2)!
+	assert hash1 != hash3 // Should be different hash for different data
+
+	retrieved2 := ds.get(hash3)!
+	assert retrieved2 == value2
+}
+
+fn test_size_limit() ! {
+	mut ds := new(
+		path: '/tmp/dedupestor_test_size'
+		reset: true
+	)!
+
+	// Test data under size limit (1KB)
+	small_data := []u8{len: 1024, init: u8(index)}
+	small_hash := ds.store(small_data)!
+	retrieved := ds.get(small_hash)!
+	assert retrieved == small_data
+
+	// Test data over size limit (2MB)
+	large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)}
+	if _ := ds.store(large_data) {
+		assert false, 'Expected error for data exceeding size limit'
+	}
+}
+
+fn test_exists() ! {
+	mut ds := new(
+		path: '/tmp/dedupestor_test_exists'
+		reset: true
+	)!
+
+	value := 'test data'.bytes()
+	hash := ds.store(value)!
+
+	assert ds.exists(hash) == true
+	assert ds.exists('nonexistent') == false
+}
+
+fn test_multiple_operations() ! {
+	mut ds := new(
+		path: '/tmp/dedupestor_test_multiple'
+		reset: true
+	)!
+
+	// Store multiple values
+	mut values := [][]u8{}
+	mut hashes := []string{}
+
+	for i in 0..5 {
+		value := 'test data ${i}'.bytes()
+		values << value
+		hash := ds.store(value)!
+		hashes << hash
+	}
+
+	// Verify all values can be retrieved
+	for i, hash in hashes {
+		retrieved := ds.get(hash)!
+		assert retrieved == values[i]
+	}
+
+	// Test deduplication by storing same values again
+	for i, value in values {
+		hash := ds.store(value)!
+		assert hash == hashes[i] // Should get same hash for same data
+	}
+}
--- a/lib/data/radixtree/serialize.v
+++ b/lib/data/radixtree/serialize.v
@@ -36,27 +36,27 @@ fn deserialize_node(data []u8) !Node {
 	mut d := encoder.decoder_new(data)

 	// Read and verify version
-	version_byte := d.get_u8()
+	version_byte := d.get_u8()!
 	if version_byte != version {
 		return error('Invalid version byte: expected ${version}, got ${version_byte}')
 	}

 	// Read key segment
-	key_segment := d.get_string()
+	key_segment := d.get_string()!

 	// Read value as []u8
-	value_len := d.get_u16()
+	value_len := d.get_u16()!
 	mut value := []u8{len: int(value_len)}
 	for i in 0 .. int(value_len) {
-		value[i] = d.get_u8()
+		value[i] = d.get_u8()!
 	}

 	// Read children
-	children_len := d.get_u16()
+	children_len := d.get_u16()!
 	mut children := []NodeRef{cap: int(children_len)}
 	for _ in 0 .. children_len {
-		key_part := d.get_string()
-		node_id := d.get_u32()
+		key_part := d.get_string()!
+		node_id := d.get_u32()!
 		children << NodeRef{
 			key_part: key_part
 			node_id:  node_id
@@ -64,7 +64,7 @@ fn deserialize_node(data []u8) !Node {
 	}

 	// Read leaf flag
-	is_leaf := d.get_u8() == 1
+	is_leaf := d.get_u8()! == 1

 	return Node{
 		key_segment: key_segment