diff --git a/lib/data/dedupestor/dedupestor.v b/lib/data/dedupestor/dedupestor.v index d0b779f1..5651c10e 100644 --- a/lib/data/dedupestor/dedupestor.v +++ b/lib/data/dedupestor/dedupestor.v @@ -42,58 +42,87 @@ pub fn new(args NewArgs) !&DedupeStore { } } -// store stores a value and returns its hash -// If the value already exists (same hash), returns the existing hash without storing again -pub fn (mut ds DedupeStore) store(value []u8) !string { +// store stores data with its reference and returns its id +// If the data already exists (same hash), returns the existing id without storing again +// appends reference to the radix tree entry of the hash to track references +pub fn (mut ds DedupeStore) store(data []u8, ref Reference) !u32 { // Check size limit - if value.len > max_value_size { + if data.len > max_value_size { return error('value size exceeds maximum allowed size of 1MB') } // Calculate blake160 hash of the value - hash := blake2b.sum160(value).hex() + hash := blake2b.sum160(data).hex() // Check if this hash already exists - if _ := ds.radix.search(hash) { - // Value already exists, return the hash - return hash + if metadata_bytes := ds.radix.search(hash) { + // Value already exists, add new ref & return the id + mut metadata := bytes_to_metadata(metadata_bytes) + metadata = metadata.add_reference(ref)! + ds.radix.update(hash, metadata.to_bytes())! + return metadata.id } // Store the actual data in ourdb - id := ds.data.set(data: value)! - - // Convert id to bytes for storage in radixtree - id_bytes := u32_to_bytes(id) + id := ds.data.set(data: data)! + metadata := Metadata{ + id: id + references: [ref] + } // Store the mapping of hash -> id in radixtree - ds.radix.insert(hash, id_bytes)! + ds.radix.insert(hash, metadata.to_bytes())! - return hash + return metadata.id } // get retrieves a value by its hash -pub fn (mut ds DedupeStore) get(hash string) ![]u8 { - // Get the ID from radixtree - id_bytes := ds.radix.search(hash)! - - // Convert bytes back to u32 id - id := bytes_to_u32(id_bytes) - - // Get the actual data from ourdb +pub fn (mut ds DedupeStore) get(id u32) ![]u8 { return ds.data.get(id)! } +// get retrieves a value by its hash +pub fn (mut ds DedupeStore) get_from_hash(hash string) ![]u8 { + // Get the ID from radixtree + metadata_bytes := ds.radix.search(hash)! + + // Convert bytes back to metadata + metadata := bytes_to_metadata(metadata_bytes) + + // Get the actual data from ourdb + return ds.data.get(metadata.id)! +} + // exists checks if a value with the given hash exists -pub fn (mut ds DedupeStore) exists(hash string) bool { +pub fn (mut ds DedupeStore) id_exists(id u32) bool { + if _ := ds.data.get(id) { return true } else {return false} +} + +// exists checks if a value with the given hash exists +pub fn (mut ds DedupeStore) hash_exists(hash string) bool { return if _ := ds.radix.search(hash) { true } else { false } } -// Helper function to convert u32 to []u8 -fn u32_to_bytes(n u32) []u8 { - return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)] -} +// delete removes a reference from the hash entry +// If it's the last reference, removes the hash entry and its data +pub fn (mut ds DedupeStore) delete(id u32, ref Reference) ! { + // Calculate blake160 hash of the value + data := ds.data.get(id)! + hash := blake2b.sum160(data).hex() -// Helper function to convert []u8 to u32 -fn bytes_to_u32(b []u8) u32 { - return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24) + // Get the current entry from radixtree + metadata_bytes := ds.radix.search(hash)! + mut metadata := bytes_to_metadata(metadata_bytes) + metadata = metadata.remove_reference(ref)! + + if metadata.references.len == 0 { + // Delete from radixtree + ds.radix.delete(hash)! + // Delete from data db + ds.data.delete(id)! + return + } + + // Update hash metadata + ds.radix.update(hash, metadata.to_bytes())! } diff --git a/lib/data/dedupestor/dedupestor_test.v b/lib/data/dedupestor/dedupestor_test.v index f10c97d0..395f6068 100644 --- a/lib/data/dedupestor/dedupestor_test.v +++ b/lib/data/dedupestor/dedupestor_test.v @@ -8,7 +8,8 @@ fn testsuite_begin() ! { '/tmp/dedupestor_test', '/tmp/dedupestor_test_size', '/tmp/dedupestor_test_exists', - '/tmp/dedupestor_test_multiple' + '/tmp/dedupestor_test_multiple', + '/tmp/dedupestor_test_refs' ] for dir in test_dirs { @@ -27,18 +28,21 @@ fn test_basic_operations() ! { // Test storing and retrieving data value1 := 'test data 1'.bytes() - hash1 := ds.store(value1)! + ref1 := Reference{owner: 1, id: 1} + hash1 := ds.store(value1, ref1)! retrieved1 := ds.get(hash1)! assert retrieved1 == value1 - // Test deduplication - hash2 := ds.store(value1)! + // Test deduplication with different reference + ref2 := Reference{owner: 1, id: 2} + hash2 := ds.store(value1, ref2)! assert hash1 == hash2 // Should return same hash for same data // Test different data gets different hash value2 := 'test data 2'.bytes() - hash3 := ds.store(value2)! + ref3 := Reference{owner: 1, id: 3} + hash3 := ds.store(value2, ref3)! assert hash1 != hash3 // Should be different hash for different data retrieved2 := ds.get(hash3)! @@ -53,13 +57,14 @@ fn test_size_limit() ! { // Test data under size limit (1KB) small_data := []u8{len: 1024, init: u8(index)} - small_hash := ds.store(small_data)! + ref := Reference{owner: 1, id: 1} + small_hash := ds.store(small_data, ref)! retrieved := ds.get(small_hash)! assert retrieved == small_data // Test data over size limit (2MB) large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)} - if _ := ds.store(large_data) { + if _ := ds.store(large_data, ref) { assert false, 'Expected error for data exceeding size limit' } } @@ -71,10 +76,11 @@ fn test_exists() ! { )! value := 'test data'.bytes() - hash := ds.store(value)! + ref := Reference{owner: 1, id: 1} + hash := ds.store(value, ref)! - assert ds.exists(hash) == true - assert ds.exists('nonexistent') == false + assert ds.id_exists(hash) == true + assert ds.id_exists(u32(99)) == false } fn test_multiple_operations() ! { @@ -85,24 +91,67 @@ fn test_multiple_operations() ! { // Store multiple values mut values := [][]u8{} - mut hashes := []string{} + mut ids := []u32{} for i in 0..5 { value := 'test data ${i}'.bytes() values << value - hash := ds.store(value)! - hashes << hash + ref := Reference{owner: 1, id: u32(i)} + id := ds.store(value, ref)! + ids << id } // Verify all values can be retrieved - for i, hash in hashes { - retrieved := ds.get(hash)! + for i, id in ids { + retrieved := ds.get(id)! assert retrieved == values[i] } // Test deduplication by storing same values again for i, value in values { - hash := ds.store(value)! - assert hash == hashes[i] // Should get same hash for same data + ref := Reference{owner: 2, id: u32(i)} + id := ds.store(value, ref)! + assert id == ids[i] // Should get same hash for same data + } +} + +fn test_references() ! { + mut ds := new( + path: '/tmp/dedupestor_test_refs' + reset: true + )! + + // Store same data with different references + value := 'test data'.bytes() + ref1 := Reference{owner: 1, id: 1} + ref2 := Reference{owner: 1, id: 2} + ref3 := Reference{owner: 2, id: 1} + + // Store with first reference + id := ds.store(value, ref1)! + + // Store same data with second reference + id2 := ds.store(value, ref2)! + assert id == id2 // Same hash for same data + + // Store same data with third reference + id3 := ds.store(value, ref3)! + assert id == id3 // Same hash for same data + + // Delete first reference - data should still exist + ds.delete(id, ref1)! + assert ds.id_exists(id) == true + + // Delete second reference - data should still exist + ds.delete(id, ref2)! + assert ds.id_exists(id) == true + + // Delete last reference - data should be gone + ds.delete(id, ref3)! + assert ds.id_exists(id) == false + + // Verify data is actually deleted by trying to get it + if _ := ds.get(id) { + assert false, 'Expected error getting deleted data' } } diff --git a/lib/data/dedupestor/metadata.v b/lib/data/dedupestor/metadata.v new file mode 100644 index 00000000..df72075b --- /dev/null +++ b/lib/data/dedupestor/metadata.v @@ -0,0 +1,109 @@ +module dedupestor + +// Metadata represents a stored value with its ID and references +pub struct Metadata { +pub: + id u32 +pub mut: + references []Reference +} + +// Reference represents a reference to stored data +pub struct Reference { +pub: + owner u16 + id u32 +} + +// to_bytes converts Metadata to bytes for storage +pub fn (m Metadata) to_bytes() []u8 { + mut bytes := u32_to_bytes(m.id) + for ref in m.references { + bytes << ref.to_bytes() + } + return bytes +} + +// bytes_to_metadata converts bytes back to Metadata +pub fn bytes_to_metadata(b []u8) Metadata { + if b.len < 4 { + return Metadata{ + id: 0 + references: []Reference{} + } + } + + id := bytes_to_u32(b[0..4]) + mut refs := []Reference{} + + // Parse references (each reference is 6 bytes) + mut i := 4 + for i < b.len { + if i + 6 <= b.len { + refs << bytes_to_reference(b[i..i+6]) + } + i += 6 + } + + return Metadata{ + id: id + references: refs + } +} + +// add_reference adds a new reference if it doesn't already exist +pub fn (mut m Metadata) add_reference(ref Reference) !Metadata { + // Check if reference already exists + for existing in m.references { + if existing.owner == ref.owner && existing.id == ref.id { + return m + } + } + + m.references << ref + return m +} + +// remove_reference removes a reference if it exists +pub fn (mut m Metadata) remove_reference(ref Reference) !Metadata { + mut new_refs := []Reference{} + for existing in m.references { + if existing.owner != ref.owner || existing.id != ref.id { + new_refs << existing + } + } + m.references = new_refs + return m +} + +// to_bytes converts Reference to bytes +pub fn (r Reference) to_bytes() []u8 { + mut bytes := []u8{len: 6} + bytes[0] = u8(r.owner) + bytes[1] = u8(r.owner >> 8) + bytes[2] = u8(r.id) + bytes[3] = u8(r.id >> 8) + bytes[4] = u8(r.id >> 16) + bytes[5] = u8(r.id >> 24) + return bytes +} + +// bytes_to_reference converts bytes to Reference +pub fn bytes_to_reference(b []u8) Reference { + owner := u16(b[0]) | (u16(b[1]) << 8) + id := u32(b[2]) | (u32(b[3]) << 8) | (u32(b[4]) << 16) | (u32(b[5]) << 24) + return Reference{ + owner: owner + id: id + } +} + +// Helper function to convert u32 to []u8 +fn u32_to_bytes(n u32) []u8 { + return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)] +} + +// Helper function to convert []u8 to u32 +fn bytes_to_u32(b []u8) u32 { + return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24) +} diff --git a/lib/data/dedupestor/metadata_test.v b/lib/data/dedupestor/metadata_test.v new file mode 100644 index 00000000..9ebb3956 --- /dev/null +++ b/lib/data/dedupestor/metadata_test.v @@ -0,0 +1,103 @@ +module dedupestor + +fn test_reference_bytes_conversion() { + ref := Reference{ + owner: 12345 + id: 67890 + } + + bytes := ref.to_bytes() + recovered := bytes_to_reference(bytes) + + assert ref.owner == recovered.owner + assert ref.id == recovered.id +} + +fn test_metadata_bytes_conversion() { + mut metadata := Metadata{ + id: 42 + references: []Reference{} + } + + ref1 := Reference{owner: 1, id: 100} + ref2 := Reference{owner: 2, id: 200} + + metadata = metadata.add_reference(ref1)! + metadata = metadata.add_reference(ref2)! + + bytes := metadata.to_bytes() + recovered := bytes_to_metadata(bytes) + + assert metadata.id == recovered.id + assert metadata.references.len == recovered.references.len + assert metadata.references[0].owner == recovered.references[0].owner + assert metadata.references[0].id == recovered.references[0].id + assert metadata.references[1].owner == recovered.references[1].owner + assert metadata.references[1].id == recovered.references[1].id +} + +fn test_add_reference() { + mut metadata := Metadata{ + id: 1 + references: []Reference{} + } + + ref1 := Reference{owner: 1, id: 100} + ref2 := Reference{owner: 2, id: 200} + + // Add first reference + metadata = metadata.add_reference(ref1)! + assert metadata.references.len == 1 + assert metadata.references[0].owner == ref1.owner + assert metadata.references[0].id == ref1.id + + // Add second reference + metadata = metadata.add_reference(ref2)! + assert metadata.references.len == 2 + assert metadata.references[1].owner == ref2.owner + assert metadata.references[1].id == ref2.id + + // Try adding duplicate reference + metadata = metadata.add_reference(ref1)! + assert metadata.references.len == 2 // Length shouldn't change +} + +fn test_remove_reference() { + mut metadata := Metadata{ + id: 1 + references: []Reference{} + } + + ref1 := Reference{owner: 1, id: 100} + ref2 := Reference{owner: 2, id: 200} + + metadata = metadata.add_reference(ref1)! + metadata = metadata.add_reference(ref2)! + + // Remove first reference + metadata = metadata.remove_reference(ref1)! + assert metadata.references.len == 1 + assert metadata.references[0].owner == ref2.owner + assert metadata.references[0].id == ref2.id + + // Remove non-existent reference + metadata = metadata.remove_reference(Reference{owner: 999, id: 999})! + assert metadata.references.len == 1 // Length shouldn't change + + // Remove last reference + metadata = metadata.remove_reference(ref2)! + assert metadata.references.len == 0 +} + +fn test_empty_metadata_bytes() { + empty := bytes_to_metadata([]u8{}) + assert empty.id == 0 + assert empty.references.len == 0 +} + +fn test_u32_bytes_conversion() { + n := u32(0x12345678) + bytes := u32_to_bytes(n) + recovered := bytes_to_u32(bytes) + assert n == recovered +}