implement better reference tracking for deduped files

This commit is contained in:
timurgordon
2025-02-26 22:24:40 +03:00
parent a798b2347f
commit 972bb9f755
4 changed files with 337 additions and 47 deletions

View File

@@ -42,58 +42,87 @@ pub fn new(args NewArgs) !&DedupeStore {
}
}
// store stores a value and returns its hash
// If the value already exists (same hash), returns the existing hash without storing again
pub fn (mut ds DedupeStore) store(value []u8) !string {
// store stores data with its reference and returns its id
// If the data already exists (same hash), returns the existing id without storing again
// appends reference to the radix tree entry of the hash to track references
pub fn (mut ds DedupeStore) store(data []u8, ref Reference) !u32 {
// Check size limit
if value.len > max_value_size {
if data.len > max_value_size {
return error('value size exceeds maximum allowed size of 1MB')
}
// Calculate blake160 hash of the value
hash := blake2b.sum160(value).hex()
hash := blake2b.sum160(data).hex()
// Check if this hash already exists
if _ := ds.radix.search(hash) {
// Value already exists, return the hash
return hash
if metadata_bytes := ds.radix.search(hash) {
// Value already exists, add new ref & return the id
mut metadata := bytes_to_metadata(metadata_bytes)
metadata = metadata.add_reference(ref)!
ds.radix.update(hash, metadata.to_bytes())!
return metadata.id
}
// Store the actual data in ourdb
id := ds.data.set(data: value)!
// Convert id to bytes for storage in radixtree
id_bytes := u32_to_bytes(id)
id := ds.data.set(data: data)!
metadata := Metadata{
id: id
references: [ref]
}
// Store the mapping of hash -> id in radixtree
ds.radix.insert(hash, id_bytes)!
ds.radix.insert(hash, metadata.to_bytes())!
return hash
return metadata.id
}
// get retrieves a value by its hash
pub fn (mut ds DedupeStore) get(hash string) ![]u8 {
// Get the ID from radixtree
id_bytes := ds.radix.search(hash)!
// Convert bytes back to u32 id
id := bytes_to_u32(id_bytes)
// Get the actual data from ourdb
pub fn (mut ds DedupeStore) get(id u32) ![]u8 {
return ds.data.get(id)!
}
// get retrieves a value by its hash
pub fn (mut ds DedupeStore) get_from_hash(hash string) ![]u8 {
// Get the ID from radixtree
metadata_bytes := ds.radix.search(hash)!
// Convert bytes back to metadata
metadata := bytes_to_metadata(metadata_bytes)
// Get the actual data from ourdb
return ds.data.get(metadata.id)!
}
// exists checks if a value with the given hash exists
pub fn (mut ds DedupeStore) exists(hash string) bool {
pub fn (mut ds DedupeStore) id_exists(id u32) bool {
if _ := ds.data.get(id) { return true } else {return false}
}
// exists checks if a value with the given hash exists
pub fn (mut ds DedupeStore) hash_exists(hash string) bool {
return if _ := ds.radix.search(hash) { true } else { false }
}
// Helper function to convert u32 to []u8
fn u32_to_bytes(n u32) []u8 {
return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
}
// delete removes a reference from the hash entry
// If it's the last reference, removes the hash entry and its data
pub fn (mut ds DedupeStore) delete(id u32, ref Reference) ! {
// Calculate blake160 hash of the value
data := ds.data.get(id)!
hash := blake2b.sum160(data).hex()
// Helper function to convert []u8 to u32
fn bytes_to_u32(b []u8) u32 {
return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
// Get the current entry from radixtree
metadata_bytes := ds.radix.search(hash)!
mut metadata := bytes_to_metadata(metadata_bytes)
metadata = metadata.remove_reference(ref)!
if metadata.references.len == 0 {
// Delete from radixtree
ds.radix.delete(hash)!
// Delete from data db
ds.data.delete(id)!
return
}
// Update hash metadata
ds.radix.update(hash, metadata.to_bytes())!
}

View File

@@ -8,7 +8,8 @@ fn testsuite_begin() ! {
'/tmp/dedupestor_test',
'/tmp/dedupestor_test_size',
'/tmp/dedupestor_test_exists',
'/tmp/dedupestor_test_multiple'
'/tmp/dedupestor_test_multiple',
'/tmp/dedupestor_test_refs'
]
for dir in test_dirs {
@@ -27,18 +28,21 @@ fn test_basic_operations() ! {
// Test storing and retrieving data
value1 := 'test data 1'.bytes()
hash1 := ds.store(value1)!
ref1 := Reference{owner: 1, id: 1}
hash1 := ds.store(value1, ref1)!
retrieved1 := ds.get(hash1)!
assert retrieved1 == value1
// Test deduplication
hash2 := ds.store(value1)!
// Test deduplication with different reference
ref2 := Reference{owner: 1, id: 2}
hash2 := ds.store(value1, ref2)!
assert hash1 == hash2 // Should return same hash for same data
// Test different data gets different hash
value2 := 'test data 2'.bytes()
hash3 := ds.store(value2)!
ref3 := Reference{owner: 1, id: 3}
hash3 := ds.store(value2, ref3)!
assert hash1 != hash3 // Should be different hash for different data
retrieved2 := ds.get(hash3)!
@@ -53,13 +57,14 @@ fn test_size_limit() ! {
// Test data under size limit (1KB)
small_data := []u8{len: 1024, init: u8(index)}
small_hash := ds.store(small_data)!
ref := Reference{owner: 1, id: 1}
small_hash := ds.store(small_data, ref)!
retrieved := ds.get(small_hash)!
assert retrieved == small_data
// Test data over size limit (2MB)
large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)}
if _ := ds.store(large_data) {
if _ := ds.store(large_data, ref) {
assert false, 'Expected error for data exceeding size limit'
}
}
@@ -71,10 +76,11 @@ fn test_exists() ! {
)!
value := 'test data'.bytes()
hash := ds.store(value)!
ref := Reference{owner: 1, id: 1}
hash := ds.store(value, ref)!
assert ds.exists(hash) == true
assert ds.exists('nonexistent') == false
assert ds.id_exists(hash) == true
assert ds.id_exists(u32(99)) == false
}
fn test_multiple_operations() ! {
@@ -85,24 +91,67 @@ fn test_multiple_operations() ! {
// Store multiple values
mut values := [][]u8{}
mut hashes := []string{}
mut ids := []u32{}
for i in 0..5 {
value := 'test data ${i}'.bytes()
values << value
hash := ds.store(value)!
hashes << hash
ref := Reference{owner: 1, id: u32(i)}
id := ds.store(value, ref)!
ids << id
}
// Verify all values can be retrieved
for i, hash in hashes {
retrieved := ds.get(hash)!
for i, id in ids {
retrieved := ds.get(id)!
assert retrieved == values[i]
}
// Test deduplication by storing same values again
for i, value in values {
hash := ds.store(value)!
assert hash == hashes[i] // Should get same hash for same data
ref := Reference{owner: 2, id: u32(i)}
id := ds.store(value, ref)!
assert id == ids[i] // Should get same hash for same data
}
}
fn test_references() ! {
mut ds := new(
path: '/tmp/dedupestor_test_refs'
reset: true
)!
// Store same data with different references
value := 'test data'.bytes()
ref1 := Reference{owner: 1, id: 1}
ref2 := Reference{owner: 1, id: 2}
ref3 := Reference{owner: 2, id: 1}
// Store with first reference
id := ds.store(value, ref1)!
// Store same data with second reference
id2 := ds.store(value, ref2)!
assert id == id2 // Same hash for same data
// Store same data with third reference
id3 := ds.store(value, ref3)!
assert id == id3 // Same hash for same data
// Delete first reference - data should still exist
ds.delete(id, ref1)!
assert ds.id_exists(id) == true
// Delete second reference - data should still exist
ds.delete(id, ref2)!
assert ds.id_exists(id) == true
// Delete last reference - data should be gone
ds.delete(id, ref3)!
assert ds.id_exists(id) == false
// Verify data is actually deleted by trying to get it
if _ := ds.get(id) {
assert false, 'Expected error getting deleted data'
}
}

View File

@@ -0,0 +1,109 @@
module dedupestor
// Metadata represents a stored value with its ID and references
pub struct Metadata {
pub:
id u32
pub mut:
references []Reference
}
// Reference represents a reference to stored data
pub struct Reference {
pub:
owner u16
id u32
}
// to_bytes converts Metadata to bytes for storage
pub fn (m Metadata) to_bytes() []u8 {
mut bytes := u32_to_bytes(m.id)
for ref in m.references {
bytes << ref.to_bytes()
}
return bytes
}
// bytes_to_metadata converts bytes back to Metadata
pub fn bytes_to_metadata(b []u8) Metadata {
if b.len < 4 {
return Metadata{
id: 0
references: []Reference{}
}
}
id := bytes_to_u32(b[0..4])
mut refs := []Reference{}
// Parse references (each reference is 6 bytes)
mut i := 4
for i < b.len {
if i + 6 <= b.len {
refs << bytes_to_reference(b[i..i+6])
}
i += 6
}
return Metadata{
id: id
references: refs
}
}
// add_reference adds a new reference if it doesn't already exist
pub fn (mut m Metadata) add_reference(ref Reference) !Metadata {
// Check if reference already exists
for existing in m.references {
if existing.owner == ref.owner && existing.id == ref.id {
return m
}
}
m.references << ref
return m
}
// remove_reference removes a reference if it exists
pub fn (mut m Metadata) remove_reference(ref Reference) !Metadata {
mut new_refs := []Reference{}
for existing in m.references {
if existing.owner != ref.owner || existing.id != ref.id {
new_refs << existing
}
}
m.references = new_refs
return m
}
// to_bytes converts Reference to bytes
pub fn (r Reference) to_bytes() []u8 {
mut bytes := []u8{len: 6}
bytes[0] = u8(r.owner)
bytes[1] = u8(r.owner >> 8)
bytes[2] = u8(r.id)
bytes[3] = u8(r.id >> 8)
bytes[4] = u8(r.id >> 16)
bytes[5] = u8(r.id >> 24)
return bytes
}
// bytes_to_reference converts bytes to Reference
pub fn bytes_to_reference(b []u8) Reference {
owner := u16(b[0]) | (u16(b[1]) << 8)
id := u32(b[2]) | (u32(b[3]) << 8) | (u32(b[4]) << 16) | (u32(b[5]) << 24)
return Reference{
owner: owner
id: id
}
}
// Helper function to convert u32 to []u8
fn u32_to_bytes(n u32) []u8 {
return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
}
// Helper function to convert []u8 to u32
fn bytes_to_u32(b []u8) u32 {
return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
}

View File

@@ -0,0 +1,103 @@
module dedupestor
fn test_reference_bytes_conversion() {
ref := Reference{
owner: 12345
id: 67890
}
bytes := ref.to_bytes()
recovered := bytes_to_reference(bytes)
assert ref.owner == recovered.owner
assert ref.id == recovered.id
}
fn test_metadata_bytes_conversion() {
mut metadata := Metadata{
id: 42
references: []Reference{}
}
ref1 := Reference{owner: 1, id: 100}
ref2 := Reference{owner: 2, id: 200}
metadata = metadata.add_reference(ref1)!
metadata = metadata.add_reference(ref2)!
bytes := metadata.to_bytes()
recovered := bytes_to_metadata(bytes)
assert metadata.id == recovered.id
assert metadata.references.len == recovered.references.len
assert metadata.references[0].owner == recovered.references[0].owner
assert metadata.references[0].id == recovered.references[0].id
assert metadata.references[1].owner == recovered.references[1].owner
assert metadata.references[1].id == recovered.references[1].id
}
fn test_add_reference() {
mut metadata := Metadata{
id: 1
references: []Reference{}
}
ref1 := Reference{owner: 1, id: 100}
ref2 := Reference{owner: 2, id: 200}
// Add first reference
metadata = metadata.add_reference(ref1)!
assert metadata.references.len == 1
assert metadata.references[0].owner == ref1.owner
assert metadata.references[0].id == ref1.id
// Add second reference
metadata = metadata.add_reference(ref2)!
assert metadata.references.len == 2
assert metadata.references[1].owner == ref2.owner
assert metadata.references[1].id == ref2.id
// Try adding duplicate reference
metadata = metadata.add_reference(ref1)!
assert metadata.references.len == 2 // Length shouldn't change
}
fn test_remove_reference() {
mut metadata := Metadata{
id: 1
references: []Reference{}
}
ref1 := Reference{owner: 1, id: 100}
ref2 := Reference{owner: 2, id: 200}
metadata = metadata.add_reference(ref1)!
metadata = metadata.add_reference(ref2)!
// Remove first reference
metadata = metadata.remove_reference(ref1)!
assert metadata.references.len == 1
assert metadata.references[0].owner == ref2.owner
assert metadata.references[0].id == ref2.id
// Remove non-existent reference
metadata = metadata.remove_reference(Reference{owner: 999, id: 999})!
assert metadata.references.len == 1 // Length shouldn't change
// Remove last reference
metadata = metadata.remove_reference(ref2)!
assert metadata.references.len == 0
}
fn test_empty_metadata_bytes() {
empty := bytes_to_metadata([]u8{})
assert empty.id == 0
assert empty.references.len == 0
}
fn test_u32_bytes_conversion() {
n := u32(0x12345678)
bytes := u32_to_bytes(n)
recovered := bytes_to_u32(bytes)
assert n == recovered
}