implement better reference tracking for deduped files
This commit is contained in:
@@ -42,58 +42,87 @@ pub fn new(args NewArgs) !&DedupeStore {
|
||||
}
|
||||
}
|
||||
|
||||
// store stores a value and returns its hash
|
||||
// If the value already exists (same hash), returns the existing hash without storing again
|
||||
pub fn (mut ds DedupeStore) store(value []u8) !string {
|
||||
// store stores data with its reference and returns its id
|
||||
// If the data already exists (same hash), returns the existing id without storing again
|
||||
// appends reference to the radix tree entry of the hash to track references
|
||||
pub fn (mut ds DedupeStore) store(data []u8, ref Reference) !u32 {
|
||||
// Check size limit
|
||||
if value.len > max_value_size {
|
||||
if data.len > max_value_size {
|
||||
return error('value size exceeds maximum allowed size of 1MB')
|
||||
}
|
||||
|
||||
// Calculate blake160 hash of the value
|
||||
hash := blake2b.sum160(value).hex()
|
||||
hash := blake2b.sum160(data).hex()
|
||||
|
||||
// Check if this hash already exists
|
||||
if _ := ds.radix.search(hash) {
|
||||
// Value already exists, return the hash
|
||||
return hash
|
||||
if metadata_bytes := ds.radix.search(hash) {
|
||||
// Value already exists, add new ref & return the id
|
||||
mut metadata := bytes_to_metadata(metadata_bytes)
|
||||
metadata = metadata.add_reference(ref)!
|
||||
ds.radix.update(hash, metadata.to_bytes())!
|
||||
return metadata.id
|
||||
}
|
||||
|
||||
// Store the actual data in ourdb
|
||||
id := ds.data.set(data: value)!
|
||||
|
||||
// Convert id to bytes for storage in radixtree
|
||||
id_bytes := u32_to_bytes(id)
|
||||
id := ds.data.set(data: data)!
|
||||
metadata := Metadata{
|
||||
id: id
|
||||
references: [ref]
|
||||
}
|
||||
|
||||
// Store the mapping of hash -> id in radixtree
|
||||
ds.radix.insert(hash, id_bytes)!
|
||||
ds.radix.insert(hash, metadata.to_bytes())!
|
||||
|
||||
return hash
|
||||
return metadata.id
|
||||
}
|
||||
|
||||
// get retrieves a value by its hash
|
||||
pub fn (mut ds DedupeStore) get(hash string) ![]u8 {
|
||||
// Get the ID from radixtree
|
||||
id_bytes := ds.radix.search(hash)!
|
||||
|
||||
// Convert bytes back to u32 id
|
||||
id := bytes_to_u32(id_bytes)
|
||||
|
||||
// Get the actual data from ourdb
|
||||
pub fn (mut ds DedupeStore) get(id u32) ![]u8 {
|
||||
return ds.data.get(id)!
|
||||
}
|
||||
|
||||
// get retrieves a value by its hash
|
||||
pub fn (mut ds DedupeStore) get_from_hash(hash string) ![]u8 {
|
||||
// Get the ID from radixtree
|
||||
metadata_bytes := ds.radix.search(hash)!
|
||||
|
||||
// Convert bytes back to metadata
|
||||
metadata := bytes_to_metadata(metadata_bytes)
|
||||
|
||||
// Get the actual data from ourdb
|
||||
return ds.data.get(metadata.id)!
|
||||
}
|
||||
|
||||
// exists checks if a value with the given hash exists
|
||||
pub fn (mut ds DedupeStore) exists(hash string) bool {
|
||||
pub fn (mut ds DedupeStore) id_exists(id u32) bool {
|
||||
if _ := ds.data.get(id) { return true } else {return false}
|
||||
}
|
||||
|
||||
// exists checks if a value with the given hash exists
|
||||
pub fn (mut ds DedupeStore) hash_exists(hash string) bool {
|
||||
return if _ := ds.radix.search(hash) { true } else { false }
|
||||
}
|
||||
|
||||
// Helper function to convert u32 to []u8
|
||||
fn u32_to_bytes(n u32) []u8 {
|
||||
return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
|
||||
}
|
||||
// delete removes a reference from the hash entry
|
||||
// If it's the last reference, removes the hash entry and its data
|
||||
pub fn (mut ds DedupeStore) delete(id u32, ref Reference) ! {
|
||||
// Calculate blake160 hash of the value
|
||||
data := ds.data.get(id)!
|
||||
hash := blake2b.sum160(data).hex()
|
||||
|
||||
// Helper function to convert []u8 to u32
|
||||
fn bytes_to_u32(b []u8) u32 {
|
||||
return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
|
||||
// Get the current entry from radixtree
|
||||
metadata_bytes := ds.radix.search(hash)!
|
||||
mut metadata := bytes_to_metadata(metadata_bytes)
|
||||
metadata = metadata.remove_reference(ref)!
|
||||
|
||||
if metadata.references.len == 0 {
|
||||
// Delete from radixtree
|
||||
ds.radix.delete(hash)!
|
||||
// Delete from data db
|
||||
ds.data.delete(id)!
|
||||
return
|
||||
}
|
||||
|
||||
// Update hash metadata
|
||||
ds.radix.update(hash, metadata.to_bytes())!
|
||||
}
|
||||
|
||||
@@ -8,7 +8,8 @@ fn testsuite_begin() ! {
|
||||
'/tmp/dedupestor_test',
|
||||
'/tmp/dedupestor_test_size',
|
||||
'/tmp/dedupestor_test_exists',
|
||||
'/tmp/dedupestor_test_multiple'
|
||||
'/tmp/dedupestor_test_multiple',
|
||||
'/tmp/dedupestor_test_refs'
|
||||
]
|
||||
|
||||
for dir in test_dirs {
|
||||
@@ -27,18 +28,21 @@ fn test_basic_operations() ! {
|
||||
|
||||
// Test storing and retrieving data
|
||||
value1 := 'test data 1'.bytes()
|
||||
hash1 := ds.store(value1)!
|
||||
ref1 := Reference{owner: 1, id: 1}
|
||||
hash1 := ds.store(value1, ref1)!
|
||||
|
||||
retrieved1 := ds.get(hash1)!
|
||||
assert retrieved1 == value1
|
||||
|
||||
// Test deduplication
|
||||
hash2 := ds.store(value1)!
|
||||
// Test deduplication with different reference
|
||||
ref2 := Reference{owner: 1, id: 2}
|
||||
hash2 := ds.store(value1, ref2)!
|
||||
assert hash1 == hash2 // Should return same hash for same data
|
||||
|
||||
// Test different data gets different hash
|
||||
value2 := 'test data 2'.bytes()
|
||||
hash3 := ds.store(value2)!
|
||||
ref3 := Reference{owner: 1, id: 3}
|
||||
hash3 := ds.store(value2, ref3)!
|
||||
assert hash1 != hash3 // Should be different hash for different data
|
||||
|
||||
retrieved2 := ds.get(hash3)!
|
||||
@@ -53,13 +57,14 @@ fn test_size_limit() ! {
|
||||
|
||||
// Test data under size limit (1KB)
|
||||
small_data := []u8{len: 1024, init: u8(index)}
|
||||
small_hash := ds.store(small_data)!
|
||||
ref := Reference{owner: 1, id: 1}
|
||||
small_hash := ds.store(small_data, ref)!
|
||||
retrieved := ds.get(small_hash)!
|
||||
assert retrieved == small_data
|
||||
|
||||
// Test data over size limit (2MB)
|
||||
large_data := []u8{len: 2 * 1024 * 1024, init: u8(index)}
|
||||
if _ := ds.store(large_data) {
|
||||
if _ := ds.store(large_data, ref) {
|
||||
assert false, 'Expected error for data exceeding size limit'
|
||||
}
|
||||
}
|
||||
@@ -71,10 +76,11 @@ fn test_exists() ! {
|
||||
)!
|
||||
|
||||
value := 'test data'.bytes()
|
||||
hash := ds.store(value)!
|
||||
ref := Reference{owner: 1, id: 1}
|
||||
hash := ds.store(value, ref)!
|
||||
|
||||
assert ds.exists(hash) == true
|
||||
assert ds.exists('nonexistent') == false
|
||||
assert ds.id_exists(hash) == true
|
||||
assert ds.id_exists(u32(99)) == false
|
||||
}
|
||||
|
||||
fn test_multiple_operations() ! {
|
||||
@@ -85,24 +91,67 @@ fn test_multiple_operations() ! {
|
||||
|
||||
// Store multiple values
|
||||
mut values := [][]u8{}
|
||||
mut hashes := []string{}
|
||||
mut ids := []u32{}
|
||||
|
||||
for i in 0..5 {
|
||||
value := 'test data ${i}'.bytes()
|
||||
values << value
|
||||
hash := ds.store(value)!
|
||||
hashes << hash
|
||||
ref := Reference{owner: 1, id: u32(i)}
|
||||
id := ds.store(value, ref)!
|
||||
ids << id
|
||||
}
|
||||
|
||||
// Verify all values can be retrieved
|
||||
for i, hash in hashes {
|
||||
retrieved := ds.get(hash)!
|
||||
for i, id in ids {
|
||||
retrieved := ds.get(id)!
|
||||
assert retrieved == values[i]
|
||||
}
|
||||
|
||||
// Test deduplication by storing same values again
|
||||
for i, value in values {
|
||||
hash := ds.store(value)!
|
||||
assert hash == hashes[i] // Should get same hash for same data
|
||||
ref := Reference{owner: 2, id: u32(i)}
|
||||
id := ds.store(value, ref)!
|
||||
assert id == ids[i] // Should get same hash for same data
|
||||
}
|
||||
}
|
||||
|
||||
fn test_references() ! {
|
||||
mut ds := new(
|
||||
path: '/tmp/dedupestor_test_refs'
|
||||
reset: true
|
||||
)!
|
||||
|
||||
// Store same data with different references
|
||||
value := 'test data'.bytes()
|
||||
ref1 := Reference{owner: 1, id: 1}
|
||||
ref2 := Reference{owner: 1, id: 2}
|
||||
ref3 := Reference{owner: 2, id: 1}
|
||||
|
||||
// Store with first reference
|
||||
id := ds.store(value, ref1)!
|
||||
|
||||
// Store same data with second reference
|
||||
id2 := ds.store(value, ref2)!
|
||||
assert id == id2 // Same hash for same data
|
||||
|
||||
// Store same data with third reference
|
||||
id3 := ds.store(value, ref3)!
|
||||
assert id == id3 // Same hash for same data
|
||||
|
||||
// Delete first reference - data should still exist
|
||||
ds.delete(id, ref1)!
|
||||
assert ds.id_exists(id) == true
|
||||
|
||||
// Delete second reference - data should still exist
|
||||
ds.delete(id, ref2)!
|
||||
assert ds.id_exists(id) == true
|
||||
|
||||
// Delete last reference - data should be gone
|
||||
ds.delete(id, ref3)!
|
||||
assert ds.id_exists(id) == false
|
||||
|
||||
// Verify data is actually deleted by trying to get it
|
||||
if _ := ds.get(id) {
|
||||
assert false, 'Expected error getting deleted data'
|
||||
}
|
||||
}
|
||||
|
||||
109
lib/data/dedupestor/metadata.v
Normal file
109
lib/data/dedupestor/metadata.v
Normal file
@@ -0,0 +1,109 @@
|
||||
module dedupestor
|
||||
|
||||
// Metadata represents a stored value with its ID and references
|
||||
pub struct Metadata {
|
||||
pub:
|
||||
id u32
|
||||
pub mut:
|
||||
references []Reference
|
||||
}
|
||||
|
||||
// Reference represents a reference to stored data
|
||||
pub struct Reference {
|
||||
pub:
|
||||
owner u16
|
||||
id u32
|
||||
}
|
||||
|
||||
// to_bytes converts Metadata to bytes for storage
|
||||
pub fn (m Metadata) to_bytes() []u8 {
|
||||
mut bytes := u32_to_bytes(m.id)
|
||||
for ref in m.references {
|
||||
bytes << ref.to_bytes()
|
||||
}
|
||||
return bytes
|
||||
}
|
||||
|
||||
// bytes_to_metadata converts bytes back to Metadata
|
||||
pub fn bytes_to_metadata(b []u8) Metadata {
|
||||
if b.len < 4 {
|
||||
return Metadata{
|
||||
id: 0
|
||||
references: []Reference{}
|
||||
}
|
||||
}
|
||||
|
||||
id := bytes_to_u32(b[0..4])
|
||||
mut refs := []Reference{}
|
||||
|
||||
// Parse references (each reference is 6 bytes)
|
||||
mut i := 4
|
||||
for i < b.len {
|
||||
if i + 6 <= b.len {
|
||||
refs << bytes_to_reference(b[i..i+6])
|
||||
}
|
||||
i += 6
|
||||
}
|
||||
|
||||
return Metadata{
|
||||
id: id
|
||||
references: refs
|
||||
}
|
||||
}
|
||||
|
||||
// add_reference adds a new reference if it doesn't already exist
|
||||
pub fn (mut m Metadata) add_reference(ref Reference) !Metadata {
|
||||
// Check if reference already exists
|
||||
for existing in m.references {
|
||||
if existing.owner == ref.owner && existing.id == ref.id {
|
||||
return m
|
||||
}
|
||||
}
|
||||
|
||||
m.references << ref
|
||||
return m
|
||||
}
|
||||
|
||||
// remove_reference removes a reference if it exists
|
||||
pub fn (mut m Metadata) remove_reference(ref Reference) !Metadata {
|
||||
mut new_refs := []Reference{}
|
||||
for existing in m.references {
|
||||
if existing.owner != ref.owner || existing.id != ref.id {
|
||||
new_refs << existing
|
||||
}
|
||||
}
|
||||
m.references = new_refs
|
||||
return m
|
||||
}
|
||||
|
||||
// to_bytes converts Reference to bytes
|
||||
pub fn (r Reference) to_bytes() []u8 {
|
||||
mut bytes := []u8{len: 6}
|
||||
bytes[0] = u8(r.owner)
|
||||
bytes[1] = u8(r.owner >> 8)
|
||||
bytes[2] = u8(r.id)
|
||||
bytes[3] = u8(r.id >> 8)
|
||||
bytes[4] = u8(r.id >> 16)
|
||||
bytes[5] = u8(r.id >> 24)
|
||||
return bytes
|
||||
}
|
||||
|
||||
// bytes_to_reference converts bytes to Reference
|
||||
pub fn bytes_to_reference(b []u8) Reference {
|
||||
owner := u16(b[0]) | (u16(b[1]) << 8)
|
||||
id := u32(b[2]) | (u32(b[3]) << 8) | (u32(b[4]) << 16) | (u32(b[5]) << 24)
|
||||
return Reference{
|
||||
owner: owner
|
||||
id: id
|
||||
}
|
||||
}
|
||||
|
||||
// Helper function to convert u32 to []u8
|
||||
fn u32_to_bytes(n u32) []u8 {
|
||||
return [u8(n), u8(n >> 8), u8(n >> 16), u8(n >> 24)]
|
||||
}
|
||||
|
||||
// Helper function to convert []u8 to u32
|
||||
fn bytes_to_u32(b []u8) u32 {
|
||||
return u32(b[0]) | (u32(b[1]) << 8) | (u32(b[2]) << 16) | (u32(b[3]) << 24)
|
||||
}
|
||||
103
lib/data/dedupestor/metadata_test.v
Normal file
103
lib/data/dedupestor/metadata_test.v
Normal file
@@ -0,0 +1,103 @@
|
||||
module dedupestor
|
||||
|
||||
fn test_reference_bytes_conversion() {
|
||||
ref := Reference{
|
||||
owner: 12345
|
||||
id: 67890
|
||||
}
|
||||
|
||||
bytes := ref.to_bytes()
|
||||
recovered := bytes_to_reference(bytes)
|
||||
|
||||
assert ref.owner == recovered.owner
|
||||
assert ref.id == recovered.id
|
||||
}
|
||||
|
||||
fn test_metadata_bytes_conversion() {
|
||||
mut metadata := Metadata{
|
||||
id: 42
|
||||
references: []Reference{}
|
||||
}
|
||||
|
||||
ref1 := Reference{owner: 1, id: 100}
|
||||
ref2 := Reference{owner: 2, id: 200}
|
||||
|
||||
metadata = metadata.add_reference(ref1)!
|
||||
metadata = metadata.add_reference(ref2)!
|
||||
|
||||
bytes := metadata.to_bytes()
|
||||
recovered := bytes_to_metadata(bytes)
|
||||
|
||||
assert metadata.id == recovered.id
|
||||
assert metadata.references.len == recovered.references.len
|
||||
assert metadata.references[0].owner == recovered.references[0].owner
|
||||
assert metadata.references[0].id == recovered.references[0].id
|
||||
assert metadata.references[1].owner == recovered.references[1].owner
|
||||
assert metadata.references[1].id == recovered.references[1].id
|
||||
}
|
||||
|
||||
fn test_add_reference() {
|
||||
mut metadata := Metadata{
|
||||
id: 1
|
||||
references: []Reference{}
|
||||
}
|
||||
|
||||
ref1 := Reference{owner: 1, id: 100}
|
||||
ref2 := Reference{owner: 2, id: 200}
|
||||
|
||||
// Add first reference
|
||||
metadata = metadata.add_reference(ref1)!
|
||||
assert metadata.references.len == 1
|
||||
assert metadata.references[0].owner == ref1.owner
|
||||
assert metadata.references[0].id == ref1.id
|
||||
|
||||
// Add second reference
|
||||
metadata = metadata.add_reference(ref2)!
|
||||
assert metadata.references.len == 2
|
||||
assert metadata.references[1].owner == ref2.owner
|
||||
assert metadata.references[1].id == ref2.id
|
||||
|
||||
// Try adding duplicate reference
|
||||
metadata = metadata.add_reference(ref1)!
|
||||
assert metadata.references.len == 2 // Length shouldn't change
|
||||
}
|
||||
|
||||
fn test_remove_reference() {
|
||||
mut metadata := Metadata{
|
||||
id: 1
|
||||
references: []Reference{}
|
||||
}
|
||||
|
||||
ref1 := Reference{owner: 1, id: 100}
|
||||
ref2 := Reference{owner: 2, id: 200}
|
||||
|
||||
metadata = metadata.add_reference(ref1)!
|
||||
metadata = metadata.add_reference(ref2)!
|
||||
|
||||
// Remove first reference
|
||||
metadata = metadata.remove_reference(ref1)!
|
||||
assert metadata.references.len == 1
|
||||
assert metadata.references[0].owner == ref2.owner
|
||||
assert metadata.references[0].id == ref2.id
|
||||
|
||||
// Remove non-existent reference
|
||||
metadata = metadata.remove_reference(Reference{owner: 999, id: 999})!
|
||||
assert metadata.references.len == 1 // Length shouldn't change
|
||||
|
||||
// Remove last reference
|
||||
metadata = metadata.remove_reference(ref2)!
|
||||
assert metadata.references.len == 0
|
||||
}
|
||||
|
||||
fn test_empty_metadata_bytes() {
|
||||
empty := bytes_to_metadata([]u8{})
|
||||
assert empty.id == 0
|
||||
assert empty.references.len == 0
|
||||
}
|
||||
|
||||
fn test_u32_bytes_conversion() {
|
||||
n := u32(0x12345678)
|
||||
bytes := u32_to_bytes(n)
|
||||
recovered := bytes_to_u32(bytes)
|
||||
assert n == recovered
|
||||
}
|
||||
Reference in New Issue
Block a user