Store duplicated data just once

This commit is contained in:
Cyryl Płotnicki 2019-09-23 17:04:23 +01:00
parent c0aa4ed22d
commit 62a0e0981a
3 changed files with 49 additions and 3 deletions

View file

@ -1,4 +1,4 @@
use std::path::Path;
use std::path::{Path, PathBuf};
use std::{fmt, fs, io};
use crate::error::BakareError;
@ -10,6 +10,7 @@ use sha2::Sha512;
use std::fmt::Formatter;
use std::fs::File;
use std::io::BufReader;
use walkdir::WalkDir;
/// represents a place where backup is stored an can be restored from.
/// right now only on-disk directory storage is supported
@ -21,6 +22,8 @@ pub struct Repository<'a> {
index: Index,
}
const DATA_DIR_NAME: &str = "data";
#[derive(Clone, Debug, PartialOrd, PartialEq, Ord, Eq, Serialize, Deserialize, Hash)]
pub struct ItemId(Box<[u8]>);
@ -95,7 +98,8 @@ impl<'a> Repository<'a> {
return Err(BakareError::PathToStoreNotAbsolute);
}
let id = Repository::calculate_id(source_path)?;
let destination_path = self.path.join(id.to_string());
let destination_path = self.data_dir();
let destination_path = destination_path.join(id.to_string());
let destination_path = Path::new(&destination_path);
if source_path.is_file() {
@ -126,6 +130,20 @@ impl<'a> Repository<'a> {
}
}
pub fn data_weight(&self) -> Result<u64, BakareError> {
let total_size = WalkDir::new(self.data_dir())
.into_iter()
.filter_map(|entry| entry.ok())
.filter_map(|entry| entry.metadata().ok())
.filter(|metadata| metadata.is_file())
.fold(0, |acc, m| acc + m.len());
Ok(total_size)
}
fn data_dir(&self) -> PathBuf {
self.path().join(DATA_DIR_NAME)
}
fn calculate_id(source_path: &Path) -> Result<ItemId, BakareError> {
let source_file = File::open(source_path)?;
let mut reader = BufReader::new(source_file);

View file

@ -97,6 +97,13 @@ pub fn backup_file_with_contents(
}
}
pub fn data_weight(repository_path: &Path) -> Result<u64, BakareError> {
{
let repository = Repository::open(repository_path)?;
Ok(repository.data_weight()?)
}
}
fn assert_directory_trees_have_same_contents(left: &Path, right: &Path) -> Result<(), BakareError> {
let left_files = get_sorted_files_recursively(left)?;
let right_files = get_sorted_files_recursively(right)?;

View file

@ -80,6 +80,28 @@ fn newer_version_should_be_greater_than_earlier_version() -> Result<(), BakareEr
Ok(())
}
#[test]
fn store_duplicated_files_just_once() -> Result<(), BakareError> {
let source = TempSource::new()?;
let repository_path = &tempdir()?.into_path();
Repository::init(repository_path)?;
assert_eq!(data_weight(&repository_path)?, 0);
let contents = "some contents";
backup_file_with_contents(&source, &repository_path, "1", contents)?;
let first_weight = data_weight(&repository_path)?;
assert!(first_weight > 0);
backup_file_with_contents(&source, &repository_path, "2", contents)?;
let second_weight = data_weight(&repository_path)?;
assert_eq!(first_weight, second_weight);
assert_restored_has_contents(repository_path, &source.file_path("1"), contents)?;
assert_restored_has_contents(repository_path, &source.file_path("2"), contents)?;
Ok(())
}
#[test]
fn restore_latest_version_by_default() -> Result<(), BakareError> {
let source = TempSource::new()?;
@ -110,6 +132,5 @@ fn forbid_backup_of_paths_within_repository() -> Result<(), BakareError> {
}
// TODO: test concurrent writes
// TODO: deduplicate data
// TODO: test that index is stored separately from data
// TODO: index corruption