Browse Source

Untracking by contents

main
Unbewohnte 3 years ago
parent
commit
a6b847d37f
  1. 168
      src/broom.cpp
  2. 9
      src/broom.hpp
  3. 65
      src/entry.cpp
  4. 19
      src/entry.hpp

168
src/broom.cpp

@ -32,28 +32,8 @@ Broom::Broom(Options options) {
m_sweeping = options.sweeping; m_sweeping = options.sweeping;
}; };
Broom::~Broom() {}; Broom::~Broom() {
m_tracked_entries.clear();
// Print current statistics
void Broom::print_statistics() {
std::cout
<< "| sweeped " << m_sweeped_files << " files" << std::endl
<< "| with a total size of " << m_sweeped_size << " bytes" << std::endl
<< std::endl;
};
// returns amount of regular files in path, searching recursively.
// Throws an invalid_argument error in case path does not exist
uintmax_t amount_of_files(const std::filesystem::path path) {
if (!std::filesystem::exists(path)) {
throw std::invalid_argument("\"" + path.string() + "\"" + " does not exist !");
};
if (!std::filesystem::is_directory(path)) {
throw std::invalid_argument("\"" + path.string() + "\"" + " is not a directory");
};
return std::distance(std::filesystem::directory_iterator(path), std::filesystem::directory_iterator{});
}; };
// recursively track every file that lies in given path. Throws an invalid_argument // recursively track every file that lies in given path. Throws an invalid_argument
@ -90,7 +70,7 @@ void Broom::track(const std::filesystem::path path) {
auto tracking_time = std::chrono::high_resolution_clock::now(); auto tracking_time = std::chrono::high_resolution_clock::now();
std::cout std::cout
<< "Tracking took " << "[BENCHMARK] Tracking took "
<< std::chrono::duration_cast<std::chrono::milliseconds>(tracking_time - t0).count() << std::chrono::duration_cast<std::chrono::milliseconds>(tracking_time - t0).count()
<< " ms" << std::endl; << " ms" << std::endl;
}; };
@ -102,16 +82,16 @@ uintmax_t Broom::untrack_unique_sizes() {
// key: size, value: amount of occurences // key: size, value: amount of occurences
std::map<uintmax_t, uintmax_t> sizes_map; std::map<uintmax_t, uintmax_t> sizes_map;
for (Entry& entry : m_tracked_entries) { for (auto entry_iter = m_tracked_entries.begin(); entry_iter != m_tracked_entries.end(); entry_iter++) {
// check if size of this entry is already in the map // check if size of this entry is already in the map
// if yes --> increment occurences counter // if yes --> increment occurences counter
// if not --> add it to the map with a counter of 1 // if not --> add it to the map with a counter of 1
entry.get_size(); entry_iter->get_size();
auto iterator = sizes_map.find(entry.filesize); auto iterator = sizes_map.find(entry_iter->filesize);
if (iterator == sizes_map.end()) { if (iterator == sizes_map.end()) {
// there is no such size // there is no such size
sizes_map.insert({entry.filesize, 1}); sizes_map.insert({entry_iter->filesize, 1});
} else { } else {
// there is such size // there is such size
sizes_map[iterator->first]++; sizes_map[iterator->first]++;
@ -130,85 +110,105 @@ uintmax_t Broom::untrack_unique_sizes() {
// std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl; // std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl;
return false; return false;
})); }), m_tracked_entries.end());
// std::cout << "Size after untracking by size: " << m_tracked_entries.size() << std::endl;
return untracked; return untracked;
}; };
// removes entries with unique first and last 20 bytes. Returns amount of // removes entries with the same content-pieces. Returns amount of
// files that are no longer being tracked // files that are no longer being tracked
// uintmax_t Broom::untrack_unique_contents() { uintmax_t Broom::untrack_unique_contents() {
// // contents, occurences // contents, occurences
// std::map<char[CHECKSUM_SIZE], uintmax_t> contents_map; std::map<std::string, uintmax_t> contents_map;
// std::map<char[CHECKSUM_SIZE], uintmax_t>::iterator iterator; std::map<std::string, uintmax_t>::iterator map_iter;
//
// for (Entry& entry : m_tracked_entries) { for (auto entry_iter = m_tracked_entries.begin(); entry_iter != m_tracked_entries.end();) {
// // the same logic: // the same logic:
// // check if contents of this entry is already in the map // check if contents of this entry is already in the map
// // if yes --> increment occurences counter // if yes --> increment occurences counter
// // if not --> add it to the map with a counter of 1 // if not --> add it to the map with a counter of 1
//
// iterator = contents_map.find(entry.checksum); try{
// // can get "permission denied" when opening file
// if (iterator == contents_map.end()) { entry_iter->get_pieces();
// // add it to the map } catch(const std::ifstream::failure& e) {
// contents_map.insert(std::pair<char[CHECKSUM_SIZE], uintmax_t>(entry.checksum, 1)); // there is nothing we can do. Untrack this entry
// } else { // std::cerr << e.what();
// // increment occurences counter entry_iter = m_tracked_entries.erase(entry_iter);
// uintmax_t occurences = contents_map[iterator->first]; continue;
// contents_map[iterator->first] = occurences++; }
// };
// }; map_iter = contents_map.find(entry_iter->pieces);
// if (map_iter == contents_map.end()) {
// uintmax_t untracked = 0; // add it to the map
// for (std::pair<const char[CHECKSUM_SIZE], uintmax_t> contents_entry : contents_map) { contents_map.insert({entry_iter->pieces, 1});
// if (contents_entry.second > 1) { // std::cout << "First time seeing this piece: " << entry_iter->pieces << std::endl;
// // not a unique size. Keep such entries } else {
// } else { // increment occurences counter
// // a unique one. Untrack such an entry contents_map[map_iter->first]++;
// std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [contents_entry](Entry e) -> bool { };
// return (e.compare_checksums(contents_entry.first));
// }); entry_iter++;
// untracked++; };
// };
// }; uintmax_t untracked = 0;
// m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](Entry entry) -> bool {
// return untracked; auto iter = contents_map.find(entry.pieces);
// }; if (iter->second == 1) {
// unique
untracked++;
return true;
} else {
return false;
}
}), m_tracked_entries.end());
return untracked;
};
// find all duplicates among tracked entries, stop tracking uniques // find all duplicates among tracked entries, stop tracking uniques
void Broom::find_duplicates() { void Broom::find_duplicates() {
if (m_benchmarking) { auto t0 = std::chrono::high_resolution_clock::now();
auto t0 = std::chrono::high_resolution_clock::now();
// print how many files are being tracked
uintmax_t global_untracked = m_tracked_entries.size();
std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl;
untrack_unique_sizes(); // untrack by size
uintmax_t untracked_by_size = untrack_unique_sizes();
global_untracked += untracked_by_size;
std::cout << "[INFO] Untracked " << untracked_by_size << " unique sizes" << std::endl;
auto sizes_untrack_time = std::chrono::high_resolution_clock::now(); auto sizes_untrack_time = std::chrono::high_resolution_clock::now();
if (m_benchmarking) {
std::cout std::cout
<< "Untracking by size took " << "[BENCHMARK] Untracking by size took "
<< std::chrono::duration_cast<std::chrono::milliseconds>(sizes_untrack_time - t0).count() << std::chrono::duration_cast<std::chrono::milliseconds>(sizes_untrack_time - t0).count()
<< " ms" << std::endl; << " ms" << std::endl;
} else { };
size_t startsize = m_tracked_entries.size();
std::cout << "Tracking " << startsize << std::endl;
uintmax_t global_untracked = 0; // untrack by contents
uintmax_t untracked_by_contents = untrack_unique_contents();
global_untracked += untracked_by_contents;
// uintmax_t untracked_by_contents = untrack_unique_contents(); auto contents_untrack_time = std::chrono::high_resolution_clock::now();
// global_untracked += untracked_by_contents;
// std::cout << "Untracked " << untracked_by_contents << " unique contents" << std::endl;
if (m_benchmarking) {
std::cout
<< "[BENCHMARK] Untracking by contents took "
<< std::chrono::duration_cast<std::chrono::milliseconds>(contents_untrack_time - sizes_untrack_time).count()
<< " ms" << std::endl;
};
uintmax_t untracked_by_size = untrack_unique_sizes(); std::cout << "[INFO] Untracked " << untracked_by_contents << " unique contents" << std::endl;
global_untracked += untracked_by_size;
std::cout << "Untracked " << untracked_by_size << " unique sizes" << std::endl;
std::cout << "[INFO] Duplicates: " << m_tracked_entries.size() << std::endl;
std::cout << "Duplicates: " << startsize - global_untracked << std::endl;
};
}; };
// remove ALL duplicate files // remove ALL duplicate files

9
src/broom.hpp

@ -48,13 +48,6 @@ public:
Broom(Options options); Broom(Options options);
~Broom(); ~Broom();
// Print current statistics
void print_statistics();
// returns amount of regular files in path, searching recursively.
// Throws an invalid_argument error in case path does not exist
uintmax_t amount_of_files(const std::filesystem::path path);
// recursively track every file that lies in given path. Throws an invalid_argument // recursively track every file that lies in given path. Throws an invalid_argument
// error in case path does not exist // error in case path does not exist
void track(const std::filesystem::path path); void track(const std::filesystem::path path);
@ -66,7 +59,7 @@ public:
// that are no longer being tracked // that are no longer being tracked
uintmax_t untrack_unique_sizes(); uintmax_t untrack_unique_sizes();
// removes entries with unique first and last 20 bytes. Returns amount of // removes entries with the same content-pieces. Returns amount of
// files that are no longer being tracked // files that are no longer being tracked
uintmax_t untrack_unique_contents(); uintmax_t untrack_unique_contents();

65
src/entry.cpp

@ -19,6 +19,11 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include "entry.hpp" #include "entry.hpp"
#include <iostream>
#include <cerrno>
#include <cstring>
// A wrapper for every file with all necessary information // A wrapper for every file with all necessary information
Entry::Entry(const std::filesystem::path entry_path) { Entry::Entry(const std::filesystem::path entry_path) {
// path // path
@ -29,54 +34,50 @@ Entry::~Entry() {};
// sets this entry`s filesize // sets this entry`s filesize
void Entry::get_size() { void Entry::get_size() {
// filesize
filesize = std::filesystem::file_size(path); filesize = std::filesystem::file_size(path);
}; };
// calculates and sets this entry`s checksum // reads 2 pieces from the beginning and the end of a file, converts them into
void Entry::get_checksum() { // a convenient hex-encoded string
// checksum void Entry::get_pieces() {
std::fstream entry_file; std::fstream entry_file;
entry_file.open(path); entry_file.open(path);
if (!entry_file.is_open()) { if (!entry_file.is_open()) {
throw std::ifstream::failure("Could not open \"" + path.filename().string() + "\""); throw std::ifstream::failure("Could not open \"" + path.string() + "\"; reason: " + std::string(std::strerror(errno)) + "\n");
} }
// TODO(Properly test it) // TODO(Properly test it)
if (filesize <= CHECKSUM_SIZE) { char pieces_buffer[PIECE_SIZE * 2];
entry_file.read(checksum, CHECKSUM_SIZE); if (filesize <= PIECE_SIZE * 2) {
// can`t take whole 2 pieces !
// read the whole file then
entry_file.read(pieces_buffer, filesize);
} else { } else {
char start_buf[CHUNK_SIZE]; // read CHUNK_SIZE bytes from the beginning of the file
entry_file.read(start_buf, CHUNK_SIZE); char start_buf[PIECE_SIZE];
entry_file.read(start_buf, PIECE_SIZE);
entry_file.seekg(CHUNK_SIZE, std::ios::end); for (uint8_t i = 0; i < PIECE_SIZE; i++) {
char end_buf[CHUNK_SIZE]; pieces_buffer[i] = start_buf[i];
entry_file.read(end_buf, CHUNK_SIZE);
for (uint8_t i = 0; i < CHECKSUM_SIZE; i++) {
if (i < CHUNK_SIZE) {
checksum[i] = start_buf[i];
}
else if (i > CHUNK_SIZE) {
checksum[i] = end_buf[i - CHUNK_SIZE];
};
}; };
};
// jump to the last CHUNK_SIZE bytes of the file and read the as well
entry_file.seekg(PIECE_SIZE, std::ios::end);
char end_buf[PIECE_SIZE];
entry_file.read(end_buf, PIECE_SIZE);
for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) {
pieces_buffer[i] = end_buf[i - PIECE_SIZE];
};
};
entry_file.close(); entry_file.close();
};
// Compare this entry`s checksum with the other one. // make a convenient hex string out of pure bytes
// If the checksums are the same -> returns true, else -> false std::stringstream pieces_hex;
bool Entry::compare_checksums(const char other_checksum[CHECKSUM_SIZE]) { for (uint8_t i = 0; i < PIECE_SIZE * 2; i++) {
for (uint8_t i = 0; i < CHECKSUM_SIZE; i++) { pieces_hex << std::hex << static_cast<unsigned>(pieces_buffer[i]);
if (checksum[i] != other_checksum[i]) {
return false;
};
}; };
return true;
pieces = pieces_hex.str();
}; };
// Remove entry from the disk // Remove entry from the disk

19
src/entry.hpp

@ -22,18 +22,18 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <filesystem> #include <filesystem>
#include <fstream> #include <fstream>
#include <sstream>
#include <iomanip>
// 2 chunks (beginning and end of the file) // 2 pieces (beginning and end of the file)
const uint8_t CHUNK_SIZE = 24; const uint8_t PIECE_SIZE = 24;
const uint8_t CHECKSUM_SIZE = CHUNK_SIZE * 2;
// A wrapper for every file with all necessary information // A wrapper for every file with all necessary information
class Entry { class Entry {
public: public:
std::filesystem::path path; std::filesystem::path path;
uintmax_t filesize; uintmax_t filesize;
char checksum[CHECKSUM_SIZE]; std::string pieces; // 2 hex-represented pieces of file (beginning and end)
Entry(const std::filesystem::path entry_path); Entry(const std::filesystem::path entry_path);
~Entry(); ~Entry();
@ -41,12 +41,9 @@ public:
// sets this entry`s filesize // sets this entry`s filesize
void get_size(); void get_size();
// calculates and sets this entry`s checksum // reads 2 pieces from the beginning and the end of a file, converts them into
void get_checksum(); // a convenient hex-encoded string
void get_pieces();
// Compare this entry`s checksum with the other one.
// If the checksums are the same -> returns true, else -> false
bool compare_checksums(const char other_checksum[CHECKSUM_SIZE]);
// REMOVE entry from the disk // REMOVE entry from the disk
void remove(); void remove();

Loading…
Cancel
Save