From 3f8dacd34cca7ec9d201771c20be4670da3e6b1d Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Mon, 27 Dec 2021 19:31:26 +0300 Subject: [PATCH] Moved Broom's options in the other file; benchmarking flag --- src/broom.cpp | 150 ++++++++++++++++++++++++++++++++++++++------------ src/broom.hpp | 22 ++++++-- src/entry.cpp | 11 +--- src/entry.hpp | 6 +- src/main.cpp | 32 ++++++----- 5 files changed, 153 insertions(+), 68 deletions(-) diff --git a/src/broom.cpp b/src/broom.cpp index 7ca9edb..557200f 100644 --- a/src/broom.cpp +++ b/src/broom.cpp @@ -20,11 +20,16 @@ along with broom. If not, see . #include #include #include +#include #include "entry.hpp" #include "broom.hpp" -Broom::Broom() {}; +Broom::Broom(Options options) { + m_benchmarking = options.benchmarking; + m_sweeping = options.sweeping; +}; + Broom::~Broom() {}; // Print current statistics @@ -35,32 +40,61 @@ void Broom::print_statistics() { << std::endl; }; +// get all entities from path recursively and track them +int Broom::track(std::filesystem::path dir) { + auto t0 = std::chrono::high_resolution_clock::now(); + + std::filesystem::directory_options options = ( + std::filesystem::directory_options::follow_directory_symlink | + std::filesystem::directory_options::skip_permission_denied + ); + + for (std::filesystem::directory_entry dir_entry : std::filesystem::recursive_directory_iterator(dir, options)) { + if (dir_entry.is_directory()) { + continue; + }; + + Entry entry(dir_entry.path()); + m_tracked_entries.push_back(entry); + }; + + if (m_benchmarking) { + auto tracking_time = std::chrono::high_resolution_clock::now(); + + std::cout + << "Tracking took " + << std::chrono::duration_cast(tracking_time - t0).count() + << " ms" << std::endl; + }; + return 0; +}; + // removes entries with unique file sizes. Returns amount of files // that are no longer being tracked uintmax_t Broom::untrack_unique_sizes() { // key: size, value: amount of occurences - std::map sizes; + std::map sizes_map; std::map::iterator iterator; for (Entry& entry : m_tracked_entries) { // check if size of this entry is already in the map // if yes --> increment occurences counter // if not --> add it to the map with a counter of 1 - iterator = sizes.find(entry.filesize); - if (iterator == sizes.end()) { + iterator = sizes_map.find(entry.filesize); + if (iterator == sizes_map.end()) { // there is no such size - sizes.insert(std::pair(entry.filesize, 1)); + sizes_map.insert(std::pair(entry.filesize, 1)); } else { // there is such size - uintmax_t occurences = sizes[iterator->first]; - sizes[iterator->first] = occurences++; + uintmax_t occurences = sizes_map[iterator->first]; + sizes_map[iterator->first] = occurences++; }; }; // go through the map again, look for uniques and remove entries with // such filesizes uintmax_t untracked = 0; - for (std::pair size_entry : sizes) { + for (std::pair size_entry : sizes_map) { if (size_entry.second > 1) { // not a unique size. Keep such entries } else { @@ -76,43 +110,87 @@ uintmax_t Broom::untrack_unique_sizes() { return untracked; }; -// get all entities from path recursively and track them -int Broom::track(std::filesystem::path dir) { - std::filesystem::directory_options options = ( - std::filesystem::directory_options::follow_directory_symlink | - std::filesystem::directory_options::skip_permission_denied - ); +// removes entries with unique first and last 20 bytes. Returns amount of +// files that are no longer being tracked +// uintmax_t Broom::untrack_unique_contents() { +// // contents, occurences +// std::map contents_map; +// std::map::iterator iterator; +// +// for (Entry& entry : m_tracked_entries) { +// // the same logic: +// // check if contents of this entry is already in the map +// // if yes --> increment occurences counter +// // if not --> add it to the map with a counter of 1 +// +// iterator = contents_map.find(entry.checksum); +// +// if (iterator == contents_map.end()) { +// // add it to the map +// contents_map.insert(std::pair(entry.checksum, 1)); +// } else { +// // increment occurences counter +// uintmax_t occurences = contents_map[iterator->first]; +// contents_map[iterator->first] = occurences++; +// }; +// }; +// +// uintmax_t untracked = 0; +// for (std::pair contents_entry : contents_map) { +// if (contents_entry.second > 1) { +// // not a unique size. Keep such entries +// } else { +// // a unique one. Untrack such an entry +// std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [contents_entry](Entry e) -> bool { +// return (e.compare_checksums(contents_entry.first)); +// }); +// untracked++; +// }; +// }; +// +// return untracked; +// }; + + +// find all duplicates among tracked entries, stop tracking uniques +int Broom::find_duplicates() { + if (m_benchmarking) { + auto t0 = std::chrono::high_resolution_clock::now(); - for (std::filesystem::directory_entry dir_entry : std::filesystem::recursive_directory_iterator(dir, options)) { - if (dir_entry.is_directory()) { - continue; - }; + untrack_unique_sizes(); - Entry entry(dir_entry.path()); - m_tracked_entries.push_back(entry); - }; + auto sizes_untrack_time = std::chrono::high_resolution_clock::now(); - return 0; -}; + std::cout + << "Untracking by size took " + << std::chrono::duration_cast(sizes_untrack_time - t0).count() + << " ms" << std::endl -// find all duplicates among tracked entries -int Broom::find_duplicates() { - size_t startsize = m_tracked_entries.size(); - std::cout << "Tracking " << startsize << std::endl; + << std::endl; + } else { + size_t startsize = m_tracked_entries.size(); + std::cout << "Tracking " << startsize << std::endl; - uintmax_t untracked = untrack_unique_sizes(); - std::cout << "Untracked " << untracked << " unique sizes" << std::endl; + uintmax_t global_untracked = 0; + + // uintmax_t untracked_by_contents = untrack_unique_contents(); + // global_untracked += untracked_by_contents; + // std::cout << "Untracked " << untracked_by_contents << " unique contents" << std::endl; + + + uintmax_t untracked_by_size = untrack_unique_sizes(); + global_untracked += untracked_by_size; + std::cout << "Untracked " << untracked_by_size << " unique sizes" << std::endl; + + + std::cout << "Duplicates: " << startsize - global_untracked << std::endl; + }; - std::cout << "Duplicates: " << startsize - untracked << std::endl; - return 0; -}; -// remove ALL duplicate files -int Broom::sweep_all(Entry entries[]) { return 0; }; -// remove ALL duplicates but the one with specified index -int Broom::sweep_all_but(Entry entries[], uint32_t index = 0) { +// remove ALL duplicate files +int Broom::sweep_all() { return 0; }; diff --git a/src/broom.hpp b/src/broom.hpp index a9ae923..851bea0 100644 --- a/src/broom.hpp +++ b/src/broom.hpp @@ -23,9 +23,20 @@ along with broom. If not, see . #include #include +// Broom`s settings +struct Options { + bool sweeping; + bool benchmarking; +}; + + // A class to find and manage duplicate files class Broom { protected: + // enable/disable benchmarking output + bool m_benchmarking; + bool m_sweeping; + // how many files has been (would be ?) "sweeped" uintmax_t m_sweeped_files; // how many bytes was (would be ?) freed @@ -34,7 +45,7 @@ protected: std::vector m_tracked_entries; public: - Broom(); + Broom(Options options); ~Broom(); // Print current statistics @@ -50,11 +61,12 @@ public: // that are no longer being tracked uintmax_t untrack_unique_sizes(); - // remove ALL duplicate files - int sweep_all(Entry entries[]); + // removes entries with unique first and last 20 bytes. Returns amount of + // files that are no longer being tracked + uintmax_t untrack_unique_contents(); - // remove ALL duplicates but the one with specified index - int sweep_all_but(Entry entries[], uint32_t index); + // remove ALL duplicate files + int sweep_all(); }; #endif diff --git a/src/entry.cpp b/src/entry.cpp index 08e1252..c37e286 100644 --- a/src/entry.cpp +++ b/src/entry.cpp @@ -51,20 +51,13 @@ Entry::Entry(std::filesystem::path path) { char end_buf[CHUNK_SIZE]; entry_file.read(end_buf, CHUNK_SIZE); - entry_file.seekg(CHUNK_SIZE, std::ios::beg); - char middle_buf[CHUNK_SIZE]; - entry_file.read(middle_buf, CHUNK_SIZE); - for (uint8_t i = 0; i < CHECKSUM_SIZE; i++) { if (i < CHUNK_SIZE) { checksum[i] = start_buf[i]; } - else if (i > CHUNK_SIZE*2) { - checksum[i] = middle_buf[i-(CHUNK_SIZE*2)]; - } else if (i > CHUNK_SIZE) { checksum[i] = end_buf[i - CHUNK_SIZE]; - } + }; }; }; @@ -75,7 +68,7 @@ Entry::~Entry() {}; // Compare this entry`s checksum with the other one. // If the checksums are the same -> returns true, else -> false -bool Entry::compare_checksums(char other_checksum[CHECKSUM_SIZE]) { +bool Entry::compare_checksums(const char other_checksum[CHECKSUM_SIZE]) { for (uint8_t i = 0; i < CHECKSUM_SIZE; i++) { if (checksum[i] != other_checksum[i]) { return false; diff --git a/src/entry.hpp b/src/entry.hpp index 56faf75..ffaef81 100644 --- a/src/entry.hpp +++ b/src/entry.hpp @@ -23,9 +23,9 @@ along with broom. If not, see . #include #include -// 3 chunks (beginning, end, middle of the file) +// 3 chunks (beginning and end of the file) const uint8_t CHUNK_SIZE = 24; -const uint8_t CHECKSUM_SIZE = CHUNK_SIZE * 3; +const uint8_t CHECKSUM_SIZE = CHUNK_SIZE * 2; // A wrapper for every file with all necessary information class Entry { @@ -41,7 +41,7 @@ public: // Compare this entry`s checksum with the other one. // If the checksums are the same -> returns true, else -> false - bool compare_checksums(char other_checksum[CHECKSUM_SIZE]); + bool compare_checksums(const char other_checksum[CHECKSUM_SIZE]); // Remove entry from the disk void remove(); diff --git a/src/main.cpp b/src/main.cpp index 6e1d8ae..d999e60 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -27,12 +27,6 @@ along with broom. If not, see . // Broom version number #define VERSION "v0.1.0" -// Broom`s settings -struct Options { - bool sweeping; - std::vector paths; -}; - void print_help() { std::cout << "broom [FLAGS..] [COMMAND] [FILES|DIRECTORIES...]" << std::endl << std::endl @@ -58,6 +52,7 @@ void print_version() { int main(int argc, char* argv[]) { Options options; + std::filesystem::path tracked_path; if (argc < 2) { print_help(); @@ -65,7 +60,7 @@ int main(int argc, char* argv[]) { }; // process command line arguments - for (unsigned int i = 0; i < argc; i++) { + for (unsigned int i = 1; i < argc; i++) { // flags -> command -> directories&&files if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { @@ -76,6 +71,9 @@ int main(int argc, char* argv[]) { print_version(); return 0; } + else if (strcmp(argv[i], "-b") == 0 || strcmp(argv[i], "--benchmark") == 0) { + options.benchmarking = true; + } else if (strcmp(argv[i], "sweep") == 0) { options.sweeping = true; } @@ -84,18 +82,22 @@ int main(int argc, char* argv[]) { } else { // add path - if (i == 0) { - continue; - } else { - options.paths.push_back(argv[i]); - } + if (std::filesystem::exists(argv[i])) { + tracked_path = argv[i]; + }; }; }; - Broom broom; + // no path was specified + if (tracked_path.string() == "") { + print_help(); + return 1; + }; + + + Broom broom(options); - std::filesystem::path first_path = options.paths.at(0); - broom.track(first_path); + broom.track(tracked_path); broom.find_duplicates(); return 0;