From ad4b55fdce2358122c4a0bb9c1935f6af1cc39da Mon Sep 17 00:00:00 2001 From: Unbewohnte Date: Mon, 17 Jan 2022 17:58:34 +0300 Subject: [PATCH] Grouping duplicates together; Significantly improved scan results file; --- src/broom.cpp | 49 +++++++++++++++++++++++++++++++++++++++++-------- src/broom.hpp | 11 ++++++++--- src/entry.cpp | 9 +++++++-- src/entry.hpp | 5 +++-- src/main.cpp | 15 +++++++-------- 5 files changed, 66 insertions(+), 23 deletions(-) diff --git a/src/broom.cpp b/src/broom.cpp index 70c94a5..717203e 100755 --- a/src/broom.cpp +++ b/src/broom.cpp @@ -142,8 +142,8 @@ uintmax_t Broom::untrack_unique_contents(std::vector& tracked_entr return untracked; }; -// creates a list of duplicate, empty files into a file -void Broom::create_scan_results_list(const std::vector tracked_entries, const std::filesystem::path dir, const std::string filename) { +// creates a list of duplicate, empty files and puts it into a file +void Broom::create_scan_results_list(const std::map> grouped_duplicates, const std::filesystem::path dir, const std::string filename) { if (!std::filesystem::exists(dir)) { // create it then bool created = std::filesystem::create_directories(dir); @@ -158,13 +158,21 @@ void Broom::create_scan_results_list(const std::vector tracked_ent throw "Could not create a scan results file"; } - for (const entry::Entry entry : tracked_entries) { - // log every entry and its group - if (entry.group == group::EMPTY) { - outfile << "[EMPTY] " << entry.path << std::endl; - } else if (entry.group == group::DUPLICATE) { - outfile << "[DUPLICATE] " << entry.path << std::endl; + auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + outfile << ">> Broom scan results file from " << std::ctime(&now) << std::endl << std::endl << std::endl; + + for (const auto record : grouped_duplicates) { + if (record.first == "") { + outfile << "[EMPTY FILES]" << std::endl; + } else { + outfile << "[DUPLICATE GROUP]" << std::endl; } + + for (const auto duplicate_entry : record.second) { + outfile << duplicate_entry.path << std::endl; + } + + outfile << std::endl << std::endl; } outfile.close(); @@ -218,4 +226,29 @@ void Broom::mark_as_duplicates(std::vector& tracked_entries) { } }; + +// searches for entries with the same pieces in tracked entries and groups them together as a duplicate group, where the key is the +// string of pieces. REMOVES EVERYTHING FROM GIVEN TRACKED ENTRIES +std::map> Broom::group_duplicates(std::vector& tracked_entries) { + std::map> duplicate_groups; + + for (auto iter = tracked_entries.begin(); iter != tracked_entries.end(); iter++) { + auto map_iter = duplicate_groups.find(iter->pieces); + if (map_iter == duplicate_groups.end()) { + // first time seeing these pieces + std::vector occurences; + occurences.push_back(*iter); + duplicate_groups.insert({iter->pieces, occurences}); + } else { + // add to occurrences this entry + duplicate_groups[map_iter->first].push_back(*iter); + } + }; + + // clear the vector + tracked_entries.clear(); + + return duplicate_groups; +}; + } diff --git a/src/broom.hpp b/src/broom.hpp index 890d61d..313ed04 100755 --- a/src/broom.hpp +++ b/src/broom.hpp @@ -22,6 +22,7 @@ along with broom. If not, see . #include #include +#include namespace broom { @@ -50,11 +51,15 @@ public: // REMOVES grouped empty files and untracks them after deletion. Returns the amount of removed empty files uintmax_t remove_empty_files(std::vector& tracked_entries); - // creates a list of duplicate, empty files into a file - void create_scan_results_list(const std::vector tracked_entries, const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt"); - // marks every entry without any group as a duplicate void mark_as_duplicates(std::vector& tracked_entries); + + // searches for entries with the same pieces in tracked entries and groups them together as a duplicate group, where the key is the + // string of pieces. REMOVES EVERYTHING FROM GIVEN TRACKED ENTRIES + std::map> group_duplicates(std::vector& tracked_entries); + + // creates a list of duplicate, empty files and puts it into a file + void create_scan_results_list(const std::map> grouped_duplicates, const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt"); }; } diff --git a/src/entry.cpp b/src/entry.cpp index 2b45c6a..31d176a 100755 --- a/src/entry.cpp +++ b/src/entry.cpp @@ -38,7 +38,8 @@ Entry::Entry(const std::filesystem::path entry_path) { Entry::~Entry() {}; // reads 3 pieces from the beginning, middle and the end of a file, converts them into -// a convenient hex-encoded string +// a convenient hex-encoded string. If a file has a size of less than PIECE_SIZE * PIECES_AMOUNT -> +// constructs pieces from the whole file contents. If a file has no contents at all -> its pieces will be set to "" void Entry::get_pieces() { std::fstream entry_file; entry_file.open(path); @@ -48,7 +49,11 @@ void Entry::get_pieces() { } char pieces_buffer[PIECE_SIZE * PIECES_AMOUNT]; - if (filesize <= PIECE_SIZE * PIECES_AMOUNT) { + if (filesize == 0) { + // EMPTY file ! + pieces = ""; + return; + } else if (filesize <= PIECE_SIZE * PIECES_AMOUNT) { // can`t take whole 3 pieces ! // read the whole file then entry_file.read(pieces_buffer, filesize); diff --git a/src/entry.hpp b/src/entry.hpp index 91bc432..a1ef9db 100755 --- a/src/entry.hpp +++ b/src/entry.hpp @@ -30,7 +30,7 @@ along with broom. If not, see . namespace entry { // 3 pieces (beginning, middle and end of the file) -const uint8_t PIECE_SIZE = 85; +const uint8_t PIECE_SIZE = 75; const uint8_t PIECES_AMOUNT = 3; // A wrapper for every file in filesystem with all necessary information @@ -45,7 +45,8 @@ public: ~Entry(); // reads 3 pieces from the beginning, middle and the end of a file, converts them into - // a convenient hex-encoded string + // a convenient hex-encoded string. If a file has a size of less than PIECE_SIZE * PIECES_AMOUNT -> + // constructs pieces from the whole file contents. If a file has no contents at all -> its pieces will be set to "" void get_pieces(); // REMOVE entry from the disk diff --git a/src/main.cpp b/src/main.cpp index 614163f..d102d19 100755 --- a/src/main.cpp +++ b/src/main.cpp @@ -166,7 +166,7 @@ int main(int argc, char* argv[]) { } }), tracked_entries.end()); - + // untrack unique contents untracked = broom.untrack_unique_contents(tracked_entries); std::cout << "[INFO] Untracked " << untracked << " files with unique contents" << std::endl; @@ -176,14 +176,13 @@ int main(int argc, char* argv[]) { std::cout << "[INFO] " << tracked_entries.size() << " files left being tracked" << std::endl; - if (tracked_entries.size() > 0) { - // now only files with a non-unique size and contents are being tracked - // are they REALLY duplicates ? - // better to leave the REALL cleanup for the user, saving these entries in a file, than doing a blind and possibly destructive purge - broom.create_scan_results_list(tracked_entries); - std::cout << "[INFO] Created scan results file" << std::endl; - } + auto grouped_duplicates = broom.group_duplicates(tracked_entries); + // now only files with a non-unique size and contents are being tracked + // are they REALLY duplicates ? + // better to leave the REALL cleanup for the user, saving these entries in a file, than doing a blind and possibly destructive purge + broom.create_scan_results_list(grouped_duplicates ); + std::cout << "[INFO] Created scan results file" << std::endl; } catch(const std::exception& e) { std::cerr