Browse Source

Grouping duplicates together; Significantly improved scan results file;

main
Unbewohnte 3 years ago
parent
commit
ad4b55fdce
  1. 49
      src/broom.cpp
  2. 11
      src/broom.hpp
  3. 9
      src/entry.cpp
  4. 5
      src/entry.hpp
  5. 13
      src/main.cpp

49
src/broom.cpp

@ -142,8 +142,8 @@ uintmax_t Broom::untrack_unique_contents(std::vector<entry::Entry>& tracked_entr
return untracked;
};
// creates a list of duplicate, empty files into a file
void Broom::create_scan_results_list(const std::vector<entry::Entry> tracked_entries, const std::filesystem::path dir, const std::string filename) {
// creates a list of duplicate, empty files and puts it into a file
void Broom::create_scan_results_list(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates, const std::filesystem::path dir, const std::string filename) {
if (!std::filesystem::exists(dir)) {
// create it then
bool created = std::filesystem::create_directories(dir);
@ -158,13 +158,21 @@ void Broom::create_scan_results_list(const std::vector<entry::Entry> tracked_ent
throw "Could not create a scan results file";
}
for (const entry::Entry entry : tracked_entries) {
// log every entry and its group
if (entry.group == group::EMPTY) {
outfile << "[EMPTY] " << entry.path << std::endl;
} else if (entry.group == group::DUPLICATE) {
outfile << "[DUPLICATE] " << entry.path << std::endl;
auto now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
outfile << ">> Broom scan results file from " << std::ctime(&now) << std::endl << std::endl << std::endl;
for (const auto record : grouped_duplicates) {
if (record.first == "") {
outfile << "[EMPTY FILES]" << std::endl;
} else {
outfile << "[DUPLICATE GROUP]" << std::endl;
}
for (const auto duplicate_entry : record.second) {
outfile << duplicate_entry.path << std::endl;
}
outfile << std::endl << std::endl;
}
outfile.close();
@ -218,4 +226,29 @@ void Broom::mark_as_duplicates(std::vector<entry::Entry>& tracked_entries) {
}
};
// searches for entries with the same pieces in tracked entries and groups them together as a duplicate group, where the key is the
// string of pieces. REMOVES EVERYTHING FROM GIVEN TRACKED ENTRIES
std::map<std::string, std::vector<entry::Entry>> Broom::group_duplicates(std::vector<entry::Entry>& tracked_entries) {
std::map<std::string, std::vector<entry::Entry>> duplicate_groups;
for (auto iter = tracked_entries.begin(); iter != tracked_entries.end(); iter++) {
auto map_iter = duplicate_groups.find(iter->pieces);
if (map_iter == duplicate_groups.end()) {
// first time seeing these pieces
std::vector<entry::Entry> occurences;
occurences.push_back(*iter);
duplicate_groups.insert({iter->pieces, occurences});
} else {
// add to occurrences this entry
duplicate_groups[map_iter->first].push_back(*iter);
}
};
// clear the vector
tracked_entries.clear();
return duplicate_groups;
};
}

11
src/broom.hpp

@ -22,6 +22,7 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <cstdint>
#include <vector>
#include <map>
namespace broom {
@ -50,11 +51,15 @@ public:
// REMOVES grouped empty files and untracks them after deletion. Returns the amount of removed empty files
uintmax_t remove_empty_files(std::vector<entry::Entry>& tracked_entries);
// creates a list of duplicate, empty files into a file
void create_scan_results_list(const std::vector<entry::Entry> tracked_entries, const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt");
// marks every entry without any group as a duplicate
void mark_as_duplicates(std::vector<entry::Entry>& tracked_entries);
// searches for entries with the same pieces in tracked entries and groups them together as a duplicate group, where the key is the
// string of pieces. REMOVES EVERYTHING FROM GIVEN TRACKED ENTRIES
std::map<std::string, std::vector<entry::Entry>> group_duplicates(std::vector<entry::Entry>& tracked_entries);
// creates a list of duplicate, empty files and puts it into a file
void create_scan_results_list(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates, const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt");
};
}

9
src/entry.cpp

@ -38,7 +38,8 @@ Entry::Entry(const std::filesystem::path entry_path) {
Entry::~Entry() {};
// reads 3 pieces from the beginning, middle and the end of a file, converts them into
// a convenient hex-encoded string
// a convenient hex-encoded string. If a file has a size of less than PIECE_SIZE * PIECES_AMOUNT ->
// constructs pieces from the whole file contents. If a file has no contents at all -> its pieces will be set to ""
void Entry::get_pieces() {
std::fstream entry_file;
entry_file.open(path);
@ -48,7 +49,11 @@ void Entry::get_pieces() {
}
char pieces_buffer[PIECE_SIZE * PIECES_AMOUNT];
if (filesize <= PIECE_SIZE * PIECES_AMOUNT) {
if (filesize == 0) {
// EMPTY file !
pieces = "";
return;
} else if (filesize <= PIECE_SIZE * PIECES_AMOUNT) {
// can`t take whole 3 pieces !
// read the whole file then
entry_file.read(pieces_buffer, filesize);

5
src/entry.hpp

@ -30,7 +30,7 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
namespace entry {
// 3 pieces (beginning, middle and end of the file)
const uint8_t PIECE_SIZE = 85;
const uint8_t PIECE_SIZE = 75;
const uint8_t PIECES_AMOUNT = 3;
// A wrapper for every file in filesystem with all necessary information
@ -45,7 +45,8 @@ public:
~Entry();
// reads 3 pieces from the beginning, middle and the end of a file, converts them into
// a convenient hex-encoded string
// a convenient hex-encoded string. If a file has a size of less than PIECE_SIZE * PIECES_AMOUNT ->
// constructs pieces from the whole file contents. If a file has no contents at all -> its pieces will be set to ""
void get_pieces();
// REMOVE entry from the disk

13
src/main.cpp

@ -176,14 +176,13 @@ int main(int argc, char* argv[]) {
std::cout << "[INFO] " << tracked_entries.size() << " files left being tracked" << std::endl;
if (tracked_entries.size() > 0) {
// now only files with a non-unique size and contents are being tracked
// are they REALLY duplicates ?
// better to leave the REALL cleanup for the user, saving these entries in a file, than doing a blind and possibly destructive purge
broom.create_scan_results_list(tracked_entries);
std::cout << "[INFO] Created scan results file" << std::endl;
}
auto grouped_duplicates = broom.group_duplicates(tracked_entries);
// now only files with a non-unique size and contents are being tracked
// are they REALLY duplicates ?
// better to leave the REALL cleanup for the user, saving these entries in a file, than doing a blind and possibly destructive purge
broom.create_scan_results_list(grouped_duplicates );
std::cout << "[INFO] Created scan results file" << std::endl;
} catch(const std::exception& e) {
std::cerr

Loading…
Cancel
Save