diff --git a/src/broom.cpp b/src/broom.cpp index c690c93..f9ce01d 100644 --- a/src/broom.cpp +++ b/src/broom.cpp @@ -32,28 +32,8 @@ Broom::Broom(Options options) { m_sweeping = options.sweeping; }; -Broom::~Broom() {}; - -// Print current statistics -void Broom::print_statistics() { - std::cout - << "| sweeped " << m_sweeped_files << " files" << std::endl - << "| with a total size of " << m_sweeped_size << " bytes" << std::endl - << std::endl; -}; - -// returns amount of regular files in path, searching recursively. -// Throws an invalid_argument error in case path does not exist -uintmax_t amount_of_files(const std::filesystem::path path) { - if (!std::filesystem::exists(path)) { - throw std::invalid_argument("\"" + path.string() + "\"" + " does not exist !"); - }; - - if (!std::filesystem::is_directory(path)) { - throw std::invalid_argument("\"" + path.string() + "\"" + " is not a directory"); - }; - - return std::distance(std::filesystem::directory_iterator(path), std::filesystem::directory_iterator{}); +Broom::~Broom() { + m_tracked_entries.clear(); }; // recursively track every file that lies in given path. Throws an invalid_argument @@ -90,7 +70,7 @@ void Broom::track(const std::filesystem::path path) { auto tracking_time = std::chrono::high_resolution_clock::now(); std::cout - << "Tracking took " + << "[BENCHMARK] Tracking took " << std::chrono::duration_cast(tracking_time - t0).count() << " ms" << std::endl; }; @@ -102,16 +82,16 @@ uintmax_t Broom::untrack_unique_sizes() { // key: size, value: amount of occurences std::map sizes_map; - for (Entry& entry : m_tracked_entries) { + for (auto entry_iter = m_tracked_entries.begin(); entry_iter != m_tracked_entries.end(); entry_iter++) { // check if size of this entry is already in the map // if yes --> increment occurences counter // if not --> add it to the map with a counter of 1 - entry.get_size(); + entry_iter->get_size(); - auto iterator = sizes_map.find(entry.filesize); + auto iterator = sizes_map.find(entry_iter->filesize); if (iterator == sizes_map.end()) { // there is no such size - sizes_map.insert({entry.filesize, 1}); + sizes_map.insert({entry_iter->filesize, 1}); } else { // there is such size sizes_map[iterator->first]++; @@ -130,85 +110,105 @@ uintmax_t Broom::untrack_unique_sizes() { // std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl; return false; - })); + }), m_tracked_entries.end()); + + // std::cout << "Size after untracking by size: " << m_tracked_entries.size() << std::endl; return untracked; }; -// removes entries with unique first and last 20 bytes. Returns amount of +// removes entries with the same content-pieces. Returns amount of // files that are no longer being tracked -// uintmax_t Broom::untrack_unique_contents() { -// // contents, occurences -// std::map contents_map; -// std::map::iterator iterator; -// -// for (Entry& entry : m_tracked_entries) { -// // the same logic: -// // check if contents of this entry is already in the map -// // if yes --> increment occurences counter -// // if not --> add it to the map with a counter of 1 -// -// iterator = contents_map.find(entry.checksum); -// -// if (iterator == contents_map.end()) { -// // add it to the map -// contents_map.insert(std::pair(entry.checksum, 1)); -// } else { -// // increment occurences counter -// uintmax_t occurences = contents_map[iterator->first]; -// contents_map[iterator->first] = occurences++; -// }; -// }; -// -// uintmax_t untracked = 0; -// for (std::pair contents_entry : contents_map) { -// if (contents_entry.second > 1) { -// // not a unique size. Keep such entries -// } else { -// // a unique one. Untrack such an entry -// std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [contents_entry](Entry e) -> bool { -// return (e.compare_checksums(contents_entry.first)); -// }); -// untracked++; -// }; -// }; -// -// return untracked; -// }; +uintmax_t Broom::untrack_unique_contents() { + // contents, occurences + std::map contents_map; + std::map::iterator map_iter; + + for (auto entry_iter = m_tracked_entries.begin(); entry_iter != m_tracked_entries.end();) { + // the same logic: + // check if contents of this entry is already in the map + // if yes --> increment occurences counter + // if not --> add it to the map with a counter of 1 + + try{ + // can get "permission denied" when opening file + entry_iter->get_pieces(); + } catch(const std::ifstream::failure& e) { + // there is nothing we can do. Untrack this entry + // std::cerr << e.what(); + entry_iter = m_tracked_entries.erase(entry_iter); + continue; + } + + map_iter = contents_map.find(entry_iter->pieces); + if (map_iter == contents_map.end()) { + // add it to the map + contents_map.insert({entry_iter->pieces, 1}); + // std::cout << "First time seeing this piece: " << entry_iter->pieces << std::endl; + } else { + // increment occurences counter + contents_map[map_iter->first]++; + }; + + entry_iter++; + }; + + uintmax_t untracked = 0; + m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](Entry entry) -> bool { + auto iter = contents_map.find(entry.pieces); + if (iter->second == 1) { + // unique + untracked++; + return true; + } else { + return false; + } + }), m_tracked_entries.end()); + + return untracked; +}; // find all duplicates among tracked entries, stop tracking uniques void Broom::find_duplicates() { - if (m_benchmarking) { - auto t0 = std::chrono::high_resolution_clock::now(); + auto t0 = std::chrono::high_resolution_clock::now(); + + // print how many files are being tracked + uintmax_t global_untracked = m_tracked_entries.size(); + std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl; - untrack_unique_sizes(); + // untrack by size + uintmax_t untracked_by_size = untrack_unique_sizes(); + global_untracked += untracked_by_size; + std::cout << "[INFO] Untracked " << untracked_by_size << " unique sizes" << std::endl; - auto sizes_untrack_time = std::chrono::high_resolution_clock::now(); + auto sizes_untrack_time = std::chrono::high_resolution_clock::now(); + if (m_benchmarking) { std::cout - << "Untracking by size took " + << "[BENCHMARK] Untracking by size took " << std::chrono::duration_cast(sizes_untrack_time - t0).count() << " ms" << std::endl; - } else { - size_t startsize = m_tracked_entries.size(); - std::cout << "Tracking " << startsize << std::endl; + }; - uintmax_t global_untracked = 0; + // untrack by contents + uintmax_t untracked_by_contents = untrack_unique_contents(); + global_untracked += untracked_by_contents; - // uintmax_t untracked_by_contents = untrack_unique_contents(); - // global_untracked += untracked_by_contents; - // std::cout << "Untracked " << untracked_by_contents << " unique contents" << std::endl; + auto contents_untrack_time = std::chrono::high_resolution_clock::now(); + if (m_benchmarking) { + std::cout + << "[BENCHMARK] Untracking by contents took " + << std::chrono::duration_cast(contents_untrack_time - sizes_untrack_time).count() + << " ms" << std::endl; + }; - uintmax_t untracked_by_size = untrack_unique_sizes(); - global_untracked += untracked_by_size; - std::cout << "Untracked " << untracked_by_size << " unique sizes" << std::endl; + std::cout << "[INFO] Untracked " << untracked_by_contents << " unique contents" << std::endl; + std::cout << "[INFO] Duplicates: " << m_tracked_entries.size() << std::endl; - std::cout << "Duplicates: " << startsize - global_untracked << std::endl; - }; }; // remove ALL duplicate files diff --git a/src/broom.hpp b/src/broom.hpp index 3797096..e315936 100644 --- a/src/broom.hpp +++ b/src/broom.hpp @@ -48,13 +48,6 @@ public: Broom(Options options); ~Broom(); - // Print current statistics - void print_statistics(); - - // returns amount of regular files in path, searching recursively. - // Throws an invalid_argument error in case path does not exist - uintmax_t amount_of_files(const std::filesystem::path path); - // recursively track every file that lies in given path. Throws an invalid_argument // error in case path does not exist void track(const std::filesystem::path path); @@ -66,7 +59,7 @@ public: // that are no longer being tracked uintmax_t untrack_unique_sizes(); - // removes entries with unique first and last 20 bytes. Returns amount of + // removes entries with the same content-pieces. Returns amount of // files that are no longer being tracked uintmax_t untrack_unique_contents(); diff --git a/src/entry.cpp b/src/entry.cpp index 76e55ad..68943fb 100644 --- a/src/entry.cpp +++ b/src/entry.cpp @@ -19,6 +19,11 @@ along with broom. If not, see . #include "entry.hpp" +#include +#include +#include + + // A wrapper for every file with all necessary information Entry::Entry(const std::filesystem::path entry_path) { // path @@ -29,54 +34,50 @@ Entry::~Entry() {}; // sets this entry`s filesize void Entry::get_size() { - // filesize filesize = std::filesystem::file_size(path); }; -// calculates and sets this entry`s checksum -void Entry::get_checksum() { - // checksum +// reads 2 pieces from the beginning and the end of a file, converts them into +// a convenient hex-encoded string +void Entry::get_pieces() { std::fstream entry_file; entry_file.open(path); if (!entry_file.is_open()) { - throw std::ifstream::failure("Could not open \"" + path.filename().string() + "\""); + throw std::ifstream::failure("Could not open \"" + path.string() + "\"; reason: " + std::string(std::strerror(errno)) + "\n"); } // TODO(Properly test it) - if (filesize <= CHECKSUM_SIZE) { - entry_file.read(checksum, CHECKSUM_SIZE); + char pieces_buffer[PIECE_SIZE * 2]; + if (filesize <= PIECE_SIZE * 2) { + // can`t take whole 2 pieces ! + // read the whole file then + entry_file.read(pieces_buffer, filesize); } else { - char start_buf[CHUNK_SIZE]; - entry_file.read(start_buf, CHUNK_SIZE); - - entry_file.seekg(CHUNK_SIZE, std::ios::end); - char end_buf[CHUNK_SIZE]; - entry_file.read(end_buf, CHUNK_SIZE); - - for (uint8_t i = 0; i < CHECKSUM_SIZE; i++) { - if (i < CHUNK_SIZE) { - checksum[i] = start_buf[i]; - } - else if (i > CHUNK_SIZE) { - checksum[i] = end_buf[i - CHUNK_SIZE]; - }; + // read CHUNK_SIZE bytes from the beginning of the file + char start_buf[PIECE_SIZE]; + entry_file.read(start_buf, PIECE_SIZE); + for (uint8_t i = 0; i < PIECE_SIZE; i++) { + pieces_buffer[i] = start_buf[i]; }; - }; + // jump to the last CHUNK_SIZE bytes of the file and read the as well + entry_file.seekg(PIECE_SIZE, std::ios::end); + char end_buf[PIECE_SIZE]; + entry_file.read(end_buf, PIECE_SIZE); + for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) { + pieces_buffer[i] = end_buf[i - PIECE_SIZE]; + }; + }; entry_file.close(); -}; - -// Compare this entry`s checksum with the other one. -// If the checksums are the same -> returns true, else -> false -bool Entry::compare_checksums(const char other_checksum[CHECKSUM_SIZE]) { - for (uint8_t i = 0; i < CHECKSUM_SIZE; i++) { - if (checksum[i] != other_checksum[i]) { - return false; - }; + // make a convenient hex string out of pure bytes + std::stringstream pieces_hex; + for (uint8_t i = 0; i < PIECE_SIZE * 2; i++) { + pieces_hex << std::hex << static_cast(pieces_buffer[i]); }; - return true; + + pieces = pieces_hex.str(); }; // Remove entry from the disk diff --git a/src/entry.hpp b/src/entry.hpp index 7b517a6..d73ac96 100644 --- a/src/entry.hpp +++ b/src/entry.hpp @@ -22,18 +22,18 @@ along with broom. If not, see . #include #include +#include +#include -// 2 chunks (beginning and end of the file) -const uint8_t CHUNK_SIZE = 24; -const uint8_t CHECKSUM_SIZE = CHUNK_SIZE * 2; +// 2 pieces (beginning and end of the file) +const uint8_t PIECE_SIZE = 24; // A wrapper for every file with all necessary information class Entry { public: std::filesystem::path path; uintmax_t filesize; - char checksum[CHECKSUM_SIZE]; - + std::string pieces; // 2 hex-represented pieces of file (beginning and end) Entry(const std::filesystem::path entry_path); ~Entry(); @@ -41,12 +41,9 @@ public: // sets this entry`s filesize void get_size(); - // calculates and sets this entry`s checksum - void get_checksum(); - - // Compare this entry`s checksum with the other one. - // If the checksums are the same -> returns true, else -> false - bool compare_checksums(const char other_checksum[CHECKSUM_SIZE]); + // reads 2 pieces from the beginning and the end of a file, converts them into + // a convenient hex-encoded string + void get_pieces(); // REMOVE entry from the disk void remove();