diff --git a/build/CMakeLists.txt b/build/CMakeLists.txt index 2e00b15..66ac265 100644 --- a/build/CMakeLists.txt +++ b/build/CMakeLists.txt @@ -13,4 +13,4 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Werror -O2") set(EXECUTABLE_OUTPUT_PATH ../bin) -add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp) +add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp ../src/group.hpp) diff --git a/src/broom.cpp b/src/broom.cpp index 64933d6..9338002 100644 --- a/src/broom.cpp +++ b/src/broom.cpp @@ -26,6 +26,9 @@ along with broom. If not, see . #include "entry.hpp" #include "broom.hpp" +#include "group.hpp" + +namespace broom { Broom::Broom(Options options) { m_benchmarking = options.benchmarking; @@ -58,12 +61,12 @@ void Broom::track(const std::filesystem::path path) { continue; }; - Entry entry(dir_entry.path()); + entry::Entry entry(dir_entry.path()); m_tracked_entries.push_back(entry); } } else if (std::filesystem::is_regular_file(path)) { // just a file - Entry entry(path); + entry::Entry entry(path); m_tracked_entries.push_back(entry); } @@ -76,11 +79,13 @@ void Broom::track(const std::filesystem::path path) { << std::chrono::duration_cast(tracking_time - t0).count() << " ms" << std::endl; } + + std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl; }; // removes entries with unique file sizes. Returns amount of files // that are no longer being tracked -uintmax_t Broom::untrack_unique_sizes() { +uintmax_t Broom::m_untrack_unique_sizes() { // key: size, value: amount of occurences std::map sizes_map; @@ -88,8 +93,6 @@ uintmax_t Broom::untrack_unique_sizes() { // check if size of this entry is already in the map // if yes --> increment occurences counter // if not --> add it to the map with a counter of 1 - entry_iter->get_size(); - auto iterator = sizes_map.find(entry_iter->filesize); if (iterator == sizes_map.end()) { // there is no such size @@ -101,7 +104,7 @@ uintmax_t Broom::untrack_unique_sizes() { } uintmax_t untracked = 0; - m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, sizes_map](Entry entry) -> bool{ + m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, sizes_map](entry::Entry entry) -> bool{ auto iter = sizes_map.find(entry.filesize); if (iter->second == 1) { // unique @@ -109,20 +112,15 @@ uintmax_t Broom::untrack_unique_sizes() { return true; }; - // std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl; - return false; }), m_tracked_entries.end()); - // std::cout << "Size after untracking by size: " << m_tracked_entries.size() << std::endl; - - return untracked; }; // removes entries with the same content-pieces. Returns amount of // files that are no longer being tracked -uintmax_t Broom::untrack_unique_contents() { +uintmax_t Broom::m_untrack_unique_contents() { // contents, occurences std::map contents_map; std::map::iterator map_iter; @@ -133,12 +131,17 @@ uintmax_t Broom::untrack_unique_contents() { // if yes --> increment occurences counter // if not --> add it to the map with a counter of 1 + if (entry_iter->filesize == 0) { + // that`s an empty file. Skip it + entry_iter++; + continue; + } + try{ // can get "permission denied" when opening file entry_iter->get_pieces(); } catch(const std::ifstream::failure& e) { // there is nothing we can do. Untrack this entry - // std::cerr << e.what(); entry_iter = m_tracked_entries.erase(entry_iter); continue; } @@ -157,7 +160,7 @@ uintmax_t Broom::untrack_unique_contents() { }; uintmax_t untracked = 0; - m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](Entry entry) -> bool { + m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](entry::Entry entry) -> bool { auto iter = contents_map.find(entry.pieces); if (iter->second == 1) { // unique @@ -171,17 +174,16 @@ uintmax_t Broom::untrack_unique_contents() { return untracked; }; - -// find all duplicates among tracked entries, stop tracking uniques -void Broom::find_duplicates() { +// finds all duplicates among tracked entries and marks them with appropriate group. +// Returns amount of duplicate files +uintmax_t Broom::m_find_duplicates() { auto t0 = std::chrono::high_resolution_clock::now(); // print how many files are being tracked uintmax_t global_untracked = m_tracked_entries.size(); - std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl; // untrack by size - uintmax_t untracked_by_size = untrack_unique_sizes(); + uintmax_t untracked_by_size = m_untrack_unique_sizes(); global_untracked += untracked_by_size; std::cout << "[INFO] Untracked " << untracked_by_size << " unique sizes" << std::endl; @@ -195,7 +197,7 @@ void Broom::find_duplicates() { } // untrack by contents - uintmax_t untracked_by_contents = untrack_unique_contents(); + uintmax_t untracked_by_contents = m_untrack_unique_contents(); global_untracked += untracked_by_contents; auto contents_untrack_time = std::chrono::high_resolution_clock::now(); @@ -209,37 +211,72 @@ void Broom::find_duplicates() { std::cout << "[INFO] Untracked " << untracked_by_contents << " unique contents" << std::endl; - std::cout << "[INFO] Duplicates: " << m_tracked_entries.size() << std::endl; + std::cout << "[INFO] Found " << m_tracked_entries.size() << " possible duplicate files" << std::endl; + + // mark duplicate entries - create_duplicates_list(); + for (entry::Entry& duplicate_entry : m_tracked_entries) { + duplicate_entry.group = group::DUPLICATE; + } - std::cout << "[INFO] Created a duplicates list" << std::endl; + return m_tracked_entries.size(); }; -// saves current list of duplicate file paths into a file in dir -void Broom::create_duplicates_list(const std::filesystem::path dir, const std::string filename) { +// creates a list of duplicate, empty files into a file +void Broom::create_scan_results_list(const std::filesystem::path dir, const std::string filename) { if (!std::filesystem::exists(dir)) { // create it then bool created = std::filesystem::create_directories(dir); if (!created) { - throw "Could not create a directory"; + throw "Could not create a directory to save scan results in"; } } // create output file there std::fstream outfile(dir / filename, std::ios::out); if (!outfile.is_open()) { - throw "Could not create an output file"; + throw "Could not create a scan results file"; } - for (const Entry duplicate_entry : m_tracked_entries) { - // log every duplicate entry - outfile << duplicate_entry.path << std::endl; + for (const entry::Entry entry : m_tracked_entries) { + // log every entry and its group + if (entry.group == group::EMPTY) { + outfile << entry.path << " --- is an empty file" << std::endl; + } else if (entry.group == group::DUPLICATE) { + outfile << entry.path << " --- is a duplicate of another file" << std::endl; + } } outfile.close(); + + std::cout << "[INFO] Created scan results file" << std::endl; +}; + +// finds empty files among tracked entries and gives them appropriate group +// Returns amount of found empty files +uintmax_t Broom::m_find_empty_files() { + uintmax_t found_empty_files = 0; + for (entry::Entry& entry : m_tracked_entries) { + if (entry.filesize == 0) { + entry.group = group::EMPTY; + found_empty_files++; + } + } + + std::cout << "[INFO] Found " << found_empty_files << " empty files" << std::endl; + + return found_empty_files; +}; + +// scans directory for duplicates and empty files +void Broom::scan() { + m_find_empty_files(); + m_find_duplicates(); }; // remove ALL duplicate files -void Broom::sweep_all() { +void Broom::sweep() { }; + + +} diff --git a/src/broom.hpp b/src/broom.hpp index b83cccb..1f80d1f 100644 --- a/src/broom.hpp +++ b/src/broom.hpp @@ -23,6 +23,7 @@ along with broom. If not, see . #include #include +namespace broom { // Broom`s settings struct Options { bool sweeping; @@ -38,12 +39,24 @@ protected: // TODO(think about how sweeping should work) bool m_sweeping; - // how many files has been (would be ?) "sweeped" - uintmax_t m_sweeped_files; - // how many bytes was (would be ?) freed - uintmax_t m_sweeped_size; // paths to tracked files - std::vector m_tracked_entries; + std::vector m_tracked_entries; + + // finds empty files among tracked entries. + // Returns amount of found empty files + uintmax_t m_find_empty_files(); + + // removes entries with unique file sizes. Returns amount of files + // that are no longer being tracked + uintmax_t m_untrack_unique_sizes(); + + // removes entries with the same content-pieces. Returns amount of + // files that are no longer being tracked + uintmax_t m_untrack_unique_contents(); + + // finds all duplicates among tracked entries and marks them with appropriate group + // Returns amount of duplicate files + uintmax_t m_find_duplicates(); public: Broom(Options options); @@ -53,22 +66,17 @@ public: // error in case path does not exist void track(const std::filesystem::path path); - // find all duplicates in the directory - void find_duplicates(); + // creates a list of duplicate, empty files into a file + void create_scan_results_list(const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt"); - // removes entries with unique file sizes. Returns amount of files - // that are no longer being tracked - uintmax_t untrack_unique_sizes(); + // TODO + void sweep(); - // removes entries with the same content-pieces. Returns amount of - // files that are no longer being tracked - uintmax_t untrack_unique_contents(); + // scans tracked entries for duplicates and empty files + void scan(); +}; - // saves current list of duplicate file paths into a file - void create_duplicates_list(const std::filesystem::path dir = ".", const std::string filename = "duplicate_files_list.txt"); +} - // TODO - void sweep_all(); -}; #endif diff --git a/src/entry.cpp b/src/entry.cpp index 034ceb7..a9bd54f 100644 --- a/src/entry.cpp +++ b/src/entry.cpp @@ -24,21 +24,20 @@ along with broom. If not, see . #include -// A wrapper for every file with all necessary information +namespace entry { + +// A wrapper for every file in filesystem with all necessary information Entry::Entry(const std::filesystem::path entry_path) { // path path = entry_path; -}; - -Entry::~Entry() {}; -// sets this entry`s filesize -void Entry::get_size() { + // filesize filesize = std::filesystem::file_size(path); }; +Entry::~Entry() {}; -// reads 2 pieces from the middle and the end of a file, converts them into +// reads 3 pieces from the beginning, middle and the end of a file, converts them into // a convenient hex-encoded string void Entry::get_pieces() { std::fstream entry_file; @@ -48,28 +47,35 @@ void Entry::get_pieces() { throw std::ifstream::failure("Could not open \"" + path.string() + "\"; reason: " + std::string(std::strerror(errno)) + "\n"); } - char pieces_buffer[PIECE_SIZE * 2]; - if (filesize <= PIECE_SIZE * 2) { - // can`t take whole 2 pieces ! + char pieces_buffer[PIECE_SIZE * PIECES_AMOUNT]; + if (filesize <= PIECE_SIZE * PIECES_AMOUNT) { + // can`t take whole 3 pieces ! // read the whole file then entry_file.read(pieces_buffer, filesize); } else { + // read chunk from the beginning + char begin_buf[PIECE_SIZE]; + entry_file.read(begin_buf, PIECE_SIZE); + for (uint8_t i = 0; i < PIECE_SIZE; i++) { + pieces_buffer[i] = begin_buf[i]; + } + uintmax_t middle_of_the_file = (double) filesize / 2.0 - PIECE_SIZE; entry_file.seekg(middle_of_the_file, std::ios::beg); // read CHUNK_SIZE bytes from the middle of the file char middle_buf[PIECE_SIZE]; entry_file.read(middle_buf, PIECE_SIZE); - for (uint8_t i = 0; i < PIECE_SIZE; i++) { - pieces_buffer[i] = middle_buf[i]; + for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) { + pieces_buffer[i] = middle_buf[i - PIECE_SIZE]; }; // jump to the last CHUNK_SIZE bytes of the file and read the as well entry_file.seekg(PIECE_SIZE, std::ios::end); char end_buf[PIECE_SIZE]; entry_file.read(end_buf, PIECE_SIZE); - for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) { - pieces_buffer[i] = end_buf[i - PIECE_SIZE]; + for (uint8_t i = PIECE_SIZE * 2; i < PIECE_SIZE * 3; i++) { + pieces_buffer[i] = end_buf[i - PIECE_SIZE * 2]; }; }; entry_file.close(); @@ -81,11 +87,12 @@ void Entry::get_pieces() { }; pieces = pieces_hex.str(); - - std::cout << pieces << std::endl; }; // Remove entry from the disk void Entry::remove() { std::filesystem::remove(path); }; + +} + diff --git a/src/entry.hpp b/src/entry.hpp index 4ee69fe..9722be5 100644 --- a/src/entry.hpp +++ b/src/entry.hpp @@ -25,23 +25,26 @@ along with broom. If not, see . #include #include -// 2 pieces (middle and end of the file) -const uint8_t PIECE_SIZE = 16; +#include "group.hpp" -// A wrapper for every file with all necessary information + +namespace entry { +// 3 pieces (beginning, middle and end of the file) +const uint8_t PIECE_SIZE = 6; +const uint8_t PIECES_AMOUNT = 3; + +// A wrapper for every file in filesystem with all necessary information class Entry { public: - std::filesystem::path path; - uintmax_t filesize; - std::string pieces; // 2 hex-represented pieces of file + std::filesystem::path path; // set via constructor + uintmax_t filesize; // set via constructor + std::string pieces; // 3 hex-represented pieces of file; set only via a method call to not stress the disk + group::Group group; // set externally Entry(const std::filesystem::path entry_path); ~Entry(); - // sets this entry`s filesize - void get_size(); - - // reads 2 pieces from the middle and the end of a file, converts them into + // reads 3 pieces from the beginning, middle and the end of a file, converts them into // a convenient hex-encoded string void get_pieces(); @@ -49,5 +52,8 @@ public: void remove(); }; +} + + #endif diff --git a/src/group.hpp b/src/group.hpp new file mode 100644 index 0000000..f9a12ee --- /dev/null +++ b/src/group.hpp @@ -0,0 +1,13 @@ +#ifndef GROUP_HPP +#define GROUP_HPP + +namespace group { + +enum Group { + DUPLICATE, + EMPTY, +}; + +} + +#endif diff --git a/src/main.cpp b/src/main.cpp index 429b2ec..2c4d5bb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,15 +26,16 @@ along with broom. If not, see . #include "broom.hpp" // Broom version number -#define VERSION "v0.1.0" +#define VERSION "v0.1.1" void print_help() { std::cout - << "broom [FLAGS..] [COMMAND] [FILES|DIRECTORIES...]" << std::endl << std::endl - << "FLAGS" << std::endl + << "broom [FLAGS..] [COMMAND] [DIRECTORY]" << std::endl << std::endl + << "[FLAGS]" << std::endl << "-v | --version -> print version information and exit" << std::endl << "-h | --help -> print this message and exit" << std::endl << std::endl - << "COMMANDS" << std::endl + + << "[COMMANDS]" << std::endl << "sweep -> scan for duplicate files and delete (sweep) all of them but the last one" << std::endl << "scan -> scan for duplicate files and output information in a file" << std::endl << std::endl; @@ -43,7 +44,8 @@ void print_help() { void print_version() { std::cout << "broom " << VERSION << std::endl - << "a command line utility to locate and manage duplicate files" << std::endl << std::endl + << "incurable hoarder`s helpful friend" << std::endl << std::endl + << "Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz))" << std::endl << "This program comes with ABSOLUTELY NO WARRANTY." << std::endl << "This is free software, and you are welcome to redistribute it" << std::endl @@ -52,7 +54,7 @@ void print_version() { }; int main(int argc, char* argv[]) { - Options options; + broom::Options options; std::filesystem::path tracked_path; if (argc < 2) { @@ -62,7 +64,7 @@ int main(int argc, char* argv[]) { // process command line arguments for (unsigned int i = 1; i < argc; i++) { - // flags -> command -> directories&&files + // flags -> command -> directory if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { print_help(); @@ -83,7 +85,7 @@ int main(int argc, char* argv[]) { } else { // add path - tracked_path = argv[i]; + tracked_path = std::filesystem::path(argv[i]); }; }; @@ -94,10 +96,11 @@ int main(int argc, char* argv[]) { }; - Broom broom(options); + broom::Broom broom(options); try { broom.track(tracked_path); - broom.find_duplicates(); + broom.scan(); + broom.create_scan_results_list(); } catch(const std::invalid_argument& e) { std::cerr << "[ERROR] Invalid argument: " << std::endl