diff --git a/README.md b/README.md index 3e4e51d..6bbbec6 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ broom [FLAGS..] [COMMAND] [DIRECTORY] [COMMANDS] -- `sweep` -> scan for duplicate files, save results in a file and REMOVE empty files +- `sweep` -> scan for duplicate files, REMOVE empty files and REPLACE other duplicates with symlinks - `scan` -> scan and save results in a file without removing anything [DEFAULT] @@ -66,9 +66,9 @@ broom [FLAGS..] [COMMAND] [DIRECTORY] ### Examples - `broom scan -od . ~/homework` -- `broom sweep ~/homework/I/have/a/lot/of/empty/files/here/for/some/reason` +- `broom sweep ~/homework` -after the scan the results file will be saved in your current working directory, scan results file contains +after the scan the results file will be saved in your current working directory, unless you specified it to be somewhere else. Scan results file contains a list of duplicate files that are grouped together so you can see EXACTLY WHERE each duplicate is in the filesystem. --- @@ -80,4 +80,5 @@ GPLv3 ## TODO - Make it go `P` A `R` A `L` L `E` L -- Output approximate size that could be freed \ No newline at end of file +- ~~Output approximate size that could be freed~~ +- ~~Remove duplicates and create symlinks~~ \ No newline at end of file diff --git a/build/CMakeLists.txt b/build/CMakeLists.txt index 52d0e11..914bc32 100755 --- a/build/CMakeLists.txt +++ b/build/CMakeLists.txt @@ -22,5 +22,5 @@ endif() set(EXECUTABLE_OUTPUT_PATH ../bin) -add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp ../src/group.hpp) +add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp) target_link_libraries(broom Threads::Threads) diff --git a/src/broom.cpp b/src/broom.cpp index 23a7eb3..dae6f4d 100755 --- a/src/broom.cpp +++ b/src/broom.cpp @@ -28,7 +28,6 @@ along with broom. If not, see . #include "entry.hpp" #include "broom.hpp" -#include "group.hpp" namespace broom { @@ -52,7 +51,7 @@ std::vector Broom::track(const std::filesystem::path path) { ); for (auto dir_entry : std::filesystem::recursive_directory_iterator(path, options)) { - if (!dir_entry.is_regular_file()) { + if (!dir_entry.is_regular_file() || std::filesystem::is_symlink(dir_entry.path())) { // skip everything that we cannot process so easily continue; }; @@ -60,7 +59,7 @@ std::vector Broom::track(const std::filesystem::path path) { entry::Entry entry(dir_entry.path()); tracked_entries.push_back(entry); } - } else if (std::filesystem::is_regular_file(path)) { + } else if (std::filesystem::is_regular_file(path) && !std::filesystem::is_symlink(path)) { // just a file entry::Entry entry(path); tracked_entries.push_back(entry); @@ -186,7 +185,7 @@ uintmax_t Broom::find_empty_files(std::vector& tracked_entries) { for (entry::Entry& entry : tracked_entries) { if (entry.filesize == 0) { // empty files can`t be considered as duplicates. assign a group - entry.group = group::EMPTY; + entry.group = entry::EMPTY; found_empty_files++; } } @@ -199,7 +198,7 @@ uintmax_t Broom::remove_empty_files(std::vector& tracked_entries) uintmax_t removed = 0; tracked_entries.erase(std::remove_if(tracked_entries.begin(), tracked_entries.end(), [&removed](entry::Entry& entry) -> bool { - if (entry.group == group::EMPTY) { + if (entry.group == entry::EMPTY) { try { entry.remove(); removed++; @@ -219,11 +218,11 @@ uintmax_t Broom::remove_empty_files(std::vector& tracked_entries) // marks every entry without any group as a duplicate void Broom::mark_as_duplicates(std::vector& tracked_entries) { for (entry::Entry& entry : tracked_entries) { - if (entry.group == group::EMPTY) { + if (entry.group == entry::EMPTY) { // do not mess up grouping continue; } - entry.group = group::DUPLICATE; + entry.group = entry::DUPLICATE; } }; @@ -252,4 +251,33 @@ std::map> Broom::group_duplicates(std::ve return duplicate_groups; }; +// REMOVES every duplicate file in a group except the first one and creates symlinks pointing to the +// first remaining real file +void Broom::remove_duplicates_make_symlinks(const std::map> grouped_duplicates) { + for (const auto& record : grouped_duplicates) { + unsigned int i = 0; + std::filesystem::path original_file_path; + + for (const auto& duplicate_entry : record.second) { + if (i == 0) { + // the first duplicate in the group. Save it + original_file_path = duplicate_entry.path; + } else { + // not the first entry; REMOVE it and create a symlink, + // pointing to the real file + std::filesystem::path removed_duplicate_path = duplicate_entry.path; + try { + // remove the entry + duplicate_entry.remove(); + // make a symlink + std::filesystem::create_symlink(original_file_path, removed_duplicate_path); + } catch(...) {} + } + + // serves only the first iteration. It doesn`t matter if it is not incremented after that + i++; + } + } +}; + } diff --git a/src/broom.hpp b/src/broom.hpp index 89ef2d2..023ed25 100755 --- a/src/broom.hpp +++ b/src/broom.hpp @@ -61,6 +61,10 @@ public: // string of pieces. REMOVES EVERYTHING FROM GIVEN TRACKED ENTRIES std::map> group_duplicates(std::vector& tracked_entries); + // REMOVES every duplicate file in a group except the first one and creates symlinks pointing to the + // first remaining real file + void remove_duplicates_make_symlinks(const std::map> grouped_duplicates); + // creates a list of duplicate, empty files and puts it into a file void create_scan_results_list(const std::map> grouped_duplicates, const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt"); }; diff --git a/src/entry.cpp b/src/entry.cpp index ad194dd..2e11dfe 100755 --- a/src/entry.cpp +++ b/src/entry.cpp @@ -95,7 +95,7 @@ void Entry::get_pieces() { }; // Remove entry from the disk -void Entry::remove() { +void Entry::remove() const { std::filesystem::remove(path); }; diff --git a/src/entry.hpp b/src/entry.hpp index 211475e..489f144 100755 --- a/src/entry.hpp +++ b/src/entry.hpp @@ -26,10 +26,14 @@ along with broom. If not, see . #include #include -#include "group.hpp" - namespace entry { + +enum Group { + DUPLICATE, + EMPTY, +}; + // 3 pieces (beginning, middle and end of the file) const uint8_t PIECE_SIZE = 75; const uint8_t PIECES_AMOUNT = 3; @@ -40,7 +44,7 @@ public: std::filesystem::path path; // set via constructor uintmax_t filesize; // set via constructor std::string pieces; // 3 hex-represented pieces of file; set only via a method call to not stress the disk - group::Group group; // set externally + Group group; // set externally Entry(const std::filesystem::path entry_path); ~Entry(); @@ -51,7 +55,7 @@ public: void get_pieces(); // REMOVE entry from the disk - void remove(); + void remove() const; }; } diff --git a/src/group.hpp b/src/group.hpp deleted file mode 100755 index 956169d..0000000 --- a/src/group.hpp +++ /dev/null @@ -1,33 +0,0 @@ -/* -Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz)) - -This file is part of broom. - -broom is free software: you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation, either version 3 of the License, or -(at your option) any later version. - -broom is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with broom. If not, see . -*/ - - -#ifndef GROUP_HPP -#define GROUP_HPP - -namespace group { - -enum Group { - DUPLICATE, - EMPTY, -}; - -} - -#endif diff --git a/src/main.cpp b/src/main.cpp index 466c697..5df9c63 100755 --- a/src/main.cpp +++ b/src/main.cpp @@ -28,7 +28,7 @@ along with broom. If not, see . #include "broom.hpp" // Broom version number -#define VERSION "v0.2.3" +#define VERSION "v0.3.0" void print_help() { std::cout @@ -39,7 +39,7 @@ void print_help() { << "-od | --output-directory -> path to the directory to save results file in" << std::endl << std::endl << "[COMMANDS]" << std::endl - << "sweep -> scan for duplicate files, save results in a file and REMOVE empty files" << std::endl + << "sweep -> scan for duplicate files, REMOVE empty files and REPLACE other duplicates with symlinks" << std::endl << "scan -> scan and save results in a file without removing anything [DEFAULT]" << std::endl << std::endl << "[DIRECTORY]" << std::endl @@ -170,26 +170,35 @@ int main(int argc, char* argv[]) { std::cout << "[INFO] " << tracked_entries.size() << " files left being tracked" << std::endl; - auto grouped_duplicates = broom.group_duplicates(tracked_entries); - - if (grouped_duplicates.size() == 0) { + if (tracked_entries.size() == 0) { + // No duplicates at all ! std::cout << "[INFO] Nothing I can help with ! Congratulations !" << std::endl; return 0; } - // now only files with a non-unique size and contents are being tracked - // are they REALLY duplicates ? - // better to leave the REALL cleanup for the user, saving these entries in a file, than doing a blind and possibly destructive purge - broom.create_scan_results_list(grouped_duplicates, results_file_dir_path); - std::cout << "[INFO] Created scan results file" << std::endl; + // make duplicate groups from all this mess that tracked_entries right now are + auto grouped_duplicates = broom.group_duplicates(tracked_entries); - // output a little information about how much space could be freed if every duplicate - // in the group will be deleted but one double could_be_freed = 0; for (auto& record : grouped_duplicates) { could_be_freed += record.second[0].filesize * (record.second.size() - 1); } - std::cout <<"[INFO] " << could_be_freed / 1024 / 1024 << " MB could be freed" << std::endl; + + if (!sweeping) { + // output a little information about how much space could be freed if every duplicate + // in the group will be deleted but one + std::cout <<"[INFO] " << could_be_freed / 1024 / 1024 << " MB could be freed" << std::endl; + + broom.create_scan_results_list(grouped_duplicates, results_file_dir_path); + std::cout << "[INFO] Created scan results file" << std::endl; + + } else { + // remove duplicates and create symlinks + std::cout << "[INFO] Removing duplicates and creating symlinks..." << std::endl; + broom.remove_duplicates_make_symlinks(grouped_duplicates); + + std::cout <<"[INFO] Freed approximately " << could_be_freed / 1024 / 1024 << " MB (May be incorrect)" << std::endl; + } } catch(const std::exception& e) { std::cerr