Compare commits

...

8 Commits
v0.2.2 ... main

  1. 16
      README.md
  2. 2
      build/CMakeLists.txt
  3. 58
      src/broom.cpp
  4. 7
      src/broom.hpp
  5. 2
      src/entry.cpp
  6. 12
      src/entry.hpp
  7. 33
      src/group.hpp
  8. 145
      src/main.cpp

16
README.md

@ -22,7 +22,7 @@
- clone this repository
`git clone https://github.com/Unbewohnte/broom`
`git clone http://unbewohnte.xyz:3000/Unbewohnte/broom`
- proceed to the directory
@ -40,7 +40,7 @@ if you're on GNU/Linux - you can run `install.sh` for broom to become system-wid
### "The lazy" way
- proceed [to the releases page](https://github.com/Unbewohnte/broom/releases) and get yourself a pre-compiled binary
- proceed [to the releases page](http://unbewohnte.xyz:3000/Unbewohnte/broom/releases) and get yourself a pre-compiled binary
---
@ -53,10 +53,11 @@ broom [FLAGS..] [COMMAND] [DIRECTORY]
- `-v` or `--version` -> print version information and exit
- `-h` or `--help` -> print this message and exit
- `-od` or `--output-directory` -> path to the directory to save results file in
[COMMANDS]
- `sweep` -> scan for duplicate files, save results in a file and REMOVE empty files
- `sweep` -> scan for duplicate files, REMOVE empty files and REPLACE other duplicates with symlinks
- `scan` -> scan and save results in a file without removing anything [DEFAULT]
@ -64,10 +65,10 @@ broom [FLAGS..] [COMMAND] [DIRECTORY]
### Examples
- `broom scan ~/homework`
- `broom sweep ~/homework/I/have/a/lot/of/empty/files/here/for/some/reason`
- `broom scan -od . ~/homework`
- `broom sweep ~/homework`
after the scan the results file will be saved in your current working directory, scan results file contains
after the scan the results file will be saved in your current working directory, unless you specified it to be somewhere else. Scan results file contains
a list of duplicate files that are grouped together so you can see EXACTLY WHERE each duplicate is in the filesystem.
---
@ -79,4 +80,5 @@ GPLv3
## TODO
- Make it go `P` A `R` A `L` L `E` L
- Output approximate size that could be freed
- ~~Output approximate size that could be freed~~
- ~~Remove duplicates and create symlinks~~

2
build/CMakeLists.txt

@ -22,5 +22,5 @@ endif()
set(EXECUTABLE_OUTPUT_PATH ../bin)
add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp ../src/group.hpp)
add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp)
target_link_libraries(broom Threads::Threads)

58
src/broom.cpp

@ -28,7 +28,6 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include "entry.hpp"
#include "broom.hpp"
#include "group.hpp"
namespace broom {
@ -52,7 +51,7 @@ std::vector<entry::Entry> Broom::track(const std::filesystem::path path) {
);
for (auto dir_entry : std::filesystem::recursive_directory_iterator(path, options)) {
if (!dir_entry.is_regular_file()) {
if (!dir_entry.is_regular_file() || std::filesystem::is_symlink(dir_entry.path())) {
// skip everything that we cannot process so easily
continue;
};
@ -60,7 +59,7 @@ std::vector<entry::Entry> Broom::track(const std::filesystem::path path) {
entry::Entry entry(dir_entry.path());
tracked_entries.push_back(entry);
}
} else if (std::filesystem::is_regular_file(path)) {
} else if (std::filesystem::is_regular_file(path) && !std::filesystem::is_symlink(path)) {
// just a file
entry::Entry entry(path);
tracked_entries.push_back(entry);
@ -186,7 +185,7 @@ uintmax_t Broom::find_empty_files(std::vector<entry::Entry>& tracked_entries) {
for (entry::Entry& entry : tracked_entries) {
if (entry.filesize == 0) {
// empty files can`t be considered as duplicates. assign a group
entry.group = group::EMPTY;
entry.group = entry::EMPTY;
found_empty_files++;
}
}
@ -199,7 +198,7 @@ uintmax_t Broom::remove_empty_files(std::vector<entry::Entry>& tracked_entries)
uintmax_t removed = 0;
tracked_entries.erase(std::remove_if(tracked_entries.begin(), tracked_entries.end(), [&removed](entry::Entry& entry) -> bool {
if (entry.group == group::EMPTY) {
if (entry.group == entry::EMPTY) {
try {
entry.remove();
removed++;
@ -216,14 +215,30 @@ uintmax_t Broom::remove_empty_files(std::vector<entry::Entry>& tracked_entries)
return removed;
};
// Untracks specified group in tracked entries. Returns an amount of entries untracked
uintmax_t Broom::untrack_group(std::vector<entry::Entry>& tracked_entries, entry::Group group) {
uintmax_t untracked = 0;
tracked_entries.erase(std::remove_if(tracked_entries.begin(), tracked_entries.end(), [&untracked, &group](entry::Entry& entry) -> bool {
if (entry.group == group) {
untracked++;
return true;
} else {
return false;
}
}), tracked_entries.end());
return untracked;
}
// marks every entry without any group as a duplicate
void Broom::mark_as_duplicates(std::vector<entry::Entry>& tracked_entries) {
for (entry::Entry& entry : tracked_entries) {
if (entry.group == group::EMPTY) {
if (entry.group == entry::EMPTY) {
// do not mess up grouping
continue;
}
entry.group = group::DUPLICATE;
entry.group = entry::DUPLICATE;
}
};
@ -252,4 +267,33 @@ std::map<std::string, std::vector<entry::Entry>> Broom::group_duplicates(std::ve
return duplicate_groups;
};
// REMOVES every duplicate file in a group except the first one and creates symlinks pointing to the
// first remaining real file
void Broom::remove_duplicates_make_symlinks(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates) {
for (const auto& record : grouped_duplicates) {
unsigned int i = 0;
std::filesystem::path original_file_path;
for (const auto& duplicate_entry : record.second) {
if (i == 0) {
// the first duplicate in the group. Save it
original_file_path = duplicate_entry.path;
} else {
// not the first entry; REMOVE it and create a symlink,
// pointing to the real file
std::filesystem::path removed_duplicate_path = duplicate_entry.path;
try {
// remove the entry
duplicate_entry.remove();
// make a symlink
std::filesystem::create_symlink(original_file_path, removed_duplicate_path);
} catch(...) {}
}
// serves only the first iteration. It doesn`t matter if it is not incremented after that
i++;
}
}
};
}

7
src/broom.hpp

@ -46,6 +46,9 @@ public:
// untracks entries with the same content-pieces. Returns amount of
// files that are no longer being tracked.
uintmax_t untrack_unique_contents(std::vector<entry::Entry>& tracked_entries);
// Untracks specified group in tracked entries. Returns an amount of entries untracked
uintmax_t untrack_group(std::vector<entry::Entry>& tracked_entries, entry::Group group);
// finds empty files among tracked entries and marks them with the appropriate group.
// Returns amount of found empty files
@ -61,6 +64,10 @@ public:
// string of pieces. REMOVES EVERYTHING FROM GIVEN TRACKED ENTRIES
std::map<std::string, std::vector<entry::Entry>> group_duplicates(std::vector<entry::Entry>& tracked_entries);
// REMOVES every duplicate file in a group except the first one and creates symlinks pointing to the
// first remaining real file
void remove_duplicates_make_symlinks(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates);
// creates a list of duplicate, empty files and puts it into a file
void create_scan_results_list(const std::map<std::string, std::vector<entry::Entry>> grouped_duplicates, const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt");
};

2
src/entry.cpp

@ -95,7 +95,7 @@ void Entry::get_pieces() {
};
// Remove entry from the disk
void Entry::remove() {
void Entry::remove() const {
std::filesystem::remove(path);
};

12
src/entry.hpp

@ -26,10 +26,14 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <iomanip>
#include <string>
#include "group.hpp"
namespace entry {
enum Group {
DUPLICATE,
EMPTY,
};
// 3 pieces (beginning, middle and end of the file)
const uint8_t PIECE_SIZE = 75;
const uint8_t PIECES_AMOUNT = 3;
@ -40,7 +44,7 @@ public:
std::filesystem::path path; // set via constructor
uintmax_t filesize; // set via constructor
std::string pieces; // 3 hex-represented pieces of file; set only via a method call to not stress the disk
group::Group group; // set externally
Group group; // set externally
Entry(const std::filesystem::path entry_path);
~Entry();
@ -51,7 +55,7 @@ public:
void get_pieces();
// REMOVE entry from the disk
void remove();
void remove() const;
};
}

33
src/group.hpp

@ -1,33 +0,0 @@
/*
Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz))
This file is part of broom.
broom is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
broom is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with broom. If not, see <https://www.gnu.org/licenses/>.
*/
#ifndef GROUP_HPP
#define GROUP_HPP
namespace group {
enum Group {
DUPLICATE,
EMPTY,
};
}
#endif

145
src/main.cpp

@ -1,5 +1,5 @@
/*
Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz))
Copyright (C) 2021 Kasyanov Nikolay Alexeyevich (Unbewohnte (me@unbewohnte.xyz))
This file is part of broom.
@ -28,50 +28,51 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include "broom.hpp"
// Broom version number
#define VERSION "v0.2.2"
#define VERSION "v0.3.1"
void print_help() {
std::cout
<< "broom [FLAGS..] [COMMAND] [DIRECTORY]" << std::endl << std::endl
<< "[FLAGS]" << std::endl
<< "-v | --version -> print version information and exit" << std::endl
<< "-h | --help -> print this message and exit" << std::endl << std::endl
<< "[COMMANDS]" << std::endl
<< "sweep -> scan for duplicate files, save results in a file and REMOVE empty files" << std::endl
<< "scan -> scan and save results in a file without removing anything [DEFAULT]" << std::endl << std::endl
<< "[DIRECTORY]" << std::endl
<< "path to the directory to be scanned" << std::endl
<< std::endl;
<< "broom [FLAGS..] [COMMAND] [DIRECTORY]\n\n"
<< "[FLAGS]\n"
<< "-v | --version -> print version information and exit\n"
<< "-h | --help -> print this message and exit\n"
<< "-ie | --ignore-empty -> do not remove empty files when sweeping\n"
<< "-od | --output-directory -> path to the directory to save results file in when scanning\n\n"
<< "[COMMANDS]\n"
<< "sweep -> scan for duplicate files, REMOVE empty files and REPLACE other duplicates with symlinks\n"
<< "scan -> scan and save results in a file without removing anything [DEFAULT]\n\n"
<< "[DIRECTORY]\n"
<< "path to the directory to be scanned\n\n";
};
void print_version() {
std::cout
<< "broom " << VERSION << std::endl
<< "incurable hoarder`s helpful friend" << std::endl << std::endl
<< " _" << std::endl
<< " //" << std::endl
<< " // " << std::endl
<< " // " << std::endl
<< " // " << std::endl
<< " /####/ " << std::endl
<< " ////// " << std::endl
<< " /////// " << std::endl << std::endl
<< "Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz))" << std::endl
<< "This program comes with ABSOLUTELY NO WARRANTY." << std::endl
<< "This is free software, and you are welcome to redistribute it" << std::endl
<< "under certain conditions" << std::endl
<< std::endl;
<< "broom " << VERSION << "\n"
<< "incurable hoarder`s helpful friend\n\n"
<< " _\n"
<< " //\n"
<< " // \n"
<< " // \n"
<< " // \n"
<< " /####/ \n"
<< " ////// \n"
<< " /////// \n\n"
<< "Copyright (C) 2021 Kasyanov Nikolay Alexeyevich (Unbewohnte (me@unbewohnte.xyz))\n"
<< "This program comes with ABSOLUTELY NO WARRANTY.\n"
<< "This is free software, and you are welcome to redistribute it\n"
<< "under certain conditions\n";
};
int main(int argc, char* argv[]) {
bool sweeping = false;
std::filesystem::path results_file_dir_path = ".";
std::filesystem::path tracked_path;
bool sweeping = false;
bool ignore_empty = false;
if (argc < 2) {
print_help();
@ -90,6 +91,13 @@ int main(int argc, char* argv[]) {
print_version();
return 0;
}
else if (strcmp(argv[i], "-od") == 0 || strcmp(argv[i], "--output-directory") == 0) {
i++;
results_file_dir_path = std::filesystem::path(argv[i]);
}
else if (strcmp(argv[i], "-ie") == 0 || strcmp(argv[i], "--ignore-empty") == 0) {
ignore_empty = true;
}
else if (strcmp(argv[i], "sweep") == 0) {
sweeping = true;
}
@ -112,37 +120,41 @@ int main(int argc, char* argv[]) {
broom::Broom broom;
try {
std::cout
<< " _" << std::endl
<< " //" << std::endl
<< " // " << std::endl
<< " // " << std::endl
<< " // " << std::endl
<< " /####/ " << std::endl
<< " ////// " << std::endl
<< " /////// " << std::endl << std::endl;
<< " _\n"
<< " //\n"
<< " // \n"
<< " // \n"
<< " // \n"
<< " /####/ \n"
<< " ////// \n"
<< " /////// \n\n";
if (sweeping) {
std::cout << "[Sweeping]" << std::endl << std::endl;
std::cout << "[Sweeping]\n\n";
} else {
std::cout << "[Scanning]" << std::endl << std::endl;
std::cout << "[Scanning]\n\n";
}
// track files in a given directory
std::vector<entry::Entry> tracked_entries = broom.track(tracked_path);
std::cout << "[INFO] Tracking " << tracked_entries.size() << " files" << std::endl;
std::cout << "[INFO] Tracking " << tracked_entries.size() << " files\n";
// find empty files
uintmax_t empty_files = broom.find_empty_files(tracked_entries);
std::cout << "[INFO] Found " << empty_files << " empty files" << std::endl;
std::cout << "[INFO] Found " << empty_files << " empty files\n";
// if sweeping - remove empty files right away
if (sweeping) {
if (sweeping && !ignore_empty) {
uintmax_t removed = broom.remove_empty_files(tracked_entries);
std::cout << "[INFO] Removed " << removed << " empty files" << std::endl;
std::cout << "[INFO] Removed " << removed << " empty files\n";
} else {
// just untrack them, do not remove
uintmax_t untracked_empty = broom.untrack_group(tracked_entries, entry::Group::EMPTY);
std::cout << "[INFO] Skipped " << untracked_empty << " empty files\n";
}
// untrack unique sizes
uintmax_t untracked = broom.untrack_unique_sizes(tracked_entries);
std::cout << "[INFO] Untracked " << untracked << " files with a unique size" << std::endl;
std::cout << "[INFO] Untracked " << untracked << " files with a unique size\n";
// get content pieces for each entry
tracked_entries.erase(std::remove_if(tracked_entries.begin(), tracked_entries.end(), [](entry::Entry& entry) -> bool {
@ -158,24 +170,45 @@ int main(int argc, char* argv[]) {
// untrack unique contents
untracked = broom.untrack_unique_contents(tracked_entries);
std::cout << "[INFO] Untracked " << untracked << " files with unique contents" << std::endl;
std::cout << "[INFO] Untracked " << untracked << " files with unique contents\n";
// mark entries as duplicates
broom.mark_as_duplicates(tracked_entries);
std::cout << "[INFO] " << tracked_entries.size() << " files left being tracked" << std::endl;
std::cout << "[INFO] " << tracked_entries.size() << " files left being tracked\n";
if (tracked_entries.size() == 0) {
// No duplicates at all !
std::cout << "[INFO] Nothing I can help with ! Congratulations !\n";
return 0;
}
// make duplicate groups from all this mess that tracked_entries right now are
auto grouped_duplicates = broom.group_duplicates(tracked_entries);
// now only files with a non-unique size and contents are being tracked
// are they REALLY duplicates ?
// better to leave the REALL cleanup for the user, saving these entries in a file, than doing a blind and possibly destructive purge
broom.create_scan_results_list(grouped_duplicates);
std::cout << "[INFO] Created scan results file" << std::endl;
double could_be_freed = 0;
for (auto& record : grouped_duplicates) {
could_be_freed += record.second[0].filesize * (record.second.size() - 1);
}
if (!sweeping) {
// output a little information about how much space could be freed if every duplicate
// in the group will be deleted but one
std::cout <<"[INFO] " << could_be_freed / 1024 / 1024 << " MB could be freed\n";
broom.create_scan_results_list(grouped_duplicates, results_file_dir_path);
std::cout << "[INFO] Created scan results file\n";
} else {
// remove duplicates and create symlinks
std::cout << "[INFO] Removing duplicates and creating symlinks...\n";
broom.remove_duplicates_make_symlinks(grouped_duplicates);
std::cout <<"[INFO] Freed approximately " << could_be_freed / 1024 / 1024 << " MB (May be incorrect)\n";
}
} catch(const std::exception& e) {
std::cerr
<< "[ERROR] " << e.what() << std::endl;
<< "[ERROR] " << e.what() <<"\n";
return 1;
};

Loading…
Cancel
Save