Browse Source

Find empty files; More informative scan result file output; File grouping as EMPTY and DUPLICATE

main
Unbewohnte 3 years ago
parent
commit
b7556902b6
  1. 2
      build/CMakeLists.txt
  2. 99
      src/broom.cpp
  3. 44
      src/broom.hpp
  4. 39
      src/entry.cpp
  5. 26
      src/entry.hpp
  6. 13
      src/group.hpp
  7. 23
      src/main.cpp

2
build/CMakeLists.txt

@ -13,4 +13,4 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Werror -O2")
set(EXECUTABLE_OUTPUT_PATH ../bin) set(EXECUTABLE_OUTPUT_PATH ../bin)
add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp) add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp ../src/group.hpp)

99
src/broom.cpp

@ -26,6 +26,9 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include "entry.hpp" #include "entry.hpp"
#include "broom.hpp" #include "broom.hpp"
#include "group.hpp"
namespace broom {
Broom::Broom(Options options) { Broom::Broom(Options options) {
m_benchmarking = options.benchmarking; m_benchmarking = options.benchmarking;
@ -58,12 +61,12 @@ void Broom::track(const std::filesystem::path path) {
continue; continue;
}; };
Entry entry(dir_entry.path()); entry::Entry entry(dir_entry.path());
m_tracked_entries.push_back(entry); m_tracked_entries.push_back(entry);
} }
} else if (std::filesystem::is_regular_file(path)) { } else if (std::filesystem::is_regular_file(path)) {
// just a file // just a file
Entry entry(path); entry::Entry entry(path);
m_tracked_entries.push_back(entry); m_tracked_entries.push_back(entry);
} }
@ -76,11 +79,13 @@ void Broom::track(const std::filesystem::path path) {
<< std::chrono::duration_cast<std::chrono::milliseconds>(tracking_time - t0).count() << std::chrono::duration_cast<std::chrono::milliseconds>(tracking_time - t0).count()
<< " ms" << std::endl; << " ms" << std::endl;
} }
std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl;
}; };
// removes entries with unique file sizes. Returns amount of files // removes entries with unique file sizes. Returns amount of files
// that are no longer being tracked // that are no longer being tracked
uintmax_t Broom::untrack_unique_sizes() { uintmax_t Broom::m_untrack_unique_sizes() {
// key: size, value: amount of occurences // key: size, value: amount of occurences
std::map<uintmax_t, uintmax_t> sizes_map; std::map<uintmax_t, uintmax_t> sizes_map;
@ -88,8 +93,6 @@ uintmax_t Broom::untrack_unique_sizes() {
// check if size of this entry is already in the map // check if size of this entry is already in the map
// if yes --> increment occurences counter // if yes --> increment occurences counter
// if not --> add it to the map with a counter of 1 // if not --> add it to the map with a counter of 1
entry_iter->get_size();
auto iterator = sizes_map.find(entry_iter->filesize); auto iterator = sizes_map.find(entry_iter->filesize);
if (iterator == sizes_map.end()) { if (iterator == sizes_map.end()) {
// there is no such size // there is no such size
@ -101,7 +104,7 @@ uintmax_t Broom::untrack_unique_sizes() {
} }
uintmax_t untracked = 0; uintmax_t untracked = 0;
m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, sizes_map](Entry entry) -> bool{ m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, sizes_map](entry::Entry entry) -> bool{
auto iter = sizes_map.find(entry.filesize); auto iter = sizes_map.find(entry.filesize);
if (iter->second == 1) { if (iter->second == 1) {
// unique // unique
@ -109,20 +112,15 @@ uintmax_t Broom::untrack_unique_sizes() {
return true; return true;
}; };
// std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl;
return false; return false;
}), m_tracked_entries.end()); }), m_tracked_entries.end());
// std::cout << "Size after untracking by size: " << m_tracked_entries.size() << std::endl;
return untracked; return untracked;
}; };
// removes entries with the same content-pieces. Returns amount of // removes entries with the same content-pieces. Returns amount of
// files that are no longer being tracked // files that are no longer being tracked
uintmax_t Broom::untrack_unique_contents() { uintmax_t Broom::m_untrack_unique_contents() {
// contents, occurences // contents, occurences
std::map<std::string, uintmax_t> contents_map; std::map<std::string, uintmax_t> contents_map;
std::map<std::string, uintmax_t>::iterator map_iter; std::map<std::string, uintmax_t>::iterator map_iter;
@ -133,12 +131,17 @@ uintmax_t Broom::untrack_unique_contents() {
// if yes --> increment occurences counter // if yes --> increment occurences counter
// if not --> add it to the map with a counter of 1 // if not --> add it to the map with a counter of 1
if (entry_iter->filesize == 0) {
// that`s an empty file. Skip it
entry_iter++;
continue;
}
try{ try{
// can get "permission denied" when opening file // can get "permission denied" when opening file
entry_iter->get_pieces(); entry_iter->get_pieces();
} catch(const std::ifstream::failure& e) { } catch(const std::ifstream::failure& e) {
// there is nothing we can do. Untrack this entry // there is nothing we can do. Untrack this entry
// std::cerr << e.what();
entry_iter = m_tracked_entries.erase(entry_iter); entry_iter = m_tracked_entries.erase(entry_iter);
continue; continue;
} }
@ -157,7 +160,7 @@ uintmax_t Broom::untrack_unique_contents() {
}; };
uintmax_t untracked = 0; uintmax_t untracked = 0;
m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](Entry entry) -> bool { m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](entry::Entry entry) -> bool {
auto iter = contents_map.find(entry.pieces); auto iter = contents_map.find(entry.pieces);
if (iter->second == 1) { if (iter->second == 1) {
// unique // unique
@ -171,17 +174,16 @@ uintmax_t Broom::untrack_unique_contents() {
return untracked; return untracked;
}; };
// finds all duplicates among tracked entries and marks them with appropriate group.
// find all duplicates among tracked entries, stop tracking uniques // Returns amount of duplicate files
void Broom::find_duplicates() { uintmax_t Broom::m_find_duplicates() {
auto t0 = std::chrono::high_resolution_clock::now(); auto t0 = std::chrono::high_resolution_clock::now();
// print how many files are being tracked // print how many files are being tracked
uintmax_t global_untracked = m_tracked_entries.size(); uintmax_t global_untracked = m_tracked_entries.size();
std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl;
// untrack by size // untrack by size
uintmax_t untracked_by_size = untrack_unique_sizes(); uintmax_t untracked_by_size = m_untrack_unique_sizes();
global_untracked += untracked_by_size; global_untracked += untracked_by_size;
std::cout << "[INFO] Untracked " << untracked_by_size << " unique sizes" << std::endl; std::cout << "[INFO] Untracked " << untracked_by_size << " unique sizes" << std::endl;
@ -195,7 +197,7 @@ void Broom::find_duplicates() {
} }
// untrack by contents // untrack by contents
uintmax_t untracked_by_contents = untrack_unique_contents(); uintmax_t untracked_by_contents = m_untrack_unique_contents();
global_untracked += untracked_by_contents; global_untracked += untracked_by_contents;
auto contents_untrack_time = std::chrono::high_resolution_clock::now(); auto contents_untrack_time = std::chrono::high_resolution_clock::now();
@ -209,37 +211,72 @@ void Broom::find_duplicates() {
std::cout << "[INFO] Untracked " << untracked_by_contents << " unique contents" << std::endl; std::cout << "[INFO] Untracked " << untracked_by_contents << " unique contents" << std::endl;
std::cout << "[INFO] Duplicates: " << m_tracked_entries.size() << std::endl; std::cout << "[INFO] Found " << m_tracked_entries.size() << " possible duplicate files" << std::endl;
create_duplicates_list(); // mark duplicate entries
std::cout << "[INFO] Created a duplicates list" << std::endl; for (entry::Entry& duplicate_entry : m_tracked_entries) {
duplicate_entry.group = group::DUPLICATE;
}
return m_tracked_entries.size();
}; };
// saves current list of duplicate file paths into a file in dir // creates a list of duplicate, empty files into a file
void Broom::create_duplicates_list(const std::filesystem::path dir, const std::string filename) { void Broom::create_scan_results_list(const std::filesystem::path dir, const std::string filename) {
if (!std::filesystem::exists(dir)) { if (!std::filesystem::exists(dir)) {
// create it then // create it then
bool created = std::filesystem::create_directories(dir); bool created = std::filesystem::create_directories(dir);
if (!created) { if (!created) {
throw "Could not create a directory"; throw "Could not create a directory to save scan results in";
} }
} }
// create output file there // create output file there
std::fstream outfile(dir / filename, std::ios::out); std::fstream outfile(dir / filename, std::ios::out);
if (!outfile.is_open()) { if (!outfile.is_open()) {
throw "Could not create an output file"; throw "Could not create a scan results file";
} }
for (const Entry duplicate_entry : m_tracked_entries) { for (const entry::Entry entry : m_tracked_entries) {
// log every duplicate entry // log every entry and its group
outfile << duplicate_entry.path << std::endl; if (entry.group == group::EMPTY) {
outfile << entry.path << " --- is an empty file" << std::endl;
} else if (entry.group == group::DUPLICATE) {
outfile << entry.path << " --- is a duplicate of another file" << std::endl;
}
} }
outfile.close(); outfile.close();
std::cout << "[INFO] Created scan results file" << std::endl;
};
// finds empty files among tracked entries and gives them appropriate group
// Returns amount of found empty files
uintmax_t Broom::m_find_empty_files() {
uintmax_t found_empty_files = 0;
for (entry::Entry& entry : m_tracked_entries) {
if (entry.filesize == 0) {
entry.group = group::EMPTY;
found_empty_files++;
}
}
std::cout << "[INFO] Found " << found_empty_files << " empty files" << std::endl;
return found_empty_files;
};
// scans directory for duplicates and empty files
void Broom::scan() {
m_find_empty_files();
m_find_duplicates();
}; };
// remove ALL duplicate files // remove ALL duplicate files
void Broom::sweep_all() { void Broom::sweep() {
}; };
}

44
src/broom.hpp

@ -23,6 +23,7 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <cstdint> #include <cstdint>
#include <vector> #include <vector>
namespace broom {
// Broom`s settings // Broom`s settings
struct Options { struct Options {
bool sweeping; bool sweeping;
@ -38,12 +39,24 @@ protected:
// TODO(think about how sweeping should work) // TODO(think about how sweeping should work)
bool m_sweeping; bool m_sweeping;
// how many files has been (would be ?) "sweeped"
uintmax_t m_sweeped_files;
// how many bytes was (would be ?) freed
uintmax_t m_sweeped_size;
// paths to tracked files // paths to tracked files
std::vector<Entry> m_tracked_entries; std::vector<entry::Entry> m_tracked_entries;
// finds empty files among tracked entries.
// Returns amount of found empty files
uintmax_t m_find_empty_files();
// removes entries with unique file sizes. Returns amount of files
// that are no longer being tracked
uintmax_t m_untrack_unique_sizes();
// removes entries with the same content-pieces. Returns amount of
// files that are no longer being tracked
uintmax_t m_untrack_unique_contents();
// finds all duplicates among tracked entries and marks them with appropriate group
// Returns amount of duplicate files
uintmax_t m_find_duplicates();
public: public:
Broom(Options options); Broom(Options options);
@ -53,22 +66,17 @@ public:
// error in case path does not exist // error in case path does not exist
void track(const std::filesystem::path path); void track(const std::filesystem::path path);
// find all duplicates in the directory // creates a list of duplicate, empty files into a file
void find_duplicates(); void create_scan_results_list(const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt");
// removes entries with unique file sizes. Returns amount of files // TODO
// that are no longer being tracked void sweep();
uintmax_t untrack_unique_sizes();
// removes entries with the same content-pieces. Returns amount of // scans tracked entries for duplicates and empty files
// files that are no longer being tracked void scan();
uintmax_t untrack_unique_contents(); };
// saves current list of duplicate file paths into a file }
void create_duplicates_list(const std::filesystem::path dir = ".", const std::string filename = "duplicate_files_list.txt");
// TODO
void sweep_all();
};
#endif #endif

39
src/entry.cpp

@ -24,21 +24,20 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <cstring> #include <cstring>
// A wrapper for every file with all necessary information namespace entry {
// A wrapper for every file in filesystem with all necessary information
Entry::Entry(const std::filesystem::path entry_path) { Entry::Entry(const std::filesystem::path entry_path) {
// path // path
path = entry_path; path = entry_path;
};
Entry::~Entry() {};
// sets this entry`s filesize // filesize
void Entry::get_size() {
filesize = std::filesystem::file_size(path); filesize = std::filesystem::file_size(path);
}; };
Entry::~Entry() {};
// reads 2 pieces from the middle and the end of a file, converts them into // reads 3 pieces from the beginning, middle and the end of a file, converts them into
// a convenient hex-encoded string // a convenient hex-encoded string
void Entry::get_pieces() { void Entry::get_pieces() {
std::fstream entry_file; std::fstream entry_file;
@ -48,28 +47,35 @@ void Entry::get_pieces() {
throw std::ifstream::failure("Could not open \"" + path.string() + "\"; reason: " + std::string(std::strerror(errno)) + "\n"); throw std::ifstream::failure("Could not open \"" + path.string() + "\"; reason: " + std::string(std::strerror(errno)) + "\n");
} }
char pieces_buffer[PIECE_SIZE * 2]; char pieces_buffer[PIECE_SIZE * PIECES_AMOUNT];
if (filesize <= PIECE_SIZE * 2) { if (filesize <= PIECE_SIZE * PIECES_AMOUNT) {
// can`t take whole 2 pieces ! // can`t take whole 3 pieces !
// read the whole file then // read the whole file then
entry_file.read(pieces_buffer, filesize); entry_file.read(pieces_buffer, filesize);
} else { } else {
// read chunk from the beginning
char begin_buf[PIECE_SIZE];
entry_file.read(begin_buf, PIECE_SIZE);
for (uint8_t i = 0; i < PIECE_SIZE; i++) {
pieces_buffer[i] = begin_buf[i];
}
uintmax_t middle_of_the_file = (double) filesize / 2.0 - PIECE_SIZE; uintmax_t middle_of_the_file = (double) filesize / 2.0 - PIECE_SIZE;
entry_file.seekg(middle_of_the_file, std::ios::beg); entry_file.seekg(middle_of_the_file, std::ios::beg);
// read CHUNK_SIZE bytes from the middle of the file // read CHUNK_SIZE bytes from the middle of the file
char middle_buf[PIECE_SIZE]; char middle_buf[PIECE_SIZE];
entry_file.read(middle_buf, PIECE_SIZE); entry_file.read(middle_buf, PIECE_SIZE);
for (uint8_t i = 0; i < PIECE_SIZE; i++) { for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) {
pieces_buffer[i] = middle_buf[i]; pieces_buffer[i] = middle_buf[i - PIECE_SIZE];
}; };
// jump to the last CHUNK_SIZE bytes of the file and read the as well // jump to the last CHUNK_SIZE bytes of the file and read the as well
entry_file.seekg(PIECE_SIZE, std::ios::end); entry_file.seekg(PIECE_SIZE, std::ios::end);
char end_buf[PIECE_SIZE]; char end_buf[PIECE_SIZE];
entry_file.read(end_buf, PIECE_SIZE); entry_file.read(end_buf, PIECE_SIZE);
for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) { for (uint8_t i = PIECE_SIZE * 2; i < PIECE_SIZE * 3; i++) {
pieces_buffer[i] = end_buf[i - PIECE_SIZE]; pieces_buffer[i] = end_buf[i - PIECE_SIZE * 2];
}; };
}; };
entry_file.close(); entry_file.close();
@ -81,11 +87,12 @@ void Entry::get_pieces() {
}; };
pieces = pieces_hex.str(); pieces = pieces_hex.str();
std::cout << pieces << std::endl;
}; };
// Remove entry from the disk // Remove entry from the disk
void Entry::remove() { void Entry::remove() {
std::filesystem::remove(path); std::filesystem::remove(path);
}; };
}

26
src/entry.hpp

@ -25,23 +25,26 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <sstream> #include <sstream>
#include <iomanip> #include <iomanip>
// 2 pieces (middle and end of the file) #include "group.hpp"
const uint8_t PIECE_SIZE = 16;
// A wrapper for every file with all necessary information
namespace entry {
// 3 pieces (beginning, middle and end of the file)
const uint8_t PIECE_SIZE = 6;
const uint8_t PIECES_AMOUNT = 3;
// A wrapper for every file in filesystem with all necessary information
class Entry { class Entry {
public: public:
std::filesystem::path path; std::filesystem::path path; // set via constructor
uintmax_t filesize; uintmax_t filesize; // set via constructor
std::string pieces; // 2 hex-represented pieces of file std::string pieces; // 3 hex-represented pieces of file; set only via a method call to not stress the disk
group::Group group; // set externally
Entry(const std::filesystem::path entry_path); Entry(const std::filesystem::path entry_path);
~Entry(); ~Entry();
// sets this entry`s filesize // reads 3 pieces from the beginning, middle and the end of a file, converts them into
void get_size();
// reads 2 pieces from the middle and the end of a file, converts them into
// a convenient hex-encoded string // a convenient hex-encoded string
void get_pieces(); void get_pieces();
@ -49,5 +52,8 @@ public:
void remove(); void remove();
}; };
}
#endif #endif

13
src/group.hpp

@ -0,0 +1,13 @@
#ifndef GROUP_HPP
#define GROUP_HPP
namespace group {
enum Group {
DUPLICATE,
EMPTY,
};
}
#endif

23
src/main.cpp

@ -26,15 +26,16 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include "broom.hpp" #include "broom.hpp"
// Broom version number // Broom version number
#define VERSION "v0.1.0" #define VERSION "v0.1.1"
void print_help() { void print_help() {
std::cout std::cout
<< "broom [FLAGS..] [COMMAND] [FILES|DIRECTORIES...]" << std::endl << std::endl << "broom [FLAGS..] [COMMAND] [DIRECTORY]" << std::endl << std::endl
<< "FLAGS" << std::endl << "[FLAGS]" << std::endl
<< "-v | --version -> print version information and exit" << std::endl << "-v | --version -> print version information and exit" << std::endl
<< "-h | --help -> print this message and exit" << std::endl << std::endl << "-h | --help -> print this message and exit" << std::endl << std::endl
<< "COMMANDS" << std::endl
<< "[COMMANDS]" << std::endl
<< "sweep -> scan for duplicate files and delete (sweep) all of them but the last one" << std::endl << "sweep -> scan for duplicate files and delete (sweep) all of them but the last one" << std::endl
<< "scan -> scan for duplicate files and output information in a file" << std::endl << "scan -> scan for duplicate files and output information in a file" << std::endl
<< std::endl; << std::endl;
@ -43,7 +44,8 @@ void print_help() {
void print_version() { void print_version() {
std::cout std::cout
<< "broom " << VERSION << std::endl << "broom " << VERSION << std::endl
<< "a command line utility to locate and manage duplicate files" << std::endl << std::endl << "incurable hoarder`s helpful friend" << std::endl << std::endl
<< "Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz))" << std::endl << "Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz))" << std::endl
<< "This program comes with ABSOLUTELY NO WARRANTY." << std::endl << "This program comes with ABSOLUTELY NO WARRANTY." << std::endl
<< "This is free software, and you are welcome to redistribute it" << std::endl << "This is free software, and you are welcome to redistribute it" << std::endl
@ -52,7 +54,7 @@ void print_version() {
}; };
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
Options options; broom::Options options;
std::filesystem::path tracked_path; std::filesystem::path tracked_path;
if (argc < 2) { if (argc < 2) {
@ -62,7 +64,7 @@ int main(int argc, char* argv[]) {
// process command line arguments // process command line arguments
for (unsigned int i = 1; i < argc; i++) { for (unsigned int i = 1; i < argc; i++) {
// flags -> command -> directories&&files // flags -> command -> directory
if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) { if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
print_help(); print_help();
@ -83,7 +85,7 @@ int main(int argc, char* argv[]) {
} }
else { else {
// add path // add path
tracked_path = argv[i]; tracked_path = std::filesystem::path(argv[i]);
}; };
}; };
@ -94,10 +96,11 @@ int main(int argc, char* argv[]) {
}; };
Broom broom(options); broom::Broom broom(options);
try { try {
broom.track(tracked_path); broom.track(tracked_path);
broom.find_duplicates(); broom.scan();
broom.create_scan_results_list();
} catch(const std::invalid_argument& e) { } catch(const std::invalid_argument& e) {
std::cerr std::cerr
<< "[ERROR] Invalid argument: " << std::endl << "[ERROR] Invalid argument: " << std::endl

Loading…
Cancel
Save