Browse Source

Find empty files; More informative scan result file output; File grouping as EMPTY and DUPLICATE

main
Unbewohnte 3 years ago
parent
commit
b7556902b6
  1. 2
      build/CMakeLists.txt
  2. 99
      src/broom.cpp
  3. 44
      src/broom.hpp
  4. 39
      src/entry.cpp
  5. 26
      src/entry.hpp
  6. 13
      src/group.hpp
  7. 23
      src/main.cpp

2
build/CMakeLists.txt

@ -13,4 +13,4 @@ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -Werror -O2")
set(EXECUTABLE_OUTPUT_PATH ../bin)
add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp)
add_executable(broom ../src/main.cpp ../src/entry.cpp ../src/broom.cpp ../src/group.hpp)

99
src/broom.cpp

@ -26,6 +26,9 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include "entry.hpp"
#include "broom.hpp"
#include "group.hpp"
namespace broom {
Broom::Broom(Options options) {
m_benchmarking = options.benchmarking;
@ -58,12 +61,12 @@ void Broom::track(const std::filesystem::path path) {
continue;
};
Entry entry(dir_entry.path());
entry::Entry entry(dir_entry.path());
m_tracked_entries.push_back(entry);
}
} else if (std::filesystem::is_regular_file(path)) {
// just a file
Entry entry(path);
entry::Entry entry(path);
m_tracked_entries.push_back(entry);
}
@ -76,11 +79,13 @@ void Broom::track(const std::filesystem::path path) {
<< std::chrono::duration_cast<std::chrono::milliseconds>(tracking_time - t0).count()
<< " ms" << std::endl;
}
std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl;
};
// removes entries with unique file sizes. Returns amount of files
// that are no longer being tracked
uintmax_t Broom::untrack_unique_sizes() {
uintmax_t Broom::m_untrack_unique_sizes() {
// key: size, value: amount of occurences
std::map<uintmax_t, uintmax_t> sizes_map;
@ -88,8 +93,6 @@ uintmax_t Broom::untrack_unique_sizes() {
// check if size of this entry is already in the map
// if yes --> increment occurences counter
// if not --> add it to the map with a counter of 1
entry_iter->get_size();
auto iterator = sizes_map.find(entry_iter->filesize);
if (iterator == sizes_map.end()) {
// there is no such size
@ -101,7 +104,7 @@ uintmax_t Broom::untrack_unique_sizes() {
}
uintmax_t untracked = 0;
m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, sizes_map](Entry entry) -> bool{
m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, sizes_map](entry::Entry entry) -> bool{
auto iter = sizes_map.find(entry.filesize);
if (iter->second == 1) {
// unique
@ -109,20 +112,15 @@ uintmax_t Broom::untrack_unique_sizes() {
return true;
};
// std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl;
return false;
}), m_tracked_entries.end());
// std::cout << "Size after untracking by size: " << m_tracked_entries.size() << std::endl;
return untracked;
};
// removes entries with the same content-pieces. Returns amount of
// files that are no longer being tracked
uintmax_t Broom::untrack_unique_contents() {
uintmax_t Broom::m_untrack_unique_contents() {
// contents, occurences
std::map<std::string, uintmax_t> contents_map;
std::map<std::string, uintmax_t>::iterator map_iter;
@ -133,12 +131,17 @@ uintmax_t Broom::untrack_unique_contents() {
// if yes --> increment occurences counter
// if not --> add it to the map with a counter of 1
if (entry_iter->filesize == 0) {
// that`s an empty file. Skip it
entry_iter++;
continue;
}
try{
// can get "permission denied" when opening file
entry_iter->get_pieces();
} catch(const std::ifstream::failure& e) {
// there is nothing we can do. Untrack this entry
// std::cerr << e.what();
entry_iter = m_tracked_entries.erase(entry_iter);
continue;
}
@ -157,7 +160,7 @@ uintmax_t Broom::untrack_unique_contents() {
};
uintmax_t untracked = 0;
m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](Entry entry) -> bool {
m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](entry::Entry entry) -> bool {
auto iter = contents_map.find(entry.pieces);
if (iter->second == 1) {
// unique
@ -171,17 +174,16 @@ uintmax_t Broom::untrack_unique_contents() {
return untracked;
};
// find all duplicates among tracked entries, stop tracking uniques
void Broom::find_duplicates() {
// finds all duplicates among tracked entries and marks them with appropriate group.
// Returns amount of duplicate files
uintmax_t Broom::m_find_duplicates() {
auto t0 = std::chrono::high_resolution_clock::now();
// print how many files are being tracked
uintmax_t global_untracked = m_tracked_entries.size();
std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl;
// untrack by size
uintmax_t untracked_by_size = untrack_unique_sizes();
uintmax_t untracked_by_size = m_untrack_unique_sizes();
global_untracked += untracked_by_size;
std::cout << "[INFO] Untracked " << untracked_by_size << " unique sizes" << std::endl;
@ -195,7 +197,7 @@ void Broom::find_duplicates() {
}
// untrack by contents
uintmax_t untracked_by_contents = untrack_unique_contents();
uintmax_t untracked_by_contents = m_untrack_unique_contents();
global_untracked += untracked_by_contents;
auto contents_untrack_time = std::chrono::high_resolution_clock::now();
@ -209,37 +211,72 @@ void Broom::find_duplicates() {
std::cout << "[INFO] Untracked " << untracked_by_contents << " unique contents" << std::endl;
std::cout << "[INFO] Duplicates: " << m_tracked_entries.size() << std::endl;
std::cout << "[INFO] Found " << m_tracked_entries.size() << " possible duplicate files" << std::endl;
create_duplicates_list();
// mark duplicate entries
std::cout << "[INFO] Created a duplicates list" << std::endl;
for (entry::Entry& duplicate_entry : m_tracked_entries) {
duplicate_entry.group = group::DUPLICATE;
}
return m_tracked_entries.size();
};
// saves current list of duplicate file paths into a file in dir
void Broom::create_duplicates_list(const std::filesystem::path dir, const std::string filename) {
// creates a list of duplicate, empty files into a file
void Broom::create_scan_results_list(const std::filesystem::path dir, const std::string filename) {
if (!std::filesystem::exists(dir)) {
// create it then
bool created = std::filesystem::create_directories(dir);
if (!created) {
throw "Could not create a directory";
throw "Could not create a directory to save scan results in";
}
}
// create output file there
std::fstream outfile(dir / filename, std::ios::out);
if (!outfile.is_open()) {
throw "Could not create an output file";
throw "Could not create a scan results file";
}
for (const Entry duplicate_entry : m_tracked_entries) {
// log every duplicate entry
outfile << duplicate_entry.path << std::endl;
for (const entry::Entry entry : m_tracked_entries) {
// log every entry and its group
if (entry.group == group::EMPTY) {
outfile << entry.path << " --- is an empty file" << std::endl;
} else if (entry.group == group::DUPLICATE) {
outfile << entry.path << " --- is a duplicate of another file" << std::endl;
}
}
outfile.close();
std::cout << "[INFO] Created scan results file" << std::endl;
};
// finds empty files among tracked entries and gives them appropriate group
// Returns amount of found empty files
uintmax_t Broom::m_find_empty_files() {
uintmax_t found_empty_files = 0;
for (entry::Entry& entry : m_tracked_entries) {
if (entry.filesize == 0) {
entry.group = group::EMPTY;
found_empty_files++;
}
}
std::cout << "[INFO] Found " << found_empty_files << " empty files" << std::endl;
return found_empty_files;
};
// scans directory for duplicates and empty files
void Broom::scan() {
m_find_empty_files();
m_find_duplicates();
};
// remove ALL duplicate files
void Broom::sweep_all() {
void Broom::sweep() {
};
}

44
src/broom.hpp

@ -23,6 +23,7 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <cstdint>
#include <vector>
namespace broom {
// Broom`s settings
struct Options {
bool sweeping;
@ -38,12 +39,24 @@ protected:
// TODO(think about how sweeping should work)
bool m_sweeping;
// how many files has been (would be ?) "sweeped"
uintmax_t m_sweeped_files;
// how many bytes was (would be ?) freed
uintmax_t m_sweeped_size;
// paths to tracked files
std::vector<Entry> m_tracked_entries;
std::vector<entry::Entry> m_tracked_entries;
// finds empty files among tracked entries.
// Returns amount of found empty files
uintmax_t m_find_empty_files();
// removes entries with unique file sizes. Returns amount of files
// that are no longer being tracked
uintmax_t m_untrack_unique_sizes();
// removes entries with the same content-pieces. Returns amount of
// files that are no longer being tracked
uintmax_t m_untrack_unique_contents();
// finds all duplicates among tracked entries and marks them with appropriate group
// Returns amount of duplicate files
uintmax_t m_find_duplicates();
public:
Broom(Options options);
@ -53,22 +66,17 @@ public:
// error in case path does not exist
void track(const std::filesystem::path path);
// find all duplicates in the directory
void find_duplicates();
// creates a list of duplicate, empty files into a file
void create_scan_results_list(const std::filesystem::path dir = ".", const std::string filename = "scan_results.txt");
// removes entries with unique file sizes. Returns amount of files
// that are no longer being tracked
uintmax_t untrack_unique_sizes();
// TODO
void sweep();
// removes entries with the same content-pieces. Returns amount of
// files that are no longer being tracked
uintmax_t untrack_unique_contents();
// scans tracked entries for duplicates and empty files
void scan();
};
// saves current list of duplicate file paths into a file
void create_duplicates_list(const std::filesystem::path dir = ".", const std::string filename = "duplicate_files_list.txt");
}
// TODO
void sweep_all();
};
#endif

39
src/entry.cpp

@ -24,21 +24,20 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <cstring>
// A wrapper for every file with all necessary information
namespace entry {
// A wrapper for every file in filesystem with all necessary information
Entry::Entry(const std::filesystem::path entry_path) {
// path
path = entry_path;
};
Entry::~Entry() {};
// sets this entry`s filesize
void Entry::get_size() {
// filesize
filesize = std::filesystem::file_size(path);
};
Entry::~Entry() {};
// reads 2 pieces from the middle and the end of a file, converts them into
// reads 3 pieces from the beginning, middle and the end of a file, converts them into
// a convenient hex-encoded string
void Entry::get_pieces() {
std::fstream entry_file;
@ -48,28 +47,35 @@ void Entry::get_pieces() {
throw std::ifstream::failure("Could not open \"" + path.string() + "\"; reason: " + std::string(std::strerror(errno)) + "\n");
}
char pieces_buffer[PIECE_SIZE * 2];
if (filesize <= PIECE_SIZE * 2) {
// can`t take whole 2 pieces !
char pieces_buffer[PIECE_SIZE * PIECES_AMOUNT];
if (filesize <= PIECE_SIZE * PIECES_AMOUNT) {
// can`t take whole 3 pieces !
// read the whole file then
entry_file.read(pieces_buffer, filesize);
} else {
// read chunk from the beginning
char begin_buf[PIECE_SIZE];
entry_file.read(begin_buf, PIECE_SIZE);
for (uint8_t i = 0; i < PIECE_SIZE; i++) {
pieces_buffer[i] = begin_buf[i];
}
uintmax_t middle_of_the_file = (double) filesize / 2.0 - PIECE_SIZE;
entry_file.seekg(middle_of_the_file, std::ios::beg);
// read CHUNK_SIZE bytes from the middle of the file
char middle_buf[PIECE_SIZE];
entry_file.read(middle_buf, PIECE_SIZE);
for (uint8_t i = 0; i < PIECE_SIZE; i++) {
pieces_buffer[i] = middle_buf[i];
for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) {
pieces_buffer[i] = middle_buf[i - PIECE_SIZE];
};
// jump to the last CHUNK_SIZE bytes of the file and read the as well
entry_file.seekg(PIECE_SIZE, std::ios::end);
char end_buf[PIECE_SIZE];
entry_file.read(end_buf, PIECE_SIZE);
for (uint8_t i = PIECE_SIZE; i < PIECE_SIZE * 2; i++) {
pieces_buffer[i] = end_buf[i - PIECE_SIZE];
for (uint8_t i = PIECE_SIZE * 2; i < PIECE_SIZE * 3; i++) {
pieces_buffer[i] = end_buf[i - PIECE_SIZE * 2];
};
};
entry_file.close();
@ -81,11 +87,12 @@ void Entry::get_pieces() {
};
pieces = pieces_hex.str();
std::cout << pieces << std::endl;
};
// Remove entry from the disk
void Entry::remove() {
std::filesystem::remove(path);
};
}

26
src/entry.hpp

@ -25,23 +25,26 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include <sstream>
#include <iomanip>
// 2 pieces (middle and end of the file)
const uint8_t PIECE_SIZE = 16;
#include "group.hpp"
// A wrapper for every file with all necessary information
namespace entry {
// 3 pieces (beginning, middle and end of the file)
const uint8_t PIECE_SIZE = 6;
const uint8_t PIECES_AMOUNT = 3;
// A wrapper for every file in filesystem with all necessary information
class Entry {
public:
std::filesystem::path path;
uintmax_t filesize;
std::string pieces; // 2 hex-represented pieces of file
std::filesystem::path path; // set via constructor
uintmax_t filesize; // set via constructor
std::string pieces; // 3 hex-represented pieces of file; set only via a method call to not stress the disk
group::Group group; // set externally
Entry(const std::filesystem::path entry_path);
~Entry();
// sets this entry`s filesize
void get_size();
// reads 2 pieces from the middle and the end of a file, converts them into
// reads 3 pieces from the beginning, middle and the end of a file, converts them into
// a convenient hex-encoded string
void get_pieces();
@ -49,5 +52,8 @@ public:
void remove();
};
}
#endif

13
src/group.hpp

@ -0,0 +1,13 @@
#ifndef GROUP_HPP
#define GROUP_HPP
namespace group {
enum Group {
DUPLICATE,
EMPTY,
};
}
#endif

23
src/main.cpp

@ -26,15 +26,16 @@ along with broom. If not, see <https://www.gnu.org/licenses/>.
#include "broom.hpp"
// Broom version number
#define VERSION "v0.1.0"
#define VERSION "v0.1.1"
void print_help() {
std::cout
<< "broom [FLAGS..] [COMMAND] [FILES|DIRECTORIES...]" << std::endl << std::endl
<< "FLAGS" << std::endl
<< "broom [FLAGS..] [COMMAND] [DIRECTORY]" << std::endl << std::endl
<< "[FLAGS]" << std::endl
<< "-v | --version -> print version information and exit" << std::endl
<< "-h | --help -> print this message and exit" << std::endl << std::endl
<< "COMMANDS" << std::endl
<< "[COMMANDS]" << std::endl
<< "sweep -> scan for duplicate files and delete (sweep) all of them but the last one" << std::endl
<< "scan -> scan for duplicate files and output information in a file" << std::endl
<< std::endl;
@ -43,7 +44,8 @@ void print_help() {
void print_version() {
std::cout
<< "broom " << VERSION << std::endl
<< "a command line utility to locate and manage duplicate files" << std::endl << std::endl
<< "incurable hoarder`s helpful friend" << std::endl << std::endl
<< "Copyright (C) 2021 Kasyanov Nikolay Alexeevich (Unbewohnte (me@unbewohnte.xyz))" << std::endl
<< "This program comes with ABSOLUTELY NO WARRANTY." << std::endl
<< "This is free software, and you are welcome to redistribute it" << std::endl
@ -52,7 +54,7 @@ void print_version() {
};
int main(int argc, char* argv[]) {
Options options;
broom::Options options;
std::filesystem::path tracked_path;
if (argc < 2) {
@ -62,7 +64,7 @@ int main(int argc, char* argv[]) {
// process command line arguments
for (unsigned int i = 1; i < argc; i++) {
// flags -> command -> directories&&files
// flags -> command -> directory
if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0) {
print_help();
@ -83,7 +85,7 @@ int main(int argc, char* argv[]) {
}
else {
// add path
tracked_path = argv[i];
tracked_path = std::filesystem::path(argv[i]);
};
};
@ -94,10 +96,11 @@ int main(int argc, char* argv[]) {
};
Broom broom(options);
broom::Broom broom(options);
try {
broom.track(tracked_path);
broom.find_duplicates();
broom.scan();
broom.create_scan_results_list();
} catch(const std::invalid_argument& e) {
std::cerr
<< "[ERROR] Invalid argument: " << std::endl

Loading…
Cancel
Save