|
|
@ -32,28 +32,8 @@ Broom::Broom(Options options) { |
|
|
|
m_sweeping = options.sweeping; |
|
|
|
m_sweeping = options.sweeping; |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
Broom::~Broom() {}; |
|
|
|
Broom::~Broom() { |
|
|
|
|
|
|
|
m_tracked_entries.clear(); |
|
|
|
// Print current statistics
|
|
|
|
|
|
|
|
void Broom::print_statistics() { |
|
|
|
|
|
|
|
std::cout |
|
|
|
|
|
|
|
<< "| sweeped " << m_sweeped_files << " files" << std::endl |
|
|
|
|
|
|
|
<< "| with a total size of " << m_sweeped_size << " bytes" << std::endl |
|
|
|
|
|
|
|
<< std::endl; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// returns amount of regular files in path, searching recursively.
|
|
|
|
|
|
|
|
// Throws an invalid_argument error in case path does not exist
|
|
|
|
|
|
|
|
uintmax_t amount_of_files(const std::filesystem::path path) { |
|
|
|
|
|
|
|
if (!std::filesystem::exists(path)) { |
|
|
|
|
|
|
|
throw std::invalid_argument("\"" + path.string() + "\"" + " does not exist !"); |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!std::filesystem::is_directory(path)) { |
|
|
|
|
|
|
|
throw std::invalid_argument("\"" + path.string() + "\"" + " is not a directory"); |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return std::distance(std::filesystem::directory_iterator(path), std::filesystem::directory_iterator{}); |
|
|
|
|
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
// recursively track every file that lies in given path. Throws an invalid_argument
|
|
|
|
// recursively track every file that lies in given path. Throws an invalid_argument
|
|
|
@ -90,7 +70,7 @@ void Broom::track(const std::filesystem::path path) { |
|
|
|
auto tracking_time = std::chrono::high_resolution_clock::now(); |
|
|
|
auto tracking_time = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
|
|
std::cout |
|
|
|
std::cout |
|
|
|
<< "Tracking took " |
|
|
|
<< "[BENCHMARK] Tracking took " |
|
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(tracking_time - t0).count() |
|
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(tracking_time - t0).count() |
|
|
|
<< " ms" << std::endl; |
|
|
|
<< " ms" << std::endl; |
|
|
|
}; |
|
|
|
}; |
|
|
@ -102,16 +82,16 @@ uintmax_t Broom::untrack_unique_sizes() { |
|
|
|
// key: size, value: amount of occurences
|
|
|
|
// key: size, value: amount of occurences
|
|
|
|
std::map<uintmax_t, uintmax_t> sizes_map; |
|
|
|
std::map<uintmax_t, uintmax_t> sizes_map; |
|
|
|
|
|
|
|
|
|
|
|
for (Entry& entry : m_tracked_entries) { |
|
|
|
for (auto entry_iter = m_tracked_entries.begin(); entry_iter != m_tracked_entries.end(); entry_iter++) { |
|
|
|
// check if size of this entry is already in the map
|
|
|
|
// check if size of this entry is already in the map
|
|
|
|
// if yes --> increment occurences counter
|
|
|
|
// if yes --> increment occurences counter
|
|
|
|
// if not --> add it to the map with a counter of 1
|
|
|
|
// if not --> add it to the map with a counter of 1
|
|
|
|
entry.get_size(); |
|
|
|
entry_iter->get_size(); |
|
|
|
|
|
|
|
|
|
|
|
auto iterator = sizes_map.find(entry.filesize); |
|
|
|
auto iterator = sizes_map.find(entry_iter->filesize); |
|
|
|
if (iterator == sizes_map.end()) { |
|
|
|
if (iterator == sizes_map.end()) { |
|
|
|
// there is no such size
|
|
|
|
// there is no such size
|
|
|
|
sizes_map.insert({entry.filesize, 1}); |
|
|
|
sizes_map.insert({entry_iter->filesize, 1}); |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
// there is such size
|
|
|
|
// there is such size
|
|
|
|
sizes_map[iterator->first]++; |
|
|
|
sizes_map[iterator->first]++; |
|
|
@ -130,85 +110,105 @@ uintmax_t Broom::untrack_unique_sizes() { |
|
|
|
// std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl;
|
|
|
|
// std::cout << "duplicate fsize: " << iter->first << " occurences: " << iter->second << std::endl;
|
|
|
|
|
|
|
|
|
|
|
|
return false; |
|
|
|
return false; |
|
|
|
})); |
|
|
|
}), m_tracked_entries.end()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// std::cout << "Size after untracking by size: " << m_tracked_entries.size() << std::endl;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return untracked; |
|
|
|
return untracked; |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
// removes entries with unique first and last 20 bytes. Returns amount of
|
|
|
|
// removes entries with the same content-pieces. Returns amount of
|
|
|
|
// files that are no longer being tracked
|
|
|
|
// files that are no longer being tracked
|
|
|
|
// uintmax_t Broom::untrack_unique_contents() {
|
|
|
|
uintmax_t Broom::untrack_unique_contents() { |
|
|
|
// // contents, occurences
|
|
|
|
// contents, occurences
|
|
|
|
// std::map<char[CHECKSUM_SIZE], uintmax_t> contents_map;
|
|
|
|
std::map<std::string, uintmax_t> contents_map; |
|
|
|
// std::map<char[CHECKSUM_SIZE], uintmax_t>::iterator iterator;
|
|
|
|
std::map<std::string, uintmax_t>::iterator map_iter; |
|
|
|
//
|
|
|
|
|
|
|
|
// for (Entry& entry : m_tracked_entries) {
|
|
|
|
for (auto entry_iter = m_tracked_entries.begin(); entry_iter != m_tracked_entries.end();) { |
|
|
|
// // the same logic:
|
|
|
|
// the same logic:
|
|
|
|
// // check if contents of this entry is already in the map
|
|
|
|
// check if contents of this entry is already in the map
|
|
|
|
// // if yes --> increment occurences counter
|
|
|
|
// if yes --> increment occurences counter
|
|
|
|
// // if not --> add it to the map with a counter of 1
|
|
|
|
// if not --> add it to the map with a counter of 1
|
|
|
|
//
|
|
|
|
|
|
|
|
// iterator = contents_map.find(entry.checksum);
|
|
|
|
try{ |
|
|
|
//
|
|
|
|
// can get "permission denied" when opening file
|
|
|
|
// if (iterator == contents_map.end()) {
|
|
|
|
entry_iter->get_pieces(); |
|
|
|
// // add it to the map
|
|
|
|
} catch(const std::ifstream::failure& e) { |
|
|
|
// contents_map.insert(std::pair<char[CHECKSUM_SIZE], uintmax_t>(entry.checksum, 1));
|
|
|
|
// there is nothing we can do. Untrack this entry
|
|
|
|
// } else {
|
|
|
|
// std::cerr << e.what();
|
|
|
|
// // increment occurences counter
|
|
|
|
entry_iter = m_tracked_entries.erase(entry_iter); |
|
|
|
// uintmax_t occurences = contents_map[iterator->first];
|
|
|
|
continue; |
|
|
|
// contents_map[iterator->first] = occurences++;
|
|
|
|
} |
|
|
|
// };
|
|
|
|
|
|
|
|
// };
|
|
|
|
map_iter = contents_map.find(entry_iter->pieces); |
|
|
|
//
|
|
|
|
if (map_iter == contents_map.end()) { |
|
|
|
// uintmax_t untracked = 0;
|
|
|
|
// add it to the map
|
|
|
|
// for (std::pair<const char[CHECKSUM_SIZE], uintmax_t> contents_entry : contents_map) {
|
|
|
|
contents_map.insert({entry_iter->pieces, 1}); |
|
|
|
// if (contents_entry.second > 1) {
|
|
|
|
// std::cout << "First time seeing this piece: " << entry_iter->pieces << std::endl;
|
|
|
|
// // not a unique size. Keep such entries
|
|
|
|
} else { |
|
|
|
// } else {
|
|
|
|
// increment occurences counter
|
|
|
|
// // a unique one. Untrack such an entry
|
|
|
|
contents_map[map_iter->first]++; |
|
|
|
// std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [contents_entry](Entry e) -> bool {
|
|
|
|
}; |
|
|
|
// return (e.compare_checksums(contents_entry.first));
|
|
|
|
|
|
|
|
// });
|
|
|
|
entry_iter++; |
|
|
|
// untracked++;
|
|
|
|
}; |
|
|
|
// };
|
|
|
|
|
|
|
|
// };
|
|
|
|
uintmax_t untracked = 0; |
|
|
|
//
|
|
|
|
m_tracked_entries.erase(std::remove_if(m_tracked_entries.begin(), m_tracked_entries.end(), [&untracked, contents_map](Entry entry) -> bool { |
|
|
|
// return untracked;
|
|
|
|
auto iter = contents_map.find(entry.pieces); |
|
|
|
// };
|
|
|
|
if (iter->second == 1) { |
|
|
|
|
|
|
|
// unique
|
|
|
|
|
|
|
|
untracked++; |
|
|
|
|
|
|
|
return true; |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
return false; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}), m_tracked_entries.end()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return untracked; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// find all duplicates among tracked entries, stop tracking uniques
|
|
|
|
// find all duplicates among tracked entries, stop tracking uniques
|
|
|
|
void Broom::find_duplicates() { |
|
|
|
void Broom::find_duplicates() { |
|
|
|
if (m_benchmarking) { |
|
|
|
auto t0 = std::chrono::high_resolution_clock::now(); |
|
|
|
auto t0 = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
|
|
// print how many files are being tracked
|
|
|
|
|
|
|
|
uintmax_t global_untracked = m_tracked_entries.size(); |
|
|
|
|
|
|
|
std::cout << "[INFO] Tracking " << m_tracked_entries.size() << " files" << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
untrack_unique_sizes(); |
|
|
|
// untrack by size
|
|
|
|
|
|
|
|
uintmax_t untracked_by_size = untrack_unique_sizes(); |
|
|
|
|
|
|
|
global_untracked += untracked_by_size; |
|
|
|
|
|
|
|
std::cout << "[INFO] Untracked " << untracked_by_size << " unique sizes" << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
auto sizes_untrack_time = std::chrono::high_resolution_clock::now(); |
|
|
|
auto sizes_untrack_time = std::chrono::high_resolution_clock::now(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (m_benchmarking) { |
|
|
|
std::cout |
|
|
|
std::cout |
|
|
|
<< "Untracking by size took " |
|
|
|
<< "[BENCHMARK] Untracking by size took " |
|
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(sizes_untrack_time - t0).count() |
|
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(sizes_untrack_time - t0).count() |
|
|
|
<< " ms" << std::endl; |
|
|
|
<< " ms" << std::endl; |
|
|
|
} else { |
|
|
|
}; |
|
|
|
size_t startsize = m_tracked_entries.size(); |
|
|
|
|
|
|
|
std::cout << "Tracking " << startsize << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uintmax_t global_untracked = 0; |
|
|
|
// untrack by contents
|
|
|
|
|
|
|
|
uintmax_t untracked_by_contents = untrack_unique_contents(); |
|
|
|
|
|
|
|
global_untracked += untracked_by_contents; |
|
|
|
|
|
|
|
|
|
|
|
// uintmax_t untracked_by_contents = untrack_unique_contents();
|
|
|
|
auto contents_untrack_time = std::chrono::high_resolution_clock::now(); |
|
|
|
// global_untracked += untracked_by_contents;
|
|
|
|
|
|
|
|
// std::cout << "Untracked " << untracked_by_contents << " unique contents" << std::endl;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (m_benchmarking) { |
|
|
|
|
|
|
|
std::cout |
|
|
|
|
|
|
|
<< "[BENCHMARK] Untracking by contents took " |
|
|
|
|
|
|
|
<< std::chrono::duration_cast<std::chrono::milliseconds>(contents_untrack_time - sizes_untrack_time).count() |
|
|
|
|
|
|
|
<< " ms" << std::endl; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
uintmax_t untracked_by_size = untrack_unique_sizes(); |
|
|
|
std::cout << "[INFO] Untracked " << untracked_by_contents << " unique contents" << std::endl; |
|
|
|
global_untracked += untracked_by_size; |
|
|
|
|
|
|
|
std::cout << "Untracked " << untracked_by_size << " unique sizes" << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
std::cout << "[INFO] Duplicates: " << m_tracked_entries.size() << std::endl; |
|
|
|
|
|
|
|
|
|
|
|
std::cout << "Duplicates: " << startsize - global_untracked << std::endl; |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
// remove ALL duplicate files
|
|
|
|
// remove ALL duplicate files
|
|
|
|