#define _XOPEN_SOURCE 500
#define _GNU_SOURCE
#define _FILE_OFFSET_BITS 64
#define _LARGEFILE_SOURCE
#define _LARGEFILE64_SOURCE
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <ftw.h>
#include "store.h"
/*
* This is used to not visit the same file/dir two times
*/
struct dev_ino
{
struct file_node *obj; /* points to first file found */
struct dev_ino *next;
};
#define DEV_INO_HASH_SIZE (16 * 4096)
#define HASH_SIZE 16384
#define MAX_INPUT_DIRS 32
#define MAX_DEPTH 1000
#define COMPUTE_FIRST 0
#define COMPUTE_FULL 1
/* Exported variables */
unsigned long long no_of_files, no_of_dirs;
unsigned long long sha1_first_computed;
unsigned long long sha1_full_computed;
unsigned long long sha1_bytes;
unsigned long long dup_no_of_files, dup_no_of_dirs;
unsigned long long can_save;
unsigned long long no_of_same_inode_file, no_of_ino_cache_uses;
unsigned long long mem_allocated, mem_calls;
/* Internal variables */
static struct file_node *file_info[HASH_SIZE];
static struct dir_node *dir_info[MAX_INPUT_DIRS];
static unsigned int dir_info_count;
static struct dir_node *dir_current[MAX_DEPTH];
static struct dir_node **dir_chain;
static unsigned long long dir_chain_len;
static unsigned char sha1_zero[SHA_DIGEST_LENGTH];
static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE];
static unsigned int debug = 0;
static FILE *out;
/* ############### Misc functions ############### */
void set_debug(const unsigned int level)
{
debug = level;
}
void set_out(FILE *f)
{
out = f;
}
/* ############### Memory functions ############### */
static void *xmalloc(size_t size)
{
void *p;
if (size == 0)
abort();
p = malloc(size);
if (!p) {
fprintf(stderr, "Cannot alloc %zu bytes!", size);
abort();
}
mem_allocated += size;
mem_calls++;
return p;
}
/* ############### dev_ino helpers ############### */
/*
* Returns first hard link found or NULL (and addas @a to cache)
*/
static struct file_node *dev_ino_seen(struct file_node *a)
{
struct dev_ino *q;
unsigned int hash;
hash = (a->dev ^ a->ino) % DEV_INO_HASH_SIZE;
q = dev_ino_hash[hash];
while (q) {
if ((q->obj->dev == a->dev) && (q->obj->ino == a->ino)) {
if (q->obj && debug) {
fprintf(stderr, "DEBUG: INO HIT for [%s] [%s]\n",
a->name, q->obj->name);
}
no_of_same_inode_file++;
return q->obj;
}
q = q->next;
}
if (debug)
fprintf(stderr, "DEBUG: add file [%s] to INO HIT dev=%lu ino=%lu\n",
a->name, a->dev, a->ino);
// TODO: sort to speed up the walk
q = (struct dev_ino *) xmalloc(sizeof(struct dev_ino));
q->obj = a;
q->next = dev_ino_hash[hash];
dev_ino_hash[hash] = q;
return NULL;
}
/*
* Clean dev_ino_seen stuff
*/
void dev_ino_seen_clean(void)
{
unsigned int i;
struct dev_ino *q, *next;
for (i = 0; i < DEV_INO_HASH_SIZE; i++) {
q = dev_ino_hash[i];
while (q) {
next = q->next;
free(q);
q = next;
}
}
}
/* ############### SHA-1 functions ############### */
static void sha1_dump(char *out, const unsigned char *b, const unsigned int max0)
{
unsigned int i, max;
if (max0 == 0)
max = SHA_DIGEST_LENGTH;
else
max = max0;
for (i = 0; i < max; i++)
snprintf(out + i * 2, 3, "%02x", b[i]);
}
/*
* Computes SHA-1 for @len length
*/
static int sha1(const char *file, const size_t len, unsigned char *out)
{
int fd;
unsigned char buf[65536];
SHA_CTX c;
size_t bytes;
fd = open(file, O_RDONLY);
if (fd == -1)
return -1;
SHA1_Init(&c);
bytes = 0;
while (bytes < len) {
ssize_t n;
n = read(fd, buf, sizeof(buf));
if (n == -1) {
close(fd);
return -1;
}
if (n == 0)
break;
SHA1_Update(&c, buf, (size_t) n);
bytes += (size_t) n;
sha1_bytes += (size_t) n;
}
close(fd);
SHA1_Final(out, &c);
if (len <= 65536)
sha1_first_computed++;
else
sha1_full_computed++;
return 0;
}
/*
* @what - COMPUTE_FIRST or COMPUTE_FULL
*/
__hot static int sha1_compute(struct file_node *a,
const unsigned char what)
{
struct file_node *z;
unsigned long long size;
if (memcmp(a->sha1[what], sha1_zero, SHA_DIGEST_LENGTH) != 0)
return 0;
if (what == COMPUTE_FIRST)
size = 65536;
else
size = a->size;
// Test if is in cache
z = a->ino_first;
if (z) {
if (memcmp(z->sha1[what], sha1_zero, SHA_DIGEST_LENGTH) == 0)
sha1(z->name, size, z->sha1[what]);
else
no_of_ino_cache_uses++;
memcpy(a->sha1[what], z->sha1[what], SHA_DIGEST_LENGTH);
return 0;
}
return sha1(a->name, size, a->sha1[what]);
}
/* ############### Main functions ############### */
__cold void dump_stats(void)
{
fprintf(stderr, "Number of visited file(s): %llu.\n", no_of_files);
fprintf(stderr, "Number of visited dir(s): %llu.\n", no_of_dirs);
fprintf(stderr, "Number of bytes processed by SHA-1: %llu.\n",
sha1_bytes);
fprintf(stderr, "Number of [<]64K SHA-1 computed: %llu.\n",
sha1_first_computed);
fprintf(stderr, "Number of full SHA-1 computed: %llu.\n",
sha1_full_computed);
fprintf(stderr, "Bytes that could be saved: %llu.\n", can_save);
fprintf(stderr, "Number of duplicated files: %llu.\n", dup_no_of_files);
fprintf(stderr, "Number of duplicated dirs: %llu.\n", dup_no_of_dirs);
fprintf(stderr, "Number of same dev/inode (file): %llu.\n",
no_of_same_inode_file);
fprintf(stderr, "Number of uses of inode cache: %llu.\n",
no_of_ino_cache_uses);
fprintf(stderr, "Memory allocated: %llu bytes in %llu call(s).\n",
mem_allocated, mem_calls);
}
static struct file_node *alloc_file_node(void)
{
struct file_node *q;
unsigned int mem;
mem = sizeof(struct file_node);
q = (struct file_node *) xmalloc(mem);
if (q == NULL) {
fprintf(stderr, "ERROR: Cannot alloc memory for a file node!\n");
return NULL;
}
memset(q, 0, sizeof(struct file_node));
return q;
}
int file_add(const char *file, const struct stat *s, const int level)
{
struct dir_node *parent;
struct file_node *q, *p, *prev;
unsigned int hash;
unsigned long long size;
size = (unsigned long long) s->st_size;
q = alloc_file_node();
if (q == NULL)
return -1;
q->size = size;
q->name = strdup(file);
memset(&q->sha1[0], 0, SHA_DIGEST_LENGTH);
memset(&q->sha1[1], 0, SHA_DIGEST_LENGTH);
q->dev = s->st_dev;
q->ino = s->st_ino;
q->level = level;
q->mtime = s->st_mtime;
// Test if is in cache
q->ino_first = dev_ino_seen(q);
/* link with dir */
parent = dir_current[level - 1];
q->next = parent->files;
parent->files = q;
q->parent = parent;
parent->no_of_files++;
no_of_files++;
hash = size % HASH_SIZE;
if (file_info[hash] == NULL) {
file_info[hash] = q;
} else {
/* We order by size, level, mtime, name */
/* Better to use qsort. TODO */
p = file_info[hash];
prev = NULL;
while (p) {
if (q->size < p->size)
break;
if (q->size == p->size) {
if (q->level < p->level)
break;
if (q->mtime < p->mtime)
break;
if (strcmp(q->name, p->name) < 0)
break;
}
prev = p;
p = p->hash_next;
}
/* We found an element bigger than 'size' */
if (prev == NULL) {
/* we must insert as the first */
q->hash_next = file_info[hash];
file_info[hash] = q;
} else {
q->hash_next = prev->hash_next;
prev->hash_next = q;
}
}
return 0;
}
static struct dir_node *alloc_dir_node(void)
{
struct dir_node *q;
unsigned int mem;
mem = sizeof(struct dir_node);
q = (struct dir_node *) xmalloc(mem);
if (q == NULL) {
fprintf(stderr, "ERROR: Cannot alloc a dir node!\n");
return NULL;
}
memset(q, 0, mem);
return q;
}
/*
* Add a dir to the structure
*/
int dir_add(const char *dir, const struct stat *s,
const int level)
{
struct dir_node *q, *parent;
/*fprintf(stderr, "ADDING DIR [%s], level %d\n", dir, level);*/
if (level >= MAX_DEPTH) {
fprintf(stderr, "Directory structure is too deep (>=%d)!\n",
MAX_DEPTH);
return -1;
}
q = alloc_dir_node();
if (q == NULL)
return -1;
q->name = strdup(dir);
q->dev = s->st_dev;
q->ino = s->st_ino;
q->level = level;
no_of_dirs++;
if (level == 0) {
if (dir_info_count == MAX_INPUT_DIRS) {
fprintf(stderr, "Too many dirs as input (>=%u)!\n",
MAX_INPUT_DIRS);
/* TODO: here is too late to check! */
free(q);
return -1;
}
/* We add a root dir */
dir_info[dir_info_count] = q;
dir_info_count++;
/*fprintf(stderr, "ADDED AS ROOT ON POSITION %u\n", dir_info_count - 1);*/
} else {
parent = dir_current[level - 1];
/*
fprintf(stderr, "PARENT is %p, subdirs is %p, q=%p set parent->subdirs to q\n",
parent, parent->subdirs, q);
*/
q->parent = parent;
q->next_sibling = parent->subdirs;
parent->subdirs = q;
}
/* Set current folder per level. */
/* Because we are sorting, the last added dir is hard to find. */
dir_current[level] = q;
return 0;
}
__cold static void file_dump_node(const struct file_node *q,
const unsigned int level)
{
char sha1[2][SHA_DIGEST_LENGTH * 2 + 1];
char prefix[128];
if (level)
memset(prefix, ' ', level * 2);
prefix[level * 2] = '\0';
sha1_dump(sha1[0], q->sha1[0], 8);
sha1_dump(sha1[1], q->sha1[1], 8);
fprintf(stderr, "%sF '%s' node=%p parent=%p next=%p hash_next=%p size=%llu"
" dev=%lu inode=%llu no_dup_possible=%u do_not_dump=%u"
" duplicates=%p left=%hhu sha1=%s/%s\n",
prefix, q->name, q, q->parent, q->next, q->hash_next, q->size,
(unsigned long) q->dev, (unsigned long long) q->ino,
q->no_dup_possible, q->do_not_dump,
q->duplicates, q->left, sha1[0], sha1[1]);
}
__cold void dump_files(void)
{
struct file_node *q;
unsigned int hash;
fprintf(stderr, "Dumping internal data - START...\n");
for (hash = 0; hash < HASH_SIZE; hash++) {
if (file_info[hash] == NULL)
continue;
fprintf(stderr, "info[%05u]:\n", hash);
q = file_info[hash];
while (q) {
file_dump_node(q, 0);
q = q->hash_next;
}
}
fprintf(stderr, "Dumping internal data - STOP...\n");
}
__cold static void dir_dump_node(const struct dir_node *d,
const unsigned int level)
{
char prefix[128];
struct dir_node *subdir;
struct file_node *file;
char dump[SHA_DIGEST_LENGTH * 2 + 1];
char fnh[SHA_DIGEST_LENGTH * 2 + 1];
memset(prefix, ' ', (level + 1) * 2);
prefix[(level + 1) * 2] = '\0';
sha1_dump(dump, d->sha1, 8);
sha1_dump(fnh, d->file_names_sha1, 8);
fprintf(stderr, "%sD '%s' d=%p subdirs=%p next_sibling=%p"
" files=%p parent=%p no_dup_possible=%u do_not_dump=%u"
" level=%d hash_next=%p left=%hhu sha1=%s file_names_sha1=%s\n",
prefix, d->name, d, d->subdirs, d->next_sibling,
d->files, d->parent, d->no_dup_possible, d->do_not_dump,
d->level, d->hash_next, d->left, dump, fnh);
subdir = d->subdirs;
while (subdir) {
dir_dump_node(subdir, level + 1 + 1);
subdir = subdir->next_sibling;
}
file = d->files;
while (file) {
file_dump_node(file, level + 1 + 1);
file = file->next;
}
}
/*
*
*/
__cold void dump_dirs(void)
{
unsigned int i;
for (i = 0; i < dir_info_count; i++) {
fprintf(stderr, "dump_dirs[%u]...\n", i);
dir_dump_node(dir_info[i], 0);
}
}
/*
* Compares two file structures, generating hashes as needed.
* Returns -1 on error, 0=no match, 1 on match
*/
static int compare_files(struct file_node *a, struct file_node *b)
{
int err = 0;
/* Test if first bytes are the same */
err += sha1_compute(a, COMPUTE_FIRST);
err += sha1_compute(b, COMPUTE_FIRST);
if (err != 0) {
fprintf(stderr, "Cannot compute sha1_first!\n");
return -1;
}
if (memcmp(a->sha1[0], b->sha1[0], SHA_DIGEST_LENGTH) != 0)
return 0;
/* First bytes are the same, compute full SHA1 */
err += sha1_compute(a, COMPUTE_FULL);
err += sha1_compute(b, COMPUTE_FULL);
if (err != 0) {
fprintf(stderr, "Cannot compute sha1_full!\n");
return -1;
}
if (memcmp(a->sha1[1], b->sha1[1], SHA_DIGEST_LENGTH) != 0)
return 0;
return 1;
}
/*
* Marks a dir as impossible to have duplicates.
*/
static void dir_mark_up_no_dup_possible(struct dir_node *d)
{
if ((d == NULL) || (d->no_dup_possible == 1))
return;
if (debug)
fprintf(stderr, "DEBUG: recursively up do dir_mark_up_no_dup_possible(%s)\n", d->name);
d->no_dup_possible = 1;
dir_mark_up_no_dup_possible(d->parent);
}
/*
* When we list a folder on the left side, we must mark whole hierarchy under
* it as 'do_not_dump'. Else, we will dump its files and we do not want that.
*/
static void dir_mark_down_do_not_dump(struct dir_node *d)
{
struct file_node *file;
struct dir_node *subdir;
if ((d == NULL) || (d->do_not_dump == 1))
return;
if (debug)
fprintf(stderr, "DEBUG: recursively dir_mark_do_not_dump(%s)\n", d->name);
d->do_not_dump = 1;
subdir = d->subdirs;
while (subdir) {
dir_mark_down_do_not_dump(subdir);
subdir = subdir->next_sibling;
}
file = d->files;
while (file) {
file->do_not_dump = 1;
file = file->next;
}
}
/*
* If we dump a dir on the left side, the dup files must be also on the left side.
*/
static void dir_mark_left(struct dir_node *d)
{
struct file_node *file;
struct dir_node *subdir;
if ((d == NULL) || (d->left == 1))
return;
if (debug)
fprintf(stderr, "DEBUG: recursively dir_mark_left(%s)\n", d->name);
d->left = 1;
subdir = d->subdirs;
while (subdir) {
dir_mark_left(subdir);
subdir = subdir->next_sibling;
}
file = d->files;
while (file) {
file->left = 1;
file = file->next;
}
}
static void file_mark_up_no_dup_possible(struct file_node *f)
{
if (f->no_dup_possible == 1)
return;
f->no_dup_possible = 1;
dir_mark_up_no_dup_possible(f->parent);
}
/*
* Compare the same size files using hashes
* @a - start of the chain of files with same size
* @b - end of the chain of files with same size
* TODO: Use a better check algo: compute first 64k SHA-1s, sort, if equal,
* compute full SHA-1s, sort?
*/
static int compare_file_range(struct file_node *a, struct file_node *b)
{
struct file_node *q, *p, *dups;
if (debug) {
fprintf(stderr, "compare_file_range:\n");
q = a;
while (q != b->hash_next) {
fprintf(stderr, "\t%s\n", q->name);
q = q->hash_next;
}
}
/* Mark all as unique */
q = a;
while (q != b->hash_next) {
q->unique = 1;
q = q->hash_next;
}
p = a;
while (p != b->hash_next) {
struct file_node *p_last;
q = p->hash_next;
if (q == NULL)
break;
p_last = p->duplicates;
while (q != b->hash_next) {
int err;
/* We do not want to break ->duplicates */
if (q->skip_compare == 1) {
q = q->hash_next;
continue;
}
err = compare_files(p, q);
if (err == -1)
return -1;
if (err != 1) {
q = q->hash_next;
continue;
}
if (p_last == NULL)
p->duplicates = q;
else
p_last->duplicates = q;
p_last = q;
if (debug) {
fprintf(stderr, "\tp[%s]->duplicates: ", p->name);
dups = p->duplicates;
while (dups) {
fprintf(stderr, " %s", dups->name);
dups = dups->duplicates;
}
fprintf(stderr, "\n");
}
p->unique = 0;
q->unique = 0;
q->skip_compare = 1;
/* TODO: these has to be moved. Why? */
dup_no_of_files++;
can_save += q->size;
q = q->hash_next;
}
p = p->hash_next;
}
/* All entries that are unique, will propagate to parents */
q = a;
while (q != b->hash_next) {
if (q->unique == 1)
file_mark_up_no_dup_possible(q);
q = q->hash_next;
}
return 0;
}
__hot int file_find_dups(void)
{
int err;
struct file_node *dups, *q, *first, *last;
unsigned int hash;
unsigned long long size;
for (hash = 0; hash < HASH_SIZE; hash++) {
if (file_info[hash] == NULL)
continue;
if (debug)
fprintf(stderr, "file_find_dups[%u]...\n", hash);
/* We need at least 2 nodes to be able to have dups */
if (file_info[hash]->hash_next == NULL) {
file_mark_up_no_dup_possible(file_info[hash]);
continue;
}
/* Now, we find groups of same size files */
first = file_info[hash];
while (1) {
/*fprintf(stderr, "FIRST [%s]\n", first->name);*/
last = first;
size = first->size;
q = first->hash_next;
while (q && (q->size == size)) {
last = q;
q = q->hash_next;
/*fprintf(stderr, "\tLAST [%s]\n", last->name);*/
}
/*fprintf(stderr, "first=[%s] last=[%s]\n", first->name, last->name);*/
err = compare_file_range(first, last);
if (err == -1)
return -1;
if (last->hash_next == NULL)
break;
first = last->hash_next;
}
if (debug > 2) {
if (debug)
fprintf(stderr, "[*] Dump chain %u start:\n", hash);
q = file_info[hash];
while (q) {
fprintf(stderr, "%s:\n", q->name);
dups = q->duplicates;
while(dups) {
if (debug)
fprintf(stderr, "\t%s\n", dups->name);
dups = dups->duplicates;
}
q = q->hash_next;
}
if (debug)
fprintf(stderr, "[*] Dump chain %u stop\n", hash);
}
}
return 0;
}
/*
* Sorting helper
* a0 and b0 are pointers!
*/
__hot static int file_compare_hashes(const void *a0, const void *b0)
{
int ret;
const struct file_node *a = * (const struct file_node **) a0;
const struct file_node *b = * (const struct file_node **) b0;
ret = memcmp(a->sha1[1], b->sha1[1], SHA_DIGEST_LENGTH);
if (ret != 0)
return ret;
if (a->parent->level > b->parent->level)
return -1;
if (a->parent->level < b->parent->level)
return 1;
return strcmp(a->name, b->name);
}
/*
* Sorts, by sha1_full the list of files and returns SHA-1 of files list.
* We need to sort because the order of files in dirs may differ because
* the names may be different but the content the same.
* TODO: Shouldn't we test if a file is unique=1 and skip the checksum of dir???
* TODO: And propage this flag 'up'?
* We return the file names hash in @fn.
*/
static int dir_files_hash(unsigned char *hash, unsigned char *fn,
struct dir_node *d)
{
struct file_node *p;
struct file_node **u;
unsigned long i, mem;
SHA_CTX c, fnh;
if (d->files == NULL) {
memset(hash, 0, SHA_DIGEST_LENGTH);
memset(fn, 0, SHA_DIGEST_LENGTH);
return 0;
}
if (d->no_of_files == 0) {
fprintf(stderr, "Dir %s has a valid d->files=%p pointer, but has 0 number of files!\n",
d->name, d->files);
abort();
}
mem = d->no_of_files * sizeof(struct file_node *);
u = (struct file_node **) xmalloc(mem);
if (u == NULL)
return -1;
p = d->files;
i = 0;
while (p) {
u[i] = p;
p = p->next;
i++;
}
qsort(u, d->no_of_files, sizeof(struct file_node *), file_compare_hashes);
SHA1_Init(&c);
SHA1_Init(&fnh);
i = 0;
while (i < d->no_of_files) {
char *base_name;
SHA1_Update(&c, u[i]->sha1, SHA_DIGEST_LENGTH);
base_name = basename(u[i]->name);
SHA1_Update(&fnh, base_name, strlen(base_name));
d->total_size += u[i]->size;
i++;
}
SHA1_Final(hash, &c);
SHA1_Final(fn, &fnh);
free(u);
if (debug > 2) {
char shash[33], sfn[33];
sha1_dump(shash, hash, 6);
sha1_dump(sfn, fn, 6);
fprintf(stderr, " DIR HASH[%s]=%s/%s\n",
d->name, shash, sfn);
}
return 0;
}
/*
* Sorting helper for dirs
* a0 and b0 are pointers!
*/
static int dir_compare_hashes(const void *a0, const void *b0)
{
int ret;
const struct dir_node *a = * (const struct dir_node **) a0;
const struct dir_node *b = * (const struct dir_node **) b0;
ret = memcmp(a->sha1, b->sha1, SHA_DIGEST_LENGTH);
if (ret != 0)
return ret;
return strcmp(a->name, b->name);
}
/*
* Sorting helper for dirs
* a0 and b0 are pointers!
*/
static int dir_compare_levels(const void *a0, const void *b0)
{
const struct dir_node *a = * (const struct dir_node **) a0;
const struct dir_node *b = * (const struct dir_node **) b0;
if (a->level > b->level)
return 1;
if (a->level < b->level)
return -1;
return strcmp(a->name, b->name);
}
/*
* Sorts subdirs array
* TODO: transform this into a generic function
* TODO: in 'dir_add' function, store the number of subdirs to be easier to alloc memory
* Pay attention! We @d is a pointer to the first element in the list
* TODO: pass the id of the sorting function as parameter.
*/
static int dir_sort_subdirs_by_hash(struct dir_node **d)
{
struct dir_node *q, **p, *prev;
unsigned int nr, i;
if (!*d)
return 0;
/* Compute the number of subdirs */
nr = 0;
q = *d;
while (q) {
nr++;
q = q->next_sibling;
}
p = xmalloc(nr * sizeof(struct dir_node *));
if (!p) {
fprintf(stderr, "Cannot alloc mem for dir list (%lu bytes)!\n",
nr * sizeof(struct dir_node *));
return -1;
}
i = 0;
q = *d;
while (q) {
p[i++] = q;
q = q->next_sibling;
}
qsort(p, nr, sizeof(struct dir_node *), dir_compare_hashes);
prev = NULL;
for (i = 0; i < nr; i++) {
if (prev == NULL)
*d = p[i];
else
prev->next_sibling = p[i];
prev = p[i];
}
prev->next_sibling = NULL;
free(p);
return 0;
}
/*
* Builds hash of a directory (and below)
*/
static long long dir_build_hash(struct dir_node *d)
{
struct dir_node *subdir;
SHA_CTX c, fnh;
unsigned char files_hash[SHA_DIGEST_LENGTH];
unsigned char file_names_sha1[SHA_DIGEST_LENGTH];
int err;
long long no_of_possible_dirs = 0;
if (debug)
fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n",
__FUNCTION__, d->name, d->no_dup_possible);
/* empty dir? */
if ((d->files == NULL) && (d->subdirs == NULL)) {
d->no_dup_possible = 1;
return 0;
}
/* We check current dir first. */
if (d->no_dup_possible == 0)
no_of_possible_dirs++;
/* Order files by hash to compute correct hashes */
err = dir_files_hash(files_hash, file_names_sha1, d);
if (err != 0)
return -1;
/* Compute hashes */
subdir = d->subdirs;
while (subdir) {
long long ret;
ret = dir_build_hash(subdir);
if (ret == -1)
return -1;
no_of_possible_dirs += ret;
d->total_size += subdir->total_size;
subdir = subdir->next_sibling;
}
/* Sorting subdirs by sha1 */
dir_sort_subdirs_by_hash(&d->subdirs);
SHA1_Init(&c);
SHA1_Update(&c, files_hash, SHA_DIGEST_LENGTH);
SHA1_Init(&fnh);
SHA1_Update(&fnh, file_names_sha1, SHA_DIGEST_LENGTH);
/* At the same time, we build hash of file names */
subdir = d->subdirs;
while (subdir) {
char *base_name;
SHA1_Update(&c, subdir->sha1, SHA_DIGEST_LENGTH);
base_name = basename(subdir->name);
SHA1_Update(&fnh, base_name, strlen(base_name));
SHA1_Update(&fnh, subdir->file_names_sha1, SHA_DIGEST_LENGTH);
subdir = subdir->next_sibling;
}
SHA1_Final(d->sha1, &c);
SHA1_Final(d->file_names_sha1, &fnh);
return no_of_possible_dirs;
}
/*
* Process a range of dirs with the same hash
*/
static void dir_process_range(struct dir_node **u, const unsigned long long first,
const unsigned long long last)
{
unsigned long long j;
struct dir_node *tail;
tail = u[first];
for (j = first + 1; j <= last; j++) {
tail->hash_next = u[j];
tail = u[j];
}
}
/*
* It helps to populate 'u' with dirs, recursive.
* Returns the new pos.
*/
static unsigned long long dir_find_dups_populate_list(struct dir_node **u,
const unsigned long long pos, struct dir_node *d)
{
struct dir_node *subdir;
unsigned long long new_pos;
new_pos = pos;
/* We check current dir first. */
if (d->no_dup_possible == 0) {
u[new_pos] = d;
new_pos++;
}
subdir = d->subdirs;
while (subdir) {
new_pos = dir_find_dups_populate_list(u, new_pos, subdir);
subdir = subdir->next_sibling;
}
return new_pos;
}
/*
* Finds dir duplicates (we are only marking here)
* We have to sort files based on hash, to match.
* We ignore 000 hashes (dirs), because they contain files that are single.
* TODO: the name does not reflect what the function does.
*/
int dir_find_dups(void)
{
unsigned long long mem, i, j, first, last;
char dump[SHA_DIGEST_LENGTH * 2 + 1];
dir_chain_len = 0;
if (debug)
fprintf(stderr, "[*] dir_find_dups...\n");
for (i = 0; i < dir_info_count; i++) {
long long err;
err = dir_build_hash(dir_info[i]);
if (err == -1)
return -1;
dir_chain_len += (unsigned long long) err;
}
if (dir_chain_len == 0)
return 0;
/* Allocate an array that we will pass to qsort */
mem = dir_chain_len * sizeof(struct dir_node *);
dir_chain = (struct dir_node **) xmalloc(mem);
if (dir_chain == NULL) {
fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n",
mem);
return -1;
}
j = 0;
for (i = 0; i < dir_info_count; i++) {
struct dir_node *d;
d = dir_info[i];
j = dir_find_dups_populate_list(dir_chain, j, d);
/* stop searching if we found all possible dirs */
if (j == dir_chain_len)
break;
}
if (debug > 2) {
fprintf(stderr, "DEBUG: dump before dir qsort:\n");
for (i = 0; i < dir_chain_len; i++)
fprintf(stderr, "\t%s\n",
dir_chain[i]->name);
fprintf(stderr, "\n");
}
/* Sort by hash and level */
qsort(dir_chain, dir_chain_len, sizeof(struct dir_node *),
dir_compare_hashes);
if (debug > 2) {
fprintf(stderr, "DEBUG: dump after dir qsort:\n");
for (i = 0; i < dir_chain_len; i++) {
sha1_dump(dump, dir_chain[i]->sha1, 0);
fprintf(stderr, "\t%s\tlevel %d\t%s\n",
dump, dir_chain[i]->level, dir_chain[i]->name);
}
}
first = 0;
last = 0;
for (i = 1; i < dir_chain_len; i++) {
if (memcmp(dir_chain[first]->sha1, dir_chain[i]->sha1, SHA_DIGEST_LENGTH) == 0) {
/* We have the same hash */
dup_no_of_dirs++;
last = i;
continue;
}
/* We have same hash in first..last */
dir_process_range(dir_chain, first, last);
/* Switch to next range */
first = i;
last = i;
}
dir_process_range(dir_chain, first, last);
/* Sort by level */
qsort(dir_chain, dir_chain_len, sizeof(struct dir_node *),
dir_compare_levels);
if (debug) {
fprintf(stderr, "DEBUG: dump after dir qsort by level:\n");
for (i = 0; i < dir_chain_len; i++) {
sha1_dump(dump, dir_chain[i]->sha1, 0);
fprintf(stderr, "\t%s\tlevel %d\t%s\n",
dump, dir_chain[i]->level, dir_chain[i]->name);
}
}
return 0;
}
/*
* Nice dumps the duplicated dirs
*/
__cold static void dir_dump_duplicates(struct dir_node *d,
const unsigned long long min_size, const unsigned int zero)
{
struct dir_node *p;
char sep, final;
char flags[9];
if (debug)
fprintf(stderr, "[*] %s(%s)\n", __func__, d->name);
if (d->no_dup_possible == 1) {
if (debug)
fprintf(stderr, "\tignore dir because no_dup_possible is set\n");
return;
}
if (d->do_not_dump == 1) {
if (debug)
fprintf(stderr, "\tignore dir because no_dup_dump is set\n");
return;
}
if (d->total_size < min_size) {
if (debug)
fprintf(stderr, "\tignore duplicate dir because size < min\n");
return;
}
if (zero) {
sep = '\0';
final = '\0';
} else {
sep = '\t';
final = '\n';
}
p = d->hash_next;
while (p) {
if (p->do_not_dump == 1) {
/* We already dumped that dir. */
if (debug)
fprintf(stderr, "DEBUG: ignore dir [%s] because"
" do_not_dump=%hhu or left=%hhu\n",
p->name, p->do_not_dump, p->left);
p = p->hash_next;
continue;
}
if (debug)
fprintf(stderr, "DEBUG: %s: Found a right dir for [%s]: %s\n",
__func__, d->name, p->name);
if (debug)
fprintf(stderr, "DEBUG: %s: set 'left' on [%s]\n",
__func__, d->name);
dir_mark_left(d);
if (debug)
fprintf(stderr, "DEBUG: %s:"
" set do_not_dump on right [%s]\n",
__func__, p->name);
dir_mark_down_do_not_dump(p);
memset(flags, '-', sizeof(flags) - 1);
flags[sizeof(flags) - 1] = '\0';
if (memcmp(d->file_names_sha1, p->file_names_sha1, SHA_DIGEST_LENGTH) != 0)
flags[0] = 'M';
if (debug)
fprintf(stderr, "DEBUG: DIR %llu %s %s %s",
d->total_size, flags, d->name, p->name);
fprintf(out, "DIR%c%llu%c%s%c%s%c%s%c",
sep, d->total_size, sep, flags, sep, d->name,
sep, p->name, final);
p = p->hash_next;
}
}
/*
* Nice dumps of the duplicated files
*/
__cold static void file_dump_duplicates(struct file_node *f,
const unsigned long long min_size, const unsigned int zero)
{
struct file_node *p, *first_left;
char sep, final;
if (debug)
fprintf(stderr, "[*] file_dump_duplicates(%s)\n", f->name);
if (f->duplicates == NULL) {
if (debug)
fprintf(stderr, "\tignore duplicate file because ->duplicates is NULL\n");
return;
}
if (f->no_dup_possible == 1) {
if (debug)
fprintf(stderr, "\tignore duplicate file because no_dup_possible=1\n");
return;
}
if (f->do_not_dump == 1) {
if (debug)
fprintf(stderr, "\tignore duplicate file because do_not_dump=1\n");
return;
}
if (f->size < min_size) {
if (debug)
fprintf(stderr, "\tignore duplicate file because size < min\n");
return;
}
/* first, search for the first left one */
first_left = f;
if (first_left->left == 0) {
p = f->duplicates;
while (p) {
if (p->left == 1) {
first_left = p;
break;
}
p = p->duplicates;
}
}
if (debug)
fprintf(stderr, "first_left = [%s]\n", first_left->name);
if (zero) {
sep = '\0';
final = '\0';
} else {
sep = '\t';
final = '\n';
}
/* now, dump */
p = f;
while (p) {
/* We do not want to dump files already dumped when we did
* it for dirs. */
if (p->do_not_dump == 1) {
if (debug)
fprintf(stderr, "\t\tignore duplicate file in"
" chain because do_not_dump=1 [%s]\n",
p->name);
p = p->duplicates;
continue;
}
if (p == first_left) {
p = p->duplicates;
continue;
}
if (debug)
fprintf(stderr, "Because we will dump [%s] as a left"
", set left/do_not_dump=1\n",
first_left->name);
first_left->left = 1;
first_left->do_not_dump = 1;
/* I am not sure the above is correct! It is correct,
* but I think is not needed; we will not have another chance
* to visit first_left. ALL "same hash files" are already here.
* Maybe something related to dirs? */
/* Prevent this file to appear again in the dump */
/* TODO: I do not think it can happen */
if (debug)
fprintf(stderr, "Because [%s] is a right"
", set do_not_dump=1\n", p->name);
p->do_not_dump = 1;
if (debug)
fprintf(stderr, "DEBUG: FILE %llu %s %s",
first_left->size, first_left->name, p->name);
fprintf(out, "FILE%c%llu%c%s%c%s%c",
sep, first_left->size, sep, first_left->name,
sep, p->name, final);
p = p->duplicates;
}
}
/*
* Searches all tree for duplicates
* @min_size - do not dump files shorter than min_size
*/
__cold void dump_duplicates(const unsigned long long min_size, const unsigned int zero)
{
unsigned int i;
struct file_node *f;
unsigned int hash;
if (debug)
fprintf(stderr, "[*] Dump duplicated dirs...\n");
for (i = 0; i < dir_chain_len; i++) {
struct dir_node *d;
if (debug)
fprintf(stderr, "\tdump_duplicates[%u]...\n", i);
d = dir_chain[i];
dir_dump_duplicates(d, min_size, zero);
}
free(dir_chain);
/* Now, we dump remaining files */
if (debug)
fprintf(stderr, "[*] Dump duplicated files...\n");
for (hash = 0; hash < HASH_SIZE; hash++) {
if (file_info[hash] == NULL)
continue;
if (debug)
fprintf(stderr, "[*] Dump duplicates in hash %u\n", hash);
f = file_info[hash];
while (f) {
file_dump_duplicates(f, min_size, zero);
f = f->hash_next;
}
}
}