RocketGit

catalinux / dupdump (public) (License: GPLv3) (since 2016-03-01) (hash sha1)
Find duplicated files and directories.
Clone URLs: https://rocketgit.com/user/catalinux/dupdump ssh://rocketgit@ssh.rocketgit.com/user/catalinux/dupdump git://git.rocketgit.com/user/catalinux/dupdump
main master
/store.c (8f737a70836f0180a635351bfd342d2d0efbfe89) (30737 bytes) (mode 100644) (type blob)
#define _XOPEN_SOURCE 500
#define _GNU_SOURCE
#define _FILE_OFFSET_BITS 64
#define _LARGEFILE_SOURCE
#define _LARGEFILE64_SOURCE

#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <ftw.h>

#include "store.h"


/*
 * This is used to not visit the same file/dir two times
 */
struct dev_ino
{
	struct file_node	*obj;	/* points to first file found */
	struct dev_ino		*next;
};

#define DEV_INO_HASH_SIZE	(16 * 4096)

#define HASH_SIZE		16384
#define MAX_INPUT_DIRS		32
#define MAX_DEPTH		1000

#define COMPUTE_FIRST		0
#define COMPUTE_FULL		1

/* Exported variables */
unsigned long long	no_of_files, no_of_dirs;
unsigned long long	sha1_first_computed;
unsigned long long	sha1_full_computed;
unsigned long long	sha1_bytes;
unsigned long long	dup_no_of_files, dup_no_of_dirs;
unsigned long long	can_save;
unsigned long long	no_of_same_inode_file, no_of_ino_cache_uses;
unsigned long long	mem_allocated, mem_calls;

/* Internal variables */
static struct file_node		*file_info[HASH_SIZE];
static struct dir_node		*dir_info[MAX_INPUT_DIRS];
static unsigned int		dir_info_count;
static struct dir_node		*dir_current[MAX_DEPTH];
static struct dir_node		**dir_chain;
static unsigned long long	dir_chain_len;
static unsigned char		sha1_zero[SHA_DIGEST_LENGTH];
static struct dev_ino		*dev_ino_hash[DEV_INO_HASH_SIZE];
static unsigned int		debug = 0;
static FILE			*out;


/* ############### Misc functions ############### */
void set_debug(const unsigned int level)
{
	debug = level;
}

void set_out(FILE *f)
{
	out = f;
}

/* ############### Memory functions ############### */
static void *xmalloc(size_t size)
{
	void *p;

	if (size == 0)
		abort();

	p = malloc(size);
	if (!p) {
		fprintf(stderr, "Cannot alloc %zu bytes!", size);
		abort();
	}

	mem_allocated += size;
	mem_calls++;

	return p;
}


/* ############### dev_ino helpers ############### */

/*
 * Returns first hard link found or NULL (and addas @a to cache)
 */
static struct file_node *dev_ino_seen(struct file_node *a)
{
	struct dev_ino *q;
	unsigned int hash;

	hash = (a->dev ^ a->ino) % DEV_INO_HASH_SIZE;

	q = dev_ino_hash[hash];
	while (q) {
		if ((q->obj->dev == a->dev) && (q->obj->ino == a->ino)) {
			if (q->obj && debug) {
				fprintf(stderr, "DEBUG: INO HIT for [%s] [%s]\n",
					a->name, q->obj->name);
			}
			no_of_same_inode_file++;
			return q->obj;
		}

		q = q->next;
	}

	if (debug)
		fprintf(stderr, "DEBUG: add file [%s] to INO HIT dev=%lu ino=%lu\n",
			a->name, a->dev, a->ino);

	// TODO: sort to speed up the walk
	q = (struct dev_ino *) xmalloc(sizeof(struct dev_ino));
	q->obj = a;
	q->next = dev_ino_hash[hash];
	dev_ino_hash[hash] = q;

	return NULL;
}

/*
 * Clean dev_ino_seen stuff
 */
void dev_ino_seen_clean(void)
{
	unsigned int i;
	struct dev_ino *q, *next;

	for (i = 0; i < DEV_INO_HASH_SIZE; i++) {
		q = dev_ino_hash[i];
		while (q) {
			next = q->next;
			free(q);
			q = next;
		}
	}
}


/* ############### SHA-1 functions ############### */

static void sha1_dump(char *out, const unsigned char *b, const unsigned int max0)
{
	unsigned int i, max;

	if (max0 == 0)
		max = SHA_DIGEST_LENGTH;
	else
		max = max0;

	for (i = 0; i < max; i++)
		snprintf(out + i * 2, 3, "%02x", b[i]);
}

/*
 * Computes SHA-1 for @len length
 */
static int sha1(const char *file, const size_t len, unsigned char *out)
{
	int fd;
	unsigned char buf[65536];
	SHA_CTX c;
	size_t bytes;

	fd = open(file, O_RDONLY);
	if (fd == -1)
		return -1;

	SHA1_Init(&c);
	bytes = 0;
	while (bytes < len) {
		ssize_t n;

		n = read(fd, buf, sizeof(buf));
		if (n == -1) {
			close(fd);
			return -1;
		}
		if (n == 0)
			break;

		SHA1_Update(&c, buf, (size_t) n);

		bytes += (size_t) n;
		sha1_bytes += (size_t) n;
	}

	close(fd);

	SHA1_Final(out, &c);

	if (len <= 65536)
		sha1_first_computed++;
	else
		sha1_full_computed++;

	return 0;
}

/*
 * @what - COMPUTE_FIRST or COMPUTE_FULL
 */
__hot static int sha1_compute(struct file_node *a,
	const unsigned char what)
{
	struct file_node *z;
	unsigned long long size;

	if (memcmp(a->sha1[what], sha1_zero, SHA_DIGEST_LENGTH) != 0)
		return 0;

	if (what == COMPUTE_FIRST)
		size = 65536;
	else
		size = a->size;

	// Test if is in cache
	z = a->ino_first;
	if (z) {
		if (memcmp(z->sha1[what], sha1_zero, SHA_DIGEST_LENGTH) == 0)
			sha1(z->name, size, z->sha1[what]);
		else
			no_of_ino_cache_uses++;
		memcpy(a->sha1[what], z->sha1[what], SHA_DIGEST_LENGTH);
		return 0;
	}

	return sha1(a->name, size, a->sha1[what]);
}


/* ############### Main functions ############### */

__cold void dump_stats(void)
{
	fprintf(stderr, "Number of visited file(s): %llu.\n", no_of_files);
	fprintf(stderr, "Number of visited dir(s): %llu.\n", no_of_dirs);
	fprintf(stderr, "Number of bytes processed by SHA-1: %llu.\n",
		sha1_bytes);
	fprintf(stderr, "Number of [<]64K SHA-1 computed: %llu.\n",
		sha1_first_computed);
	fprintf(stderr, "Number of full SHA-1 computed: %llu.\n",
		sha1_full_computed);
	fprintf(stderr, "Bytes that could be saved: %llu.\n", can_save);
	fprintf(stderr, "Number of duplicated files: %llu.\n", dup_no_of_files);
	fprintf(stderr, "Number of duplicated dirs: %llu.\n", dup_no_of_dirs);
	fprintf(stderr, "Number of same dev/inode (file): %llu.\n",
		no_of_same_inode_file);
	fprintf(stderr, "Number of uses of inode cache: %llu.\n",
		no_of_ino_cache_uses);
	fprintf(stderr, "Memory allocated: %llu bytes in %llu call(s).\n",
		mem_allocated, mem_calls);
}

static struct file_node *alloc_file_node(void)
{
	struct file_node *q;
	unsigned int mem;

	mem = sizeof(struct file_node);
	q = (struct file_node *) xmalloc(mem);
	if (q == NULL) {
		fprintf(stderr, "ERROR: Cannot alloc memory for a file node!\n");
		return NULL;
	}
	memset(q, 0, sizeof(struct file_node));

	return q;
}

int file_add(const char *file, const struct stat *s, const int level)
{
	struct dir_node *parent;
	struct file_node *q, *p, *prev;
	unsigned int hash;
	unsigned long long size;

	size = (unsigned long long) s->st_size;

	q = alloc_file_node();
	if (q == NULL)
		return -1;

	q->size = size;
	q->name = strdup(file);
	memset(&q->sha1[0], 0, SHA_DIGEST_LENGTH);
	memset(&q->sha1[1], 0, SHA_DIGEST_LENGTH);
	q->dev = s->st_dev;
	q->ino = s->st_ino;
	q->level = level;
	q->mtime = s->st_mtime;

	// Test if is in cache
	q->ino_first = dev_ino_seen(q);

	/* link with dir */
	parent = dir_current[level - 1];
	q->next = parent->files;
	parent->files = q;
	q->parent = parent;

	parent->no_of_files++;
	no_of_files++;

	hash = size % HASH_SIZE;
	if (file_info[hash] == NULL) {
		file_info[hash] = q;
	} else {
		/* We order by size, level, mtime, name */
		/* Better to use qsort. TODO */
		p = file_info[hash];
		prev = NULL;
		while (p) {
			if (q->size < p->size)
				break;

			if (q->size == p->size) {
				if (q->level < p->level)
					break;

				if (q->mtime < p->mtime)
					break;

				if (strcmp(q->name, p->name) < 0)
					break;
			}

			prev = p;
			p = p->hash_next;
		}

		/* We found an element bigger than 'size' */
		if (prev == NULL) {
			/* we must insert as the first */
			q->hash_next = file_info[hash];
			file_info[hash] = q;
		} else {
			q->hash_next = prev->hash_next;
			prev->hash_next = q;
		}
	}

	return 0;
}

static struct dir_node *alloc_dir_node(void)
{
	struct dir_node *q;
	unsigned int mem;

	mem = sizeof(struct dir_node);
	q = (struct dir_node *) xmalloc(mem);
	if (q == NULL) {
		fprintf(stderr, "ERROR: Cannot alloc a dir node!\n");
		return NULL;
	}
	memset(q, 0, mem);

	return q;
}

/*
 * Add a dir to the structure
 */
int dir_add(const char *dir, const struct stat *s,
	const int level)
{
	struct dir_node *q, *parent;

	/*fprintf(stderr, "ADDING DIR [%s], level %d\n", dir, level);*/
	if (level >= MAX_DEPTH) {
		fprintf(stderr, "Directory structure is too deep (>=%d)!\n",
			MAX_DEPTH);
		return -1;
	}

	q = alloc_dir_node();
	if (q == NULL)
		return -1;

	q->name = strdup(dir);
	q->dev = s->st_dev;
	q->ino = s->st_ino;
	q->level = level;

	no_of_dirs++;

	if (level == 0) {
		if (dir_info_count == MAX_INPUT_DIRS) {
			fprintf(stderr, "Too many dirs as input (>=%u)!\n",
				MAX_INPUT_DIRS);
			/* TODO: here is too late to check! */
			free(q);
			return -1;
		}

		/* We add a root dir */
		dir_info[dir_info_count] = q;
		dir_info_count++;
		/*fprintf(stderr, "ADDED AS ROOT ON POSITION %u\n", dir_info_count - 1);*/
	} else {
		parent = dir_current[level - 1];
		/*
		fprintf(stderr, "PARENT is %p, subdirs is %p, q=%p set parent->subdirs to q\n",
			parent, parent->subdirs, q);
		*/
		q->parent = parent;
		q->next_sibling = parent->subdirs;
		parent->subdirs = q;
	}

	/* Set current folder per level. */
	/* Because we are sorting, the last added dir is hard to find. */
	dir_current[level] = q;

	return 0;
}

__cold static void file_dump_node(const struct file_node *q,
	const unsigned int level)
{
	char sha1[2][SHA_DIGEST_LENGTH * 2 + 1];
	char prefix[128];

	if (level)
		memset(prefix, ' ', level * 2);
	prefix[level * 2] = '\0';

	sha1_dump(sha1[0], q->sha1[0], 8);
	sha1_dump(sha1[1], q->sha1[1], 8);
	fprintf(stderr, "%sF '%s' node=%p parent=%p next=%p hash_next=%p size=%llu"
		" dev=%lu inode=%llu no_dup_possible=%u do_not_dump=%u"
		" duplicates=%p left=%hhu sha1=%s/%s\n",
		prefix, q->name, q, q->parent, q->next, q->hash_next, q->size,
		(unsigned long) q->dev, (unsigned long long) q->ino,
		q->no_dup_possible, q->do_not_dump,
		q->duplicates, q->left, sha1[0], sha1[1]);
}

__cold void dump_files(void)
{
	struct file_node *q;
	unsigned int hash;

	fprintf(stderr, "Dumping internal data - START...\n");
	for (hash = 0; hash < HASH_SIZE; hash++) {
		if (file_info[hash] == NULL)
			continue;

		fprintf(stderr, "info[%05u]:\n", hash);
		q = file_info[hash];
		while (q) {
			file_dump_node(q, 0);
			q = q->hash_next;
		}
	}
	fprintf(stderr, "Dumping internal data - STOP...\n");
}

__cold static void dir_dump_node(const struct dir_node *d,
	const unsigned int level)
{
	char prefix[128];
	struct dir_node *subdir;
	struct file_node *file;
	char dump[SHA_DIGEST_LENGTH * 2 + 1];
	char fnh[SHA_DIGEST_LENGTH * 2 + 1];

	memset(prefix, ' ', (level + 1) * 2);
	prefix[(level + 1) * 2] = '\0';

	sha1_dump(dump, d->sha1, 8);
	sha1_dump(fnh, d->file_names_sha1, 8);
	fprintf(stderr, "%sD '%s' d=%p subdirs=%p next_sibling=%p"
		" files=%p parent=%p no_dup_possible=%u do_not_dump=%u"
		" level=%d hash_next=%p left=%hhu sha1=%s file_names_sha1=%s\n",
		prefix, d->name, d, d->subdirs, d->next_sibling,
		d->files, d->parent, d->no_dup_possible, d->do_not_dump,
		d->level, d->hash_next, d->left, dump, fnh);

	subdir = d->subdirs;
	while (subdir) {
		dir_dump_node(subdir, level + 1 + 1);
		subdir = subdir->next_sibling;
	}

	file = d->files;
	while (file) {
		file_dump_node(file, level + 1 + 1);
		file = file->next;
	}
}

/*
 *
 */
__cold void dump_dirs(void)
{
	unsigned int i;

	for (i = 0; i < dir_info_count; i++) {
		fprintf(stderr, "dump_dirs[%u]...\n", i);
		dir_dump_node(dir_info[i], 0);
	}
}

/*
 * Compares two file structures, generating hashes as needed.
 * Returns -1 on error, 0=no match, 1 on match
 */
static int compare_files(struct file_node *a, struct file_node *b)
{
	int err = 0;

	/* Test if first bytes are the same */
	err += sha1_compute(a, COMPUTE_FIRST);
	err += sha1_compute(b, COMPUTE_FIRST);
	if (err != 0) {
		fprintf(stderr, "Cannot compute sha1_first!\n");
		return -1;
	}
	if (memcmp(a->sha1[0], b->sha1[0], SHA_DIGEST_LENGTH) != 0)
		return 0;

	/* First bytes are the same, compute full SHA1 */
	err += sha1_compute(a, COMPUTE_FULL);
	err += sha1_compute(b, COMPUTE_FULL);
	if (err != 0) {
		fprintf(stderr, "Cannot compute sha1_full!\n");
		return -1;
	}
	if (memcmp(a->sha1[1], b->sha1[1], SHA_DIGEST_LENGTH) != 0)
		return 0;

	return 1;
}

/*
 * Marks a dir as impossible to have duplicates.
 */
static void dir_mark_up_no_dup_possible(struct dir_node *d)
{
	if ((d == NULL) || (d->no_dup_possible == 1))
		return;

	if (debug)
		fprintf(stderr, "DEBUG: recursively up do dir_mark_up_no_dup_possible(%s)\n", d->name);

	d->no_dup_possible = 1;
	dir_mark_up_no_dup_possible(d->parent);
}

/*
 * When we list a folder on the left side, we must mark whole hierarchy under
 * it as 'do_not_dump'. Else, we will dump its files and we do not want that.
 */
static void dir_mark_down_do_not_dump(struct dir_node *d)
{
	struct file_node *file;
	struct dir_node *subdir;

	if ((d == NULL) || (d->do_not_dump == 1))
		return;

	if (debug)
		fprintf(stderr, "DEBUG: recursively dir_mark_do_not_dump(%s)\n", d->name);

	d->do_not_dump = 1;

	subdir = d->subdirs;
	while (subdir) {
		dir_mark_down_do_not_dump(subdir);
		subdir = subdir->next_sibling;
	}

	file = d->files;
	while (file) {
		file->do_not_dump = 1;
		file = file->next;
	}
}

/*
 * If we dump a dir on the left side, the dup files must be also on the left side.
 */
static void dir_mark_left(struct dir_node *d)
{
	struct file_node *file;
	struct dir_node *subdir;

	if ((d == NULL) || (d->left == 1))
		return;

	if (debug)
		fprintf(stderr, "DEBUG: recursively dir_mark_left(%s)\n", d->name);

	d->left = 1;

	subdir = d->subdirs;
	while (subdir) {
		dir_mark_left(subdir);
		subdir = subdir->next_sibling;
	}

	file = d->files;
	while (file) {
		file->left = 1;
		file = file->next;
	}
}

static void file_mark_up_no_dup_possible(struct file_node *f)
{
	if (f->no_dup_possible == 1)
		return;

	f->no_dup_possible = 1;
	dir_mark_up_no_dup_possible(f->parent);
}

/*
 * Compare the same size files using hashes
 * @a - start of the chain of files with same size
 * @b - end of the chain of files with same size
 * TODO: Use a better check algo: compute first 64k SHA-1s, sort, if equal,
 * compute full SHA-1s, sort?
 */
static int compare_file_range(struct file_node *a, struct file_node *b)
{
	struct file_node *q, *p, *dups;

	if (debug) {
		fprintf(stderr, "compare_file_range:\n");
		q = a;
		while (q != b->hash_next) {
			fprintf(stderr, "\t%s\n", q->name);
			q = q->hash_next;
		}
	}

	/* Mark all as unique */
	q = a;
	while (q != b->hash_next) {
		q->unique = 1;
		q = q->hash_next;
	}

	p = a;
	while (p != b->hash_next) {
		struct file_node *p_last;

		q = p->hash_next;
		if (q == NULL)
			break;

		p_last = p->duplicates;
		while (q != b->hash_next) {
			int err;

			/* We do not want to break ->duplicates */
			if (q->skip_compare == 1) {
				q = q->hash_next;
				continue;
			}

 			err = compare_files(p, q);
			if (err == -1)
				return -1;

			if (err != 1) {
				q = q->hash_next;
				continue;
			}

			if (p_last == NULL)
				p->duplicates = q;
			else
				p_last->duplicates = q;
			p_last = q;

			if (debug) {
				fprintf(stderr, "\tp[%s]->duplicates: ", p->name);
				dups = p->duplicates;
				while (dups) {
					fprintf(stderr, " %s", dups->name);
					dups = dups->duplicates;
				}
				fprintf(stderr, "\n");
			}

			p->unique = 0;
			q->unique = 0;

			q->skip_compare = 1;

			/* TODO: these has to be moved. Why? */
			dup_no_of_files++;
			can_save += q->size;

			q = q->hash_next;
		}

		p = p->hash_next;
	}

	/* All entries that are unique, will propagate to parents */
	q = a;
	while (q != b->hash_next) {
		if (q->unique == 1)
			file_mark_up_no_dup_possible(q);
		q = q->hash_next;
	}

	return 0;
}

__hot int file_find_dups(void)
{
	int err;
	struct file_node *dups, *q, *first, *last;
	unsigned int hash;
	unsigned long long size;

	for (hash = 0; hash < HASH_SIZE; hash++) {
		if (file_info[hash] == NULL)
			continue;

		if (debug)
			fprintf(stderr, "file_find_dups[%u]...\n", hash);

		/* We need at least 2 nodes to be able to have dups */
		if (file_info[hash]->hash_next == NULL) {
			file_mark_up_no_dup_possible(file_info[hash]);
			continue;
		}

		/* Now, we find groups of same size files */
		first = file_info[hash];
		while (1) {
			/*fprintf(stderr, "FIRST [%s]\n", first->name);*/
			last = first;
			size = first->size;

			q = first->hash_next;
			while (q && (q->size == size)) {
				last = q;
				q = q->hash_next;
				/*fprintf(stderr, "\tLAST [%s]\n", last->name);*/
			}
			/*fprintf(stderr, "first=[%s] last=[%s]\n", first->name, last->name);*/

			err = compare_file_range(first, last);
			if (err == -1)
				return -1;

			if (last->hash_next == NULL)
				break;

			first = last->hash_next;
		}

		if (debug > 2) {
			if (debug)
				fprintf(stderr, "[*] Dump chain %u start:\n", hash);
			q = file_info[hash];
			while (q) {
				fprintf(stderr, "%s:\n", q->name);
				dups = q->duplicates;
				while(dups) {
					if (debug)
						fprintf(stderr, "\t%s\n", dups->name);
					dups = dups->duplicates;
				}
				q = q->hash_next;
			}
			if (debug)
				fprintf(stderr, "[*] Dump chain %u stop\n", hash);
		}
	}

	return 0;
}

/*
 * Sorting helper
 * a0 and b0 are pointers!
 */
__hot static int file_compare_hashes(const void *a0, const void *b0)
{
	int ret;
	const struct file_node *a = * (const struct file_node **) a0;
	const struct file_node *b = * (const struct file_node **) b0;

	ret = memcmp(a->sha1[1], b->sha1[1], SHA_DIGEST_LENGTH);
	if (ret != 0)
		return ret;

	if (a->parent->level > b->parent->level)
		return -1;
	if (a->parent->level < b->parent->level)
		return 1;

	return strcmp(a->name, b->name);
}

/*
 * Sorts, by sha1_full the list of files and returns SHA-1 of files list.
 * We need to sort because the order of files in dirs may differ because
 * the names may be different but the content the same.
 * TODO: Shouldn't we test if a file is unique=1 and skip the checksum of dir???
 * TODO: And propage this flag 'up'?
 * We return the file names hash in @fn.
 */
static int dir_files_hash(unsigned char *hash, unsigned char *fn,
	struct dir_node *d)
{
	struct file_node *p;
	struct file_node **u;
	unsigned long i, mem;
	SHA_CTX c, fnh;

	if (d->files == NULL) {
		memset(hash, 0, SHA_DIGEST_LENGTH);
		memset(fn, 0, SHA_DIGEST_LENGTH);
		return 0;
	}

	if (d->no_of_files == 0) {
		fprintf(stderr, "Dir %s has a valid d->files=%p pointer, but has 0 number of files!\n",
			d->name, d->files);
		abort();
	}

	mem = d->no_of_files * sizeof(struct file_node *);
	u = (struct file_node **) xmalloc(mem);
	if (u == NULL)
		return -1;

	p = d->files;
	i = 0;
	while (p) {
		u[i] = p;
		p = p->next;
		i++;
	}

	qsort(u, d->no_of_files, sizeof(struct file_node *), file_compare_hashes);

	SHA1_Init(&c);
	SHA1_Init(&fnh);

	i = 0;
	while (i < d->no_of_files) {
		char *base_name;

		SHA1_Update(&c, u[i]->sha1, SHA_DIGEST_LENGTH);

		base_name = basename(u[i]->name);
		SHA1_Update(&fnh, base_name, strlen(base_name));

		d->total_size += u[i]->size;
		i++;
	}

	SHA1_Final(hash, &c);
	SHA1_Final(fn, &fnh);

	free(u);

	if (debug > 2) {
		char shash[33], sfn[33];
		sha1_dump(shash, hash, 6);
		sha1_dump(sfn, fn, 6);
		fprintf(stderr, "	DIR HASH[%s]=%s/%s\n",
			d->name, shash, sfn);
	}

	return 0;
}

/*
 * Sorting helper for dirs
 * a0 and b0 are pointers!
 */
static int dir_compare_hashes(const void *a0, const void *b0)
{
	int ret;
	const struct dir_node *a = * (const struct dir_node **) a0;
	const struct dir_node *b = * (const struct dir_node **) b0;

	ret = memcmp(a->sha1, b->sha1, SHA_DIGEST_LENGTH);
	if (ret != 0)
		return ret;

	return strcmp(a->name, b->name);
}

/*
 * Sorting helper for dirs
 * a0 and b0 are pointers!
 */
static int dir_compare_levels(const void *a0, const void *b0)
{
	const struct dir_node *a = * (const struct dir_node **) a0;
	const struct dir_node *b = * (const struct dir_node **) b0;

	if (a->level > b->level)
		return 1;
	if (a->level < b->level)
		return -1;

	return strcmp(a->name, b->name);
}

/*
 * Sorts subdirs array
 * TODO: transform this into a generic function
 * TODO: in 'dir_add' function, store the number of subdirs to be easier to alloc memory
 * Pay attention! We @d is a pointer to the first element in the list
 * TODO: pass the id of the sorting function as parameter.
 */
static int dir_sort_subdirs_by_hash(struct dir_node **d)
{
	struct dir_node *q, **p, *prev;
	unsigned int nr, i;

	if (!*d)
		return 0;

	/* Compute the number of subdirs */
	nr = 0;
	q = *d;
	while (q) {
		nr++;
		q = q->next_sibling;
	}

	p = xmalloc(nr * sizeof(struct dir_node *));
	if (!p) {
		fprintf(stderr, "Cannot alloc mem for dir list (%lu bytes)!\n",
			nr * sizeof(struct dir_node *));
		return -1;
	}

	i = 0;
	q = *d;
	while (q) {
		p[i++] = q;
		q = q->next_sibling;
	}

	qsort(p, nr, sizeof(struct dir_node *), dir_compare_hashes);

	prev = NULL;
	for (i = 0; i < nr; i++) {
		if (prev == NULL)
			*d = p[i];
		else
			prev->next_sibling = p[i];
		prev = p[i];
	}
	prev->next_sibling = NULL;

	free(p);

	return 0;
}

/*
 * Builds hash of a directory (and below)
 */
static long long dir_build_hash(struct dir_node *d)
{
	struct dir_node *subdir;
	SHA_CTX c, fnh;
	unsigned char files_hash[SHA_DIGEST_LENGTH];
	unsigned char file_names_sha1[SHA_DIGEST_LENGTH];
	int err;
	long long no_of_possible_dirs = 0;

	if (debug)
		fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n",
			__FUNCTION__, d->name, d->no_dup_possible);

	/* empty dir? */
	if ((d->files == NULL) && (d->subdirs == NULL)) {
		d->no_dup_possible = 1;
		return 0;
	}

	/* We check current dir first. */
	if (d->no_dup_possible == 0)
		no_of_possible_dirs++;

	/* Order files by hash to compute correct hashes */
	err = dir_files_hash(files_hash, file_names_sha1, d);
	if (err != 0)
		return -1;

	/* Compute hashes */
	subdir = d->subdirs;
	while (subdir) {
		long long ret;

		ret = dir_build_hash(subdir);
		if (ret == -1)
			return -1;

		no_of_possible_dirs += ret;

		d->total_size += subdir->total_size;

		subdir = subdir->next_sibling;
	}

	/* Sorting subdirs by sha1 */
	dir_sort_subdirs_by_hash(&d->subdirs);

	SHA1_Init(&c);
	SHA1_Update(&c, files_hash, SHA_DIGEST_LENGTH);
	SHA1_Init(&fnh);
	SHA1_Update(&fnh, file_names_sha1, SHA_DIGEST_LENGTH);

	/* At the same time, we build hash of file names */
	subdir = d->subdirs;
	while (subdir) {
		char *base_name;

		SHA1_Update(&c, subdir->sha1, SHA_DIGEST_LENGTH);

		base_name = basename(subdir->name);
		SHA1_Update(&fnh, base_name, strlen(base_name));
		SHA1_Update(&fnh, subdir->file_names_sha1, SHA_DIGEST_LENGTH);

		subdir = subdir->next_sibling;
	}

	SHA1_Final(d->sha1, &c);
	SHA1_Final(d->file_names_sha1, &fnh);

	return no_of_possible_dirs;
}

/*
 * Process a range of dirs with the same hash
 */
static void dir_process_range(struct dir_node **u, const unsigned long long first,
	const unsigned long long last)
{
	unsigned long long j;
	struct dir_node *tail;

	tail = u[first];
	for (j = first + 1; j <= last; j++) {
		tail->hash_next = u[j];
		tail = u[j];
	}
}

/*
 * It helps to populate 'u' with dirs, recursive.
 * Returns the new pos.
 */
static unsigned long long dir_find_dups_populate_list(struct dir_node **u,
	const unsigned long long pos, struct dir_node *d)
{
	struct dir_node *subdir;
	unsigned long long new_pos;

	new_pos = pos;

	/* We check current dir first. */
	if (d->no_dup_possible == 0) {
		u[new_pos] = d;
		new_pos++;
	}

	subdir = d->subdirs;
	while (subdir) {
		new_pos = dir_find_dups_populate_list(u, new_pos, subdir);
		subdir = subdir->next_sibling;
	}

	return new_pos;
}

/*
 * Finds dir duplicates (we are only marking here)
 * We have to sort files based on hash, to match.
 * We ignore 000 hashes (dirs), because they contain files that are single.
 * TODO: the name does not reflect what the function does.
 */
int dir_find_dups(void)
{
	unsigned long long mem, i, j, first, last;
	char dump[SHA_DIGEST_LENGTH * 2 + 1];

	dir_chain_len = 0;

	if (debug)
		fprintf(stderr, "[*] dir_find_dups...\n");

	for (i = 0; i < dir_info_count; i++) {
		long long err;

		err = dir_build_hash(dir_info[i]);
		if (err == -1)
			return -1;

		dir_chain_len += (unsigned long long) err;
	}
	if (dir_chain_len == 0)
		return 0;

	/* Allocate an array that we will pass to qsort */
	mem = dir_chain_len * sizeof(struct dir_node *);
	dir_chain = (struct dir_node **) xmalloc(mem);
	if (dir_chain == NULL) {
		fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n",
			mem);
		return -1;
	}

	j = 0;
	for (i = 0; i < dir_info_count; i++) {
		struct dir_node *d;

		d = dir_info[i];
		j = dir_find_dups_populate_list(dir_chain, j, d);

		/* stop searching if we found all possible dirs */
		if (j == dir_chain_len)
			break;
	}

	if (debug > 2) {
		fprintf(stderr, "DEBUG: dump before dir qsort:\n");
		for (i = 0; i < dir_chain_len; i++)
			fprintf(stderr, "\t%s\n",
				dir_chain[i]->name);
		fprintf(stderr, "\n");
	}

	/* Sort by hash and level */
	qsort(dir_chain, dir_chain_len, sizeof(struct dir_node *),
		dir_compare_hashes);

	if (debug > 2) {
		fprintf(stderr, "DEBUG: dump after dir qsort:\n");
		for (i = 0; i < dir_chain_len; i++) {
			sha1_dump(dump, dir_chain[i]->sha1, 0);
			fprintf(stderr, "\t%s\tlevel %d\t%s\n",
				dump, dir_chain[i]->level, dir_chain[i]->name);
		}
	}

	first = 0;
	last = 0;
	for (i = 1; i < dir_chain_len; i++) {
		if (memcmp(dir_chain[first]->sha1, dir_chain[i]->sha1, SHA_DIGEST_LENGTH) == 0) {
			/* We have the same hash */
			dup_no_of_dirs++;
			last = i;
			continue;
		}

		/* We have same hash in first..last */
		dir_process_range(dir_chain, first, last);

		/* Switch to next range */
		first = i;
		last = i;
	}
	dir_process_range(dir_chain, first, last);

	/* Sort by level */
	qsort(dir_chain, dir_chain_len, sizeof(struct dir_node *),
		dir_compare_levels);

	if (debug) {
		fprintf(stderr, "DEBUG: dump after dir qsort by level:\n");
		for (i = 0; i < dir_chain_len; i++) {
			sha1_dump(dump, dir_chain[i]->sha1, 0);
			fprintf(stderr, "\t%s\tlevel %d\t%s\n",
				dump, dir_chain[i]->level, dir_chain[i]->name);
		}
	}
	return 0;
}

/*
 * Nice dumps the duplicated dirs
 */
__cold static void dir_dump_duplicates(struct dir_node *d,
	const unsigned long long min_size, const unsigned int zero)
{
	struct dir_node *p;
	char sep, final;
	char flags[9];

	if (debug)
		fprintf(stderr, "[*] %s(%s)\n", __func__, d->name);

	if (d->no_dup_possible == 1) {
		if (debug)
			fprintf(stderr, "\tignore dir because no_dup_possible is set\n");
		return;
	}

	if (d->do_not_dump == 1) {
		if (debug)
			fprintf(stderr, "\tignore dir because no_dup_dump is set\n");
		return;
	}

	if (d->total_size < min_size) {
		if (debug)
			fprintf(stderr, "\tignore duplicate dir because size < min\n");
		return;
	}

	if (zero) {
		sep = '\0';
		final = '\0';
	} else {
		sep = '\t';
		final = '\n';
	}

	p = d->hash_next;
	while (p) {
		if (p->do_not_dump == 1) {
			/* We already dumped that dir. */
			if (debug)
				fprintf(stderr, "DEBUG: ignore dir [%s] because"
					" do_not_dump=%hhu or left=%hhu\n",
					p->name, p->do_not_dump, p->left);
			p = p->hash_next;
			continue;
		}

		if (debug)
			fprintf(stderr, "DEBUG: %s: Found a right dir for [%s]: %s\n",
				__func__, d->name, p->name);

		if (debug)
			fprintf(stderr, "DEBUG: %s: set 'left' on [%s]\n",
				__func__, d->name);
		dir_mark_left(d);

		if (debug)
			fprintf(stderr, "DEBUG: %s:"
				" set do_not_dump on right [%s]\n",
				__func__, p->name);
		dir_mark_down_do_not_dump(p);

		memset(flags, '-', sizeof(flags) - 1);
		flags[sizeof(flags) - 1] = '\0';

		if (memcmp(d->file_names_sha1, p->file_names_sha1, SHA_DIGEST_LENGTH) != 0)
			flags[0] = 'M';

		if (debug)
			fprintf(stderr, "DEBUG: DIR %llu %s %s %s",
				d->total_size, flags, d->name, p->name);
		fprintf(out, "DIR%c%llu%c%s%c%s%c%s%c",
			sep, d->total_size, sep, flags, sep, d->name,
			sep, p->name, final);
		p = p->hash_next;
	}
}

/*
 * Nice dumps of the duplicated files
 */
__cold static void file_dump_duplicates(struct file_node *f,
	const unsigned long long min_size, const unsigned int zero)
{
	struct file_node *p, *first_left;
	char sep, final;

	if (debug)
		fprintf(stderr, "[*] file_dump_duplicates(%s)\n", f->name);

	if (f->duplicates == NULL) {
		if (debug)
			fprintf(stderr, "\tignore duplicate file because ->duplicates is NULL\n");
		return;
	}

	if (f->no_dup_possible == 1) {
		if (debug)
			fprintf(stderr, "\tignore duplicate file because no_dup_possible=1\n");
		return;
	}

	if (f->do_not_dump == 1) {
		if (debug)
			fprintf(stderr, "\tignore duplicate file because do_not_dump=1\n");
		return;
	}

	if (f->size < min_size) {
		if (debug)
			fprintf(stderr, "\tignore duplicate file because size < min\n");
		return;
	}

	/* first, search for the first left one */
	first_left = f;
	if (first_left->left == 0) {
		p = f->duplicates;
		while (p) {
			if (p->left == 1) {
				first_left = p;
				break;
			}
			p = p->duplicates;
		}
	}
	if (debug)
		fprintf(stderr, "first_left = [%s]\n", first_left->name);

	if (zero) {
		sep = '\0';
		final = '\0';
	} else {
		sep = '\t';
		final = '\n';
	}

	/* now, dump */
	p = f;
	while (p) {
		/* We do not want to dump files already dumped when we did
		 * it for dirs. */
		if (p->do_not_dump == 1) {
			if (debug)
				fprintf(stderr, "\t\tignore duplicate file in"
					" chain because do_not_dump=1 [%s]\n",
					p->name);
			p = p->duplicates;
			continue;
		}

		if (p == first_left) {
			p = p->duplicates;
			continue;
		}

		if (debug)
			fprintf(stderr, "Because we will dump [%s] as a left"
				", set left/do_not_dump=1\n",
				first_left->name);
		first_left->left = 1;
		first_left->do_not_dump = 1;
		/* I am not sure the above is correct! It is correct,
		 * but I think is not needed; we will not have another chance
		 * to visit first_left. ALL "same hash files" are already here.
		 * Maybe something related to dirs? */

		/* Prevent this file to appear again in the dump */
		/* TODO: I do not think it can happen */
		if (debug)
			fprintf(stderr, "Because [%s] is a right"
				", set do_not_dump=1\n", p->name);
		p->do_not_dump = 1;

		if (debug)
			fprintf(stderr, "DEBUG: FILE %llu %s %s",
				first_left->size, first_left->name, p->name);
		fprintf(out, "FILE%c%llu%c%s%c%s%c",
			sep, first_left->size, sep, first_left->name,
			sep, p->name, final);
		p = p->duplicates;
	}
}

/*
 * Searches all tree for duplicates
 * @min_size - do not dump files shorter than min_size
 */
__cold void dump_duplicates(const unsigned long long min_size, const unsigned int zero)
{
	unsigned int i;
	struct file_node *f;
	unsigned int hash;

	if (debug)
		fprintf(stderr, "[*] Dump duplicated dirs...\n");
	for (i = 0; i < dir_chain_len; i++) {
		struct dir_node *d;

		if (debug)
			fprintf(stderr, "\tdump_duplicates[%u]...\n", i);
		d = dir_chain[i];
		dir_dump_duplicates(d, min_size, zero);
	}
	free(dir_chain);

	/* Now, we dump remaining files */
	if (debug)
		fprintf(stderr, "[*] Dump duplicated files...\n");
	for (hash = 0; hash < HASH_SIZE; hash++) {
		if (file_info[hash] == NULL)
			continue;

		if (debug)
			fprintf(stderr, "[*] Dump duplicates in hash %u\n", hash);

		f = file_info[hash];
		while (f) {
			file_dump_duplicates(f, min_size, zero);
			f = f->hash_next;
		}
	}
}
Mode	Type	Size	Ref	File
100644	blob	20	85940595c7c3a70ebc0bd5da9b35bc6b6a16a71a	.exclude
100644	blob	105	9e50f3bfb5cc392fa65019aef80cab5093162bd2	.gitignore
100644	blob	35147	94a9ed024d3859793618152ea559a168bbcbb5e2	LICENSE
100644	blob	635	5ec5fadb5ab8ec7839ca5f11414aa2a855cffa03	Makefile.in
100644	blob	2627	9f4bbb9647e9fea4e861fa9a04bf32a716a2da05	README
100644	blob	2216	4699616f54bc9be1acd4b252ddd76b75e9eeb48a	TODO
100755	blob	31	382d4ea2c0c98b1b25ea01f1e194cfc4990ac527	configure
100755	blob	15674	c93b35dad5dedf498b90aafcbf409a4844b1bc8c	duilder
100644	blob	807	741ea33bf42f98943be21be26fc7e1b6b38d8378	duilder.conf
100644	blob	2040	22eee88f6126c7effa781bcb8fde0c58ca487731	dupdump.1
100644	blob	3981	c59d9bbf4076703d2ffc82502f91595393199bce	dupdump.c
100644	blob	805	a992c9f287eb58cd910aca63c6e009526ec2595f	dupdump.spec.in
100755	blob	205	677395e91b18c8272dc795ace0d17ec5610e2d70	process.sh
100644	blob	30737	8f737a70836f0180a635351bfd342d2d0efbfe89	store.c
100644	blob	1916	113ca447b857e1890ad0db35a95a06849330b8db	store.h
040000	tree	-	2f1796ebce0f596969d86738ee6b635521296929	tests
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"
Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/catalinux/dupdump
Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/catalinux/dupdump
Clone this repository using git:
git clone git://git.rocketgit.com/user/catalinux/dupdump
You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main