RocketGit

catalinux / dupdump (public) (License: GPLv3) (since 2016-03-01) (hash sha1)

Find duplicated files and directories.

Clone URLs: https://rocketgit.com/user/catalinux/dupdump ssh://rocketgit@ssh.rocketgit.com/user/catalinux/dupdump git://git.rocketgit.com/user/catalinux/dupdump

main master

List of commits:

Subject	Hash	Author	Date (UTC)
Several fixes	2909b1ba2e99929e775ddfea5f4894c50694a638	Catalin(ux) M. BOIE	2012-06-19 13:08:50
First version	3d7935d9b8a91694fe8213998ce4d3910348d6ef	Catalin(ux) M. BOIE	2012-05-06 19:40:40

Commit 2909b1ba2e99929e775ddfea5f4894c50694a638 - Several fixes
Author: Catalin(ux) M. BOIE
Author date (UTC): 2012-06-19 13:08
Committer name: Catalin(ux) M. BOIE
Committer date (UTC): 2012-06-19 13:08
Parent(s): 3d7935d9b8a91694fe8213998ce4d3910348d6ef
Signing key:
Tree: b6ca189f2b650d1068d4e94d296559b91b7b5923

File	Lines added	Lines deleted
.gitignore	2	0
TODO	25	1
dupdump.c	14	17
store.c	531	78
store.h	19	9

File .gitignore changed (mode: 100644) (index 66c01b4..7b8473d)
1	1	*.o	*.o
2	2	dupdump	dupdump
3	3	vgcore*	vgcore*
	4		Makefile
	5		*.spec

File TODO changed (mode: 100644) (index c53b196..8a84376)
1		[ ] --min-size parameter
	1		[X] Mark files as NOT_POSSIBLE_DUPLICATES
	2		[X] Do the compare of size and hashes
	3		[X] If match, mark as POSSIBLE_DUPLICATES
	4		[X] For the rest, propagate the flag to the parent dirs.
	5
	6		[ ] compute dir hashes by sorting hashes for dirs and files, maybe, compute separately files and dir
	7		[ ] Sort files and subdirs by hash - do a function
	8
	9		[ ] --min-size parameter and --max-size
2	10	[ ] Use more threads	[ ] Use more threads
3	11	[ ] Marcam directoarele care ar putea fi identice si apoi le scanam doar pe ele.	[ ] Marcam directoarele care ar putea fi identice si apoi le scanam doar pe ele.
4	12	[ ] First, we build the directory tree, then, we compute sha1 where needed	[ ] First, we build the directory tree, then, we compute sha1 where needed

8	16	[ ] Order input directories by len to avoid building a strange tree. Hm.	[ ] Order input directories by len to avoid building a strange tree. Hm.
9	17	Probably does not work.	Probably does not work.
10	18	[ ] We should order by mtime, older one being the first shown.	[ ] We should order by mtime, older one being the first shown.
	19		[ ]
	20
	21
	22		After find_files_dups, unique dirs are marked as such.
	23		The problems now are:
	24		- how we detect equal dirs
	25		- what about the case when we ran "dupdump ./2/3 /1/2" we should sort
	26		somehow the path so /1/2 to be first because starts deeper.
	27
	28		[ ] Strange case:
	29		dir1
	30		dir2
	31		dir3=dir4
	32		dir4=dir3
	33
	34		[ ] We could throw away unique files.
11	35	[ ]	[ ]

File dupdump.c changed (mode: 100644) (index 026822d..f10b7b7)
...	...	static int callback(const char fpath, const struct stat s, int tflag,
42	42
43	43	/* Add dir */	/* Add dir */
44	44	if (tflag == FTW_D) {	if (tflag == FTW_D) {
45		err = add_dir(fpath, s, ftwbuf->level);
	45		err = dir_add(fpath, s, ftwbuf->level);
46	46	if (err != 0) {	if (err != 0) {
47	47	fprintf(stderr, "ERROR: Probably out of memory!\n");	fprintf(stderr, "ERROR: Probably out of memory!\n");
48	48	return FTW_STOP;	return FTW_STOP;

...	...	static int callback(const char fpath, const struct stat s, int tflag,
51	51	return FTW_CONTINUE;	return FTW_CONTINUE;
52	52	}	}
53	53
54		/* Ignore too small size files */
55		if (s->st_size < min_size) {
56		if (verbose >= 1)
57		fprintf(stderr, "Ignore file smaller than %llu bytes.\n",
58		min_size);
59		/* Because we can have 2 dirs with 2 big files that the same,
60		* but the rest of the dir is not the same, we will not allow
61		* 'dir' to be considered in a dir-dir duplicate scenario.
62		*/
63		mark_dir_as_incomplete(ftwbuf->level);
64		return FTW_CONTINUE;
65		}
66
67		err = add_file(fpath, s, ftwbuf->level);
	54		err = file_add(fpath, s, ftwbuf->level);
68	55	if (err != 0) {	if (err != 0) {
69	56	fprintf(stderr, "ERROR: Cannot add file!\n");	fprintf(stderr, "ERROR: Cannot add file!\n");
70	57	return FTW_STOP;	return FTW_STOP;

...	...	int main(int argc, char argv[])*
102	89	if (verbose >= 2)	if (verbose >= 2)
103	90	dump_files();	dump_files();
104	91
105		/* Now, check for duplicates */
106		err = find_file_dups();
	92		/* Check for file duplicates */
	93		err = file_find_dups();
107	94	if (err != 0) {	if (err != 0) {
108	95	fprintf(stderr, "Error comparing files!\n");	fprintf(stderr, "Error comparing files!\n");
109	96	return 1;	return 1;
110	97	}	}
111	98
	99		/* Check for dir duplicates */
	100		err = dir_find_dups();
	101		if (err != 0) {
	102		fprintf(stderr, "Error comparing dirs!\n");
	103		return 1;
	104		}
	105
112	106	dump_dirs();	dump_dirs();
113	107
	108		fprintf(stderr, "\nDUMP DUPLICATES...\n\n");
	109		dump_duplicates(min_size);
	110
114	111	dump_stats();	dump_stats();
115	112
116	113	return 0;	return 0;

File store.c changed (mode: 100644) (index d2c9a18..c4de925)
14	14	#include "store.h"	#include "store.h"
15	15
16	16
17		#define FLAGS_DUMPED (1 << 0)
18
19		#define FLAGS_DIR_NOT_FOR_DUP (1 << 0)
20
21
22	17	#define DEV_INO_HASH_SIZE 4096	#define DEV_INO_HASH_SIZE 4096
23	18	#define HASH_SIZE 512	#define HASH_SIZE 512
24	19	#define MAX_INPUT_DIRS 32	#define MAX_INPUT_DIRS 32

...	...	static struct dir_node dir_current[MAX_DEPTH];*
43	38	static unsigned char sha1_zero[SHA_DIGEST_LENGTH];	static unsigned char sha1_zero[SHA_DIGEST_LENGTH];
44	39	static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE];	static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE];
45	40
	41		/* ############### Memory functions ############### */
	42		static void *xmalloc(size_t size)
	43		{
	44		void *p;
	45
	46		p = malloc(size);
	47		if (p) {
	48		mem_allocated += size;
	49		mem_calls++;
	50		}
	51
	52		return p;
	53		}
	54
46	55	/* ############### SHA-1 functions ############### */	/* ############### SHA-1 functions ############### */
47	56
48	57	void sha1_dump(char out, const unsigned char b, const unsigned int max0)	void sha1_dump(char out, const unsigned char b, const unsigned int max0)

...	...	int dev_ino_seen(const unsigned int type, const dev_t dev, const ino_t ino)
159	168	q = q->next;	q = q->next;
160	169	}	}
161	170
162		q = (struct dev_ino *) malloc(sizeof(struct dev_ino));
	171		q = (struct dev_ino *) xmalloc(sizeof(struct dev_ino));
163	172	if (q == NULL)	if (q == NULL)
164	173	return -1;	return -1;
165	174

...	...	void dump_stats(void)
182	191	fprintf(stderr, "Number of 64K SHA-1 computed: %llu.\n", sha1_first_computed);	fprintf(stderr, "Number of 64K SHA-1 computed: %llu.\n", sha1_first_computed);
183	192	fprintf(stderr, "Number of full SHA-1 computed: %llu.\n", sha1_full_computed);	fprintf(stderr, "Number of full SHA-1 computed: %llu.\n", sha1_full_computed);
184	193	fprintf(stderr, "Bytes that could be saved: %llu.\n", can_save);	fprintf(stderr, "Bytes that could be saved: %llu.\n", can_save);
185		fprintf(stderr, "Number of duplicates: %llu.\n", dup_no_of_files);
186		fprintf(stderr, "Number of duplicates: %llu.\n", dup_no_of_dirs);
	194		fprintf(stderr, "Number of duplicated files: %llu.\n", dup_no_of_files);
	195		fprintf(stderr, "Number of duplicated dirs: %llu.\n", dup_no_of_dirs);
187	196	fprintf(stderr, "Number of same dev/inode (file): %llu.\n", no_of_same_inode_file);	fprintf(stderr, "Number of same dev/inode (file): %llu.\n", no_of_same_inode_file);
188	197	fprintf(stderr, "Number of same dev/inode (dir): %llu.\n", no_of_same_inode_dir);	fprintf(stderr, "Number of same dev/inode (dir): %llu.\n", no_of_same_inode_dir);
189		fprintf(stderr, "Memory allocated: %llu in %llu calls.\n",
	198		fprintf(stderr, "Memory allocated: %llu bytes in %llu call(s).\n",
190	199	mem_allocated, mem_calls);	mem_allocated, mem_calls);
191	200	}	}
192	201

...	...	static struct file_node alloc_file_node(void)*
196	205	unsigned int mem;	unsigned int mem;
197	206
198	207	mem = sizeof(struct file_node);	mem = sizeof(struct file_node);
199		q = (struct file_node *) malloc(mem);
	208		q = (struct file_node *) xmalloc(mem);
200	209	if (q == NULL) {	if (q == NULL) {
201	210	fprintf(stderr, "ERROR: Cannot alloc memory for a file node!\n");	fprintf(stderr, "ERROR: Cannot alloc memory for a file node!\n");
202	211	return NULL;	return NULL;
203	212	}	}
204	213	memset(q, 0, sizeof(struct file_node));	memset(q, 0, sizeof(struct file_node));
205	214
206		mem_allocated += mem;
207		mem_calls++;
208
209	215	return q;	return q;
210	216	}	}
211	217
212		int add_file(const char file, const struct stat s,
	218		int file_add(const char file, const struct stat s,
213	219	const unsigned int level)	const unsigned int level)
214	220	{	{
215	221	struct dir_node *parent;	struct dir_node *parent;

...	...	int add_file(const char file, const struct stat s,
253	259	q->name = strdup(file);	q->name = strdup(file);
254	260	memset(&q->sha1_first, 0, SHA_DIGEST_LENGTH);	memset(&q->sha1_first, 0, SHA_DIGEST_LENGTH);
255	261	memset(&q->sha1_full, 0, SHA_DIGEST_LENGTH);	memset(&q->sha1_full, 0, SHA_DIGEST_LENGTH);
256		q->flags = 0;
257	262	q->dev = s->st_dev;	q->dev = s->st_dev;
258	263	q->ino = s->st_ino;	q->ino = s->st_ino;
259	264

...	...	int add_file(const char file, const struct stat s,
263	268	parent->files = q;	parent->files = q;
264	269	q->parent = parent;	q->parent = parent;
265	270
	271		parent->no_of_files++;
266	272	no_of_files++;	no_of_files++;
267	273
268	274	return 0;	return 0;

...	...	static struct dir_node alloc_dir_node(void)*
274	280	unsigned int mem;	unsigned int mem;
275	281
276	282	mem = sizeof(struct dir_node);	mem = sizeof(struct dir_node);
277		q = (struct dir_node *) malloc(mem);
	283		q = (struct dir_node *) xmalloc(mem);
278	284	if (q == NULL) {	if (q == NULL) {
279	285	fprintf(stderr, "ERROR: Cannot alloc a dir node!\n");	fprintf(stderr, "ERROR: Cannot alloc a dir node!\n");
280	286	return NULL;	return NULL;
281	287	}	}
282	288	memset(q, 0, mem);	memset(q, 0, mem);
283	289
284		mem_allocated += mem;
285		mem_calls++;
286
287	290	return q;	return q;
288	291	}	}
289	292
290	293	/*	/*
291	294	* Add a dir to the structure	* Add a dir to the structure
292	295	*/	*/
293		int add_dir(const char dir, const struct stat s, const unsigned int level)
	296		int dir_add(const char dir, const struct stat s, const unsigned int level)
294	297	{	{
295	298	struct dir_node q, parent;	struct dir_node q, parent;
296	299

...	...	int add_dir(const char dir, const struct stat s, const unsigned int level)
308	311	q->name = strdup(dir);	q->name = strdup(dir);
309	312	q->dev = s->st_dev;	q->dev = s->st_dev;
310	313	q->ino = s->st_ino;	q->ino = s->st_ino;
	314		q->level = level;
311	315
312	316	no_of_dirs++;	no_of_dirs++;
313	317

...	...	int add_dir(const char dir, const struct stat s, const unsigned int level)
329	333	fprintf(stderr, "PARENT is %p, subdirs is %p, q=%p set parent->subdirs to q\n",	fprintf(stderr, "PARENT is %p, subdirs is %p, q=%p set parent->subdirs to q\n",
330	334	parent, parent->subdirs, q);	parent, parent->subdirs, q);
331	335	*/	*/
	336		q->parent = parent;
332	337	q->next_sibling = parent->subdirs;	q->next_sibling = parent->subdirs;
333	338	parent->subdirs = q;	parent->subdirs = q;
334	339	}	}

...	...	int add_dir(const char dir, const struct stat s, const unsigned int level)
340	345	return 0;	return 0;
341	346	}	}
342	347
343		static void dump_file_node(const struct file_node *q, const unsigned int level)
	348		static void file_dump_node(const struct file_node *q, const unsigned int level)
344	349	{	{
345	350	char sha1_first[SHA_DIGEST_LENGTH * 2 + 1];	char sha1_first[SHA_DIGEST_LENGTH * 2 + 1];
346	351	char sha1_full[SHA_DIGEST_LENGTH * 2 + 1];	char sha1_full[SHA_DIGEST_LENGTH * 2 + 1];
347	352	char prefix[128];	char prefix[128];
348	353
349		memset(prefix, '\t', level);
350		prefix[level] = '\0';
	354		memset(prefix, ' ', level * 2);
	355		prefix[level * 2] = '\0';
351	356
352	357	sha1_dump(sha1_first, q->sha1_first, 8);	sha1_dump(sha1_first, q->sha1_first, 8);
353	358	sha1_dump(sha1_full, q->sha1_full, 8);	sha1_dump(sha1_full, q->sha1_full, 8);
354	359	fprintf(stderr, "%sF '%s' node=%p parent=%p next=%p hash_next=%p size=%llu"	fprintf(stderr, "%sF '%s' node=%p parent=%p next=%p hash_next=%p size=%llu"
355		" dev=%lu inode=%llu sha1=%s/%s\n",
	360		" dev=%lu inode=%llu no_dup_possible=%u do_not_dump=%u"
	361		" duplicates=%p left=%u sha1=%s/%s\n",
356	362	prefix, q->name, q, q->parent, q->next, q->hash_next, q->size,	prefix, q->name, q, q->parent, q->next, q->hash_next, q->size,
357	363	(unsigned long) q->dev, (unsigned long long) q->ino,	(unsigned long) q->dev, (unsigned long long) q->ino,
358		sha1_first, sha1_full);
	364		q->no_dup_possible, q->do_not_dump,
	365		q->duplicates, q->left, sha1_first, sha1_full);
359	366	}	}
360	367
361	368	void dump_files(void)	void dump_files(void)

...	...	void dump_files(void)
363	370	struct file_node *q;	struct file_node *q;
364	371	unsigned int hash;	unsigned int hash;
365	372
366		fprintf(stderr, "Dumping internal data...\n");
	373		fprintf(stderr, "Dumping internal data - START...\n");
367	374	for (hash = 0; hash < HASH_SIZE; hash++) {	for (hash = 0; hash < HASH_SIZE; hash++) {
368	375	if (file_info[hash] == NULL)	if (file_info[hash] == NULL)
369	376	continue;	continue;

...	...	void dump_files(void)
371	378	fprintf(stderr, "info[%05d]:\n", hash);	fprintf(stderr, "info[%05d]:\n", hash);
372	379	q = file_info[hash];	q = file_info[hash];
373	380	while (q) {	while (q) {
374		dump_file_node(q, 0);
375		q = q->next;
	381		file_dump_node(q, 0);
	382		q = q->hash_next;
376	383	}	}
377	384	}	}
	385		fprintf(stderr, "Dumping internal data - STOP...\n");
378	386	}	}
379	387
380		void dump_dir_node(const struct dir_node *d, const unsigned int level)
	388		void dir_dump_node(const struct dir_node *d, const unsigned int level)
381	389	{	{
382	390	char prefix[128];	char prefix[128];
383	391	struct dir_node *subdir;	struct dir_node *subdir;
384	392	struct file_node *file;	struct file_node *file;
	393		char dump[SHA_DIGEST_LENGTH * 2 + 1];
385	394
386		/fprintf(stderr, "dump_dir_node d=%p level=%u\n", d, level);/
387		memset(prefix, '\t', level);
388		prefix[level] = '\0';
	395		memset(prefix, ' ', (level + 1) * 2);
	396		prefix[(level + 1) * 2] = '\0';
389	397
390		fprintf(stderr, "%sD '%s' L=%u d=%p subdirs=%p next_sibling=%p"
391		" files=%p:\n",
392		prefix, d->name, level, d, d->subdirs, d->next_sibling,
393		d->files);
	398		sha1_dump(dump, d->sha1, 8);
	399		fprintf(stderr, "%sD '%s' d=%p subdirs=%p next_sibling=%p"
	400		" files=%p parent=%p no_dup_possible=%u do_not_dump=%u"
	401		" level=%hu hash_next=%p left=%u sha1=%s\n",
	402		prefix, d->name, d, d->subdirs, d->next_sibling,
	403		d->files, d->parent, d->no_dup_possible, d->do_not_dump,
	404		d->level, d->hash_next, d->left, dump);
394	405
395	406	subdir = d->subdirs;	subdir = d->subdirs;
396	407	while (subdir) {	while (subdir) {
397		dump_dir_node(subdir, level + 1);
	408		dir_dump_node(subdir, level + 1 + 1);
398	409	subdir = subdir->next_sibling;	subdir = subdir->next_sibling;
399	410	}	}
400	411
401	412	file = d->files;	file = d->files;
402	413	while (file) {	while (file) {
403		dump_file_node(file, level + 1);
	414		file_dump_node(file, level + 1 + 1);
404	415	file = file->next;	file = file->next;
405	416	}	}
406	417	}	}

...	...	void dump_dirs(void)
411	422
412	423	for (i = 0; i < dir_info_count; i++) {	for (i = 0; i < dir_info_count; i++) {
413	424	fprintf(stderr, "dump_dirs[%u]...\n", i);	fprintf(stderr, "dump_dirs[%u]...\n", i);
414		dump_dir_node(dir_info[i], 0);
	425		dir_dump_node(dir_info[i], 0);
415	426	}	}
416	427	}	}
417	428

...	...	static int compare_files(struct file_node a, struct file_node b)
445	456	return 1;	return 1;
446	457	}	}
447	458
	459		static void dir_mark_no_dup_possible(struct dir_node *d)
	460		{
	461		if ((d == NULL) \|\| (d->no_dup_possible == 1))
	462		return;
	463
	464		d->no_dup_possible = 1;
	465		dir_mark_no_dup_possible(d->parent);
	466		}
	467
448	468	/*	/*
449		* Compare the same size files
	469		* When we list a folder on the left side, we must mark whole hierarchy under
	470		* it as 'do_not_dump'. Else, we will dump its files and we do not want that.
450	471	*/	*/
451		static int compare_file_range(struct file_node a, struct file_node b)
	472		static void dir_mark_do_not_dump(struct dir_node *d)
452	473	{	{
453		int err, q1_dumped;
454		struct file_node q1, q2;
	474		struct file_node *file;
	475		struct dir_node *subdir;
455	476
456		/fprintf(stderr, "compare_range: %p -> %p\n", a, b);/
	477		if ((d == NULL) \|\| (d->do_not_dump == 1))
	478		return;
457	479
458		/* Single file of X size */
459		if (a->hash_next == NULL)
460		return 0;
	480		d->do_not_dump = 1;
461	481
462		q1 = a;
463		while (q1 != b->hash_next) {
464		/* We avoid already matched files */
465		if (q1->flags & FLAGS_DUMPED) {
466		q1 = q1->hash_next;
467		continue;
468		}
	482		subdir = d->subdirs;
	483		while (subdir) {
	484		dir_mark_do_not_dump(subdir);
	485		subdir = subdir->next_sibling;
	486		}
	487
	488		file = d->files;
	489		while (file) {
	490		file->do_not_dump = 1;
	491		file = file->next;
	492		}
	493		}
	494
	495		/*
	496		* If we dump a dir on the left side, the dup files must be also on the left side.
	497		*/
	498		static void dir_mark_left(struct dir_node *d)
	499		{
	500		struct file_node *file;
	501		struct dir_node *subdir;
	502
	503		if ((d == NULL) \|\| (d->left == 1))
	504		return;
	505
	506		d->left = 1;
	507
	508		subdir = d->subdirs;
	509		while (subdir) {
	510		dir_mark_left(subdir);
	511		subdir = subdir->next_sibling;
	512		}
	513
	514		file = d->files;
	515		while (file) {
	516		file->left = 1;
	517		file = file->next;
	518		}
	519		}
	520
	521		static void file_mark_no_dup_possible(struct file_node *f)
	522		{
	523		if (f->no_dup_possible == 1)
	524		return;
469	525
470		q2 = q1->hash_next;
471		if (q2 == NULL)
	526		f->no_dup_possible = 1;
	527		dir_mark_no_dup_possible(f->parent);
	528		}
	529
	530		/*
	531		* Mark a file to not be dumped
	532		*/
	533		static void file_mark_do_not_dump(struct file_node *f)
	534		{
	535		if ((f == NULL) \|\| (f->do_not_dump == 1))
	536		return;
	537
	538		f->do_not_dump = 1;
	539		}
	540
	541		/*
	542		* Compare the same size files using hashes
	543		* TODO: Use a better check algo!
	544		*/
	545		static int compare_file_range(struct file_node a, struct file_node b)
	546		{
	547		int err;
	548		struct file_node q, p;
	549
	550		/* Mark all as unique */
	551		q = a;
	552		while (q != b->hash_next) {
	553		q->unique = 1;
	554		q = q->hash_next;
	555		}
	556
	557		p = a;
	558		while (p != b->hash_next) {
	559		q = p->hash_next;
	560		if (q == NULL)
472	561	break;	break;
473	562
474		q1_dumped = 0;
475		while (q2 != b->hash_next) {
476		err = compare_files(q1, q2);
	563		while (q != b->hash_next) {
	564		err = compare_files(p, q);
477	565	if (err == -1)	if (err == -1)
478	566	return -1;	return -1;
479	567
480	568	if (err != 1) {	if (err != 1) {
481		q2 = q2->hash_next;
	569		q = q->hash_next;
482	570	continue;	continue;
483	571	}	}
484	572
485		if (q1_dumped == 0) {
486		printf("%s\n", q1->name);
487		q1_dumped = 1;
488		}
	573		q->duplicates = p->duplicates;
	574		p->duplicates = q;
	575		file_mark_do_not_dump(q);
489	576
490		/* show dup file */
491		printf("\t%s\n", q2->name);
492		q2->flags \|= FLAGS_DUMPED;
	577		p->unique = 0;
	578		q->unique = 0;
	579
	580		/* TODO: these has to be moved */
493	581	dup_no_of_files++;	dup_no_of_files++;
494		can_save += q2->size;
	582		can_save += q->size;
495	583
496		q2 = q2->hash_next;
	584		q = q->hash_next;
497	585	}	}
498	586
499		q1 = q1->hash_next;
	587		p = p->hash_next;
	588		}
	589
	590		/* All entries that are unique, will propagate to parents */
	591		q = a;
	592		while (q != b->hash_next) {
	593		if (q->unique == 1)
	594		file_mark_no_dup_possible(q);
	595		q = q->hash_next;
500	596	}	}
501	597
502	598	return 0;	return 0;
503	599	}	}
504	600
505		int find_file_dups(void)
	601		int file_find_dups(void)
506	602	{	{
507	603	int err;	int err;
508	604	struct file_node q, first, *last;	struct file_node q, first, *last;

...	...	int find_file_dups(void)
513	609	if (file_info[hash] == NULL)	if (file_info[hash] == NULL)
514	610	continue;	continue;
515	611
516		fprintf(stderr, "find_file_dups[%u]...\n", hash);
	612		fprintf(stderr, "file_find_dups[%u]...\n", hash);
517	613
518	614	/* We need at least 2 nodes */	/* We need at least 2 nodes */
519		if (file_info[hash]->hash_next == NULL)
	615		if (file_info[hash]->hash_next == NULL) {
	616		file_mark_no_dup_possible(file_info[hash]);
520	617	continue;	continue;
	618		}
521	619
522	620	first = file_info[hash];	first = file_info[hash];
523	621	while (1) {	while (1) {

...	...	int find_file_dups(void)
530	628	q = q->hash_next;	q = q->hash_next;
531	629	}	}
532	630
	631		fprintf(stderr, "\tfirst=%p last=%p\n", first, last);
	632
533	633	err = compare_file_range(first, last);	err = compare_file_range(first, last);
534	634	if (err == -1)	if (err == -1)
535	635	return -1;	return -1;

...	...	int find_file_dups(void)
545	645	}	}
546	646
547	647	/*	/*
548		* Will mark a dir as incomplete because we ignore too small files
	648		* Sorting helper
	649		* a0 and b0 are pointers!
	650		*/
	651		static int file_compare_hashes(const void a0, const void b0)
	652		{
	653		const unsigned char a = (const unsigned char **) a0;
	654		const unsigned char b = (const unsigned char **) b0;
	655
	656		return memcmp(a, b, SHA_DIGEST_LENGTH);
	657		}
	658
	659		/*
	660		* Sorts, by sha1_full the list of files and returns SHA-1 of files list.
	661		* We need to sort because the order of files in dirs may differ because
	662		* the names may be different but the content the same.
	663		*/
	664		static int dir_files_hash(unsigned char hash, struct dir_node d)
	665		{
	666		struct file_node *p;
	667		unsigned char **u;
	668		unsigned int i, mem;
	669		SHA_CTX c;
	670
	671		if (d->files == NULL) {
	672		memset(hash, 0, SHA_DIGEST_LENGTH);
	673		return 0;
	674		}
	675
	676		mem = d->no_of_files * sizeof(unsigned char *);
	677		u = (unsigned char **) xmalloc(mem);
	678		if (u == NULL)
	679		return -1;
	680
	681		p = d->files;
	682		i = 0;
	683		while (p) {
	684		u[i] = p->sha1_full;
	685		p = p->next;
	686		i++;
	687		}
	688
	689		qsort(u, d->no_of_files, sizeof(unsigned char *), file_compare_hashes);
	690
	691		SHA1_Init(&c);
	692
	693		i = 0;
	694		while (i < d->no_of_files) {
	695		SHA1_Update(&c, u[i], SHA_DIGEST_LENGTH);
	696		i++;
	697		}
	698
	699		SHA1_Final(hash, &c);
	700
	701		free(u);
	702
	703		return 0;
	704		}
	705
	706		/*
	707		* Builds hash of a directory
	708		*/
	709		static long long dir_build_hash(struct dir_node *d)
	710		{
	711		struct dir_node *subdir;
	712		SHA_CTX c;
	713		unsigned char files_hash[SHA_DIGEST_LENGTH];
	714		int err;
	715		long long no_of_possible_dirs = 0;
	716		long long ret;
	717
	718		fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n",
	719		__FUNCTION__, d->name, d->no_dup_possible);
	720
	721		/* We check current dir first. */
	722		if (d->no_dup_possible == 0)
	723		no_of_possible_dirs++;
	724
	725		/* Order files by hash to compute correct hashes */
	726		err = dir_files_hash(files_hash, d);
	727		if (err != 0)
	728		return -1;
	729
	730		SHA1_Init(&c);
	731		SHA1_Update(&c, files_hash, SHA_DIGEST_LENGTH);
	732
	733		subdir = d->subdirs;
	734		while (subdir) {
	735		ret = dir_build_hash(subdir);
	736		if (ret == -1)
	737		return -1;
	738
	739		SHA1_Update(&c, subdir->sha1, SHA_DIGEST_LENGTH);
	740		no_of_possible_dirs += ret;
	741		subdir = subdir->next_sibling;
	742		}
	743
	744		SHA1_Final(d->sha1, &c);
	745
	746		return no_of_possible_dirs;
	747		}
	748
	749		/*
	750		* Sorting helper for dirs
	751		* a0 and b0 are pointers!
	752		*/
	753		static int dir_compare_hashes(const void a0, const void b0)
	754		{
	755		const struct dir_node a = (const struct dir_node **) a0;
	756		const struct dir_node b = (const struct dir_node **) b0;
	757
	758		return memcmp(a->sha1, b->sha1, SHA_DIGEST_LENGTH);
	759		}
	760
	761		/*
	762		* Process a range of dirs with the same hash
	763		* @where is the lowest level dir - where we should link everything under.
	764		*/
	765		static void dir_process_range(struct dir_node **u, const long long first,
	766		const long long last, const long long where)
	767		{
	768		long long j;
	769
	770		for (j = first; j <= last; j++) {
	771		if (j == where)
	772		continue;
	773		dir_mark_do_not_dump(u[j]);
	774		u[j]->hash_next = u[where]->hash_next;
	775		u[where]->hash_next = u[j];
	776		}
	777
	778		}
	779		/*
	780		* Finds dir duplicates (we are only marking here)
	781		* We have to sort files based on hash, to match
	782		* We ignore 000 hashes (dirs), because that files are single.
	783		* TODO: the name does not reflect what the function does.
	784		*/
	785		int dir_find_dups(void)
	786		{
	787		long long i, j, first, last, where;
	788		int final_step;
	789		struct dir_node *d;
	790		unsigned long long mem;
	791		long long err, no_of_possible_dirs = 0;
	792		struct dir_node subdir, *u;
	793
	794		fprintf(stderr, "DEBUG: %s...\n", __FUNCTION__);
	795		for (i = 0; i < dir_info_count; i++) {
	796		fprintf(stderr, "\tDEBUG: [%llu]...\n", i);
	797		d = dir_info[i];
	798		err = dir_build_hash(d);
	799		if (err == -1)
	800		return -1;
	801
	802		no_of_possible_dirs += err;
	803		}
	804		fprintf(stderr, "\tDEBUG: no_of_possible_dirs = %lld\n", no_of_possible_dirs);
	805
	806		/* Allocate an array that we will pass to qsort */
	807		mem = no_of_possible_dirs * sizeof(struct dir_node *);
	808		u = (struct dir_node **) xmalloc(mem);
	809		if (u == NULL) {
	810		fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n",
	811		mem);
	812		return -1;
	813		}
	814
	815		/* TODO: we should break when j is no_of_possible_dirs */
	816		j = 0;
	817		for (i = 0; i < dir_info_count; i++) {
	818		fprintf(stderr, "dir_find_dups[%llu]...\n", i);
	819		d = dir_info[i];
	820
	821		/* we first add the current dir */
	822		if (d->no_dup_possible == 0)
	823		u[j++] = d;
	824
	825		subdir = d->subdirs;
	826		while (subdir) {
	827		if (subdir->no_dup_possible == 0)
	828		u[j++] = subdir;
	829		subdir = subdir->next_sibling;
	830		}
	831		}
	832
	833		/* TODO: pass whole structure also for files */
	834		qsort(u, no_of_possible_dirs, sizeof(struct dir_node *), dir_compare_hashes);
	835
	836		first = 0;
	837		last = 0;
	838		where = 0;
	839		for (i = 1; i < no_of_possible_dirs; i++) {
	840		if (memcmp(u[first]->sha1, u[i]->sha1, SHA_DIGEST_LENGTH) == 0) {
	841		/* We have the same hash */
	842		dup_no_of_dirs++;
	843		last = i;
	844		if (u[last]->level < u[where]->level)
	845		where = last;
	846		continue;
	847		}
	848
	849		/* We have same hash in first..last */
	850		final_step = 0;
	851		dir_process_range(u, first, last, where);
	852
	853		/* Switch to next range */
	854		first = i;
	855		last = i;
	856		where = i;
	857		final_step = 1;
	858		}
	859
	860
	861		/* TODO: shoudn't we do the same for files? */
	862		if (final_step == 1)
	863		dir_process_range(u, first, last, where);
	864
	865		free(u);
	866
	867		return 0;
	868		}
	869
	870		/*
	871		* Nice dumps the duplicated dirs
549	872	*/	*/
550		void mark_dir_as_incomplete(const unsigned int level)
	873		void dir_dump_duplicates(struct dir_node *d)
551	874	{	{
552		struct dir_node *dir;
	875		struct dir_node *p;
553	876
554		dir = dir_current[level - 1];
555		dir->flags \|= FLAGS_DIR_NOT_FOR_DUP;
	877		if (d->no_dup_possible == 1)
	878		return;
	879
	880		if (d->do_not_dump == 1)
	881		return;
	882
	883		if (d->hash_next == NULL)
	884		return;
	885
	886		dir_mark_left(d);
	887
	888		p = d->hash_next;
	889		while (p) {
	890		if (p->left == 1) {
	891		/*
	892		* We already dumped that dir on the left side.
	893		* makes no sense to dump it again on the right side!
	894		*/
	895		p = p->hash_next;
	896		continue;
	897		}
	898
	899		fprintf(stderr, "\t\t\t%s = %s\n",
	900		d->name, p->name);
	901		printf("DIR\t%s\t%s\n",
	902		d->name, p->name);
	903		p = p->hash_next;
	904		}
	905		}
	906
	907		/*
	908		* Nice dumps the duplicated files
	909		*/
	910		void file_dump_duplicates(const struct file_node *f,
	911		const unsigned long long min_size)
	912		{
	913		const struct file_node p, left;
	914
	915		file_dump_node(f, 1);
	916
	917		if (f->duplicates == NULL)
	918		return;
	919
	920		if (f->no_dup_possible == 1)
	921		return;
	922
	923		if (f->do_not_dump == 1)
	924		return;
	925
	926		if (f->duplicates == NULL)
	927		return;
	928
	929		if (f->size < min_size)
	930		return;
	931
	932		/* first, search for the first left one */
	933		left = f;
	934		if (left->left == 0) {
	935		p = f->duplicates;
	936		while (p) {
	937		if (p->left == 1) {
	938		left = p;
	939		break;
	940		}
	941		p = p->duplicates;
	942		}
	943		}
	944
	945		/* now, dump */
	946		p = f;
	947		while (p) {
	948		/*
	949		* We do not want to dump files already dumped when we did
	950		* it for dirs.
	951		*/
	952		if (p->do_not_dump == 1) {
	953		p = p->duplicates;
	954		continue;
	955		}
	956
	957		if (p == left) {
	958		p = p->duplicates;
	959		continue;
	960		}
	961
	962		fprintf(stderr, "\t\t\t%s = %s\n",
	963		left->name, p->name);
	964		printf("FILE\t%s\t%s\n",
	965		left->name, p->name);
	966		p = p->duplicates;
	967		}
	968		}
	969
	970		/*
	971		* Searches all tree for duplicates
	972		* @min_size - do not dump files shorter than min_size
	973		*/
	974		void dump_duplicates(const unsigned long long min_size)
	975		{
	976		unsigned int i;
	977		struct dir_node d, subdir;
	978		struct file_node *f;
	979		unsigned int hash;
	980
	981		fprintf(stderr, "Dump duplicates (bigger than %llu)...\n", min_size);
	982
	983		for (i = 0; i < dir_info_count; i++) {
	984		fprintf(stderr, "\tdump_duplicates[%u]...\n", i);
	985		d = dir_info[i];
	986		dir_dump_duplicates(d);
	987
	988		subdir = d->subdirs;
	989		while (subdir) {
	990		dir_dump_duplicates(subdir);
	991		subdir = subdir->next_sibling;
	992		}
	993		}
	994
	995		/* Now, we dump remaining files */
	996		fprintf(stderr, "DEBUG: Dump duplicated files...\n");
	997		for (hash = 0; hash < HASH_SIZE; hash++) {
	998		if (file_info[hash] == NULL)
	999		continue;
	1000
	1001		fprintf(stderr, "\thash %u\n", hash);
	1002
	1003		f = file_info[hash];
	1004		while (f) {
	1005		file_dump_duplicates(f, min_size);
	1006		f = f->hash_next;
	1007		}
	1008		}
556	1009	}	}

File store.h changed (mode: 100644) (index b5b3633..eb059da)
...	...	struct file_node
19	19	unsigned long long size;	unsigned long long size;
20	20	unsigned char sha1_first[SHA_DIGEST_LENGTH];	unsigned char sha1_first[SHA_DIGEST_LENGTH];
21	21	unsigned char sha1_full[SHA_DIGEST_LENGTH];	unsigned char sha1_full[SHA_DIGEST_LENGTH];
22		unsigned int flags;
	22		unsigned int no_dup_possible:1;
	23		unsigned int do_not_dump:1;
	24		unsigned int unique:1;
	25		unsigned int left:1;
23	26	dev_t dev;	dev_t dev;
24	27	ino_t ino;	ino_t ino;
25	28	struct file_node *next;	struct file_node *next;
26	29	struct file_node *hash_next;	struct file_node *hash_next;
27	30	struct dir_node *parent;	struct dir_node *parent;
	31		struct file_node *duplicates;
28	32	};	};
29	33
30	34	struct dir_node	struct dir_node
31	35	{	{
32	36	char *name;	char *name;
33	37	unsigned char sha1[SHA_DIGEST_LENGTH];	unsigned char sha1[SHA_DIGEST_LENGTH];
34		unsigned int flags;
	38		unsigned int no_dup_possible:1;
	39		unsigned int do_not_dump:1;
	40		unsigned int left:1;
	41		unsigned int level:16;
35	42	dev_t dev;	dev_t dev;
36	43	ino_t ino;	ino_t ino;
37	44	struct dir_node *subdirs;	struct dir_node *subdirs;
38	45	struct dir_node next_sibling; / Link subdirs on the same level */	struct dir_node next_sibling; / Link subdirs on the same level */
39	46	struct file_node *files;	struct file_node *files;
	47		unsigned int no_of_files;
	48		struct dir_node *parent;
	49		struct dir_node hash_next; / in the last phase, here we store duplicates */
40	50	};	};
41	51
42	52
43	53	extern void dump_stats(void);	extern void dump_stats(void);
44		extern int add_file(const char file, const struct stat s,
	54		extern int file_add(const char file, const struct stat s,
45	55	const unsigned int level);	const unsigned int level);
46		extern int add_dir(const char dir, const struct stat s,
	56		extern int dir_add(const char dir, const struct stat s,
47	57	const unsigned int level);	const unsigned int level);
48	58	extern void dump_files(void);	extern void dump_files(void);
49		extern int find_file_dups(void);
50		extern int find_dir_dups(void);
	59		extern int file_find_dups(void);
	60		extern int dir_find_dups(void);
51	61	extern struct dir_node *dir_get_current(const unsigned int level);	extern struct dir_node *dir_get_current(const unsigned int level);
52	62
53	63	extern int dev_ino_seen(const unsigned int type,	extern int dev_ino_seen(const unsigned int type,
54	64	const dev_t dev, const ino_t ino);	const dev_t dev, const ino_t ino);
55	65
56		extern void mark_dir_as_incomplete(const unsigned int level);
57
58	66	extern void dump_dirs(void);	extern void dump_dirs(void);
59	67
60		#endif
	68		extern void dump_duplicates(const unsigned long long min_size);
	69
	70		#endif

Hints:
Before first commit, do not forget to setup your git environment:

git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):

git clone https://rocketgit.com/user/catalinux/dupdump

Clone this repository using ssh (do not forget to upload a key first):

git clone ssh://rocketgit@ssh.rocketgit.com/user/catalinux/dupdump

Clone this repository using git:

git clone git://git.rocketgit.com/user/catalinux/dupdump

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:

... clone the repository ...
... make some changes and some commits ...
git push origin main