List of commits:
Subject Hash Author Date (UTC)
Several fixes 2909b1ba2e99929e775ddfea5f4894c50694a638 Catalin(ux) M. BOIE 2012-06-19 13:08:50
First version 3d7935d9b8a91694fe8213998ce4d3910348d6ef Catalin(ux) M. BOIE 2012-05-06 19:40:40
Commit 2909b1ba2e99929e775ddfea5f4894c50694a638 - Several fixes
Author: Catalin(ux) M. BOIE
Author date (UTC): 2012-06-19 13:08
Committer name: Catalin(ux) M. BOIE
Committer date (UTC): 2012-06-19 13:08
Parent(s): 3d7935d9b8a91694fe8213998ce4d3910348d6ef
Signing key:
Tree: b6ca189f2b650d1068d4e94d296559b91b7b5923
File Lines added Lines deleted
.gitignore 2 0
TODO 25 1
dupdump.c 14 17
store.c 531 78
store.h 19 9
File .gitignore changed (mode: 100644) (index 66c01b4..7b8473d)
1 1 *.o *.o
2 2 dupdump dupdump
3 3 vgcore* vgcore*
4 Makefile
5 *.spec
File TODO changed (mode: 100644) (index c53b196..8a84376)
1 [ ] --min-size parameter
1 [X] Mark files as NOT_POSSIBLE_DUPLICATES
2 [X] Do the compare of size and hashes
3 [X] If match, mark as POSSIBLE_DUPLICATES
4 [X] For the rest, propagate the flag to the parent dirs.
5
6 [ ] compute dir hashes by sorting hashes for dirs and files, maybe, compute separately files and dir
7 [ ] Sort files and subdirs by hash - do a function
8
9 [ ] --min-size parameter and --max-size
2 10 [ ] Use more threads [ ] Use more threads
3 11 [ ] Marcam directoarele care ar putea fi identice si apoi le scanam doar pe ele. [ ] Marcam directoarele care ar putea fi identice si apoi le scanam doar pe ele.
4 12 [ ] First, we build the directory tree, then, we compute sha1 where needed [ ] First, we build the directory tree, then, we compute sha1 where needed
 
8 16 [ ] Order input directories by len to avoid building a strange tree. Hm. [ ] Order input directories by len to avoid building a strange tree. Hm.
9 17 Probably does not work. Probably does not work.
10 18 [ ] We should order by mtime, older one being the first shown. [ ] We should order by mtime, older one being the first shown.
19 [ ]
20
21
22 After find_files_dups, unique dirs are marked as such.
23 The problems now are:
24 - how we detect equal dirs
25 - what about the case when we ran "dupdump ./2/3 /1/2" we should sort
26 somehow the path so /1/2 to be first because starts deeper.
27
28 [ ] Strange case:
29 dir1
30 dir2
31 dir3=dir4
32 dir4=dir3
33
34 [ ] We could throw away unique files.
11 35 [ ] [ ]
File dupdump.c changed (mode: 100644) (index 026822d..f10b7b7)
... ... static int callback(const char *fpath, const struct stat *s, int tflag,
42 42
43 43 /* Add dir */ /* Add dir */
44 44 if (tflag == FTW_D) { if (tflag == FTW_D) {
45 err = add_dir(fpath, s, ftwbuf->level);
45 err = dir_add(fpath, s, ftwbuf->level);
46 46 if (err != 0) { if (err != 0) {
47 47 fprintf(stderr, "ERROR: Probably out of memory!\n"); fprintf(stderr, "ERROR: Probably out of memory!\n");
48 48 return FTW_STOP; return FTW_STOP;
 
... ... static int callback(const char *fpath, const struct stat *s, int tflag,
51 51 return FTW_CONTINUE; return FTW_CONTINUE;
52 52 } }
53 53
54 /* Ignore too small size files */
55 if (s->st_size < min_size) {
56 if (verbose >= 1)
57 fprintf(stderr, "Ignore file smaller than %llu bytes.\n",
58 min_size);
59 /* Because we can have 2 dirs with 2 big files that the same,
60 * but the rest of the dir is not the same, we will not allow
61 * 'dir' to be considered in a dir-dir duplicate scenario.
62 */
63 mark_dir_as_incomplete(ftwbuf->level);
64 return FTW_CONTINUE;
65 }
66
67 err = add_file(fpath, s, ftwbuf->level);
54 err = file_add(fpath, s, ftwbuf->level);
68 55 if (err != 0) { if (err != 0) {
69 56 fprintf(stderr, "ERROR: Cannot add file!\n"); fprintf(stderr, "ERROR: Cannot add file!\n");
70 57 return FTW_STOP; return FTW_STOP;
 
... ... int main(int argc, char *argv[])
102 89 if (verbose >= 2) if (verbose >= 2)
103 90 dump_files(); dump_files();
104 91
105 /* Now, check for duplicates */
106 err = find_file_dups();
92 /* Check for file duplicates */
93 err = file_find_dups();
107 94 if (err != 0) { if (err != 0) {
108 95 fprintf(stderr, "Error comparing files!\n"); fprintf(stderr, "Error comparing files!\n");
109 96 return 1; return 1;
110 97 } }
111 98
99 /* Check for dir duplicates */
100 err = dir_find_dups();
101 if (err != 0) {
102 fprintf(stderr, "Error comparing dirs!\n");
103 return 1;
104 }
105
112 106 dump_dirs(); dump_dirs();
113 107
108 fprintf(stderr, "\nDUMP DUPLICATES...\n\n");
109 dump_duplicates(min_size);
110
114 111 dump_stats(); dump_stats();
115 112
116 113 return 0; return 0;
File store.c changed (mode: 100644) (index d2c9a18..c4de925)
14 14 #include "store.h" #include "store.h"
15 15
16 16
17 #define FLAGS_DUMPED (1 << 0)
18
19 #define FLAGS_DIR_NOT_FOR_DUP (1 << 0)
20
21
22 17 #define DEV_INO_HASH_SIZE 4096 #define DEV_INO_HASH_SIZE 4096
23 18 #define HASH_SIZE 512 #define HASH_SIZE 512
24 19 #define MAX_INPUT_DIRS 32 #define MAX_INPUT_DIRS 32
 
... ... static struct dir_node *dir_current[MAX_DEPTH];
43 38 static unsigned char sha1_zero[SHA_DIGEST_LENGTH]; static unsigned char sha1_zero[SHA_DIGEST_LENGTH];
44 39 static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE]; static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE];
45 40
41 /* ############### Memory functions ############### */
42 static void *xmalloc(size_t size)
43 {
44 void *p;
45
46 p = malloc(size);
47 if (p) {
48 mem_allocated += size;
49 mem_calls++;
50 }
51
52 return p;
53 }
54
46 55 /* ############### SHA-1 functions ############### */ /* ############### SHA-1 functions ############### */
47 56
48 57 void sha1_dump(char *out, const unsigned char *b, const unsigned int max0) void sha1_dump(char *out, const unsigned char *b, const unsigned int max0)
 
... ... int dev_ino_seen(const unsigned int type, const dev_t dev, const ino_t ino)
159 168 q = q->next; q = q->next;
160 169 } }
161 170
162 q = (struct dev_ino *) malloc(sizeof(struct dev_ino));
171 q = (struct dev_ino *) xmalloc(sizeof(struct dev_ino));
163 172 if (q == NULL) if (q == NULL)
164 173 return -1; return -1;
165 174
 
... ... void dump_stats(void)
182 191 fprintf(stderr, "Number of 64K SHA-1 computed: %llu.\n", sha1_first_computed); fprintf(stderr, "Number of 64K SHA-1 computed: %llu.\n", sha1_first_computed);
183 192 fprintf(stderr, "Number of full SHA-1 computed: %llu.\n", sha1_full_computed); fprintf(stderr, "Number of full SHA-1 computed: %llu.\n", sha1_full_computed);
184 193 fprintf(stderr, "Bytes that could be saved: %llu.\n", can_save); fprintf(stderr, "Bytes that could be saved: %llu.\n", can_save);
185 fprintf(stderr, "Number of duplicates: %llu.\n", dup_no_of_files);
186 fprintf(stderr, "Number of duplicates: %llu.\n", dup_no_of_dirs);
194 fprintf(stderr, "Number of duplicated files: %llu.\n", dup_no_of_files);
195 fprintf(stderr, "Number of duplicated dirs: %llu.\n", dup_no_of_dirs);
187 196 fprintf(stderr, "Number of same dev/inode (file): %llu.\n", no_of_same_inode_file); fprintf(stderr, "Number of same dev/inode (file): %llu.\n", no_of_same_inode_file);
188 197 fprintf(stderr, "Number of same dev/inode (dir): %llu.\n", no_of_same_inode_dir); fprintf(stderr, "Number of same dev/inode (dir): %llu.\n", no_of_same_inode_dir);
189 fprintf(stderr, "Memory allocated: %llu in %llu calls.\n",
198 fprintf(stderr, "Memory allocated: %llu bytes in %llu call(s).\n",
190 199 mem_allocated, mem_calls); mem_allocated, mem_calls);
191 200 } }
192 201
 
... ... static struct file_node *alloc_file_node(void)
196 205 unsigned int mem; unsigned int mem;
197 206
198 207 mem = sizeof(struct file_node); mem = sizeof(struct file_node);
199 q = (struct file_node *) malloc(mem);
208 q = (struct file_node *) xmalloc(mem);
200 209 if (q == NULL) { if (q == NULL) {
201 210 fprintf(stderr, "ERROR: Cannot alloc memory for a file node!\n"); fprintf(stderr, "ERROR: Cannot alloc memory for a file node!\n");
202 211 return NULL; return NULL;
203 212 } }
204 213 memset(q, 0, sizeof(struct file_node)); memset(q, 0, sizeof(struct file_node));
205 214
206 mem_allocated += mem;
207 mem_calls++;
208
209 215 return q; return q;
210 216 } }
211 217
212 int add_file(const char *file, const struct stat *s,
218 int file_add(const char *file, const struct stat *s,
213 219 const unsigned int level) const unsigned int level)
214 220 { {
215 221 struct dir_node *parent; struct dir_node *parent;
 
... ... int add_file(const char *file, const struct stat *s,
253 259 q->name = strdup(file); q->name = strdup(file);
254 260 memset(&q->sha1_first, 0, SHA_DIGEST_LENGTH); memset(&q->sha1_first, 0, SHA_DIGEST_LENGTH);
255 261 memset(&q->sha1_full, 0, SHA_DIGEST_LENGTH); memset(&q->sha1_full, 0, SHA_DIGEST_LENGTH);
256 q->flags = 0;
257 262 q->dev = s->st_dev; q->dev = s->st_dev;
258 263 q->ino = s->st_ino; q->ino = s->st_ino;
259 264
 
... ... int add_file(const char *file, const struct stat *s,
263 268 parent->files = q; parent->files = q;
264 269 q->parent = parent; q->parent = parent;
265 270
271 parent->no_of_files++;
266 272 no_of_files++; no_of_files++;
267 273
268 274 return 0; return 0;
 
... ... static struct dir_node *alloc_dir_node(void)
274 280 unsigned int mem; unsigned int mem;
275 281
276 282 mem = sizeof(struct dir_node); mem = sizeof(struct dir_node);
277 q = (struct dir_node *) malloc(mem);
283 q = (struct dir_node *) xmalloc(mem);
278 284 if (q == NULL) { if (q == NULL) {
279 285 fprintf(stderr, "ERROR: Cannot alloc a dir node!\n"); fprintf(stderr, "ERROR: Cannot alloc a dir node!\n");
280 286 return NULL; return NULL;
281 287 } }
282 288 memset(q, 0, mem); memset(q, 0, mem);
283 289
284 mem_allocated += mem;
285 mem_calls++;
286
287 290 return q; return q;
288 291 } }
289 292
290 293 /* /*
291 294 * Add a dir to the structure * Add a dir to the structure
292 295 */ */
293 int add_dir(const char *dir, const struct stat *s, const unsigned int level)
296 int dir_add(const char *dir, const struct stat *s, const unsigned int level)
294 297 { {
295 298 struct dir_node *q, *parent; struct dir_node *q, *parent;
296 299
 
... ... int add_dir(const char *dir, const struct stat *s, const unsigned int level)
308 311 q->name = strdup(dir); q->name = strdup(dir);
309 312 q->dev = s->st_dev; q->dev = s->st_dev;
310 313 q->ino = s->st_ino; q->ino = s->st_ino;
314 q->level = level;
311 315
312 316 no_of_dirs++; no_of_dirs++;
313 317
 
... ... int add_dir(const char *dir, const struct stat *s, const unsigned int level)
329 333 fprintf(stderr, "PARENT is %p, subdirs is %p, q=%p set parent->subdirs to q\n", fprintf(stderr, "PARENT is %p, subdirs is %p, q=%p set parent->subdirs to q\n",
330 334 parent, parent->subdirs, q); parent, parent->subdirs, q);
331 335 */ */
336 q->parent = parent;
332 337 q->next_sibling = parent->subdirs; q->next_sibling = parent->subdirs;
333 338 parent->subdirs = q; parent->subdirs = q;
334 339 } }
 
... ... int add_dir(const char *dir, const struct stat *s, const unsigned int level)
340 345 return 0; return 0;
341 346 } }
342 347
343 static void dump_file_node(const struct file_node *q, const unsigned int level)
348 static void file_dump_node(const struct file_node *q, const unsigned int level)
344 349 { {
345 350 char sha1_first[SHA_DIGEST_LENGTH * 2 + 1]; char sha1_first[SHA_DIGEST_LENGTH * 2 + 1];
346 351 char sha1_full[SHA_DIGEST_LENGTH * 2 + 1]; char sha1_full[SHA_DIGEST_LENGTH * 2 + 1];
347 352 char prefix[128]; char prefix[128];
348 353
349 memset(prefix, '\t', level);
350 prefix[level] = '\0';
354 memset(prefix, ' ', level * 2);
355 prefix[level * 2] = '\0';
351 356
352 357 sha1_dump(sha1_first, q->sha1_first, 8); sha1_dump(sha1_first, q->sha1_first, 8);
353 358 sha1_dump(sha1_full, q->sha1_full, 8); sha1_dump(sha1_full, q->sha1_full, 8);
354 359 fprintf(stderr, "%sF '%s' node=%p parent=%p next=%p hash_next=%p size=%llu" fprintf(stderr, "%sF '%s' node=%p parent=%p next=%p hash_next=%p size=%llu"
355 " dev=%lu inode=%llu sha1=%s/%s\n",
360 " dev=%lu inode=%llu no_dup_possible=%u do_not_dump=%u"
361 " duplicates=%p left=%u sha1=%s/%s\n",
356 362 prefix, q->name, q, q->parent, q->next, q->hash_next, q->size, prefix, q->name, q, q->parent, q->next, q->hash_next, q->size,
357 363 (unsigned long) q->dev, (unsigned long long) q->ino, (unsigned long) q->dev, (unsigned long long) q->ino,
358 sha1_first, sha1_full);
364 q->no_dup_possible, q->do_not_dump,
365 q->duplicates, q->left, sha1_first, sha1_full);
359 366 } }
360 367
361 368 void dump_files(void) void dump_files(void)
 
... ... void dump_files(void)
363 370 struct file_node *q; struct file_node *q;
364 371 unsigned int hash; unsigned int hash;
365 372
366 fprintf(stderr, "Dumping internal data...\n");
373 fprintf(stderr, "Dumping internal data - START...\n");
367 374 for (hash = 0; hash < HASH_SIZE; hash++) { for (hash = 0; hash < HASH_SIZE; hash++) {
368 375 if (file_info[hash] == NULL) if (file_info[hash] == NULL)
369 376 continue; continue;
 
... ... void dump_files(void)
371 378 fprintf(stderr, "info[%05d]:\n", hash); fprintf(stderr, "info[%05d]:\n", hash);
372 379 q = file_info[hash]; q = file_info[hash];
373 380 while (q) { while (q) {
374 dump_file_node(q, 0);
375 q = q->next;
381 file_dump_node(q, 0);
382 q = q->hash_next;
376 383 } }
377 384 } }
385 fprintf(stderr, "Dumping internal data - STOP...\n");
378 386 } }
379 387
380 void dump_dir_node(const struct dir_node *d, const unsigned int level)
388 void dir_dump_node(const struct dir_node *d, const unsigned int level)
381 389 { {
382 390 char prefix[128]; char prefix[128];
383 391 struct dir_node *subdir; struct dir_node *subdir;
384 392 struct file_node *file; struct file_node *file;
393 char dump[SHA_DIGEST_LENGTH * 2 + 1];
385 394
386 /*fprintf(stderr, "dump_dir_node d=%p level=%u\n", d, level);*/
387 memset(prefix, '\t', level);
388 prefix[level] = '\0';
395 memset(prefix, ' ', (level + 1) * 2);
396 prefix[(level + 1) * 2] = '\0';
389 397
390 fprintf(stderr, "%sD '%s' L=%u d=%p subdirs=%p next_sibling=%p"
391 " files=%p:\n",
392 prefix, d->name, level, d, d->subdirs, d->next_sibling,
393 d->files);
398 sha1_dump(dump, d->sha1, 8);
399 fprintf(stderr, "%sD '%s' d=%p subdirs=%p next_sibling=%p"
400 " files=%p parent=%p no_dup_possible=%u do_not_dump=%u"
401 " level=%hu hash_next=%p left=%u sha1=%s\n",
402 prefix, d->name, d, d->subdirs, d->next_sibling,
403 d->files, d->parent, d->no_dup_possible, d->do_not_dump,
404 d->level, d->hash_next, d->left, dump);
394 405
395 406 subdir = d->subdirs; subdir = d->subdirs;
396 407 while (subdir) { while (subdir) {
397 dump_dir_node(subdir, level + 1);
408 dir_dump_node(subdir, level + 1 + 1);
398 409 subdir = subdir->next_sibling; subdir = subdir->next_sibling;
399 410 } }
400 411
401 412 file = d->files; file = d->files;
402 413 while (file) { while (file) {
403 dump_file_node(file, level + 1);
414 file_dump_node(file, level + 1 + 1);
404 415 file = file->next; file = file->next;
405 416 } }
406 417 } }
 
... ... void dump_dirs(void)
411 422
412 423 for (i = 0; i < dir_info_count; i++) { for (i = 0; i < dir_info_count; i++) {
413 424 fprintf(stderr, "dump_dirs[%u]...\n", i); fprintf(stderr, "dump_dirs[%u]...\n", i);
414 dump_dir_node(dir_info[i], 0);
425 dir_dump_node(dir_info[i], 0);
415 426 } }
416 427 } }
417 428
 
... ... static int compare_files(struct file_node *a, struct file_node *b)
445 456 return 1; return 1;
446 457 } }
447 458
459 static void dir_mark_no_dup_possible(struct dir_node *d)
460 {
461 if ((d == NULL) || (d->no_dup_possible == 1))
462 return;
463
464 d->no_dup_possible = 1;
465 dir_mark_no_dup_possible(d->parent);
466 }
467
448 468 /* /*
449 * Compare the same size files
469 * When we list a folder on the left side, we must mark whole hierarchy under
470 * it as 'do_not_dump'. Else, we will dump its files and we do not want that.
450 471 */ */
451 static int compare_file_range(struct file_node *a, struct file_node *b)
472 static void dir_mark_do_not_dump(struct dir_node *d)
452 473 { {
453 int err, q1_dumped;
454 struct file_node *q1, *q2;
474 struct file_node *file;
475 struct dir_node *subdir;
455 476
456 /*fprintf(stderr, "compare_range: %p -> %p\n", a, b);*/
477 if ((d == NULL) || (d->do_not_dump == 1))
478 return;
457 479
458 /* Single file of X size */
459 if (a->hash_next == NULL)
460 return 0;
480 d->do_not_dump = 1;
461 481
462 q1 = a;
463 while (q1 != b->hash_next) {
464 /* We avoid already matched files */
465 if (q1->flags & FLAGS_DUMPED) {
466 q1 = q1->hash_next;
467 continue;
468 }
482 subdir = d->subdirs;
483 while (subdir) {
484 dir_mark_do_not_dump(subdir);
485 subdir = subdir->next_sibling;
486 }
487
488 file = d->files;
489 while (file) {
490 file->do_not_dump = 1;
491 file = file->next;
492 }
493 }
494
495 /*
496 * If we dump a dir on the left side, the dup files must be also on the left side.
497 */
498 static void dir_mark_left(struct dir_node *d)
499 {
500 struct file_node *file;
501 struct dir_node *subdir;
502
503 if ((d == NULL) || (d->left == 1))
504 return;
505
506 d->left = 1;
507
508 subdir = d->subdirs;
509 while (subdir) {
510 dir_mark_left(subdir);
511 subdir = subdir->next_sibling;
512 }
513
514 file = d->files;
515 while (file) {
516 file->left = 1;
517 file = file->next;
518 }
519 }
520
521 static void file_mark_no_dup_possible(struct file_node *f)
522 {
523 if (f->no_dup_possible == 1)
524 return;
469 525
470 q2 = q1->hash_next;
471 if (q2 == NULL)
526 f->no_dup_possible = 1;
527 dir_mark_no_dup_possible(f->parent);
528 }
529
530 /*
531 * Mark a file to not be dumped
532 */
533 static void file_mark_do_not_dump(struct file_node *f)
534 {
535 if ((f == NULL) || (f->do_not_dump == 1))
536 return;
537
538 f->do_not_dump = 1;
539 }
540
541 /*
542 * Compare the same size files using hashes
543 * TODO: Use a better check algo!
544 */
545 static int compare_file_range(struct file_node *a, struct file_node *b)
546 {
547 int err;
548 struct file_node *q, *p;
549
550 /* Mark all as unique */
551 q = a;
552 while (q != b->hash_next) {
553 q->unique = 1;
554 q = q->hash_next;
555 }
556
557 p = a;
558 while (p != b->hash_next) {
559 q = p->hash_next;
560 if (q == NULL)
472 561 break; break;
473 562
474 q1_dumped = 0;
475 while (q2 != b->hash_next) {
476 err = compare_files(q1, q2);
563 while (q != b->hash_next) {
564 err = compare_files(p, q);
477 565 if (err == -1) if (err == -1)
478 566 return -1; return -1;
479 567
480 568 if (err != 1) { if (err != 1) {
481 q2 = q2->hash_next;
569 q = q->hash_next;
482 570 continue; continue;
483 571 } }
484 572
485 if (q1_dumped == 0) {
486 printf("%s\n", q1->name);
487 q1_dumped = 1;
488 }
573 q->duplicates = p->duplicates;
574 p->duplicates = q;
575 file_mark_do_not_dump(q);
489 576
490 /* show dup file */
491 printf("\t%s\n", q2->name);
492 q2->flags |= FLAGS_DUMPED;
577 p->unique = 0;
578 q->unique = 0;
579
580 /* TODO: these has to be moved */
493 581 dup_no_of_files++; dup_no_of_files++;
494 can_save += q2->size;
582 can_save += q->size;
495 583
496 q2 = q2->hash_next;
584 q = q->hash_next;
497 585 } }
498 586
499 q1 = q1->hash_next;
587 p = p->hash_next;
588 }
589
590 /* All entries that are unique, will propagate to parents */
591 q = a;
592 while (q != b->hash_next) {
593 if (q->unique == 1)
594 file_mark_no_dup_possible(q);
595 q = q->hash_next;
500 596 } }
501 597
502 598 return 0; return 0;
503 599 } }
504 600
505 int find_file_dups(void)
601 int file_find_dups(void)
506 602 { {
507 603 int err; int err;
508 604 struct file_node *q, *first, *last; struct file_node *q, *first, *last;
 
... ... int find_file_dups(void)
513 609 if (file_info[hash] == NULL) if (file_info[hash] == NULL)
514 610 continue; continue;
515 611
516 fprintf(stderr, "find_file_dups[%u]...\n", hash);
612 fprintf(stderr, "file_find_dups[%u]...\n", hash);
517 613
518 614 /* We need at least 2 nodes */ /* We need at least 2 nodes */
519 if (file_info[hash]->hash_next == NULL)
615 if (file_info[hash]->hash_next == NULL) {
616 file_mark_no_dup_possible(file_info[hash]);
520 617 continue; continue;
618 }
521 619
522 620 first = file_info[hash]; first = file_info[hash];
523 621 while (1) { while (1) {
 
... ... int find_file_dups(void)
530 628 q = q->hash_next; q = q->hash_next;
531 629 } }
532 630
631 fprintf(stderr, "\tfirst=%p last=%p\n", first, last);
632
533 633 err = compare_file_range(first, last); err = compare_file_range(first, last);
534 634 if (err == -1) if (err == -1)
535 635 return -1; return -1;
 
... ... int find_file_dups(void)
545 645 } }
546 646
547 647 /* /*
548 * Will mark a dir as incomplete because we ignore too small files
648 * Sorting helper
649 * a0 and b0 are pointers!
650 */
651 static int file_compare_hashes(const void *a0, const void *b0)
652 {
653 const unsigned char *a = * (const unsigned char **) a0;
654 const unsigned char *b = * (const unsigned char **) b0;
655
656 return memcmp(a, b, SHA_DIGEST_LENGTH);
657 }
658
659 /*
660 * Sorts, by sha1_full the list of files and returns SHA-1 of files list.
661 * We need to sort because the order of files in dirs may differ because
662 * the names may be different but the content the same.
663 */
664 static int dir_files_hash(unsigned char *hash, struct dir_node *d)
665 {
666 struct file_node *p;
667 unsigned char **u;
668 unsigned int i, mem;
669 SHA_CTX c;
670
671 if (d->files == NULL) {
672 memset(hash, 0, SHA_DIGEST_LENGTH);
673 return 0;
674 }
675
676 mem = d->no_of_files * sizeof(unsigned char *);
677 u = (unsigned char **) xmalloc(mem);
678 if (u == NULL)
679 return -1;
680
681 p = d->files;
682 i = 0;
683 while (p) {
684 u[i] = p->sha1_full;
685 p = p->next;
686 i++;
687 }
688
689 qsort(u, d->no_of_files, sizeof(unsigned char *), file_compare_hashes);
690
691 SHA1_Init(&c);
692
693 i = 0;
694 while (i < d->no_of_files) {
695 SHA1_Update(&c, u[i], SHA_DIGEST_LENGTH);
696 i++;
697 }
698
699 SHA1_Final(hash, &c);
700
701 free(u);
702
703 return 0;
704 }
705
706 /*
707 * Builds hash of a directory
708 */
709 static long long dir_build_hash(struct dir_node *d)
710 {
711 struct dir_node *subdir;
712 SHA_CTX c;
713 unsigned char files_hash[SHA_DIGEST_LENGTH];
714 int err;
715 long long no_of_possible_dirs = 0;
716 long long ret;
717
718 fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n",
719 __FUNCTION__, d->name, d->no_dup_possible);
720
721 /* We check current dir first. */
722 if (d->no_dup_possible == 0)
723 no_of_possible_dirs++;
724
725 /* Order files by hash to compute correct hashes */
726 err = dir_files_hash(files_hash, d);
727 if (err != 0)
728 return -1;
729
730 SHA1_Init(&c);
731 SHA1_Update(&c, files_hash, SHA_DIGEST_LENGTH);
732
733 subdir = d->subdirs;
734 while (subdir) {
735 ret = dir_build_hash(subdir);
736 if (ret == -1)
737 return -1;
738
739 SHA1_Update(&c, subdir->sha1, SHA_DIGEST_LENGTH);
740 no_of_possible_dirs += ret;
741 subdir = subdir->next_sibling;
742 }
743
744 SHA1_Final(d->sha1, &c);
745
746 return no_of_possible_dirs;
747 }
748
749 /*
750 * Sorting helper for dirs
751 * a0 and b0 are pointers!
752 */
753 static int dir_compare_hashes(const void *a0, const void *b0)
754 {
755 const struct dir_node *a = * (const struct dir_node **) a0;
756 const struct dir_node *b = * (const struct dir_node **) b0;
757
758 return memcmp(a->sha1, b->sha1, SHA_DIGEST_LENGTH);
759 }
760
761 /*
762 * Process a range of dirs with the same hash
763 * @where is the lowest level dir - where we should link everything under.
764 */
765 static void dir_process_range(struct dir_node **u, const long long first,
766 const long long last, const long long where)
767 {
768 long long j;
769
770 for (j = first; j <= last; j++) {
771 if (j == where)
772 continue;
773 dir_mark_do_not_dump(u[j]);
774 u[j]->hash_next = u[where]->hash_next;
775 u[where]->hash_next = u[j];
776 }
777
778 }
779 /*
780 * Finds dir duplicates (we are only marking here)
781 * We have to sort files based on hash, to match
782 * We ignore 000 hashes (dirs), because that files are single.
783 * TODO: the name does not reflect what the function does.
784 */
785 int dir_find_dups(void)
786 {
787 long long i, j, first, last, where;
788 int final_step;
789 struct dir_node *d;
790 unsigned long long mem;
791 long long err, no_of_possible_dirs = 0;
792 struct dir_node *subdir, **u;
793
794 fprintf(stderr, "DEBUG: %s...\n", __FUNCTION__);
795 for (i = 0; i < dir_info_count; i++) {
796 fprintf(stderr, "\tDEBUG: [%llu]...\n", i);
797 d = dir_info[i];
798 err = dir_build_hash(d);
799 if (err == -1)
800 return -1;
801
802 no_of_possible_dirs += err;
803 }
804 fprintf(stderr, "\tDEBUG: no_of_possible_dirs = %lld\n", no_of_possible_dirs);
805
806 /* Allocate an array that we will pass to qsort */
807 mem = no_of_possible_dirs * sizeof(struct dir_node *);
808 u = (struct dir_node **) xmalloc(mem);
809 if (u == NULL) {
810 fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n",
811 mem);
812 return -1;
813 }
814
815 /* TODO: we should break when j is no_of_possible_dirs */
816 j = 0;
817 for (i = 0; i < dir_info_count; i++) {
818 fprintf(stderr, "dir_find_dups[%llu]...\n", i);
819 d = dir_info[i];
820
821 /* we first add the current dir */
822 if (d->no_dup_possible == 0)
823 u[j++] = d;
824
825 subdir = d->subdirs;
826 while (subdir) {
827 if (subdir->no_dup_possible == 0)
828 u[j++] = subdir;
829 subdir = subdir->next_sibling;
830 }
831 }
832
833 /* TODO: pass whole structure also for files */
834 qsort(u, no_of_possible_dirs, sizeof(struct dir_node *), dir_compare_hashes);
835
836 first = 0;
837 last = 0;
838 where = 0;
839 for (i = 1; i < no_of_possible_dirs; i++) {
840 if (memcmp(u[first]->sha1, u[i]->sha1, SHA_DIGEST_LENGTH) == 0) {
841 /* We have the same hash */
842 dup_no_of_dirs++;
843 last = i;
844 if (u[last]->level < u[where]->level)
845 where = last;
846 continue;
847 }
848
849 /* We have same hash in first..last */
850 final_step = 0;
851 dir_process_range(u, first, last, where);
852
853 /* Switch to next range */
854 first = i;
855 last = i;
856 where = i;
857 final_step = 1;
858 }
859
860
861 /* TODO: shoudn't we do the same for files? */
862 if (final_step == 1)
863 dir_process_range(u, first, last, where);
864
865 free(u);
866
867 return 0;
868 }
869
870 /*
871 * Nice dumps the duplicated dirs
549 872 */ */
550 void mark_dir_as_incomplete(const unsigned int level)
873 void dir_dump_duplicates(struct dir_node *d)
551 874 { {
552 struct dir_node *dir;
875 struct dir_node *p;
553 876
554 dir = dir_current[level - 1];
555 dir->flags |= FLAGS_DIR_NOT_FOR_DUP;
877 if (d->no_dup_possible == 1)
878 return;
879
880 if (d->do_not_dump == 1)
881 return;
882
883 if (d->hash_next == NULL)
884 return;
885
886 dir_mark_left(d);
887
888 p = d->hash_next;
889 while (p) {
890 if (p->left == 1) {
891 /*
892 * We already dumped that dir on the left side.
893 * makes no sense to dump it again on the right side!
894 */
895 p = p->hash_next;
896 continue;
897 }
898
899 fprintf(stderr, "\t\t\t%s = %s\n",
900 d->name, p->name);
901 printf("DIR\t%s\t%s\n",
902 d->name, p->name);
903 p = p->hash_next;
904 }
905 }
906
907 /*
908 * Nice dumps the duplicated files
909 */
910 void file_dump_duplicates(const struct file_node *f,
911 const unsigned long long min_size)
912 {
913 const struct file_node *p, *left;
914
915 file_dump_node(f, 1);
916
917 if (f->duplicates == NULL)
918 return;
919
920 if (f->no_dup_possible == 1)
921 return;
922
923 if (f->do_not_dump == 1)
924 return;
925
926 if (f->duplicates == NULL)
927 return;
928
929 if (f->size < min_size)
930 return;
931
932 /* first, search for the first left one */
933 left = f;
934 if (left->left == 0) {
935 p = f->duplicates;
936 while (p) {
937 if (p->left == 1) {
938 left = p;
939 break;
940 }
941 p = p->duplicates;
942 }
943 }
944
945 /* now, dump */
946 p = f;
947 while (p) {
948 /*
949 * We do not want to dump files already dumped when we did
950 * it for dirs.
951 */
952 if (p->do_not_dump == 1) {
953 p = p->duplicates;
954 continue;
955 }
956
957 if (p == left) {
958 p = p->duplicates;
959 continue;
960 }
961
962 fprintf(stderr, "\t\t\t%s = %s\n",
963 left->name, p->name);
964 printf("FILE\t%s\t%s\n",
965 left->name, p->name);
966 p = p->duplicates;
967 }
968 }
969
970 /*
971 * Searches all tree for duplicates
972 * @min_size - do not dump files shorter than min_size
973 */
974 void dump_duplicates(const unsigned long long min_size)
975 {
976 unsigned int i;
977 struct dir_node *d, *subdir;
978 struct file_node *f;
979 unsigned int hash;
980
981 fprintf(stderr, "Dump duplicates (bigger than %llu)...\n", min_size);
982
983 for (i = 0; i < dir_info_count; i++) {
984 fprintf(stderr, "\tdump_duplicates[%u]...\n", i);
985 d = dir_info[i];
986 dir_dump_duplicates(d);
987
988 subdir = d->subdirs;
989 while (subdir) {
990 dir_dump_duplicates(subdir);
991 subdir = subdir->next_sibling;
992 }
993 }
994
995 /* Now, we dump remaining files */
996 fprintf(stderr, "DEBUG: Dump duplicated files...\n");
997 for (hash = 0; hash < HASH_SIZE; hash++) {
998 if (file_info[hash] == NULL)
999 continue;
1000
1001 fprintf(stderr, "\thash %u\n", hash);
1002
1003 f = file_info[hash];
1004 while (f) {
1005 file_dump_duplicates(f, min_size);
1006 f = f->hash_next;
1007 }
1008 }
556 1009 } }
File store.h changed (mode: 100644) (index b5b3633..eb059da)
... ... struct file_node
19 19 unsigned long long size; unsigned long long size;
20 20 unsigned char sha1_first[SHA_DIGEST_LENGTH]; unsigned char sha1_first[SHA_DIGEST_LENGTH];
21 21 unsigned char sha1_full[SHA_DIGEST_LENGTH]; unsigned char sha1_full[SHA_DIGEST_LENGTH];
22 unsigned int flags;
22 unsigned int no_dup_possible:1;
23 unsigned int do_not_dump:1;
24 unsigned int unique:1;
25 unsigned int left:1;
23 26 dev_t dev; dev_t dev;
24 27 ino_t ino; ino_t ino;
25 28 struct file_node *next; struct file_node *next;
26 29 struct file_node *hash_next; struct file_node *hash_next;
27 30 struct dir_node *parent; struct dir_node *parent;
31 struct file_node *duplicates;
28 32 }; };
29 33
30 34 struct dir_node struct dir_node
31 35 { {
32 36 char *name; char *name;
33 37 unsigned char sha1[SHA_DIGEST_LENGTH]; unsigned char sha1[SHA_DIGEST_LENGTH];
34 unsigned int flags;
38 unsigned int no_dup_possible:1;
39 unsigned int do_not_dump:1;
40 unsigned int left:1;
41 unsigned int level:16;
35 42 dev_t dev; dev_t dev;
36 43 ino_t ino; ino_t ino;
37 44 struct dir_node *subdirs; struct dir_node *subdirs;
38 45 struct dir_node *next_sibling; /* Link subdirs on the same level */ struct dir_node *next_sibling; /* Link subdirs on the same level */
39 46 struct file_node *files; struct file_node *files;
47 unsigned int no_of_files;
48 struct dir_node *parent;
49 struct dir_node *hash_next; /* in the last phase, here we store duplicates */
40 50 }; };
41 51
42 52
43 53 extern void dump_stats(void); extern void dump_stats(void);
44 extern int add_file(const char *file, const struct stat *s,
54 extern int file_add(const char *file, const struct stat *s,
45 55 const unsigned int level); const unsigned int level);
46 extern int add_dir(const char *dir, const struct stat *s,
56 extern int dir_add(const char *dir, const struct stat *s,
47 57 const unsigned int level); const unsigned int level);
48 58 extern void dump_files(void); extern void dump_files(void);
49 extern int find_file_dups(void);
50 extern int find_dir_dups(void);
59 extern int file_find_dups(void);
60 extern int dir_find_dups(void);
51 61 extern struct dir_node *dir_get_current(const unsigned int level); extern struct dir_node *dir_get_current(const unsigned int level);
52 62
53 63 extern int dev_ino_seen(const unsigned int type, extern int dev_ino_seen(const unsigned int type,
54 64 const dev_t dev, const ino_t ino); const dev_t dev, const ino_t ino);
55 65
56 extern void mark_dir_as_incomplete(const unsigned int level);
57
58 66 extern void dump_dirs(void); extern void dump_dirs(void);
59 67
60 #endif
68 extern void dump_duplicates(const unsigned long long min_size);
69
70 #endif
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/catalinux/dupdump

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/catalinux/dupdump

Clone this repository using git:
git clone git://git.rocketgit.com/user/catalinux/dupdump

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main