File dupdump.c changed (mode: 100644) (index b293669..db5d9f6) |
10 |
10 |
#include <string.h> |
#include <string.h> |
11 |
11 |
#include <errno.h> |
#include <errno.h> |
12 |
12 |
#include <getopt.h> |
#include <getopt.h> |
|
13 |
|
#include <malloc.h> |
13 |
14 |
|
|
14 |
15 |
#include "store.h" |
#include "store.h" |
15 |
16 |
|
|
16 |
17 |
static off_t min_size = 0; |
static off_t min_size = 0; |
17 |
18 |
static int verbose = 0; |
static int verbose = 0; |
18 |
19 |
static int debug = 0; |
static int debug = 0; |
|
20 |
|
static int zero = 0; |
|
21 |
|
static char *out_file; |
19 |
22 |
|
|
20 |
23 |
static struct option options[] = |
static struct option options[] = |
21 |
24 |
{ |
{ |
|
25 |
|
{"zero", no_argument, NULL, 'z'}, |
22 |
26 |
{"min-size", required_argument, NULL, 'i'}, |
{"min-size", required_argument, NULL, 'i'}, |
|
27 |
|
{"out", required_argument, NULL, 'o'}, |
23 |
28 |
{"verbose", no_argument, NULL, 'v'}, |
{"verbose", no_argument, NULL, 'v'}, |
24 |
29 |
{"debug", no_argument, NULL, 'd'}, |
{"debug", no_argument, NULL, 'd'}, |
25 |
30 |
{NULL, 0, NULL, 0} |
{NULL, 0, NULL, 0} |
|
... |
... |
static struct option options[] = |
28 |
33 |
static void usage(void) |
static void usage(void) |
29 |
34 |
{ |
{ |
30 |
35 |
fprintf(stderr, "Usage [options] <dir1> [<dir2>] ...\n" |
fprintf(stderr, "Usage [options] <dir1> [<dir2>] ...\n" |
|
36 |
|
" --zero -z Use \\0 to separate columns\n" |
31 |
37 |
" --min-size -i Ignore files under this size (default 1)\n" |
" --min-size -i Ignore files under this size (default 1)\n" |
|
38 |
|
" --out -i Where to store results (default stdout)\n" |
32 |
39 |
" --verbose -v Be more verbose\n" |
" --verbose -v Be more verbose\n" |
33 |
40 |
" --debug -d Print debug information\n" |
" --debug -d Print debug information\n" |
34 |
41 |
); |
); |
|
... |
... |
int main(int argc, char *argv[]) |
85 |
92 |
int err; |
int err; |
86 |
93 |
int options_index = 0; |
int options_index = 0; |
87 |
94 |
char c; |
char c; |
|
95 |
|
FILE *out; |
88 |
96 |
|
|
89 |
|
while ((c = getopt_long(argc, argv, "i:vdh", options, &options_index)) != -1) { |
|
|
97 |
|
while ((c = getopt_long(argc, argv, "zi:o:vdh", options, &options_index)) != -1) { |
90 |
98 |
switch (c) { |
switch (c) { |
|
99 |
|
case 'z': zero = 1; break; |
91 |
100 |
case 'i': min_size = strtoul(optarg, NULL, 10); break; |
case 'i': min_size = strtoul(optarg, NULL, 10); break; |
|
101 |
|
case 'o': out_file = optarg; break; |
92 |
102 |
case 'v': verbose = 1; break; |
case 'v': verbose = 1; break; |
93 |
103 |
case 'd': debug = 1; break; |
case 'd': debug = 1; break; |
94 |
104 |
default: |
default: |
|
... |
... |
int main(int argc, char *argv[]) |
97 |
107 |
} |
} |
98 |
108 |
} |
} |
99 |
109 |
|
|
|
110 |
|
if (out_file == NULL) { |
|
111 |
|
out = stdout; |
|
112 |
|
} else { |
|
113 |
|
out = fopen(out_file, "w"); |
|
114 |
|
if (out == NULL) { |
|
115 |
|
fprintf(stderr, "Cannot open results file (%s)!\n", |
|
116 |
|
strerror(errno)); |
|
117 |
|
return 1; |
|
118 |
|
} |
|
119 |
|
} |
|
120 |
|
|
100 |
121 |
flags |= FTW_PHYS; /* Do not follow symlinks */ |
flags |= FTW_PHYS; /* Do not follow symlinks */ |
101 |
122 |
flags |= FTW_ACTIONRETVAL; /* To skip hierarchies */ |
flags |= FTW_ACTIONRETVAL; /* To skip hierarchies */ |
102 |
123 |
|
|
|
... |
... |
int main(int argc, char *argv[]) |
107 |
128 |
} |
} |
108 |
129 |
|
|
109 |
130 |
set_debug(debug); |
set_debug(debug); |
|
131 |
|
set_out(out); |
110 |
132 |
|
|
111 |
133 |
if (verbose) |
if (verbose) |
112 |
134 |
fprintf(stderr, "Scanning for duplicates, min-size %lld\n", |
fprintf(stderr, "Scanning for duplicates, min-size %lld\n", |
|
... |
... |
int main(int argc, char *argv[]) |
130 |
152 |
dump_files(); |
dump_files(); |
131 |
153 |
|
|
132 |
154 |
if (verbose) |
if (verbose) |
133 |
|
fprintf(stderr, "Find duplicate files...\n"); |
|
|
155 |
|
fprintf(stderr, "[*] Find duplicate files...\n"); |
134 |
156 |
err = file_find_dups(); |
err = file_find_dups(); |
135 |
157 |
if (err != 0) { |
if (err != 0) { |
136 |
158 |
fprintf(stderr, "Error comparing files!\n"); |
fprintf(stderr, "Error comparing files!\n"); |
|
... |
... |
int main(int argc, char *argv[]) |
138 |
160 |
} |
} |
139 |
161 |
|
|
140 |
162 |
if (verbose) |
if (verbose) |
141 |
|
fprintf(stderr, "Find duplicate dirs...\n"); |
|
|
163 |
|
fprintf(stderr, "[*] Find duplicate dirs...\n"); |
142 |
164 |
err = dir_find_dups(); |
err = dir_find_dups(); |
143 |
165 |
if (err != 0) { |
if (err != 0) { |
144 |
166 |
fprintf(stderr, "Error comparing dirs!\n"); |
fprintf(stderr, "Error comparing dirs!\n"); |
|
... |
... |
int main(int argc, char *argv[]) |
148 |
170 |
if (debug) |
if (debug) |
149 |
171 |
dump_dirs(); |
dump_dirs(); |
150 |
172 |
|
|
151 |
|
dump_duplicates(min_size); |
|
|
173 |
|
dump_duplicates(min_size, zero); |
152 |
174 |
|
|
153 |
175 |
if (verbose) |
if (verbose) |
154 |
176 |
dump_stats(); |
dump_stats(); |
155 |
177 |
|
|
|
178 |
|
dev_ino_seen_clean(); |
|
179 |
|
|
156 |
180 |
return 0; |
return 0; |
157 |
181 |
} |
} |
File store.c changed (mode: 100644) (index 5d5b290..8cd312c) |
15 |
15 |
|
|
16 |
16 |
|
|
17 |
17 |
#define DEV_INO_HASH_SIZE 4096 |
#define DEV_INO_HASH_SIZE 4096 |
18 |
|
#define HASH_SIZE 512 |
|
|
18 |
|
#define HASH_SIZE 16384 |
19 |
19 |
#define MAX_INPUT_DIRS 32 |
#define MAX_INPUT_DIRS 32 |
20 |
20 |
#define MAX_DEPTH 1000 |
#define MAX_DEPTH 1000 |
21 |
21 |
|
|
|
... |
... |
static struct file_node *file_info[HASH_SIZE]; |
35 |
35 |
static struct dir_node *dir_info[MAX_INPUT_DIRS]; |
static struct dir_node *dir_info[MAX_INPUT_DIRS]; |
36 |
36 |
static unsigned int dir_info_count; |
static unsigned int dir_info_count; |
37 |
37 |
static struct dir_node *dir_current[MAX_DEPTH]; |
static struct dir_node *dir_current[MAX_DEPTH]; |
|
38 |
|
static struct dir_node **dir_chain; |
|
39 |
|
static unsigned long long dir_chain_len; |
38 |
40 |
static unsigned char sha1_zero[SHA_DIGEST_LENGTH]; |
static unsigned char sha1_zero[SHA_DIGEST_LENGTH]; |
39 |
41 |
static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE]; |
static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE]; |
40 |
42 |
static int debug = 0; |
static int debug = 0; |
|
43 |
|
static FILE *out; |
|
44 |
|
|
41 |
45 |
|
|
42 |
46 |
/* ############### Misc functions ############### */ |
/* ############### Misc functions ############### */ |
43 |
47 |
void set_debug(const unsigned int level) |
void set_debug(const unsigned int level) |
|
... |
... |
void set_debug(const unsigned int level) |
45 |
49 |
debug = level; |
debug = level; |
46 |
50 |
} |
} |
47 |
51 |
|
|
|
52 |
|
void set_out(FILE *f) |
|
53 |
|
{ |
|
54 |
|
out = f; |
|
55 |
|
} |
|
56 |
|
|
48 |
57 |
/* ############### Memory functions ############### */ |
/* ############### Memory functions ############### */ |
49 |
58 |
static void *xmalloc(size_t size) |
static void *xmalloc(size_t size) |
50 |
59 |
{ |
{ |
|
... |
... |
int dev_ino_seen(const unsigned int type, const dev_t dev, const ino_t ino) |
186 |
195 |
return 0; |
return 0; |
187 |
196 |
} |
} |
188 |
197 |
|
|
|
198 |
|
/* |
|
199 |
|
* Clean dev_ino_seen stuff |
|
200 |
|
*/ |
|
201 |
|
void dev_ino_seen_clean(void) |
|
202 |
|
{ |
|
203 |
|
unsigned int i; |
|
204 |
|
struct dev_ino *q, *next; |
|
205 |
|
|
|
206 |
|
for (i = 0; i < DEV_INO_HASH_SIZE; i++) { |
|
207 |
|
q = dev_ino_hash[i]; |
|
208 |
|
while (q) { |
|
209 |
|
next = q->next; |
|
210 |
|
free(q); |
|
211 |
|
q = next; |
|
212 |
|
} |
|
213 |
|
} |
|
214 |
|
} |
|
215 |
|
|
189 |
216 |
|
|
190 |
217 |
/* ############### Main functions ############### */ |
/* ############### Main functions ############### */ |
191 |
218 |
|
|
|
... |
... |
static void dir_mark_no_dup_possible(struct dir_node *d) |
477 |
504 |
if ((d == NULL) || (d->no_dup_possible == 1)) |
if ((d == NULL) || (d->no_dup_possible == 1)) |
478 |
505 |
return; |
return; |
479 |
506 |
|
|
|
507 |
|
if (debug) |
|
508 |
|
fprintf(stderr, "DEBUG: recursively up do dir_mark_no_dup_possible(%s)\n", d->name); |
|
509 |
|
|
480 |
510 |
d->no_dup_possible = 1; |
d->no_dup_possible = 1; |
481 |
511 |
dir_mark_no_dup_possible(d->parent); |
dir_mark_no_dup_possible(d->parent); |
482 |
512 |
} |
} |
|
... |
... |
static void dir_mark_do_not_dump(struct dir_node *d) |
490 |
520 |
struct file_node *file; |
struct file_node *file; |
491 |
521 |
struct dir_node *subdir; |
struct dir_node *subdir; |
492 |
522 |
|
|
493 |
|
if (debug) |
|
494 |
|
fprintf(stderr, "DEBUG: dir_mark_do_not_dump(%s)\n", d->name); |
|
495 |
523 |
if ((d == NULL) || (d->do_not_dump == 1)) |
if ((d == NULL) || (d->do_not_dump == 1)) |
496 |
524 |
return; |
return; |
497 |
525 |
|
|
|
526 |
|
if (debug) |
|
527 |
|
fprintf(stderr, "DEBUG: recursively dir_mark_do_not_dump(%s)\n", d->name); |
|
528 |
|
|
498 |
529 |
d->do_not_dump = 1; |
d->do_not_dump = 1; |
499 |
530 |
|
|
500 |
531 |
subdir = d->subdirs; |
subdir = d->subdirs; |
|
... |
... |
static void dir_mark_do_not_dump(struct dir_node *d) |
505 |
536 |
|
|
506 |
537 |
file = d->files; |
file = d->files; |
507 |
538 |
while (file) { |
while (file) { |
508 |
|
if (debug) |
|
509 |
|
fprintf(stderr, "\tSet do_not_dump=1 on [%s]\n", file->name); |
|
510 |
539 |
file->do_not_dump = 1; |
file->do_not_dump = 1; |
511 |
540 |
file = file->next; |
file = file->next; |
512 |
541 |
} |
} |
|
... |
... |
static void dir_mark_left(struct dir_node *d) |
523 |
552 |
if ((d == NULL) || (d->left == 1)) |
if ((d == NULL) || (d->left == 1)) |
524 |
553 |
return; |
return; |
525 |
554 |
|
|
|
555 |
|
if (debug) |
|
556 |
|
fprintf(stderr, "DEBUG: recursively dir_mark_left(%s)\n", d->name); |
|
557 |
|
|
526 |
558 |
d->left = 1; |
d->left = 1; |
527 |
559 |
|
|
528 |
560 |
subdir = d->subdirs; |
subdir = d->subdirs; |
|
... |
... |
static int compare_file_range(struct file_node *a, struct file_node *b) |
591 |
623 |
} |
} |
592 |
624 |
|
|
593 |
625 |
err = compare_files(p, q); |
err = compare_files(p, q); |
594 |
|
if (debug) |
|
595 |
|
fprintf(stderr, "COMPARING [%s] with [%s] = %d\n", p->name, q->name, err); |
|
596 |
626 |
if (err == -1) |
if (err == -1) |
597 |
627 |
return -1; |
return -1; |
598 |
628 |
|
|
|
... |
... |
int file_find_dups(void) |
688 |
718 |
} |
} |
689 |
719 |
|
|
690 |
720 |
if (debug) { |
if (debug) { |
691 |
|
fprintf(stderr, "Dump chain %u: ", hash); |
|
|
721 |
|
fprintf(stderr, "[*] Dump chain %u start:\n", hash); |
692 |
722 |
q = file_info[hash]; |
q = file_info[hash]; |
693 |
723 |
while (q) { |
while (q) { |
694 |
|
fprintf(stderr, "%s(", q->name); |
|
|
724 |
|
fprintf(stderr, "%s:\n", q->name); |
695 |
725 |
dups = q->duplicates; |
dups = q->duplicates; |
696 |
726 |
while(dups) { |
while(dups) { |
697 |
|
fprintf(stderr, " %s", dups->name); |
|
|
727 |
|
fprintf(stderr, "\t%s\n", dups->name); |
698 |
728 |
dups = dups->duplicates; |
dups = dups->duplicates; |
699 |
729 |
} |
} |
700 |
|
fprintf(stderr, ") -> "); |
|
701 |
730 |
q = q->hash_next; |
q = q->hash_next; |
702 |
731 |
} |
} |
703 |
|
fprintf(stderr, "\n"); |
|
|
732 |
|
fprintf(stderr, "[*] Dump chain %u stop\n", hash); |
704 |
733 |
} |
} |
705 |
734 |
} |
} |
706 |
735 |
|
|
|
... |
... |
static long long dir_build_hash(struct dir_node *d) |
793 |
822 |
fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n", |
fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n", |
794 |
823 |
__FUNCTION__, d->name, d->no_dup_possible); |
__FUNCTION__, d->name, d->no_dup_possible); |
795 |
824 |
|
|
|
825 |
|
/* empty dir? */ |
|
826 |
|
if ((d->files == NULL) && (d->subdirs == NULL)) { |
|
827 |
|
d->no_dup_possible = 1; |
|
828 |
|
return 0; |
|
829 |
|
} |
|
830 |
|
|
796 |
831 |
/* We check current dir first. */ |
/* We check current dir first. */ |
797 |
832 |
if (d->no_dup_possible == 0) |
if (d->no_dup_possible == 0) |
798 |
833 |
no_of_possible_dirs++; |
no_of_possible_dirs++; |
|
... |
... |
static long long dir_find_dups_populate_list(struct dir_node **u, |
892 |
927 |
/* |
/* |
893 |
928 |
* Finds dir duplicates (we are only marking here) |
* Finds dir duplicates (we are only marking here) |
894 |
929 |
* We have to sort files based on hash, to match |
* We have to sort files based on hash, to match |
895 |
|
* We ignore 000 hashes (dirs), because that files are single. |
|
|
930 |
|
* We ignore 000 hashes (dirs), because they contain files that are single. |
896 |
931 |
* TODO: the name does not reflect what the function does. |
* TODO: the name does not reflect what the function does. |
897 |
932 |
*/ |
*/ |
898 |
933 |
int dir_find_dups(void) |
int dir_find_dups(void) |
899 |
934 |
{ |
{ |
900 |
|
long long i, j, first, last, where; |
|
|
935 |
|
unsigned long long i, j, first, last, where; |
901 |
936 |
int final_step; |
int final_step; |
902 |
937 |
struct dir_node *d; |
struct dir_node *d; |
903 |
938 |
unsigned long long mem; |
unsigned long long mem; |
904 |
|
long long err, no_of_possible_dirs = 0; |
|
905 |
|
struct dir_node **u; |
|
|
939 |
|
long long err; |
906 |
940 |
char dump[SHA_DIGEST_LENGTH * 2 + 1]; |
char dump[SHA_DIGEST_LENGTH * 2 + 1]; |
907 |
941 |
|
|
|
942 |
|
dir_chain_len = 0; |
|
943 |
|
|
|
944 |
|
if (debug) |
|
945 |
|
fprintf(stderr, "[*] dir_find_dups...\n"); |
|
946 |
|
|
908 |
947 |
for (i = 0; i < dir_info_count; i++) { |
for (i = 0; i < dir_info_count; i++) { |
909 |
948 |
err = dir_build_hash(dir_info[i]); |
err = dir_build_hash(dir_info[i]); |
910 |
949 |
if (err == -1) |
if (err == -1) |
911 |
950 |
return -1; |
return -1; |
912 |
951 |
|
|
913 |
|
no_of_possible_dirs += err; |
|
|
952 |
|
dir_chain_len += err; |
914 |
953 |
} |
} |
915 |
954 |
|
|
916 |
955 |
/* Allocate an array that we will pass to qsort */ |
/* Allocate an array that we will pass to qsort */ |
917 |
|
mem = no_of_possible_dirs * sizeof(struct dir_node *); |
|
918 |
|
u = (struct dir_node **) xmalloc(mem); |
|
919 |
|
if (u == NULL) { |
|
|
956 |
|
mem = dir_chain_len * sizeof(struct dir_node *); |
|
957 |
|
dir_chain = (struct dir_node **) xmalloc(mem); |
|
958 |
|
if (dir_chain == NULL) { |
920 |
959 |
fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n", |
fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n", |
921 |
960 |
mem); |
mem); |
922 |
961 |
return -1; |
return -1; |
|
... |
... |
int dir_find_dups(void) |
925 |
964 |
j = 0; |
j = 0; |
926 |
965 |
for (i = 0; i < dir_info_count; i++) { |
for (i = 0; i < dir_info_count; i++) { |
927 |
966 |
d = dir_info[i]; |
d = dir_info[i]; |
928 |
|
if (debug) |
|
929 |
|
fprintf(stderr, "dir_find_dups[i=%llu, j=%lld] [%s]\n", i, j, d->name); |
|
930 |
967 |
|
|
931 |
|
j += dir_find_dups_populate_list(u, j, d); |
|
|
968 |
|
j += dir_find_dups_populate_list(dir_chain, j, d); |
932 |
969 |
|
|
933 |
970 |
/* stop searching if we found all possible dirs */ |
/* stop searching if we found all possible dirs */ |
934 |
|
if (j == no_of_possible_dirs) |
|
|
971 |
|
if (j == dir_chain_len) |
935 |
972 |
break; |
break; |
936 |
973 |
} |
} |
937 |
974 |
|
|
938 |
975 |
if (debug) { |
if (debug) { |
939 |
976 |
fprintf(stderr, "dir u (j=%lld): ", j); |
fprintf(stderr, "dir u (j=%lld): ", j); |
940 |
|
for (i = 0; i < no_of_possible_dirs; i++) |
|
941 |
|
fprintf(stderr, "[%lld]=%s ", i, u[i]->name); |
|
|
977 |
|
for (i = 0; i < dir_chain_len; i++) |
|
978 |
|
fprintf(stderr, "[%lld]=%s ", i, dir_chain[i]->name); |
942 |
979 |
fprintf(stderr, "\n"); |
fprintf(stderr, "\n"); |
943 |
980 |
} |
} |
944 |
981 |
|
|
945 |
|
/* Order by hash */ |
|
946 |
|
qsort(u, no_of_possible_dirs, sizeof(struct dir_node *), dir_compare_hashes); |
|
|
982 |
|
/* Sort by hash */ |
|
983 |
|
qsort(dir_chain, dir_chain_len, sizeof(struct dir_node *), dir_compare_hashes); |
947 |
984 |
|
|
948 |
985 |
if (debug) { |
if (debug) { |
949 |
|
fprintf(stderr, "DEBUG: dump after dir qsort [%s]\n", d->name); |
|
950 |
|
for (i = 0; i < no_of_possible_dirs; i++) { |
|
951 |
|
sha1_dump(dump, u[i]->sha1, 0); |
|
952 |
|
fprintf(stderr, "DEBUG: %s\t%u\t%s\n", dump, u[i]->level, u[i]->name); |
|
|
986 |
|
fprintf(stderr, "DEBUG: dump after dir qsort:\n"); |
|
987 |
|
for (i = 0; i < dir_chain_len; i++) { |
|
988 |
|
sha1_dump(dump, dir_chain[i]->sha1, 0); |
|
989 |
|
fprintf(stderr, "DEBUG: %s\tlevel %u\t%s\n", |
|
990 |
|
dump, dir_chain[i]->level, dir_chain[i]->name); |
953 |
991 |
} |
} |
954 |
992 |
} |
} |
955 |
993 |
|
|
956 |
994 |
first = 0; |
first = 0; |
957 |
995 |
last = 0; |
last = 0; |
958 |
996 |
where = 0; |
where = 0; |
959 |
|
for (i = 1; i < no_of_possible_dirs; i++) { |
|
960 |
|
if (memcmp(u[first]->sha1, u[i]->sha1, SHA_DIGEST_LENGTH) == 0) { |
|
|
997 |
|
final_step = 1; |
|
998 |
|
for (i = 1; i < dir_chain_len; i++) { |
|
999 |
|
if (memcmp(dir_chain[first]->sha1, dir_chain[i]->sha1, SHA_DIGEST_LENGTH) == 0) { |
961 |
1000 |
/* We have the same hash */ |
/* We have the same hash */ |
962 |
1001 |
dup_no_of_dirs++; |
dup_no_of_dirs++; |
963 |
1002 |
last = i; |
last = i; |
964 |
|
if (u[last]->level < u[where]->level) |
|
|
1003 |
|
if (dir_chain[last]->level < dir_chain[where]->level) |
965 |
1004 |
where = last; |
where = last; |
966 |
1005 |
continue; |
continue; |
967 |
1006 |
} |
} |
968 |
1007 |
|
|
969 |
1008 |
/* We have same hash in first..last */ |
/* We have same hash in first..last */ |
970 |
1009 |
final_step = 0; |
final_step = 0; |
971 |
|
dir_process_range(u, first, last, where); |
|
|
1010 |
|
dir_process_range(dir_chain, first, last, where); |
972 |
1011 |
|
|
973 |
1012 |
/* Switch to next range */ |
/* Switch to next range */ |
974 |
1013 |
first = i; |
first = i; |
|
... |
... |
int dir_find_dups(void) |
980 |
1019 |
|
|
981 |
1020 |
/* TODO: shoudn't we do the same for files? */ |
/* TODO: shoudn't we do the same for files? */ |
982 |
1021 |
if (final_step == 1) |
if (final_step == 1) |
983 |
|
dir_process_range(u, first, last, where); |
|
984 |
|
|
|
985 |
|
free(u); |
|
|
1022 |
|
dir_process_range(dir_chain, first, last, where); |
986 |
1023 |
|
|
987 |
1024 |
return 0; |
return 0; |
988 |
1025 |
} |
} |
|
... |
... |
int dir_find_dups(void) |
990 |
1027 |
/* |
/* |
991 |
1028 |
* Nice dumps the duplicated dirs |
* Nice dumps the duplicated dirs |
992 |
1029 |
*/ |
*/ |
993 |
|
void dir_dump_duplicates(struct dir_node *d) |
|
|
1030 |
|
void dir_dump_duplicates(struct dir_node *d, const unsigned int zero) |
994 |
1031 |
{ |
{ |
995 |
1032 |
struct dir_node *p; |
struct dir_node *p; |
|
1033 |
|
char sep, final; |
996 |
1034 |
|
|
997 |
|
if (d->no_dup_possible == 1) |
|
|
1035 |
|
if (debug) |
|
1036 |
|
fprintf(stderr, "[*] dir_dump_duplicates(%s)\n", d->name); |
|
1037 |
|
|
|
1038 |
|
if (d->no_dup_possible == 1) { |
|
1039 |
|
if (debug) |
|
1040 |
|
fprintf(stderr, "\tignore duplicate dir because no_dup_possible\n "); |
998 |
1041 |
return; |
return; |
|
1042 |
|
} |
999 |
1043 |
|
|
1000 |
|
if (d->do_not_dump == 1) |
|
|
1044 |
|
if (d->do_not_dump == 1) { |
|
1045 |
|
if (debug) |
|
1046 |
|
fprintf(stderr, "\tignore duplicate dir because no_dup_dump\n "); |
1001 |
1047 |
return; |
return; |
|
1048 |
|
} |
1002 |
1049 |
|
|
1003 |
|
if (d->hash_next == NULL) |
|
|
1050 |
|
if (d->hash_next == NULL) { |
|
1051 |
|
if (debug) |
|
1052 |
|
fprintf(stderr, "\tignore duplicate dir because hash_next\n "); |
1004 |
1053 |
return; |
return; |
|
1054 |
|
} |
|
1055 |
|
|
|
1056 |
|
if (zero) { |
|
1057 |
|
sep = '\0'; |
|
1058 |
|
final = '\0'; |
|
1059 |
|
} else { |
|
1060 |
|
sep = '\t'; |
|
1061 |
|
final = '\n'; |
|
1062 |
|
} |
1005 |
1063 |
|
|
1006 |
1064 |
p = d->hash_next; |
p = d->hash_next; |
1007 |
1065 |
while (p) { |
while (p) { |
|
... |
... |
void dir_dump_duplicates(struct dir_node *d) |
1010 |
1068 |
* We already dumped that dir on the left/right side. |
* We already dumped that dir on the left/right side. |
1011 |
1069 |
* makes no sense to dump it again on the right side! |
* makes no sense to dump it again on the right side! |
1012 |
1070 |
*/ |
*/ |
|
1071 |
|
if (debug) |
|
1072 |
|
fprintf(stderr, "DEBUG: ignore dir [%s] because" |
|
1073 |
|
"left=%u and/or do_not_dump=%u\n", |
|
1074 |
|
p->name, p->left, p->do_not_dump); |
1013 |
1075 |
p = p->hash_next; |
p = p->hash_next; |
1014 |
1076 |
continue; |
continue; |
1015 |
1077 |
} |
} |
1016 |
1078 |
|
|
|
1079 |
|
if (debug) |
|
1080 |
|
fprintf(stderr, "DEBUG: Found a right dir for [%s]: %s\n", d->name, p->name); |
|
1081 |
|
|
1017 |
1082 |
dir_mark_left(d); |
dir_mark_left(d); |
1018 |
1083 |
if (debug) |
if (debug) |
1019 |
1084 |
fprintf(stderr, "dir_dump_duplicates: set do_not_dump on left [%s]\n", d->name); |
fprintf(stderr, "dir_dump_duplicates: set do_not_dump on left [%s]\n", d->name); |
|
... |
... |
void dir_dump_duplicates(struct dir_node *d) |
1023 |
1088 |
fprintf(stderr, "dir_dump_duplicates: set do_not_dump on right [%s]\n", p->name); |
fprintf(stderr, "dir_dump_duplicates: set do_not_dump on right [%s]\n", p->name); |
1024 |
1089 |
dir_mark_do_not_dump(p); |
dir_mark_do_not_dump(p); |
1025 |
1090 |
|
|
1026 |
|
printf("DIR\t%s\t%s\n", |
|
1027 |
|
d->name, p->name); |
|
|
1091 |
|
if (debug) |
|
1092 |
|
fprintf(stderr, "DIR%c%s%c%s%c", |
|
1093 |
|
sep, d->name, sep, p->name, final); |
|
1094 |
|
fprintf(out, "DIR%c%s%c%s%c", |
|
1095 |
|
sep, d->name, sep, p->name, final); |
1028 |
1096 |
p = p->hash_next; |
p = p->hash_next; |
1029 |
1097 |
} |
} |
1030 |
1098 |
} |
} |
|
... |
... |
void dir_dump_duplicates(struct dir_node *d) |
1033 |
1101 |
* Nice dumps the duplicated files |
* Nice dumps the duplicated files |
1034 |
1102 |
*/ |
*/ |
1035 |
1103 |
void file_dump_duplicates(struct file_node *f, |
void file_dump_duplicates(struct file_node *f, |
1036 |
|
const unsigned long long min_size) |
|
|
1104 |
|
const unsigned long long min_size, const unsigned int zero) |
1037 |
1105 |
{ |
{ |
1038 |
1106 |
struct file_node *p, *first_left; |
struct file_node *p, *first_left; |
|
1107 |
|
char sep, final; |
1039 |
1108 |
|
|
1040 |
1109 |
if (debug) |
if (debug) |
1041 |
|
file_dump_node(f, 1); |
|
|
1110 |
|
fprintf(stderr, "[*] file_dump_duplicates(%s)\n", f->name); |
1042 |
1111 |
|
|
1043 |
1112 |
if (f->duplicates == NULL) { |
if (f->duplicates == NULL) { |
1044 |
1113 |
if (debug) |
if (debug) |
|
... |
... |
void file_dump_duplicates(struct file_node *f, |
1079 |
1148 |
if (debug) |
if (debug) |
1080 |
1149 |
fprintf(stderr, "first_left = [%s]\n", first_left->name); |
fprintf(stderr, "first_left = [%s]\n", first_left->name); |
1081 |
1150 |
|
|
|
1151 |
|
if (zero) { |
|
1152 |
|
sep = '\0'; |
|
1153 |
|
final = '\0'; |
|
1154 |
|
} else { |
|
1155 |
|
sep = '\t'; |
|
1156 |
|
final = '\n'; |
|
1157 |
|
} |
|
1158 |
|
|
1082 |
1159 |
/* now, dump */ |
/* now, dump */ |
1083 |
1160 |
p = f; |
p = f; |
1084 |
1161 |
while (p) { |
while (p) { |
|
... |
... |
void file_dump_duplicates(struct file_node *f, |
1108 |
1185 |
fprintf(stderr, "Because [%s] is a right, set do_not_dump=1\n", p->name); |
fprintf(stderr, "Because [%s] is a right, set do_not_dump=1\n", p->name); |
1109 |
1186 |
p->do_not_dump = 1; |
p->do_not_dump = 1; |
1110 |
1187 |
|
|
1111 |
|
printf("FILE\t%s\t%s\n", |
|
1112 |
|
first_left->name, p->name); |
|
|
1188 |
|
if (debug) |
|
1189 |
|
fprintf(stderr, "FILE%c%s%c%s%c", |
|
1190 |
|
sep, first_left->name, sep, p->name, final); |
|
1191 |
|
fprintf(out, "FILE%c%s%c%s%c", |
|
1192 |
|
sep, first_left->name, sep, p->name, final); |
1113 |
1193 |
p = p->duplicates; |
p = p->duplicates; |
1114 |
1194 |
} |
} |
1115 |
1195 |
} |
} |
|
... |
... |
void file_dump_duplicates(struct file_node *f, |
1118 |
1198 |
* Searches all tree for duplicates |
* Searches all tree for duplicates |
1119 |
1199 |
* @min_size - do not dump files shorter than min_size |
* @min_size - do not dump files shorter than min_size |
1120 |
1200 |
*/ |
*/ |
1121 |
|
void dump_duplicates(const unsigned long long min_size) |
|
|
1201 |
|
void dump_duplicates(const unsigned long long min_size, const unsigned int zero) |
1122 |
1202 |
{ |
{ |
1123 |
1203 |
unsigned int i; |
unsigned int i; |
1124 |
|
struct dir_node *d, *subdir; |
|
|
1204 |
|
struct dir_node *d; |
1125 |
1205 |
struct file_node *f; |
struct file_node *f; |
1126 |
1206 |
unsigned int hash; |
unsigned int hash; |
1127 |
1207 |
|
|
1128 |
|
for (i = 0; i < dir_info_count; i++) { |
|
|
1208 |
|
if (debug) |
|
1209 |
|
fprintf(stderr, "[*] Dump duplicated dirs...\n"); |
|
1210 |
|
for (i = 0; i < dir_chain_len; i++) { |
1129 |
1211 |
if (debug) |
if (debug) |
1130 |
1212 |
fprintf(stderr, "\tdump_duplicates[%u]...\n", i); |
fprintf(stderr, "\tdump_duplicates[%u]...\n", i); |
1131 |
|
d = dir_info[i]; |
|
1132 |
|
dir_dump_duplicates(d); |
|
1133 |
|
|
|
1134 |
|
subdir = d->subdirs; |
|
1135 |
|
while (subdir) { |
|
1136 |
|
dir_dump_duplicates(subdir); |
|
1137 |
|
subdir = subdir->next_sibling; |
|
1138 |
|
} |
|
|
1213 |
|
d = dir_chain[i]; |
|
1214 |
|
dir_dump_duplicates(d, zero); |
1139 |
1215 |
} |
} |
|
1216 |
|
free(dir_chain); |
1140 |
1217 |
|
|
1141 |
1218 |
/* Now, we dump remaining files */ |
/* Now, we dump remaining files */ |
1142 |
1219 |
if (debug) |
if (debug) |
1143 |
|
fprintf(stderr, "DEBUG: Dump duplicated files...\n"); |
|
|
1220 |
|
fprintf(stderr, "[*] Dump duplicated files...\n"); |
1144 |
1221 |
for (hash = 0; hash < HASH_SIZE; hash++) { |
for (hash = 0; hash < HASH_SIZE; hash++) { |
1145 |
1222 |
if (file_info[hash] == NULL) |
if (file_info[hash] == NULL) |
1146 |
1223 |
continue; |
continue; |
1147 |
1224 |
|
|
1148 |
1225 |
if (debug) |
if (debug) |
1149 |
|
fprintf(stderr, "Dump duplicates in hash %u\n", hash); |
|
|
1226 |
|
fprintf(stderr, "[*] Dump duplicates in hash %u\n", hash); |
1150 |
1227 |
|
|
1151 |
1228 |
f = file_info[hash]; |
f = file_info[hash]; |
1152 |
1229 |
while (f) { |
while (f) { |
1153 |
|
file_dump_duplicates(f, min_size); |
|
|
1230 |
|
file_dump_duplicates(f, min_size, zero); |
1154 |
1231 |
f = f->hash_next; |
f = f->hash_next; |
1155 |
1232 |
} |
} |
1156 |
1233 |
} |
} |