List of commits:
Subject Hash Author Date (UTC)
Lots of stuff 9c43842ac36feff6b29cb20a95ad4510a23bf472 Catalin(ux) M. BOIE 2012-07-20 15:43:51
Cosmetic + man a53df11bfc30c152c9f61fdb1bcc69dc6ec20765 Catalin(ux) M. BOIE 2012-06-24 12:11:17
First working version\! 27bd1bf47c9fb707760d84ea3cf4241083fa283d Catalin(ux) M. BOIE 2012-06-22 21:10:30
Several fixes 2909b1ba2e99929e775ddfea5f4894c50694a638 Catalin(ux) M. BOIE 2012-06-19 13:08:50
First version 3d7935d9b8a91694fe8213998ce4d3910348d6ef Catalin(ux) M. BOIE 2012-05-06 19:40:40
Commit 9c43842ac36feff6b29cb20a95ad4510a23bf472 - Lots of stuff
Author: Catalin(ux) M. BOIE
Author date (UTC): 2012-07-20 15:43
Committer name: Catalin(ux) M. BOIE
Committer date (UTC): 2012-07-20 15:43
Parent(s): a53df11bfc30c152c9f61fdb1bcc69dc6ec20765
Signing key:
Tree: 8340045c5e913a9b00cea166da9cba8bdc546a83
File Lines added Lines deleted
.exclude 3 0
.gitignore 5 1
Makefile.in 2 0
TODO 7 0
duilder 7 0
dupdump.1 7 1
dupdump.c 28 4
dupdump.spec.in 2 2
store.c 140 63
store.h 4 1
tests/1/expected 6 0
tests/1/in/a1 1 0
tests/1/in/a2 1 0
tests/1/in/a3 1 0
tests/1/in/b1 1 0
tests/1/in/b2 1 0
tests/1/in/c1 1 0
tests/1/in/dir_a1/a4 1 0
tests/1/in/dir_a1/a5 1 0
tests/1/in/dir_b1/b3 1 0
tests/1/in/x/dir_a2/a6 1 0
tests/1/in/x/dir_a2/a7 1 0
tests/2/expected 1 0
tests/2/in/d1/a 1 0
tests/2/in/d1/b 1 0
tests/2/in/d2/c 1 0
tests/2/in/d2/d 1 0
tests/3/expected 1 0
tests/3/in/dir_a1/a1 1 0
tests/3/in/dir_a1/b1 1 0
tests/3/in/dir_a2/a1x 1 0
tests/3/in/dir_a2/b1x 1 0
File .exclude added (mode: 100644) (index 0000000..835ac7f)
1 my1.sh
2 *.gz
3 *.out
File .gitignore changed (mode: 100644) (index 7b8473d..fe7c3de)
2 2 dupdump dupdump
3 3 vgcore* vgcore*
4 4 Makefile Makefile
5 *.spec
5 *.spec
6 Changelog
7 *.gz
8 my1.sh
9 *.out
File Makefile.in changed (mode: 100644) (index 18914b6..f61f7f7)
... ... clean:
21 21 install: all install: all
22 22 @mkdir -p $(I_USR_BIN) @mkdir -p $(I_USR_BIN)
23 23 cp -vd --no-clobber dupdump $(I_USR_BIN)/ cp -vd --no-clobber dupdump $(I_USR_BIN)/
24 @mkdir -p $(I_MAN)/man1
25 cp -vd --no-clobber dupdump.1 $(I_MAN)/man1/
File TODO changed (mode: 100644) (index 6cdf3ea..fee0bc8)
12 12 [ ] We could throw away unique files. [ ] We could throw away unique files.
13 13 [ ] Comparing in O(N*N) sucks! [ ] Comparing in O(N*N) sucks!
14 14 [ ] Install man. [ ] Install man.
15 [ ] Dump in stats also the max memory used.
16 [ ] Dump two types of dirs: DIR AND DIRFNC (File Names Changed).
17 Maybe also for files
18
19
20 [ ] Use a cache, specified by command line.
21 [ ]
File duilder changed (mode: 100755) (index a6e2825..dfa8a6a)
... ... function duilder_tar()
218 218 if [ ! -z "${EXCLUDE}" ]; then if [ ! -z "${EXCLUDE}" ]; then
219 219 ADD_EXCLUDE="--exclude-from ${P}/${EXCLUDE}" ADD_EXCLUDE="--exclude-from ${P}/${EXCLUDE}"
220 220 fi fi
221 echo "ADD_EXCLUDE=${ADD_EXCLUDE}"
221 222
222 223 (cd .. \ (cd .. \
223 224 && rm -rf "${P}" \ && rm -rf "${P}" \
 
... ... USR_SHARE_DOC="/usr/share/doc/${PRJ}-${VER}"
326 327 SBIN="/usr/sbin" SBIN="/usr/sbin"
327 328 VAR="/var" VAR="/var"
328 329 VAR_LIB="/var/lib" VAR_LIB="/var/lib"
330 MAN="/usr/share/man"
329 331
330 332 while [ "${1}" != "" ]; do while [ "${1}" != "" ]; do
331 333 VAR="`echo ${1} | cut -d'=' -f1`" VAR="`echo ${1} | cut -d'=' -f1`"
 
... ... while [ "${1}" != "" ]; do
355 357 --datadir) --datadir)
356 358 USR_SHARE="${VAL}" USR_SHARE="${VAL}"
357 359 ;; ;;
360 --mandir)
361 MAN="${VAL}"
362 ;;
358 363 esac esac
359 364 shift shift
360 365 done done
 
... ... echo "s#@USR_INC@#${USR_INCLUDE}#g" >> tmp.sed
471 476 echo "s#@USR_LIB@#${USR_LIB}#g" >> tmp.sed echo "s#@USR_LIB@#${USR_LIB}#g" >> tmp.sed
472 477 echo "s#@USR_SHARE@#${USR_SHARE}#g" >> tmp.sed echo "s#@USR_SHARE@#${USR_SHARE}#g" >> tmp.sed
473 478 echo "s#@USR_SHARE_DOC@#${USR_SHARE_DOC}#g" >> tmp.sed echo "s#@USR_SHARE_DOC@#${USR_SHARE_DOC}#g" >> tmp.sed
479 echo "s#@MAN#${MAN}#g" >> tmp.sed
474 480 # Export stuff # Export stuff
475 481 echo "s#@EXPORT_PATH@#${EXPORT_PATH}#g" >> tmp.sed echo "s#@EXPORT_PATH@#${EXPORT_PATH}#g" >> tmp.sed
476 482
 
... ... if [ -r Makefile.in ]; then
500 506 echo "export I_VAR_LIB := \$(DESTDIR)${VAR_LIB}" >> Makefile echo "export I_VAR_LIB := \$(DESTDIR)${VAR_LIB}" >> Makefile
501 507 echo "export I_VAR_LOG := \$(DESTDIR)${VAR_LOG}" >> Makefile echo "export I_VAR_LOG := \$(DESTDIR)${VAR_LOG}" >> Makefile
502 508 echo "export I_VAR_RUN := \$(DESTDIR)${VAR_RUN}" >> Makefile echo "export I_VAR_RUN := \$(DESTDIR)${VAR_RUN}" >> Makefile
509 echo "export I_MAN := \$(DESTDIR)${MAN}" >> Makefile
503 510 echo >> Makefile echo >> Makefile
504 511 echo "# DB stuff" >> Makefile echo "# DB stuff" >> Makefile
505 512 echo "export DB_SUPPORT := ${DB_SUPPORT}" >> Makefile echo "export DB_SUPPORT := ${DB_SUPPORT}" >> Makefile
File dupdump.1 changed (mode: 100644) (index 4054a0a..e7aed35)
... ... matches.
19 19
20 20 .SH OPTIONS .SH OPTIONS
21 21 .TP .TP
22 .B -z --zero
23 use \0 as fields and records separator instead of \\t and \\n
24 .TP
22 25 .B -i --min-size .B -i --min-size
23 Do not dump files under specified size. Still, they are taken into account
26 do not dump files under specified size. Still, they are taken into account
24 27 when a dir match is possible when a dir match is possible
25 28 .TP .TP
29 .B -o --out
30 specify where to store the list of duplicates (default stdout)
31 .TP
26 32 .B -v --verbose: .B -v --verbose:
27 33 be more verbose be more verbose
28 34 .TP .TP
File dupdump.c changed (mode: 100644) (index b293669..db5d9f6)
10 10 #include <string.h> #include <string.h>
11 11 #include <errno.h> #include <errno.h>
12 12 #include <getopt.h> #include <getopt.h>
13 #include <malloc.h>
13 14
14 15 #include "store.h" #include "store.h"
15 16
16 17 static off_t min_size = 0; static off_t min_size = 0;
17 18 static int verbose = 0; static int verbose = 0;
18 19 static int debug = 0; static int debug = 0;
20 static int zero = 0;
21 static char *out_file;
19 22
20 23 static struct option options[] = static struct option options[] =
21 24 { {
25 {"zero", no_argument, NULL, 'z'},
22 26 {"min-size", required_argument, NULL, 'i'}, {"min-size", required_argument, NULL, 'i'},
27 {"out", required_argument, NULL, 'o'},
23 28 {"verbose", no_argument, NULL, 'v'}, {"verbose", no_argument, NULL, 'v'},
24 29 {"debug", no_argument, NULL, 'd'}, {"debug", no_argument, NULL, 'd'},
25 30 {NULL, 0, NULL, 0} {NULL, 0, NULL, 0}
 
... ... static struct option options[] =
28 33 static void usage(void) static void usage(void)
29 34 { {
30 35 fprintf(stderr, "Usage [options] <dir1> [<dir2>] ...\n" fprintf(stderr, "Usage [options] <dir1> [<dir2>] ...\n"
36 " --zero -z Use \\0 to separate columns\n"
31 37 " --min-size -i Ignore files under this size (default 1)\n" " --min-size -i Ignore files under this size (default 1)\n"
38 " --out -i Where to store results (default stdout)\n"
32 39 " --verbose -v Be more verbose\n" " --verbose -v Be more verbose\n"
33 40 " --debug -d Print debug information\n" " --debug -d Print debug information\n"
34 41 ); );
 
... ... int main(int argc, char *argv[])
85 92 int err; int err;
86 93 int options_index = 0; int options_index = 0;
87 94 char c; char c;
95 FILE *out;
88 96
89 while ((c = getopt_long(argc, argv, "i:vdh", options, &options_index)) != -1) {
97 while ((c = getopt_long(argc, argv, "zi:o:vdh", options, &options_index)) != -1) {
90 98 switch (c) { switch (c) {
99 case 'z': zero = 1; break;
91 100 case 'i': min_size = strtoul(optarg, NULL, 10); break; case 'i': min_size = strtoul(optarg, NULL, 10); break;
101 case 'o': out_file = optarg; break;
92 102 case 'v': verbose = 1; break; case 'v': verbose = 1; break;
93 103 case 'd': debug = 1; break; case 'd': debug = 1; break;
94 104 default: default:
 
... ... int main(int argc, char *argv[])
97 107 } }
98 108 } }
99 109
110 if (out_file == NULL) {
111 out = stdout;
112 } else {
113 out = fopen(out_file, "w");
114 if (out == NULL) {
115 fprintf(stderr, "Cannot open results file (%s)!\n",
116 strerror(errno));
117 return 1;
118 }
119 }
120
100 121 flags |= FTW_PHYS; /* Do not follow symlinks */ flags |= FTW_PHYS; /* Do not follow symlinks */
101 122 flags |= FTW_ACTIONRETVAL; /* To skip hierarchies */ flags |= FTW_ACTIONRETVAL; /* To skip hierarchies */
102 123
 
... ... int main(int argc, char *argv[])
107 128 } }
108 129
109 130 set_debug(debug); set_debug(debug);
131 set_out(out);
110 132
111 133 if (verbose) if (verbose)
112 134 fprintf(stderr, "Scanning for duplicates, min-size %lld\n", fprintf(stderr, "Scanning for duplicates, min-size %lld\n",
 
... ... int main(int argc, char *argv[])
130 152 dump_files(); dump_files();
131 153
132 154 if (verbose) if (verbose)
133 fprintf(stderr, "Find duplicate files...\n");
155 fprintf(stderr, "[*] Find duplicate files...\n");
134 156 err = file_find_dups(); err = file_find_dups();
135 157 if (err != 0) { if (err != 0) {
136 158 fprintf(stderr, "Error comparing files!\n"); fprintf(stderr, "Error comparing files!\n");
 
... ... int main(int argc, char *argv[])
138 160 } }
139 161
140 162 if (verbose) if (verbose)
141 fprintf(stderr, "Find duplicate dirs...\n");
163 fprintf(stderr, "[*] Find duplicate dirs...\n");
142 164 err = dir_find_dups(); err = dir_find_dups();
143 165 if (err != 0) { if (err != 0) {
144 166 fprintf(stderr, "Error comparing dirs!\n"); fprintf(stderr, "Error comparing dirs!\n");
 
... ... int main(int argc, char *argv[])
148 170 if (debug) if (debug)
149 171 dump_dirs(); dump_dirs();
150 172
151 dump_duplicates(min_size);
173 dump_duplicates(min_size, zero);
152 174
153 175 if (verbose) if (verbose)
154 176 dump_stats(); dump_stats();
155 177
178 dev_ino_seen_clean();
179
156 180 return 0; return 0;
157 181 } }
File dupdump.spec.in changed (mode: 100644) (index bfc547e..bb53a32)
... ... Group: Development/Tools
7 7 Source: http://kernel.embedromix.ro/us/%{name}/%{name}-%{version}.tar.gz Source: http://kernel.embedromix.ro/us/%{name}/%{name}-%{version}.tar.gz
8 8 URL: http://kernel.embedromix.ro/us/ URL: http://kernel.embedromix.ro/us/
9 9 BuildRoot: %{_tmppath}/%{name}-%{version}-buildroot BuildRoot: %{_tmppath}/%{name}-%{version}-buildroot
10 BuildArch: noarch
11 Biuld-require: openssl-devel, gcc
10 BuildRequires: openssl-devel, gcc
12 11 Requires: openssl Requires: openssl
13 12
14 13
 
... ... rm -rf ${RPM_BUILD_ROOT}
40 39 %attr (-,root,root) %attr (-,root,root)
41 40 %doc README LICENSE Changelog TODO %doc README LICENSE Changelog TODO
42 41 /usr/bin/dupdump /usr/bin/dupdump
42 %{_mandir}/man1/%{name}.1.gz
43 43
44 44 %changelog %changelog
45 45 * Sun May 6 2012 Catalin(ux) M. BOIE <catab at embedromix dot ro> 0.1 * Sun May 6 2012 Catalin(ux) M. BOIE <catab at embedromix dot ro> 0.1
File store.c changed (mode: 100644) (index 5d5b290..8cd312c)
15 15
16 16
17 17 #define DEV_INO_HASH_SIZE 4096 #define DEV_INO_HASH_SIZE 4096
18 #define HASH_SIZE 512
18 #define HASH_SIZE 16384
19 19 #define MAX_INPUT_DIRS 32 #define MAX_INPUT_DIRS 32
20 20 #define MAX_DEPTH 1000 #define MAX_DEPTH 1000
21 21
 
... ... static struct file_node *file_info[HASH_SIZE];
35 35 static struct dir_node *dir_info[MAX_INPUT_DIRS]; static struct dir_node *dir_info[MAX_INPUT_DIRS];
36 36 static unsigned int dir_info_count; static unsigned int dir_info_count;
37 37 static struct dir_node *dir_current[MAX_DEPTH]; static struct dir_node *dir_current[MAX_DEPTH];
38 static struct dir_node **dir_chain;
39 static unsigned long long dir_chain_len;
38 40 static unsigned char sha1_zero[SHA_DIGEST_LENGTH]; static unsigned char sha1_zero[SHA_DIGEST_LENGTH];
39 41 static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE]; static struct dev_ino *dev_ino_hash[DEV_INO_HASH_SIZE];
40 42 static int debug = 0; static int debug = 0;
43 static FILE *out;
44
41 45
42 46 /* ############### Misc functions ############### */ /* ############### Misc functions ############### */
43 47 void set_debug(const unsigned int level) void set_debug(const unsigned int level)
 
... ... void set_debug(const unsigned int level)
45 49 debug = level; debug = level;
46 50 } }
47 51
52 void set_out(FILE *f)
53 {
54 out = f;
55 }
56
48 57 /* ############### Memory functions ############### */ /* ############### Memory functions ############### */
49 58 static void *xmalloc(size_t size) static void *xmalloc(size_t size)
50 59 { {
 
... ... int dev_ino_seen(const unsigned int type, const dev_t dev, const ino_t ino)
186 195 return 0; return 0;
187 196 } }
188 197
198 /*
199 * Clean dev_ino_seen stuff
200 */
201 void dev_ino_seen_clean(void)
202 {
203 unsigned int i;
204 struct dev_ino *q, *next;
205
206 for (i = 0; i < DEV_INO_HASH_SIZE; i++) {
207 q = dev_ino_hash[i];
208 while (q) {
209 next = q->next;
210 free(q);
211 q = next;
212 }
213 }
214 }
215
189 216
190 217 /* ############### Main functions ############### */ /* ############### Main functions ############### */
191 218
 
... ... static void dir_mark_no_dup_possible(struct dir_node *d)
477 504 if ((d == NULL) || (d->no_dup_possible == 1)) if ((d == NULL) || (d->no_dup_possible == 1))
478 505 return; return;
479 506
507 if (debug)
508 fprintf(stderr, "DEBUG: recursively up do dir_mark_no_dup_possible(%s)\n", d->name);
509
480 510 d->no_dup_possible = 1; d->no_dup_possible = 1;
481 511 dir_mark_no_dup_possible(d->parent); dir_mark_no_dup_possible(d->parent);
482 512 } }
 
... ... static void dir_mark_do_not_dump(struct dir_node *d)
490 520 struct file_node *file; struct file_node *file;
491 521 struct dir_node *subdir; struct dir_node *subdir;
492 522
493 if (debug)
494 fprintf(stderr, "DEBUG: dir_mark_do_not_dump(%s)\n", d->name);
495 523 if ((d == NULL) || (d->do_not_dump == 1)) if ((d == NULL) || (d->do_not_dump == 1))
496 524 return; return;
497 525
526 if (debug)
527 fprintf(stderr, "DEBUG: recursively dir_mark_do_not_dump(%s)\n", d->name);
528
498 529 d->do_not_dump = 1; d->do_not_dump = 1;
499 530
500 531 subdir = d->subdirs; subdir = d->subdirs;
 
... ... static void dir_mark_do_not_dump(struct dir_node *d)
505 536
506 537 file = d->files; file = d->files;
507 538 while (file) { while (file) {
508 if (debug)
509 fprintf(stderr, "\tSet do_not_dump=1 on [%s]\n", file->name);
510 539 file->do_not_dump = 1; file->do_not_dump = 1;
511 540 file = file->next; file = file->next;
512 541 } }
 
... ... static void dir_mark_left(struct dir_node *d)
523 552 if ((d == NULL) || (d->left == 1)) if ((d == NULL) || (d->left == 1))
524 553 return; return;
525 554
555 if (debug)
556 fprintf(stderr, "DEBUG: recursively dir_mark_left(%s)\n", d->name);
557
526 558 d->left = 1; d->left = 1;
527 559
528 560 subdir = d->subdirs; subdir = d->subdirs;
 
... ... static int compare_file_range(struct file_node *a, struct file_node *b)
591 623 } }
592 624
593 625 err = compare_files(p, q); err = compare_files(p, q);
594 if (debug)
595 fprintf(stderr, "COMPARING [%s] with [%s] = %d\n", p->name, q->name, err);
596 626 if (err == -1) if (err == -1)
597 627 return -1; return -1;
598 628
 
... ... int file_find_dups(void)
688 718 } }
689 719
690 720 if (debug) { if (debug) {
691 fprintf(stderr, "Dump chain %u: ", hash);
721 fprintf(stderr, "[*] Dump chain %u start:\n", hash);
692 722 q = file_info[hash]; q = file_info[hash];
693 723 while (q) { while (q) {
694 fprintf(stderr, "%s(", q->name);
724 fprintf(stderr, "%s:\n", q->name);
695 725 dups = q->duplicates; dups = q->duplicates;
696 726 while(dups) { while(dups) {
697 fprintf(stderr, " %s", dups->name);
727 fprintf(stderr, "\t%s\n", dups->name);
698 728 dups = dups->duplicates; dups = dups->duplicates;
699 729 } }
700 fprintf(stderr, ") -> ");
701 730 q = q->hash_next; q = q->hash_next;
702 731 } }
703 fprintf(stderr, "\n");
732 fprintf(stderr, "[*] Dump chain %u stop\n", hash);
704 733 } }
705 734 } }
706 735
 
... ... static long long dir_build_hash(struct dir_node *d)
793 822 fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n", fprintf(stderr, "DEBUG: %s [%s] no_dup_possible=%u\n",
794 823 __FUNCTION__, d->name, d->no_dup_possible); __FUNCTION__, d->name, d->no_dup_possible);
795 824
825 /* empty dir? */
826 if ((d->files == NULL) && (d->subdirs == NULL)) {
827 d->no_dup_possible = 1;
828 return 0;
829 }
830
796 831 /* We check current dir first. */ /* We check current dir first. */
797 832 if (d->no_dup_possible == 0) if (d->no_dup_possible == 0)
798 833 no_of_possible_dirs++; no_of_possible_dirs++;
 
... ... static long long dir_find_dups_populate_list(struct dir_node **u,
892 927 /* /*
893 928 * Finds dir duplicates (we are only marking here) * Finds dir duplicates (we are only marking here)
894 929 * We have to sort files based on hash, to match * We have to sort files based on hash, to match
895 * We ignore 000 hashes (dirs), because that files are single.
930 * We ignore 000 hashes (dirs), because they contain files that are single.
896 931 * TODO: the name does not reflect what the function does. * TODO: the name does not reflect what the function does.
897 932 */ */
898 933 int dir_find_dups(void) int dir_find_dups(void)
899 934 { {
900 long long i, j, first, last, where;
935 unsigned long long i, j, first, last, where;
901 936 int final_step; int final_step;
902 937 struct dir_node *d; struct dir_node *d;
903 938 unsigned long long mem; unsigned long long mem;
904 long long err, no_of_possible_dirs = 0;
905 struct dir_node **u;
939 long long err;
906 940 char dump[SHA_DIGEST_LENGTH * 2 + 1]; char dump[SHA_DIGEST_LENGTH * 2 + 1];
907 941
942 dir_chain_len = 0;
943
944 if (debug)
945 fprintf(stderr, "[*] dir_find_dups...\n");
946
908 947 for (i = 0; i < dir_info_count; i++) { for (i = 0; i < dir_info_count; i++) {
909 948 err = dir_build_hash(dir_info[i]); err = dir_build_hash(dir_info[i]);
910 949 if (err == -1) if (err == -1)
911 950 return -1; return -1;
912 951
913 no_of_possible_dirs += err;
952 dir_chain_len += err;
914 953 } }
915 954
916 955 /* Allocate an array that we will pass to qsort */ /* Allocate an array that we will pass to qsort */
917 mem = no_of_possible_dirs * sizeof(struct dir_node *);
918 u = (struct dir_node **) xmalloc(mem);
919 if (u == NULL) {
956 mem = dir_chain_len * sizeof(struct dir_node *);
957 dir_chain = (struct dir_node **) xmalloc(mem);
958 if (dir_chain == NULL) {
920 959 fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n", fprintf(stderr, "Cannot alloc mem for dir list (%llu bytes)!\n",
921 960 mem); mem);
922 961 return -1; return -1;
 
... ... int dir_find_dups(void)
925 964 j = 0; j = 0;
926 965 for (i = 0; i < dir_info_count; i++) { for (i = 0; i < dir_info_count; i++) {
927 966 d = dir_info[i]; d = dir_info[i];
928 if (debug)
929 fprintf(stderr, "dir_find_dups[i=%llu, j=%lld] [%s]\n", i, j, d->name);
930 967
931 j += dir_find_dups_populate_list(u, j, d);
968 j += dir_find_dups_populate_list(dir_chain, j, d);
932 969
933 970 /* stop searching if we found all possible dirs */ /* stop searching if we found all possible dirs */
934 if (j == no_of_possible_dirs)
971 if (j == dir_chain_len)
935 972 break; break;
936 973 } }
937 974
938 975 if (debug) { if (debug) {
939 976 fprintf(stderr, "dir u (j=%lld): ", j); fprintf(stderr, "dir u (j=%lld): ", j);
940 for (i = 0; i < no_of_possible_dirs; i++)
941 fprintf(stderr, "[%lld]=%s ", i, u[i]->name);
977 for (i = 0; i < dir_chain_len; i++)
978 fprintf(stderr, "[%lld]=%s ", i, dir_chain[i]->name);
942 979 fprintf(stderr, "\n"); fprintf(stderr, "\n");
943 980 } }
944 981
945 /* Order by hash */
946 qsort(u, no_of_possible_dirs, sizeof(struct dir_node *), dir_compare_hashes);
982 /* Sort by hash */
983 qsort(dir_chain, dir_chain_len, sizeof(struct dir_node *), dir_compare_hashes);
947 984
948 985 if (debug) { if (debug) {
949 fprintf(stderr, "DEBUG: dump after dir qsort [%s]\n", d->name);
950 for (i = 0; i < no_of_possible_dirs; i++) {
951 sha1_dump(dump, u[i]->sha1, 0);
952 fprintf(stderr, "DEBUG: %s\t%u\t%s\n", dump, u[i]->level, u[i]->name);
986 fprintf(stderr, "DEBUG: dump after dir qsort:\n");
987 for (i = 0; i < dir_chain_len; i++) {
988 sha1_dump(dump, dir_chain[i]->sha1, 0);
989 fprintf(stderr, "DEBUG: %s\tlevel %u\t%s\n",
990 dump, dir_chain[i]->level, dir_chain[i]->name);
953 991 } }
954 992 } }
955 993
956 994 first = 0; first = 0;
957 995 last = 0; last = 0;
958 996 where = 0; where = 0;
959 for (i = 1; i < no_of_possible_dirs; i++) {
960 if (memcmp(u[first]->sha1, u[i]->sha1, SHA_DIGEST_LENGTH) == 0) {
997 final_step = 1;
998 for (i = 1; i < dir_chain_len; i++) {
999 if (memcmp(dir_chain[first]->sha1, dir_chain[i]->sha1, SHA_DIGEST_LENGTH) == 0) {
961 1000 /* We have the same hash */ /* We have the same hash */
962 1001 dup_no_of_dirs++; dup_no_of_dirs++;
963 1002 last = i; last = i;
964 if (u[last]->level < u[where]->level)
1003 if (dir_chain[last]->level < dir_chain[where]->level)
965 1004 where = last; where = last;
966 1005 continue; continue;
967 1006 } }
968 1007
969 1008 /* We have same hash in first..last */ /* We have same hash in first..last */
970 1009 final_step = 0; final_step = 0;
971 dir_process_range(u, first, last, where);
1010 dir_process_range(dir_chain, first, last, where);
972 1011
973 1012 /* Switch to next range */ /* Switch to next range */
974 1013 first = i; first = i;
 
... ... int dir_find_dups(void)
980 1019
981 1020 /* TODO: shoudn't we do the same for files? */ /* TODO: shoudn't we do the same for files? */
982 1021 if (final_step == 1) if (final_step == 1)
983 dir_process_range(u, first, last, where);
984
985 free(u);
1022 dir_process_range(dir_chain, first, last, where);
986 1023
987 1024 return 0; return 0;
988 1025 } }
 
... ... int dir_find_dups(void)
990 1027 /* /*
991 1028 * Nice dumps the duplicated dirs * Nice dumps the duplicated dirs
992 1029 */ */
993 void dir_dump_duplicates(struct dir_node *d)
1030 void dir_dump_duplicates(struct dir_node *d, const unsigned int zero)
994 1031 { {
995 1032 struct dir_node *p; struct dir_node *p;
1033 char sep, final;
996 1034
997 if (d->no_dup_possible == 1)
1035 if (debug)
1036 fprintf(stderr, "[*] dir_dump_duplicates(%s)\n", d->name);
1037
1038 if (d->no_dup_possible == 1) {
1039 if (debug)
1040 fprintf(stderr, "\tignore duplicate dir because no_dup_possible\n ");
998 1041 return; return;
1042 }
999 1043
1000 if (d->do_not_dump == 1)
1044 if (d->do_not_dump == 1) {
1045 if (debug)
1046 fprintf(stderr, "\tignore duplicate dir because no_dup_dump\n ");
1001 1047 return; return;
1048 }
1002 1049
1003 if (d->hash_next == NULL)
1050 if (d->hash_next == NULL) {
1051 if (debug)
1052 fprintf(stderr, "\tignore duplicate dir because hash_next\n ");
1004 1053 return; return;
1054 }
1055
1056 if (zero) {
1057 sep = '\0';
1058 final = '\0';
1059 } else {
1060 sep = '\t';
1061 final = '\n';
1062 }
1005 1063
1006 1064 p = d->hash_next; p = d->hash_next;
1007 1065 while (p) { while (p) {
 
... ... void dir_dump_duplicates(struct dir_node *d)
1010 1068 * We already dumped that dir on the left/right side. * We already dumped that dir on the left/right side.
1011 1069 * makes no sense to dump it again on the right side! * makes no sense to dump it again on the right side!
1012 1070 */ */
1071 if (debug)
1072 fprintf(stderr, "DEBUG: ignore dir [%s] because"
1073 "left=%u and/or do_not_dump=%u\n",
1074 p->name, p->left, p->do_not_dump);
1013 1075 p = p->hash_next; p = p->hash_next;
1014 1076 continue; continue;
1015 1077 } }
1016 1078
1079 if (debug)
1080 fprintf(stderr, "DEBUG: Found a right dir for [%s]: %s\n", d->name, p->name);
1081
1017 1082 dir_mark_left(d); dir_mark_left(d);
1018 1083 if (debug) if (debug)
1019 1084 fprintf(stderr, "dir_dump_duplicates: set do_not_dump on left [%s]\n", d->name); fprintf(stderr, "dir_dump_duplicates: set do_not_dump on left [%s]\n", d->name);
 
... ... void dir_dump_duplicates(struct dir_node *d)
1023 1088 fprintf(stderr, "dir_dump_duplicates: set do_not_dump on right [%s]\n", p->name); fprintf(stderr, "dir_dump_duplicates: set do_not_dump on right [%s]\n", p->name);
1024 1089 dir_mark_do_not_dump(p); dir_mark_do_not_dump(p);
1025 1090
1026 printf("DIR\t%s\t%s\n",
1027 d->name, p->name);
1091 if (debug)
1092 fprintf(stderr, "DIR%c%s%c%s%c",
1093 sep, d->name, sep, p->name, final);
1094 fprintf(out, "DIR%c%s%c%s%c",
1095 sep, d->name, sep, p->name, final);
1028 1096 p = p->hash_next; p = p->hash_next;
1029 1097 } }
1030 1098 } }
 
... ... void dir_dump_duplicates(struct dir_node *d)
1033 1101 * Nice dumps the duplicated files * Nice dumps the duplicated files
1034 1102 */ */
1035 1103 void file_dump_duplicates(struct file_node *f, void file_dump_duplicates(struct file_node *f,
1036 const unsigned long long min_size)
1104 const unsigned long long min_size, const unsigned int zero)
1037 1105 { {
1038 1106 struct file_node *p, *first_left; struct file_node *p, *first_left;
1107 char sep, final;
1039 1108
1040 1109 if (debug) if (debug)
1041 file_dump_node(f, 1);
1110 fprintf(stderr, "[*] file_dump_duplicates(%s)\n", f->name);
1042 1111
1043 1112 if (f->duplicates == NULL) { if (f->duplicates == NULL) {
1044 1113 if (debug) if (debug)
 
... ... void file_dump_duplicates(struct file_node *f,
1079 1148 if (debug) if (debug)
1080 1149 fprintf(stderr, "first_left = [%s]\n", first_left->name); fprintf(stderr, "first_left = [%s]\n", first_left->name);
1081 1150
1151 if (zero) {
1152 sep = '\0';
1153 final = '\0';
1154 } else {
1155 sep = '\t';
1156 final = '\n';
1157 }
1158
1082 1159 /* now, dump */ /* now, dump */
1083 1160 p = f; p = f;
1084 1161 while (p) { while (p) {
 
... ... void file_dump_duplicates(struct file_node *f,
1108 1185 fprintf(stderr, "Because [%s] is a right, set do_not_dump=1\n", p->name); fprintf(stderr, "Because [%s] is a right, set do_not_dump=1\n", p->name);
1109 1186 p->do_not_dump = 1; p->do_not_dump = 1;
1110 1187
1111 printf("FILE\t%s\t%s\n",
1112 first_left->name, p->name);
1188 if (debug)
1189 fprintf(stderr, "FILE%c%s%c%s%c",
1190 sep, first_left->name, sep, p->name, final);
1191 fprintf(out, "FILE%c%s%c%s%c",
1192 sep, first_left->name, sep, p->name, final);
1113 1193 p = p->duplicates; p = p->duplicates;
1114 1194 } }
1115 1195 } }
 
... ... void file_dump_duplicates(struct file_node *f,
1118 1198 * Searches all tree for duplicates * Searches all tree for duplicates
1119 1199 * @min_size - do not dump files shorter than min_size * @min_size - do not dump files shorter than min_size
1120 1200 */ */
1121 void dump_duplicates(const unsigned long long min_size)
1201 void dump_duplicates(const unsigned long long min_size, const unsigned int zero)
1122 1202 { {
1123 1203 unsigned int i; unsigned int i;
1124 struct dir_node *d, *subdir;
1204 struct dir_node *d;
1125 1205 struct file_node *f; struct file_node *f;
1126 1206 unsigned int hash; unsigned int hash;
1127 1207
1128 for (i = 0; i < dir_info_count; i++) {
1208 if (debug)
1209 fprintf(stderr, "[*] Dump duplicated dirs...\n");
1210 for (i = 0; i < dir_chain_len; i++) {
1129 1211 if (debug) if (debug)
1130 1212 fprintf(stderr, "\tdump_duplicates[%u]...\n", i); fprintf(stderr, "\tdump_duplicates[%u]...\n", i);
1131 d = dir_info[i];
1132 dir_dump_duplicates(d);
1133
1134 subdir = d->subdirs;
1135 while (subdir) {
1136 dir_dump_duplicates(subdir);
1137 subdir = subdir->next_sibling;
1138 }
1213 d = dir_chain[i];
1214 dir_dump_duplicates(d, zero);
1139 1215 } }
1216 free(dir_chain);
1140 1217
1141 1218 /* Now, we dump remaining files */ /* Now, we dump remaining files */
1142 1219 if (debug) if (debug)
1143 fprintf(stderr, "DEBUG: Dump duplicated files...\n");
1220 fprintf(stderr, "[*] Dump duplicated files...\n");
1144 1221 for (hash = 0; hash < HASH_SIZE; hash++) { for (hash = 0; hash < HASH_SIZE; hash++) {
1145 1222 if (file_info[hash] == NULL) if (file_info[hash] == NULL)
1146 1223 continue; continue;
1147 1224
1148 1225 if (debug) if (debug)
1149 fprintf(stderr, "Dump duplicates in hash %u\n", hash);
1226 fprintf(stderr, "[*] Dump duplicates in hash %u\n", hash);
1150 1227
1151 1228 f = file_info[hash]; f = file_info[hash];
1152 1229 while (f) { while (f) {
1153 file_dump_duplicates(f, min_size);
1230 file_dump_duplicates(f, min_size, zero);
1154 1231 f = f->hash_next; f = f->hash_next;
1155 1232 } }
1156 1233 } }
File store.h changed (mode: 100644) (index 27e1a3d..ffcc85a)
... ... struct dir_node
53 53
54 54
55 55 extern void set_debug(const unsigned int level); extern void set_debug(const unsigned int level);
56 extern void set_out(FILE *out);
56 57 extern void dump_stats(void); extern void dump_stats(void);
57 58 extern int file_add(const char *file, const struct stat *s, extern int file_add(const char *file, const struct stat *s,
58 59 const unsigned int level); const unsigned int level);
 
... ... extern struct dir_node *dir_get_current(const unsigned int level);
65 66
66 67 extern int dev_ino_seen(const unsigned int type, extern int dev_ino_seen(const unsigned int type,
67 68 const dev_t dev, const ino_t ino); const dev_t dev, const ino_t ino);
69 extern void dev_ino_seen_clean(void);
68 70
69 71 extern void dump_dirs(void); extern void dump_dirs(void);
70 72
71 extern void dump_duplicates(const unsigned long long min_size);
73 extern void dump_duplicates(const unsigned long long min_size,
74 const unsigned int zero);
72 75
73 76 #endif #endif
File tests/1/expected added (mode: 100644) (index 0000000..496eede)
1 DIR in/dir_a1 in/x/dir_a2
2 FILE in/dir_a1/a4 in/a1
3 FILE in/dir_a1/a4 in/a2
4 FILE in/dir_a1/a4 in/a3
5 FILE in/b1 in/b2
6 FILE in/b1 in/dir_b1/b3
File tests/1/in/a1 added (mode: 100644) (index 0000000..7284ab4)
1 aaaa
File tests/1/in/a2 added (mode: 100644) (index 0000000..7284ab4)
1 aaaa
File tests/1/in/a3 added (mode: 100644) (index 0000000..7284ab4)
1 aaaa
File tests/1/in/b1 added (mode: 100644) (index 0000000..6484fb6)
1 bbbb
File tests/1/in/b2 added (mode: 100644) (index 0000000..6484fb6)
1 bbbb
File tests/1/in/c1 added (mode: 100644) (index 0000000..baebf33)
1 cccc
File tests/1/in/dir_a1/a4 added (mode: 100644) (index 0000000..7284ab4)
1 aaaa
File tests/1/in/dir_a1/a5 added (mode: 100644) (index 0000000..7284ab4)
1 aaaa
File tests/1/in/dir_b1/b3 added (mode: 100644) (index 0000000..6484fb6)
1 bbbb
File tests/1/in/x/dir_a2/a6 added (mode: 100644) (index 0000000..7284ab4)
1 aaaa
File tests/1/in/x/dir_a2/a7 added (mode: 100644) (index 0000000..7284ab4)
1 aaaa
File tests/2/expected added (mode: 100644) (index 0000000..4538851)
1 DIR in/d2 in/d1
File tests/2/in/d1/a added (mode: 100644) (index 0000000..5ee608e)
1 xxxx
File tests/2/in/d1/b added (mode: 100644) (index 0000000..97aee46)
1 yyyy
File tests/2/in/d2/c added (mode: 100644) (index 0000000..97aee46)
1 yyyy
File tests/2/in/d2/d added (mode: 100644) (index 0000000..5ee608e)
1 xxxx
File tests/3/expected added (mode: 100644) (index 0000000..0061f50)
1 DIR in/dir_a1 in/dir_a2
File tests/3/in/dir_a1/a1 added (mode: 100644) (index 0000000..5d308e1)
1 aaaa
File tests/3/in/dir_a1/b1 added (mode: 100644) (index 0000000..b433656)
1 bbbb
File tests/3/in/dir_a2/a1x added (mode: 100644) (index 0000000..b433656)
1 bbbb
File tests/3/in/dir_a2/b1x added (mode: 100644) (index 0000000..5d308e1)
1 aaaa
Hints:
Before first commit, do not forget to setup your git environment:
git config --global user.name "your_name_here"
git config --global user.email "your@email_here"

Clone this repository using HTTP(S):
git clone https://rocketgit.com/user/catalinux/dupdump

Clone this repository using ssh (do not forget to upload a key first):
git clone ssh://rocketgit@ssh.rocketgit.com/user/catalinux/dupdump

Clone this repository using git:
git clone git://git.rocketgit.com/user/catalinux/dupdump

You are allowed to anonymously push to this repository.
This means that your pushed commits will automatically be transformed into a merge request:
... clone the repository ...
... make some changes and some commits ...
git push origin main