-
Notifications
You must be signed in to change notification settings - Fork 0
/
dedup.sh
executable file
·413 lines (352 loc) · 12.3 KB
/
dedup.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#!/bin/bash
####################################################################################
#
# FILE: dedup.sh
#
# USAGE: dedup.sh [OPTION]... [DIRECTORY]...
#
# DESCRIPTION: Script for deduplicate of files and replace them with hardlinks.
# The default directory is the current directory.
# Don’t work across filesystems.
#
# OPTIONS: See function ’usage’ below.
#
# REQUIREMENTS: bc
# cmp
# date
# md5sum
#
# NOTES: See README.md
#
# AUTHOR: Andreas Klamke
#
# VERSION: 2.0.2
#
# CREATED: 12.12.2015
#
# UPDATED: 18.01.2016
#
####################################################################################
# global defined variables with default values
script_name=$(basename $0)
backup=0
dry_run=0
interactive=0
recursive=0
verbose=0
declare -A checksumarray
# functions
function usage {
# print usage manual
echo -e "
\rUsage: $script_name [OPTION]... [DIRECTORY]...
\rDeduplicate files and replace duplicates with hardlinks.
\rMandatory arguments to long options are mandatory for short options too.
\r -b, --backup hard linked files will be backuped like file.~1~
\r -d, --dry-run runs in dry-run mode
\r -i, --interactive prompt whether to remove duplicate files before
\r hard links will be created
\r -r, --recursive recursive through subdirectories
\r -h, --help display this help and exit
\r -v, --verbose more details in output
\rIf DIRECTORY is missing, current directory is used.
\rExit status is 0 if no error occures, otherwise exit status is 1.
\rReport bugs on: https://github.com/FlatKey/dedup
"
exit 0
}
function check_requirements {
# check if script requirements exist
command -v bc &>/dev/null
if [[ ! $? -eq 0 ]]
then
echo -e "\nERROR - The required program bc does not exist!\n"
exit 1
fi
command -v cmp &>/dev/null
if [[ ! $? -eq 0 ]]
then
echo -e "\nERROR - The required program cmp does not exist!\n"
exit 1
fi
command -v date &>/dev/null
if [[ ! $? -eq 0 ]]
then
echo -e "\nERROR - The required program date does not exist!\n"
exit 1
fi
command -v md5sum &>/dev/null
if [[ ! $? -eq 0 ]]
then
echo -e "\nERROR - The required program md5sum does not exist!\n"
exit 1
fi
}
function validate_directory {
# validates the given directory arguments
for directory in "$@"
do
if [[ ! -d "$directory" ]]
then
echo -e "\nERROR - $directory is not a existing directory!\n" 1>&2
exit 1
fi
done
}
function show_script_header {
# print script header
echo -e "\n| dedup.sh |"
echo -e "\r| @2015 by Andreas Klamke |"
}
function show_script_options {
# print script options in verbose mode only
if [[ $verbose -eq 1 ]]
then
echo -e "\nOptions:\n========\n"
if [[ $backup -eq 1 ]]
then
echo -e "- backup on"
fi
if [[ $dry_run -eq 1 ]]
then
echo -e "- dry_run on"
fi
if [[ $interactive -eq 1 ]]
then
echo -e "- interactive on"
fi
if [[ $recursive -eq 1 ]]
then
echo -e "- recursive on"
fi
echo -e "- verbose on"
fi
}
function build_file_checksum_array {
# take time for process duration measurement
time_checksum_build_started=$(date --date "now" "+%s")
# print title in verbose mode only
if [[ $verbose -eq 1 ]]; then echo -e "\nRetrieve file checksums:\n========================\n"; else echo -e "\nPreparing deduplication process...\n"; fi
# find all files recursivly in given directory arguments and associate them in an array with their md5sum
if [[ $recursive -eq 1 ]]
then
findcommand="find '$@' -type f -size +0c -print0"
else
findcommand="find '$@' -maxdepth 1 -type f -size +0c -print0"
fi
while IFS= read -r -d '' file
do
filestring=$(printf '%q\n' "$file")
# check if checksum can be retrieved, otherwise ignore file
checksum=$(bash -c "md5sum $filestring" 2>&1)
if [[ "$?" -eq 0 ]]
then
checksum=$(bash -c "echo $checksum|cut -d' ' -f1")
checksumarray["'"$filestring"'"]="$checksum"
# print md5sum - file associations in verbose mode only
if [[ $verbose -eq 1 ]]
then
echo "${checksumarray["'"$filestring"'"]} - '$filestring'"
fi
fi
done < <(bash -c "$findcommand 2>/dev/null")
# abort script if less than 2 files were found
if [[ ${#checksumarray[@]} -lt 2 ]]
then
echo -e "ERROR - found less than 2 files!\n"
exit 1
fi
# print indexed summary in verbose mode only
if [[ $verbose -eq 1 ]]; then echo -e "\n${#checksumarray[@]} files indexed."; fi
}
function process_deduplication {
# take time for process duration measurement
time_deduplication_started=$(date --date "now" "+%s")
# the main deduplication logic of this script
echo -e "\nDeduplicate files:\n==================\n"
# sort array indexes by checksums to avoid the exponential problem with one file to all file comparision
IFS=$'\n'
keyarray=($(
for key in "${!checksumarray[@]}"
do
printf '%s:%s\n' "$key" "${checksumarray[$key]}"
done | sort -t : -k 2 | sed 's/:.*//'))
unset IFS
filecount="${#checksumarray[@]}"
probablyhardlinkablecount=0
hardlinkablecount=0
hardlinkcount=0
freeablebytes=0
freedbytes=0
while [[ $filecount -gt 1 ]]
do
# needed to compare the actuale file only with the next file as consequence of the sorted indexes
actualfile=$(echo "${keyarray[$filecount-1]}"| sed "s/^.//" | sed "s/.$//")
actualchecksum="${checksumarray[${keyarray[$filecount-1]}]}"
comparefile=$(echo "${keyarray[$filecount-2]}"| sed "s/^.//" | sed "s/.$//")
comparechecksum="${checksumarray[${keyarray[$filecount-2]}]}"
if [[ "$actualchecksum" == "$comparechecksum" ]]
then
if [[ $(bash -c "stat -c %i $actualfile") == $(bash -c "stat -c %i $comparefile") ]]
then
if [[ $verbose -eq 1 ]]; then echo -e "$actualfile & $comparefile -> already hardlinked."; fi
elif [[ $(bash -c "stat -c %m $actualfile") != $(bash -c "stat -c %m $comparefile") ]]
then
if [[ $verbose -eq 1 ]]; then echo -e "$actualfile & $comparefile -> equal md5 checksum, but not located on the same filesystem."; fi
probablyhardlinkablecount+=1
else
echo -e "$actualfile & $comparefile -> equal md5 checksum, they will be compared byte-by-byte:"
cmpmessage=$(bash -c "cmp $actualfile $comparefile 2>&1")
if [[ $? -eq 0 ]]
then
echo -n "Files match, they will be hard linked... "
linkcommand="ln"
if [[ $backup -eq 1 ]]
then
linkcommand="$linkcommand --backup=numbered"
fi
if [[ $interactive -eq 1 ]]
then
linkcommand="$linkcommand -i"
else
linkcommand="$linkcommand -f"
fi
if [[ $dry_run -eq 0 ]]; then $(bash -c "$linkcommand $actualfile $comparefile"); linkerror=$?; fi
if [[ $linkerror -eq 0 ]]
then
let freedbytes="$(( $freedbytes + $(bash -c "stat -c %s $comparefile") ))"
let hardlinkcount+=1
echo -e "done\n"
else
echo -e "failed\n"
fi
let freeablebytes="$(( $freeablebytes + $(bash -c "stat -c %s $comparefile") ))"
let hardlinkablecount+=1
else
echo -e "$cmpmessage"
echo -e "Files not equal, nothing to do."
fi
fi
fi
let filecount-=1
echo -ne "$(( ${#checksumarray[@]} - $filecount + 1 )) files checked.\r"
done
echo -e "\n"
# print deduplicate summary in verbose mode only
if [[ $verbose -eq 1 ]]; then echo -e "$hardlinkcount linkable duplicates found."; fi
}
function show_summary {
# take time for process duration measurement
time_process_finished=$(date --date "now" "+%s")
# calculate needed time for processing
time_checksum_build=$(($time_deduplication_started - $time_checksum_build_started))
time_deduplication=$(($time_process_finished - $time_deduplication_started))
# calculate scale unit for freeable disk space
if [[ $freeablebytes -ge 1073741824 ]]
then
freeablebytes=$(bc <<< "scale=2; $freeablebytes / 1073741824" )
freeablescaleunit="GiB"
elif [[ $freeablebytes -ge 1048576 ]]
then
freeablebytes=$(bc <<< "scale=2; $freeablebytes / 1048576" )
freeablescaleunit="MiB"
elif [[ $freeablebytes -ge 1024 ]]
then
freeablebytes=$(bc <<< "sclae=2; $freeablebytes / 1024" )
freeablescaleunit="KiB"
else
freeablescaleunit="bytes"
fi
# calculate scale unit for freed disk space
if [[ $freedbytes -ge 1073741824 ]]
then
freedbytes=$(bc <<< "scale=2; $freedbytes / 1073741824" )
freedscaleunit="GiB"
elif [[ $freedbytes -ge 1048576 ]]
then
freedbytes=$(bc <<< "scale=2; $freedbytes / 1048576" )
freedscaleunit="MiB"
elif [[ $freedbytes -ge 1024 ]]
then
freedbytes=$(bc <<< "sclae=2; $freedbytes / 1024" )
freedscaleunit="KiB"
else
freedscaleunit="bytes"
fi
# print summary of script activities
echo -e "\nSummary:\n========\n"
echo -e "$time_checksum_build seconds needed to retrieve all file checksums."
echo -e "$time_deduplication seconds needed for the deduplication process."
echo -e "${#checksumarray[@]} files found and checked."
echo -e "$probablyhardlinkablecount probably duplicates found on different filesystems."
echo -e "$hardlinkablecount duplicates found."
echo -en "$hardlinkcount files deduplicated."; if [[ $interactive -eq 1 ]]; then echo -e " (if all interactive decisions were 'yes'."; else echo -e ""; fi
echo -e "$freeablebytes $freeablescaleunit of disk space probably freeable."
echo -en "$freedbytes $freedscaleunit of disk space freed."; if [[ $interactive -eq 1 ]]; then echo -e " (if all interactive decisions were 'yes'.\n"; else echo -e "\n"; fi
}
# main
if [[ $# == 0 ]]
then
echo -e "$script_name: missing operand after '$script_name'"
echo -e "$script_name: Try '$script_name --help' for more information."
exit 1
fi
# test if a directory argument exist
for argument in "$@"
do
if [[ ! $argument == "-"* ]]
then
directory_argument_exists=1
fi
done
# set current directory if no directory argument exist
if [[ ! $directory_argument_exists -eq 1 ]]
then
set -- "$@" "$(pwd)"
fi
while [[ $# -gt 0 ]]
do
case ${1,,} in
-h|--help)
usage
exit 0
;;
-b|--backup)
backup=1
;;
-d|--dry-run)
dry_run=1
;;
-i|--interactive)
interactive=1
;;
-r|--recursive)
recursive=1
;;
-v|--verbose)
verbose=1
;;
-*|--*)
echo "\nERROR - $1 option does not exist!\n" 1>&2
exit 1
;;
*)
if [[ $# -gt 1 ]]
then
directories="$@"
shift $#
else
directories="$1"
fi
check_requirements
validate_directory "$directories"
show_script_header
show_script_options
build_file_checksum_array "$directories"
process_deduplication
show_summary
;;
esac
shift
done
exit 0