forked from chengyin30069/nh-project
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnh2.sh
executable file
·351 lines (318 loc) · 11 KB
/
nh2.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
#!/bin/bash
# constants
VERSION="2024-10-03"
# kill the whole process group of this script on Ctrl + C
# ref: https://stackoverflow.com/a/2173421
trap 'trap - SIGTERM && kill -- -$$' SIGINT SIGTERM
# utilities
errcho() { echo "$@" 1>&2; }
print_help() {
echo "This is a script used to download an nhentai as pictures"
echo
echo "Usage: (./)nh2.sh [OPTIONS...] [NUMBERS...|help]"
echo
echo "NUMBERS: The magic numbers of the galleries (i.e., book IDs)."
echo
echo "OPTIONS:"
echo " -r, --max-retry=NUMBER Times to retry downloading after failure."
echo " (default: 20)"
echo ' -m, --media-server-list="SERVER1 SERVER2 ..."'
echo " List of alternative servers, space seperated."
echo " Used sequentially when retrying."
echo ' (default: "3 7 5")'
echo ' (Hint: "dig i$''{NUM}.nhentai.net" to test)'
echo ' -f, --folder-path=PATH Specify a directory for image stroage.'
echo ' (default: ~/nh)'
echo ' -p, --parallel=MAX_JOBS Max number of download jobs in parallel.'
echo ' (default: 20)'
echo ' -h, --help Show this message. (You may as well use the '
echo ' keyword help to print out this help messege)'
echo ' -v, --version Show the version.'
}
throw() {
while [ "$#" -gt 0 ]; do
errcho "$1"
shift
done
finish 1;
}
throwhelp() {
throw "$@" \
"" \
"--------------------------------------------------------------------------------" \
"$(print_help)"
}
# a job management system to ensure the script only exit
# when all the things are done
declare JOBS=()
update_jobs() {
declare UPDATED=()
for PID in "${JOBS[@]}"; do
if ps -p "$PID" >> /dev/null ; then
UPDATED+=("$PID")
fi
done;
JOBS=("${UPDATED[@]}")
}
wait_for_jobs() {
while [ -n "${JOBS[*]}" ]; do
# echo waiting for "${JOBS[@]}"
update_jobs;
done;
}
finish() {
declare EXIT_CODE
EXIT_CODE="${1:-0}"
update_jobs;
# wait for jobs, if there is any undone
if [ -n "${JOBS[*]}" ]; then
echo "Waiting for jobs to finish..."
wait_for_jobs;
# after the waiting, error message may have been flushed away.
# notify the user to check them
if [ "$EXIT_CODE" -ne "0" ]; then
echo "Something went wrong with the script (exit $EXIT_CODE). Check the error above."
fi
fi
exit "$EXIT_CODE";
}
# a tool for parsing the arguments
parse_args() {
if [ "$#" -le 4 ]; then
echo "Illegal use of parse_args. You should give at leat three arguments: "
echo " - A regex matches options with values (e.g. '^--(output|etc)|-[oe]$')"
echo " - A regex matches options w/o values (e.g. '^--(help|version)|-[hv]$')"
echo " - A regex matches options with optional value (e.g. '^--(retry|parallel)|-[rp]$')"
echo " - note the '^' and '$' at the two ends of the regex"
echo " - A callback command, which takes 4 parameter: "
echo " - status (NO_VAL, WITH_VAL, UNEXPT_VAL, UNEXPT_NO_VAL, UNKNOWN_OPT, NON_OPT, OPT_VAL_NO_EQ)"
echo ' - option name: you must implement for names `''PARSE_ARGS_NON_OPTION`'
echo ' - option value: (if given)'
echo ' - full error argument'
echo ' - Pass all the arugments to parse with "$@". You must check if "$@" is empty in advance.'
finish 1
fi
declare OPTS_WITH_VALUE="$1"
declare OPTS_WO_VALUE="$2"
declare OPTS_WITH_OPT_VALUE="$3"
declare CALLBACK="$4"
shift 4
declare SIGN # the symbol of a flag, such as `+` in `+f` and `-` in `-f`
declare FLAGS # the leftover part of an argument
declare ARG # the full argument
while [ "$#" -gt 0 ]; do
declare FLAG
declare EQENDING
# if the last argument is fully parsed, parse another
if [ -z "$FLAGS" ]; then
ARG="$1"
shift 1
if [[ "$ARG" =~ ^[-+][a-zA-Z0-9]+(=.*)?$ ]]; then
SIGN="$(sed -E 's/^([-+])([a-zA-Z0-9]+)(=.*)?$/\1/' <<< "$ARG")"
FLAGS="$(sed -E 's/^([-+])([a-zA-Z0-9]+)(=.*)?$/\2\3/' <<< "$ARG")"
elif [[ "$ARG" =~ ^--[-a-zA-Z0-9]+(=.*)?$ ]]; then
FLAG="$(sed -E 's/^(--[-a-zA-Z0-9]+)(=.*)?$/\1/' <<< "$ARG")"
EQENDING="$(sed -E 's/^(--[-a-zA-Z0-9]+)(=.*)?$/\2/' <<< "$ARG")"
else
"$CALLBACK" NON_OPT "" "$ARG"
continue
fi
fi
if [ -n "$FLAGS" ]; then
declare FLAG_REST="${FLAGS#?}"
FLAG="$SIGN${FLAGS%"$FLAG_REST"}"
if [[ "$FLAG_REST" =~ ^=.* ]]; then
EQENDING=${FLAG_REST#=}
FLAGS=""
else
FLAGS="${FLAG_REST}"
fi
fi
if [[ "$FLAG" =~ $OPTS_WITH_OPT_VALUE ]]; then
# for a option with optional value
if [ -z "$EQENDING" ]; then # the next symbol is not '='
if { [[ -n "$FLAGS" ]] && [[ ! "$FLAGS" =~ ^[a-zA-Z0-9].* ]]; } then
# if the next symbol is not a valid flag, treat it as invalid value
"$CALLBACK" OPT_VAL_NO_EQ "$FLAG" "$FLAGS" "$ARG"
else
"$CALLBACK" NO_VAL "$FLAG"
fi
else # pass the value if there is a value
"$CALLBACK" WITH_VAL "$FLAG" "${EQENDING#=}"
fi
elif [[ "$FLAG" =~ $OPTS_WITH_VALUE ]]; then
# for options with value
if [ -n "$FLAGS" ]; then # treat text after it as value
"$CALLBACK" WITH_VAL "$FLAG" "$FLAGS"
elif [ -n "$EQENDING" ]; then # remove the '=', and treat it as value
"$CALLBACK" WITH_VAL "$FLAG" "${EQENDING#=}"
elif [ "$#" -gt 0 ]; then # treat the next argument as value
"$CALLBACK" WITH_VAL "$FLAG" "$1"
shift 1
else # send the exception of no value
"$CALLBACK" UNEXPT_NO_VAL "$FLAG" "" "$ARG"
fi
elif [[ "$FLAG" =~ $OPTS_WO_VALUE ]]; then
if { [ -z "$FLAGS" ] && [ -n "$EQENDING" ] ; }; then
# a equal sign on option without value is wrong
"$CALLBACK" UNEXPT_VAL "$FLAG" "${EQENDING#=}" "$ARG"
elif { [ -n "$FLAGS" ] && [[ ! "$FLAGS" =~ ^[a-zA-Z0-9].* ]] ; } ; then
# if the next symbol is not a valid flag, treat it as invalid value
"$CALLBACK" UNEXPT_VAL "$FLAG" "$FLAGS" "$ARG"
else # call back with no value
"$CALLBACK" NO_VAL "$FLAG"
fi
else
"$CALLBACK" UNKNOWN_OPT "$FLAG" "" "$ARG"
fi
FLAG=
EQENDING=
done
}
# == START OF parse the arguments ==
declare MAX_JOB_COUNT=20
declare MAX_RETRY=5
declare MEDIA_SERVER_LIST=(3 7 5)
declare ID_LIST=()
declare FOLDER_PATH="$HOME/nh"
argument_callback() {
declare STATUS="$1"
declare FLAG="$2"
declare VALUE="$3"
declare ERR_ARG="$4"
case "$1" in
UNEXPT_VAL)
throwhelp "Option '$FLAG' doesn't require a value '$VALUE'. Error at '$ERR_ARG'.";;
UNEXPT_NO_VAL)
throwhelp "Option '$FLAG' requires a value. Error at '$ERR_ARG'.";;
UNKNOWN_OPT)
throwhelp "Option '$FLAG' is unknown. Error at '$ERR_ARG'." \
"Hint: if this is a optional value of the previous option (such a '-p')," \
" you may need to specify it with equal sign '='." ;;
OPT_VAL_NO_EQ)
throwhelp "The optional value of option '$FLAG' must be explicitly specified with equal sign '='." \
"Error at '$ERR_ARG'" \
"Hint: your value '$VALUE' may be a wrongly-typed flag." ;;
NON_OPT)
if [ "$VALUE" = "help" ]; then
print_help
finish
elif [[ "$VALUE" =~ [0-9]+ ]]; then
ID_LIST+=("$VALUE")
return
else
throwhelp "'$VALUE' is neither a book id, an option, or a keyword like 'help'."
fi;;
esac
case "$2" in
--help|-h)
print_help; finish;;
--version|-v)
echo "$VERSION"; finish;;
--max-retry|-r)
MAX_RETRY="$VALUE";;
--media-server-list|-m)
# MEDIA_SERVER_LIST=($VALUE);;
IFS=" " read -r -a MEDIA_SERVER_LIST <<< "$VALUE" ;;
--folder-path|-f)
FOLDER_PATH="${VALUE/#\~/$HOME}";;
# expand '~' to "$HOME" correctly
# ref: https://stackoverflow.com/a/27485157
--parallel|-p)
# case "$STATUS" in
# NO_VAL)
# MAX_JOB_COUNT=20;;
# WITH_VAL)
# MAX_JOB_COUNT="$VALUE";;
# *)
# throw "Unexpected status. Status '$STATUS', option '$FLAG', value '$VALUE', ERR_ARG '$ERR_ARG'";;
# esac;;
MAX_JOB_COUNT="$VALUE";;
*)
throw "Unexpected things happened. This line shouldn't be executed." \
"Option '$FLAG', value '$VALUE', ERR_ARG '$ERR_ARG'";;
esac
}
parse_args '^--(max-retry|media-server-list|folder-path|parallel)|-[rmfp]$' '^--(help|version)|-[hv]$' '^$' argument_callback "$@"
# if there is no given book id, shows error
if [ -z "${ID_LIST[*]}" ]; then
throwhelp "At least one book id should be given. "
fi
# == END OF parse the arguments ==
# a command to download with auto-retrying
# we'll use it in main()
download_with_auto_retry() {
declare FILENAME="$1"
declare URL="$2"
# touch "$FILENAME"
wget -q -O "$FILENAME" "$URL"
declare LAST_WGET_DOWNLOAD_RET=$?
# by using `dig i${n}.nhentai.net`, we can see only these three
# servers have IPv4 addresses and are thus valid
for i in $(seq 1 "$MAX_RETRY"); do
# retry if wget didn't run successfully
if [ "$LAST_WGET_DOWNLOAD_RET" -eq 0 ]; then
break;
fi
declare ALTER_MEDIA_SERVER_IDX=$(((i - 1) % ${#MEDIA_SERVER_LIST[@]}))
declare ALTER_MEDIA_SERVER=${MEDIA_SERVER_LIST[ALTER_MEDIA_SERVER_IDX]}
declare ALTER_URL
ALTER_URL=$(echo "$URL" | sed -E "s/\/\/(i|t)[0-9]*\./\/\/\1${ALTER_MEDIA_SERVER}./")
echo "$FILENAME error. Retrying with media_server=$ALTER_MEDIA_SERVER ($i/$MAX_RETRY)..."
wget -q -O "$FILENAME" "$ALTER_URL"
LAST_WGET_DOWNLOAD_RET=$?
done
# tell the user that some file is downloaded
if [ "$LAST_WGET_DOWNLOAD_RET" -eq 0 ]; then
echo "$FILENAME downloaded"
else
echo "$FILENAME failed to download"
fi
}
# == START OF main ==
# switch to the directory for storage
if [ ! -d "$FOLDER_PATH" ]; then
mkdir -p "$FOLDER_PATH" || throw "Failed to create the directory for storage, '$FOLDER_PATH'"
fi
cd "$FOLDER_PATH" || throw "Failed to switch to the directory for storage, '$FOLDER_PATH'"
for ID in "${ID_LIST[@]}"; do
mkdir "$ID" || throw "Failed to create the directory for book#$ID"
# fetch the cover page and save it
echo "Parsing book#$ID..."
declare COVER_HTML
COVER_HTML="$(wget -q -O - "https://nhentai.net/g/$ID/")"
echo "$COVER_HTML" > "$ID/cover_page.html"
# extract a list of images that we need to download
# make enter after each html tag
# -> grep the urls
# - pattern: cover.jpg/png/gif, and t${number}.jpg/png/gif for thumbnails
# -> convert thumbnail filenames to normal files
# -> uniquify the links with awk
# - notice: there may still be multiple urls for book covers
declare IMAGE_URLS
IMAGE_URLS="$(echo "$COVER_HTML" \
| sed -E 's/>/\n/g' \
| grep -oEe 'https://t[0-9]+.nhentai\.net/galleries/[0-9]+/[0-9]+t\.[a-zA-Z]+' \
| sed -E 's/t(\.[a-zA-Z]{1,10})$/\1/g' | sed -E 's/\/\/t([0-9]+)\./\/\/i\1./' \
| awk '!a[$0]++'
)"
for URL in $IMAGE_URLS; do
# extract filename
declare FILENAME
FILENAME="$ID/$(echo "$URL" | sed -E 's/.*\/([^\/]+)/\1/' )"
# check if file exists; if do, skip it
if [ -e "$FILENAME" ]; then
continue
fi
# wait while there are too many downloading in parallel
while [ "${#JOBS[@]}" -ge "$MAX_JOB_COUNT" ]; do
update_jobs
sleep 1;
done
# download the file with auto-retrying
download_with_auto_retry "$FILENAME" "$URL" &
JOBS+=("$!")
done
done
wait_for_jobs
# == END OF main ==