-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharchive-home-dirs.py
295 lines (254 loc) · 9.21 KB
/
archive-home-dirs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
#!/usr/bin/env python3
"""
Archive home directories onto object storage (like S3 or GCS).
Designed to be run manually, and takes care to not delete anything without a lot of
confirmation.
Original script by Yuvi Panda (@yuvipanda), 2i2c.
https://github.com/2i2c-org/features/issues/32#issue-2221427520
"""
import hashlib
import string
import sys
import shutil
import os
import argparse
import boto3
from botocore.exceptions import ClientError
from escapism import escape
from pathlib import Path
from contextlib import contextmanager
import tempfile
import time
import subprocess
from functools import cache
@cache
def get_tar_command() -> str:
"""
Return the tar command to use.
We use `gnu` tar for compressing files, and Mac OS ships with bsd tar by
default. We detect this, and tell users to get gnu tar if needed for local
testing. Should not be an issue when running on containers.
"""
out = subprocess.check_output(["tar", "--version"]).decode()
if out.startswith("tar (GNU tar)"):
return "tar"
else:
# We may be on Mac OS, and GNU Tar is not installed by default
# It can be installed from homebrew with `brew install gnu-tar`,
# which provides `gtar`
if shutil.which("gtar"):
return "gtar"
else:
print("Could not find GNU Tar on the system", file=sys.stderr)
print(
"If on Mac OS, please install gnu-tar with the following command (if using homebrew) and try again",
file=sys.stderr,
)
print("brew install gnu-tar", file=sys.stderr)
sys.exit(1)
def validate_homes_exist(basedir: Path, usernames: list[str], ignore_missing: bool):
"""
Validate that all given homedirectories for users exist
"""
errors = []
for username in usernames:
escaped_username = escape(
username, safe=set(string.ascii_lowercase + string.digits), escape_char="-"
).lower()
# We should still protect against directory traversal attacks
user_home = (basedir / escaped_username).absolute()
if basedir not in user_home.parents:
errors.append(
f"{user_home} refers to a directory outside of {basedir}, can not be archived"
)
if not user_home.exists() and not ignore_missing:
errors.append(
f"{username}'s home directory does not exist inside {basedir}, {user_home} not found"
)
if errors:
print(
"The following errors were found when trying to validate that all user home directories exist",
file=sys.stderr,
)
print("\n".join(errors), file=sys.stderr)
sys.exit(1)
@contextmanager
def archive_dir(dir_path: Path, archive_name: str, temp_path: str):
"""
Archive given directory reproducibly to out_path
"""
start_time = time.perf_counter()
with tempfile.TemporaryDirectory(dir=temp_path) as d:
target_file = Path(d) / (archive_name + ".tar.gz")
cmd = [
get_tar_command(),
f"--directory={dir_path}",
"--sort=name",
"--numeric-owner",
"--create",
"--use-compress-program=pigz",
f"--file={target_file}",
] + ["."]
env = os.environ.copy()
# Set GZip / pigz option to not write timestamp so we get consistent hashes
env["GZIP"] = "-n"
try:
# Capture output and fail explicitly on non-0 error code
# Primarily to get rid of tar: Removing leading `/' from member names
subprocess.check_output(cmd, stderr=subprocess.STDOUT, env=env)
except subprocess.CalledProcessError as e:
print(f"Executing {e.cmd} failed with code {e.returncode}", file=sys.stderr)
print(f"stdout: {e.stdout}", file=sys.stderr)
print(f"stderr: {e.stderr}", file=sys.stderr)
sys.exit(1)
duration = time.perf_counter() - start_time
file_size_gb = target_file.stat().st_size / 1024 / 1024 / 1024
print(
f"Tarballing {dir_path.name} to {archive_name}.tar.gz ({file_size_gb:0.3f} GB) took {duration:0.2f}s"
)
yield target_file
def sha256_file(filepath: Path) -> str:
h = hashlib.sha256()
b = bytearray(128*1024)
mv = memoryview(b)
with open(filepath, "rb", buffering=0) as f:
for n in iter(lambda : f.readinto(mv), 0):
h.update(mv[:n])
return h.hexdigest()
def archive_user(
s3_client,
basedir: Path,
username: str,
archive_name: str,
bucket_name: str,
prefix: str,
ignore_missing: bool,
delete: bool,
temp_path: str
):
escaped_username = escape(
username, safe=set(string.ascii_lowercase + string.digits), escape_char="-"
).lower()
homedir = basedir / username
escaped_homedir = basedir / escaped_username
if ignore_missing and not escaped_homedir.exists():
print(f"User {username} does not exist, skipping archival")
return
print(f"Archiving {username}")
with archive_dir(homedir, f"{escaped_username}-{archive_name}", temp_path) as archived_file:
# Make sure the object key has the same extension as the compressed file we have
object_name = os.path.join(prefix, username, archive_name) + "".join(
archived_file.suffixes
)
sha256sum = sha256_file(archived_file)
try:
head_response = s3_client.head_object(Bucket=bucket_name, Key=object_name)
# If we are here, it means that the file *does* exist
if head_response["Metadata"].get("sha256sum") == sha256sum:
# We have already uploaded this, and the hashes match!
needs_upload = False
else:
# This file exists, *but hashes do not match!*
# This is an error condition, and we abort so we don't overwrite user files
print(head_response)
print("AAAAAAAAAAA", file=sys.stderr)
sys.exit(1)
except ClientError as e:
if e.response.get("Error", {}).get("Code") == "404":
# Does not exist, needs to be uploaded
needs_upload = True
else:
# Some other issue, let's just fail
raise
if needs_upload:
start_time = time.perf_counter()
print(f"Uploading {username}...")
s3_client.upload_file(
archived_file,
bucket_name,
object_name,
ExtraArgs={"Metadata": {"sha256sum": sha256sum}},
)
duration = time.perf_counter() - start_time
print(f"Upload for {username} complete in {duration:0.2f}s")
else:
if delete:
start_time = time.perf_counter()
print(f"Already uploaded, going to delete {username}")
shutil.rmtree(escaped_homedir)
duration = time.perf_counter() - start_time
print(f"Already uploaded, deleted {username} in {duration:0.2f}s")
else:
print(f"Username already uploaded, skipping.")
def main():
argparser = argparse.ArgumentParser()
argparser.add_argument(
"--archive-name",
help="Name for user home directory in",
required=True,
)
argparser.add_argument(
"--basedir",
help="Base directory containing user home directories",
required=True,
)
argparser.add_argument(
"--object-store",
choices=("s3",),
default="s3",
help="Type of object store to upload files to",
)
argparser.add_argument(
"--bucket-name",
help="Name of object storage bucket to upload archived files to",
required=True,
)
argparser.add_argument(
"--object-prefix",
help="Prefix to use before username when uploading archives",
default="a/",
)
argparser.add_argument(
"--usernames-file",
help="File with list of usernames to archive, one per line",
required=True,
)
argparser.add_argument(
"--ignore-missing",
help="Ignore missing user home directories",
action="store_true",
)
argparser.add_argument(
"--delete",
help="Delete home directories after uploading",
action="store_true"
)
argparser.add_argument(
"--temp-path",
help="Location to write the archive before uploading; default _tempdir_ uses the default tempdir.",
default=None
)
args = argparser.parse_args()
basedir = Path(args.basedir).absolute()
usernames = []
with open(args.usernames_file) as f:
for line in f:
if line.startswith("#"):
continue
usernames.append(line.strip())
validate_homes_exist(basedir, usernames, args.ignore_missing)
s3_client = boto3.client("s3")
for username in usernames:
archive_user(
s3_client,
basedir,
username,
args.archive_name,
args.bucket_name,
args.object_prefix,
args.ignore_missing,
args.delete,
args.temp_path
)
if __name__ == "__main__":
main()