From 98e03380b34cf8a7ee2e9100f08552dcf6c1d3dd Mon Sep 17 00:00:00 2001 From: Alex Sokol Date: Tue, 12 Apr 2022 10:36:16 +0300 Subject: [PATCH 1/4] Cover case of a failed checkout --- download_data.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/download_data.py b/download_data.py index 32ea09400..d4039fa04 100644 --- a/download_data.py +++ b/download_data.py @@ -82,11 +82,24 @@ def download(temp_dir): checkout_command = f"cd {temp_dir}/{ownername}/{reponame} && git checkout -f {commit_sha}" subprocess.call(download_command, shell=True) - subprocess.call(checkout_command, shell=True) + try: + subprocess.check_call(checkout_command, shell=True) + except subprocess.CalledProcessError: + print("Couldn't checkout repo. Skip") + # Remove repo + if not is_empty(f"{temp_dir}/{ownername}/{reponame}"): + shutil.rmtree(f"{temp_dir}/{ownername}/{reponame}") print(f"Downloaded: {i + 1}/{len(snapshot_data)}") +def is_empty(directory): + exists = os.path.exists(directory) + if exists: + return len(os.listdir(directory)) == 0 + return True + + def move_files(temp_dir, dataset_dir): """Select files with credential candidates. Files without candidates is omited""" snapshot_file = "snapshot.yaml" @@ -111,6 +124,13 @@ def move_files(temp_dir, dataset_dir): repo_url = repo_data["url"] ownername, reponame = repo_url.split("/")[-2:] + if is_empty(f"{temp_dir}/{ownername}/{reponame}"): + print(f"Couldn't find data in {new_repo_id} repo. " + f"Removing {meta_file_path}, so missing files would not count in the dataset statistics") + print(f"You can use git to restore {meta_file_path} file back") + os.remove(meta_file_path) + continue + # Select all files in the repo # pathlib.Path.glob used instead of glob.glob, as glob.glob could not search for a hidden files repo_files = pathlib.Path(f"{temp_dir}/{ownername}/{reponame}").glob("**/*") From f513a03194b73a2d5db6bd5b6169db130c21c12b Mon Sep 17 00:00:00 2001 From: Alex Sokol Date: Tue, 12 Apr 2022 16:08:15 +0300 Subject: [PATCH 2/4] Move is_empty check up to fix re-runs --- download_data.py | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/download_data.py b/download_data.py index d4039fa04..82147b0bb 100644 --- a/download_data.py +++ b/download_data.py @@ -111,26 +111,25 @@ def move_files(temp_dir, dataset_dir): for i, repo_data in enumerate(snapshot_data): new_repo_id = hashlib.sha256(repo_data["id"].encode()).hexdigest()[:8] - - meta_file_path = f"meta/{new_repo_id}.csv" - - # Select file names from meta that we will use in dataset - interesting_files = set() - with open(meta_file_path) as csvfile: - meta_reader = csv.DictReader(csvfile) - for row in meta_reader: - interesting_files.add(row["FileID"]) - repo_url = repo_data["url"] ownername, reponame = repo_url.split("/")[-2:] + meta_file_path = f"meta/{new_repo_id}.csv" if is_empty(f"{temp_dir}/{ownername}/{reponame}"): print(f"Couldn't find data in {new_repo_id} repo. " f"Removing {meta_file_path}, so missing files would not count in the dataset statistics") print(f"You can use git to restore {meta_file_path} file back") - os.remove(meta_file_path) + if os.path.exists(meta_file_path): + os.remove(meta_file_path) continue + # Select file names from meta that we will use in dataset + interesting_files = set() + with open(meta_file_path) as csvfile: + meta_reader = csv.DictReader(csvfile) + for row in meta_reader: + interesting_files.add(row["FileID"]) + # Select all files in the repo # pathlib.Path.glob used instead of glob.glob, as glob.glob could not search for a hidden files repo_files = pathlib.Path(f"{temp_dir}/{ownername}/{reponame}").glob("**/*") From abb49c68daa024854d5b1539a448a4ec6ce52c00 Mon Sep 17 00:00:00 2001 From: Alex Sokol Date: Tue, 12 Apr 2022 16:25:21 +0300 Subject: [PATCH 3/4] Change assert with error to just a message --- download_data.py | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/download_data.py b/download_data.py index 82147b0bb..8657b1067 100644 --- a/download_data.py +++ b/download_data.py @@ -108,6 +108,7 @@ def move_files(temp_dir, dataset_dir): os.makedirs(temp_dir, exist_ok=True) os.makedirs(dataset_dir, exist_ok=True) + missing_repos = [] for i, repo_data in enumerate(snapshot_data): new_repo_id = hashlib.sha256(repo_data["id"].encode()).hexdigest()[:8] @@ -115,12 +116,11 @@ def move_files(temp_dir, dataset_dir): ownername, reponame = repo_url.split("/")[-2:] meta_file_path = f"meta/{new_repo_id}.csv" - if is_empty(f"{temp_dir}/{ownername}/{reponame}"): - print(f"Couldn't find data in {new_repo_id} repo. " + if not os.path.exists(meta_file_path): + print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. " f"Removing {meta_file_path}, so missing files would not count in the dataset statistics") print(f"You can use git to restore {meta_file_path} file back") - if os.path.exists(meta_file_path): - os.remove(meta_file_path) + missing_repos.append(meta_file_path) continue # Select file names from meta that we will use in dataset @@ -146,8 +146,13 @@ def move_files(temp_dir, dataset_dir): ids_found.add(file_id) # Check if there are files that present in meta but we could not find, or we somehow found files not from meta - assert len(ids_found.symmetric_difference(interesting_files)) == 0, \ - "Could not find all files mentioned in metadata. Try to remove `tmp` directory and run again." + if len(ids_found.symmetric_difference(interesting_files)) != 0: + print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. " + f"Removing {meta_file_path}, so missing files would not count in the dataset statistics") + print(f"You can use git to restore {meta_file_path} file back") + missing_repos.append(meta_file_path) + if os.path.exists(meta_file_path): + os.remove(meta_file_path) # Copy files to new dataset location for j, full_path in enumerate(sorted(list(files_found))): @@ -171,6 +176,8 @@ def move_files(temp_dir, dataset_dir): print(f"Processed: {i + 1}/{len(snapshot_data)}") + return missing_repos + def get_obfuscated_value(value, predefined_pattern): obfuscated_value = "" @@ -462,8 +469,15 @@ def obfuscate_creds(dataset_dir): print("Start download") download(temp_directory) print("Download finished. Now processing the files...") - move_files(temp_directory, args.data_dir) + removed_meta = move_files(temp_directory, args.data_dir) print("Finalizing dataset. Please wait a moment...") obfuscate_creds(args.data_dir) print("Done!") print(f"All files saved to {args.data_dir}") + + if len(removed_meta) > 0: + print("Some repos had a problem with download.") + print("Removing meta so missing files would not count in the dataset statistics:") + for missing in removed_meta: + print(missing) + print(f"You can use git to restore mentioned meta files back") From 6dc40ec97ab1b14032f1d08522c4d287548c2012 Mon Sep 17 00:00:00 2001 From: Alex Sokol Date: Tue, 12 Apr 2022 16:30:53 +0300 Subject: [PATCH 4/4] Fix typo in download_data.py log message --- download_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/download_data.py b/download_data.py index 8657b1067..ced2ecde3 100644 --- a/download_data.py +++ b/download_data.py @@ -117,7 +117,7 @@ def move_files(temp_dir, dataset_dir): meta_file_path = f"meta/{new_repo_id}.csv" if not os.path.exists(meta_file_path): - print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. " + print(f"Couldn't find all files mentioned in metadata for {new_repo_id} repo. " f"Removing {meta_file_path}, so missing files would not count in the dataset statistics") print(f"You can use git to restore {meta_file_path} file back") missing_repos.append(meta_file_path) @@ -147,7 +147,7 @@ def move_files(temp_dir, dataset_dir): # Check if there are files that present in meta but we could not find, or we somehow found files not from meta if len(ids_found.symmetric_difference(interesting_files)) != 0: - print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. " + print(f"Couldn't find all files mentioned in metadata for {new_repo_id} repo. " f"Removing {meta_file_path}, so missing files would not count in the dataset statistics") print(f"You can use git to restore {meta_file_path} file back") missing_repos.append(meta_file_path)