From 98e03380b34cf8a7ee2e9100f08552dcf6c1d3dd Mon Sep 17 00:00:00 2001
From: Alex Sokol <meanrin@outlook.com>
Date: Tue, 12 Apr 2022 10:36:16 +0300
Subject: [PATCH 1/4] Cover case of a failed checkout

---
 download_data.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/download_data.py b/download_data.py
index 32ea09400..d4039fa04 100644
--- a/download_data.py
+++ b/download_data.py
@@ -82,11 +82,24 @@ def download(temp_dir):
         checkout_command = f"cd {temp_dir}/{ownername}/{reponame} && git checkout -f {commit_sha}"
 
         subprocess.call(download_command, shell=True)
-        subprocess.call(checkout_command, shell=True)
+        try:
+            subprocess.check_call(checkout_command, shell=True)
+        except subprocess.CalledProcessError:
+            print("Couldn't checkout repo. Skip")
+            # Remove repo
+            if not is_empty(f"{temp_dir}/{ownername}/{reponame}"):
+                shutil.rmtree(f"{temp_dir}/{ownername}/{reponame}")
 
         print(f"Downloaded: {i + 1}/{len(snapshot_data)}")
 
 
+def is_empty(directory):
+    exists = os.path.exists(directory)
+    if exists:
+        return len(os.listdir(directory)) == 0
+    return True
+
+
 def move_files(temp_dir, dataset_dir):
     """Select files with credential candidates. Files without candidates is omited"""
     snapshot_file = "snapshot.yaml"
@@ -111,6 +124,13 @@ def move_files(temp_dir, dataset_dir):
         repo_url = repo_data["url"]
         ownername, reponame = repo_url.split("/")[-2:]
 
+        if is_empty(f"{temp_dir}/{ownername}/{reponame}"):
+            print(f"Couldn't find data in {new_repo_id} repo. "
+                  f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
+            print(f"You can use git to restore {meta_file_path} file back")
+            os.remove(meta_file_path)
+            continue
+
         # Select all files in the repo
         # pathlib.Path.glob used instead of glob.glob, as glob.glob could not search for a hidden files
         repo_files = pathlib.Path(f"{temp_dir}/{ownername}/{reponame}").glob("**/*")

From f513a03194b73a2d5db6bd5b6169db130c21c12b Mon Sep 17 00:00:00 2001
From: Alex Sokol <meanrin@outlook.com>
Date: Tue, 12 Apr 2022 16:08:15 +0300
Subject: [PATCH 2/4] Move is_empty check up to fix re-runs

---
 download_data.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/download_data.py b/download_data.py
index d4039fa04..82147b0bb 100644
--- a/download_data.py
+++ b/download_data.py
@@ -111,26 +111,25 @@ def move_files(temp_dir, dataset_dir):
 
     for i, repo_data in enumerate(snapshot_data):
         new_repo_id = hashlib.sha256(repo_data["id"].encode()).hexdigest()[:8]
-
-        meta_file_path = f"meta/{new_repo_id}.csv"
-
-        # Select file names from meta that we will use in dataset
-        interesting_files = set()
-        with open(meta_file_path) as csvfile:
-            meta_reader = csv.DictReader(csvfile)
-            for row in meta_reader:
-                interesting_files.add(row["FileID"])
-
         repo_url = repo_data["url"]
         ownername, reponame = repo_url.split("/")[-2:]
+        meta_file_path = f"meta/{new_repo_id}.csv"
 
         if is_empty(f"{temp_dir}/{ownername}/{reponame}"):
             print(f"Couldn't find data in {new_repo_id} repo. "
                   f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
             print(f"You can use git to restore {meta_file_path} file back")
-            os.remove(meta_file_path)
+            if os.path.exists(meta_file_path):
+                os.remove(meta_file_path)
             continue
 
+        # Select file names from meta that we will use in dataset
+        interesting_files = set()
+        with open(meta_file_path) as csvfile:
+            meta_reader = csv.DictReader(csvfile)
+            for row in meta_reader:
+                interesting_files.add(row["FileID"])
+
         # Select all files in the repo
         # pathlib.Path.glob used instead of glob.glob, as glob.glob could not search for a hidden files
         repo_files = pathlib.Path(f"{temp_dir}/{ownername}/{reponame}").glob("**/*")

From abb49c68daa024854d5b1539a448a4ec6ce52c00 Mon Sep 17 00:00:00 2001
From: Alex Sokol <meanrin@outlook.com>
Date: Tue, 12 Apr 2022 16:25:21 +0300
Subject: [PATCH 3/4] Change assert with error to just a message

---
 download_data.py | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/download_data.py b/download_data.py
index 82147b0bb..8657b1067 100644
--- a/download_data.py
+++ b/download_data.py
@@ -108,6 +108,7 @@ def move_files(temp_dir, dataset_dir):
     os.makedirs(temp_dir, exist_ok=True)
 
     os.makedirs(dataset_dir, exist_ok=True)
+    missing_repos = []
 
     for i, repo_data in enumerate(snapshot_data):
         new_repo_id = hashlib.sha256(repo_data["id"].encode()).hexdigest()[:8]
@@ -115,12 +116,11 @@ def move_files(temp_dir, dataset_dir):
         ownername, reponame = repo_url.split("/")[-2:]
         meta_file_path = f"meta/{new_repo_id}.csv"
 
-        if is_empty(f"{temp_dir}/{ownername}/{reponame}"):
-            print(f"Couldn't find data in {new_repo_id} repo. "
+        if not os.path.exists(meta_file_path):
+            print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. "
                   f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
             print(f"You can use git to restore {meta_file_path} file back")
-            if os.path.exists(meta_file_path):
-                os.remove(meta_file_path)
+            missing_repos.append(meta_file_path)
             continue
 
         # Select file names from meta that we will use in dataset
@@ -146,8 +146,13 @@ def move_files(temp_dir, dataset_dir):
                 ids_found.add(file_id)
 
         # Check if there are files that present in meta but we could not find, or we somehow found files not from meta
-        assert len(ids_found.symmetric_difference(interesting_files)) == 0, \
-            "Could not find all files mentioned in metadata. Try to remove `tmp` directory and run again."
+        if len(ids_found.symmetric_difference(interesting_files)) != 0:
+            print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. "
+                  f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
+            print(f"You can use git to restore {meta_file_path} file back")
+            missing_repos.append(meta_file_path)
+            if os.path.exists(meta_file_path):
+                os.remove(meta_file_path)
 
         # Copy files to new dataset location
         for j, full_path in enumerate(sorted(list(files_found))):
@@ -171,6 +176,8 @@ def move_files(temp_dir, dataset_dir):
 
         print(f"Processed: {i + 1}/{len(snapshot_data)}")
 
+    return missing_repos
+
 
 def get_obfuscated_value(value, predefined_pattern):
     obfuscated_value = ""
@@ -462,8 +469,15 @@ def obfuscate_creds(dataset_dir):
     print("Start download")
     download(temp_directory)
     print("Download finished. Now processing the files...")
-    move_files(temp_directory, args.data_dir)
+    removed_meta = move_files(temp_directory, args.data_dir)
     print("Finalizing dataset. Please wait a moment...")
     obfuscate_creds(args.data_dir)
     print("Done!")
     print(f"All files saved to {args.data_dir}")
+
+    if len(removed_meta) > 0:
+        print("Some repos had a problem with download.")
+        print("Removing meta so missing files would not count in the dataset statistics:")
+        for missing in removed_meta:
+            print(missing)
+        print(f"You can use git to restore mentioned meta files back")

From 6dc40ec97ab1b14032f1d08522c4d287548c2012 Mon Sep 17 00:00:00 2001
From: Alex Sokol <meanrin@outlook.com>
Date: Tue, 12 Apr 2022 16:30:53 +0300
Subject: [PATCH 4/4] Fix typo in download_data.py log message

---
 download_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/download_data.py b/download_data.py
index 8657b1067..ced2ecde3 100644
--- a/download_data.py
+++ b/download_data.py
@@ -117,7 +117,7 @@ def move_files(temp_dir, dataset_dir):
         meta_file_path = f"meta/{new_repo_id}.csv"
 
         if not os.path.exists(meta_file_path):
-            print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. "
+            print(f"Couldn't find all files mentioned in metadata for {new_repo_id} repo. "
                   f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
             print(f"You can use git to restore {meta_file_path} file back")
             missing_repos.append(meta_file_path)
@@ -147,7 +147,7 @@ def move_files(temp_dir, dataset_dir):
 
         # Check if there are files that present in meta but we could not find, or we somehow found files not from meta
         if len(ids_found.symmetric_difference(interesting_files)) != 0:
-            print(f"Couldn't find all files mentioned in metadata. in {new_repo_id} repo. "
+            print(f"Couldn't find all files mentioned in metadata for {new_repo_id} repo. "
                   f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
             print(f"You can use git to restore {meta_file_path} file back")
             missing_repos.append(meta_file_path)