Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cover case of a failed checkout #1

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 41 additions & 8 deletions download_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,11 +82,24 @@ def download(temp_dir):
checkout_command = f"cd {temp_dir}/{ownername}/{reponame} && git checkout -f {commit_sha}"

subprocess.call(download_command, shell=True)
subprocess.call(checkout_command, shell=True)
try:
subprocess.check_call(checkout_command, shell=True)
except subprocess.CalledProcessError:
print("Couldn't checkout repo. Skip")
# Remove repo
if not is_empty(f"{temp_dir}/{ownername}/{reponame}"):
shutil.rmtree(f"{temp_dir}/{ownername}/{reponame}")

print(f"Downloaded: {i + 1}/{len(snapshot_data)}")


def is_empty(directory):
exists = os.path.exists(directory)
if exists:
return len(os.listdir(directory)) == 0
return True


def move_files(temp_dir, dataset_dir):
"""Select files with credential candidates. Files without candidates is omited"""
snapshot_file = "snapshot.yaml"
Expand All @@ -95,22 +108,28 @@ def move_files(temp_dir, dataset_dir):
os.makedirs(temp_dir, exist_ok=True)

os.makedirs(dataset_dir, exist_ok=True)
missing_repos = []

for i, repo_data in enumerate(snapshot_data):
new_repo_id = hashlib.sha256(repo_data["id"].encode()).hexdigest()[:8]

repo_url = repo_data["url"]
ownername, reponame = repo_url.split("/")[-2:]
meta_file_path = f"meta/{new_repo_id}.csv"

if not os.path.exists(meta_file_path):
print(f"Couldn't find all files mentioned in metadata for {new_repo_id} repo. "
f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
print(f"You can use git to restore {meta_file_path} file back")
missing_repos.append(meta_file_path)
continue

# Select file names from meta that we will use in dataset
interesting_files = set()
with open(meta_file_path) as csvfile:
meta_reader = csv.DictReader(csvfile)
for row in meta_reader:
interesting_files.add(row["FileID"])

repo_url = repo_data["url"]
ownername, reponame = repo_url.split("/")[-2:]

# Select all files in the repo
# pathlib.Path.glob used instead of glob.glob, as glob.glob could not search for a hidden files
repo_files = pathlib.Path(f"{temp_dir}/{ownername}/{reponame}").glob("**/*")
Expand All @@ -127,8 +146,13 @@ def move_files(temp_dir, dataset_dir):
ids_found.add(file_id)

# Check if there are files that present in meta but we could not find, or we somehow found files not from meta
assert len(ids_found.symmetric_difference(interesting_files)) == 0, \
"Could not find all files mentioned in metadata. Try to remove `tmp` directory and run again."
if len(ids_found.symmetric_difference(interesting_files)) != 0:
print(f"Couldn't find all files mentioned in metadata for {new_repo_id} repo. "
f"Removing {meta_file_path}, so missing files would not count in the dataset statistics")
print(f"You can use git to restore {meta_file_path} file back")
missing_repos.append(meta_file_path)
if os.path.exists(meta_file_path):
os.remove(meta_file_path)

# Copy files to new dataset location
for j, full_path in enumerate(sorted(list(files_found))):
Expand All @@ -152,6 +176,8 @@ def move_files(temp_dir, dataset_dir):

print(f"Processed: {i + 1}/{len(snapshot_data)}")

return missing_repos


def get_obfuscated_value(value, predefined_pattern):
obfuscated_value = ""
Expand Down Expand Up @@ -443,8 +469,15 @@ def obfuscate_creds(dataset_dir):
print("Start download")
download(temp_directory)
print("Download finished. Now processing the files...")
move_files(temp_directory, args.data_dir)
removed_meta = move_files(temp_directory, args.data_dir)
print("Finalizing dataset. Please wait a moment...")
obfuscate_creds(args.data_dir)
print("Done!")
print(f"All files saved to {args.data_dir}")

if len(removed_meta) > 0:
print("Some repos had a problem with download.")
print("Removing meta so missing files would not count in the dataset statistics:")
for missing in removed_meta:
print(missing)
print(f"You can use git to restore mentioned meta files back")