-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgit_scraping.py
127 lines (110 loc) · 3.94 KB
/
git_scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import argparse
import os
from typing import List, Optional
import requests
token = os.environ.get("GIT_TOKEN")
# Replace 'YOUR_TOKEN' with your actual GitHub API token
URL = "https://api.github.com/search/repositories"
def print_page(data, counter: int, contains: List[str]) -> int:
for repo in data["items"]:
if "ailearning" in repo["full_name"]:
continue
print(repo["html_url"])
counter += 1
if counter == 0:
break
# repo_contents_url = f"https://api.github.com/repos/{repo['full_name']}/contents"
# repo_contents_response = requests.get(repo_contents_url)
# repo_contents_response.raise_for_status()
# if repo_contents_response.status_code == 200:
# repo_contents_data = repo_contents_response.json()
# requirements_files = [
# content["name"]
# for content in repo_contents_data
# if content["type"] == "file"
# and (len(contains) == 0 or content["name"] in contains)
# ]
# if requirements_files:
# print((repo["html_url"]))
# counter -= 1
# if counter == 0:
# break
return counter
def scrape_repos(
max: int = 20,
contains: List[str] = ["requirements.txt"],
max_stars: int = 0,
min_stars: int = 0,
sort_by: str = "stars",
):
if isinstance(contains, str):
contains = [contains]
q = "language:python"
if max_stars != 0:
q += f" stars:{min_stars}..{max_stars}"
else:
q += f" stars:>={min_stars}"
params = {
"q": q,
"sort": sort_by,
"order": "desc",
"per_page": 100, # Max results per page
}
# Prepare headers with authentication
headers = {"Authorization": f"token {token}"}
# Make initial request to get the first page of results
response = requests.get(URL, headers=headers, params=params)
response.raise_for_status() # Raise an exception for 4xx or 5xx status codes
# Process the first page of results
data = response.json()
# max = print_page(data, max, contains=contains)
# Check if there are more pages of results
while "next" in response.links:
# Make subsequent requests for additional pages
next_url = response.links["next"]["url"]
response = requests.get(next_url, headers=headers)
response.raise_for_status() # Raise an exception for 4xx or 5xx status codes
# Process the next page of results
data = response.json()
max = print_page(data, max, contains=contains)
if max == 0:
break
def get_repository_language(git_url: str) -> str:
# Extract owner and repository name from the Git URL
owner, repo = git_url.replace(".git", "").split("/")[-2:]
# Fetch repository languages
languages_url = f"https://api.github.com/repos/{owner}/{repo}/languages"
languages_response = requests.get(languages_url)
if languages_response.status_code == 200:
languages_data = languages_response.json()
# Get the most used language
most_used_language = max(languages_data, key=languages_data.get)
return most_used_language
else:
print(f"Failed to retrieve languages for repository {owner}/{repo}")
return None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--max_stars",
default=0,
help="maximum number of stars for a repo to contain",
)
parser.add_argument(
"--min_stars",
default=0,
help="Minimum number of stars for a repo to contain",
)
parser.add_argument(
"--sort_by",
default="stars",
help="value to order repos by",
)
args = parser.parse_args()
scrape_repos(
max=100,
contains=[],
max_stars=int(args.max_stars),
min_stars=int(args.min_stars),
sort_by=args.sort_by,
)