-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #133 from Kaleido66/feat/add-repo-data
feat/add repository dashboard
- Loading branch information
Showing
5 changed files
with
362 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
name: Monthly Python Script Execution | ||
|
||
on: | ||
schedule: | ||
# 每个月2号的00:00 UTC运行脚本 | ||
- cron: '0 0 2 * *' | ||
workflow_dispatch: # 也允许手动触发工作流 | ||
|
||
jobs: | ||
run_python_script: | ||
runs-on: ubuntu-latest # 使用 GitHub 提供的 Ubuntu 环境 | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v3 # 检出仓库代码 | ||
|
||
- name: Set up Python | ||
uses: actions/setup-python@v4 | ||
with: | ||
python-version: '3.10' # 设置 Python 版本 | ||
|
||
- name: Install Python dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
pip install -r requirements.txt # 安装 Python 依赖 | ||
- name: Run the Python script | ||
run: | | ||
python dashboard/company/scripts/workflow.py # 执行 workflow.py 脚本 | ||
python dashboard/repository/scripts/repo_name.py | ||
python dashboard/repository/scripts/repo_data.py | ||
env: | ||
xlabDB_HOST: ${{ secrets.DB_HOST }} | ||
xlabDB_USER: ${{ secrets.DB_USER }} | ||
xlabDB_PASSWORD: ${{ secrets.DB_PASSWORD }} | ||
dashboardDB_HOST: ${{ secrets.DASHBOARD_DB_HOST }} | ||
dashboardDB_USER: ${{ secrets.DASHBOARD_DB_USER }} | ||
dashboardDB_PASSWORD: ${{ secrets.DASHBOARD_DB_PASSWORD }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,209 @@ | ||
import pandas as pd | ||
import requests | ||
import re | ||
from datetime import datetime, timedelta | ||
from clickhouse_driver import Client | ||
|
||
# 下载并聚合数据 | ||
def down_data_and_aggregate(url, org_repo_platform_df, metrics_list): | ||
"""下载数据并聚合到一个DataFrame""" | ||
all_data = pd.DataFrame() # 初始化空的DataFrame | ||
|
||
for index, row in org_repo_platform_df.iterrows(): | ||
org_repo = row['repo_name'] | ||
platform = row['platform'] | ||
|
||
# 遍历每个指标 | ||
for metric in metrics_list: | ||
cur_url = f"{url}{platform}/{org_repo}/{metric}" | ||
response = requests.get(cur_url, timeout=30) | ||
|
||
if response.status_code == 200: | ||
data = response.json() | ||
filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}-\d{2}$", k)} | ||
|
||
metric_name = metric.split('.')[0] # 使用指标名去除扩展名 | ||
df = pd.DataFrame(list(filtered_data.items()), columns=['t_month', metric_name]) | ||
df['org_repo'] = org_repo | ||
all_data = pd.concat([all_data, df], axis=0) # 聚合数据 | ||
else: | ||
print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}") | ||
|
||
# 聚合数据,按 "org_repo" 和 "t_month" 进行透视 | ||
all_data = all_data.pivot_table(index=['org_repo', 't_month'], | ||
values=[metric.split('.')[0] for metric in metrics_list], | ||
aggfunc='first').reset_index() | ||
|
||
return all_data | ||
|
||
# 获取每个项目最新一个月的数据 | ||
def get_latest_data(all_data): | ||
"""获取每个项目最新一个月的数据""" | ||
all_data['t_month'] = pd.to_datetime(all_data['t_month'], format='%Y-%m') | ||
latest_data = all_data.sort_values('t_month').groupby('org_repo').tail(1) | ||
latest_data['t_month'] = latest_data['t_month'].dt.strftime('%Y-%m') | ||
return latest_data | ||
|
||
# 下载近六个月的数据并聚合到一个DataFrame | ||
def down_data_and_aggregate_contributor(url, org_repo_platform_df, metric): | ||
"""下载近六个月的数据并聚合到一个DataFrame""" | ||
all_data = pd.DataFrame() # 初始化空的DataFrame以聚合所有数据 | ||
|
||
# 计算近六个月的月份 | ||
current_date = datetime.now() | ||
recent_six_months = [(current_date - timedelta(days=30 * i)).strftime("%Y-%m") for i in range(6)] | ||
|
||
# 遍历每个仓库和平台 | ||
for index, row in org_repo_platform_df.iterrows(): | ||
org_repo = row['repo_name'] | ||
platform = row['platform'] | ||
|
||
# 构建URL | ||
cur_url = f"{url}{platform}/{org_repo}/{metric}" | ||
response = requests.get(cur_url) | ||
|
||
if response.status_code == 200: | ||
data = response.json() | ||
|
||
# 只保留日期为"yyyy-mm"格式并且属于近六个月的数据 | ||
filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}-\d{2}$", k) and k in recent_six_months} | ||
|
||
# 创建DataFrame并整合数据 | ||
rows = [] | ||
for month, contributors in filtered_data.items(): | ||
for contributor, value in contributors: | ||
rows.append([org_repo, month, contributor, value]) | ||
|
||
df = pd.DataFrame(rows, columns=['org_repo', 't_month', 'contributor', 'value']) | ||
all_data = pd.concat([all_data, df], axis=0) # 聚合数据 | ||
else: | ||
print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}") | ||
|
||
return all_data | ||
|
||
# 年度数据下载与聚合 | ||
def down_data_and_aggregate_yearly(url, org_repo_platform_df, metrics_dict): | ||
"""下载数据并按项目求和后聚合到一个DataFrame""" | ||
all_data = pd.DataFrame() # 初始化空的DataFrame以聚合所有数据 | ||
|
||
# 遍历每个仓库、平台和所需的指标 | ||
for index, row in org_repo_platform_df.iterrows(): | ||
org_repo = row['repo_name'] | ||
platform = row['platform'] | ||
|
||
repo_data = {'org_repo': org_repo, 'stars': 0, 'forks': 0, 'participants': 0, 'merged_PRs': 0} | ||
|
||
for metric, metric_name in metrics_dict.items(): | ||
cur_url = f"{url}{platform}/{org_repo}/{metric}" | ||
response = requests.get(cur_url) | ||
|
||
if response.status_code == 200: | ||
data = response.json() | ||
# 只保留年份格式为 "yyyy" 的数据 | ||
filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}$", k)} | ||
|
||
# 计算当前指标的总和,并存储在repo_data字典中 | ||
repo_data[metric_name] = sum(filtered_data.values()) | ||
else: | ||
print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}") | ||
|
||
# 将当前项目的数据添加到总体数据中 | ||
all_data = pd.concat([all_data, pd.DataFrame([repo_data])], ignore_index=True) | ||
|
||
return all_data | ||
|
||
# 保存数据到ClickHouse | ||
def save_to_clickhouse(client, table_name, df, columns): | ||
"""将数据保存到ClickHouse""" | ||
# 确保日期字段为字符串 | ||
if 't_month' in df.columns: | ||
df['t_month'] = df['t_month'].astype(str) | ||
|
||
# 清空目标表 | ||
client.execute(f"TRUNCATE TABLE {table_name}") | ||
|
||
# 显式将 NaN 转换为 None(NULL) | ||
df = df.applymap(lambda x: None if pd.isna(x) else x) | ||
|
||
# 转换为记录字典列表并插入 | ||
records = df.to_dict('records') | ||
client.execute(f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES", records) | ||
print(f"数据已成功保存到表 {table_name}") | ||
|
||
# 从ClickHouse查询数据 | ||
def query_clickhouse(client, table_name): | ||
"""从ClickHouse查询数据""" | ||
query = f"SELECT * FROM {table_name}" | ||
result = client.execute(query) | ||
return pd.DataFrame(result, columns=['org_repo', 't_month', 'contributor', 'value']) | ||
|
||
# 从ClickHouse加载repo_name和platform列 | ||
def load_org_repo_platform_from_clickhouse(client, table_name): | ||
"""从ClickHouse查询repo_name和platform列""" | ||
query = f"SELECT repo_name, platform FROM {table_name}" | ||
result = client.execute(query) | ||
return pd.DataFrame(result, columns=['repo_name', 'platform']) | ||
|
||
|
||
if __name__ == '__main__': | ||
url = "https://oss.x-lab.info/open_digger/" # 基础API URL | ||
platform_project_mapping_table = 'platform_project_mapping' # 存储repo_name和platform信息的表名 | ||
|
||
# 连接到ClickHouse | ||
target_client = Client( | ||
host=dashboard_host, # 目标服务器地址 | ||
port=9000, # ClickHouse 默认端口 | ||
user=dashboard_user, # 目标服务器用户名 | ||
password=dashboard_password, # 目标服务器密码 | ||
database='opensource', # 目标数据库名称 | ||
send_receive_timeout=600 | ||
) | ||
|
||
# 从ClickHouse表加载repo_name和platform列 | ||
org_repo_platform_df = load_org_repo_platform_from_clickhouse(target_client, platform_project_mapping_table) | ||
|
||
# -------------------- code_change_data Metric Set -------------------- (code change data) | ||
metrics_list_1 = ["code_change_lines_add.json", "code_change_lines_remove.json", "code_change_lines_sum.json"] | ||
all_data_1 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_1) | ||
aggregated_table_1 = "REPO_code_change_data" | ||
save_to_clickhouse(target_client, aggregated_table_1, all_data_1, ['org_repo', 't_month', 'code_change_lines_add', 'code_change_lines_remove', 'code_change_lines_sum']) | ||
|
||
# -------------------- issues_data Metric Set -------------------- (issue data) | ||
metrics_list_2 = ["issues_new.json", "issues_closed.json", "issue_comments.json"] | ||
all_data_2 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_2) | ||
aggregated_table_2 = "REPO_issues_data" | ||
save_to_clickhouse(target_client, aggregated_table_2, all_data_2, ['org_repo', 't_month', 'issues_new', 'issues_closed', 'issue_comments']) | ||
|
||
# -------------------- pr_data Metric Set -------------------- (pull request data) | ||
metrics_list_3 = ["change_requests.json", "change_requests_accepted.json"] | ||
all_data_3 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_3) | ||
aggregated_table_3 = "REPO_pr_data" | ||
save_to_clickhouse(target_client, aggregated_table_3, all_data_3, ['org_repo', 't_month', 'change_requests', 'change_requests_accepted']) | ||
|
||
# -------------------- aggregated_data Metric Set -------------------- (activity & attention) | ||
metrics_list_4 = ["activity.json", "openrank.json", "attention.json"] | ||
all_data_4 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_4) | ||
aggregated_table_4 = "REPO_aggregated_data" | ||
save_to_clickhouse(target_client, aggregated_table_4, all_data_4, ['org_repo', 't_month', 'activity', 'openrank', 'attention']) | ||
|
||
# -------------------- activity_details Metric Set -------------------- (monthly activity details) | ||
metric_monthly = "activity_details.json" | ||
all_data_monthly = down_data_and_aggregate_contributor(url, org_repo_platform_df, metric_monthly) | ||
aggregated_table_monthly = "REPO_activity_details" | ||
save_to_clickhouse(target_client, aggregated_table_monthly, all_data_monthly, ['org_repo', 't_month', 'contributor', 'value']) | ||
|
||
# -------------------- starfork_participant_data Metric Set -------------------- (yearly star/fork/participant data) | ||
metrics_dict_yearly = { | ||
"stars.json": "stars", | ||
"technical_fork.json": "forks", | ||
"participants.json": "participants", | ||
"change_requests_accepted.json": "merged_PRs" | ||
} | ||
all_data_yearly = down_data_and_aggregate_yearly(url, org_repo_platform_df, metrics_dict_yearly) | ||
aggregated_table_yearly = "REPO_starfork_participant_data" | ||
save_to_clickhouse(target_client, aggregated_table_yearly, all_data_yearly, ['org_repo', 'stars', 'forks', 'participants', 'merged_PRs']) | ||
|
||
# -------------------- latest_month_data Metric Set -------------------- (latest month data) | ||
latest_data_4 = get_latest_data(all_data_4) | ||
latest_table_4 = "REPO_latest_month_data" | ||
save_to_clickhouse(target_client, latest_table_4, latest_data_4, ['org_repo', 't_month', 'activity', 'openrank', 'attention']) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
import requests | ||
import pandas as pd | ||
from datetime import datetime | ||
from clickhouse_driver import Client | ||
|
||
# 获取最新的年月,如果最新月没有数据,则回退到上一个月 | ||
def get_latest_month(): | ||
today = datetime.today() | ||
year = today.year | ||
month = today.month | ||
return year, month | ||
|
||
# 获取数据 | ||
def fetch_data(year, month, index, region): | ||
url = f'https://xlab-open-source.oss-cn-beijing.aliyuncs.com/open_leaderboard/{index}/repo/{region}/{year}{month}.json' | ||
try: | ||
response = requests.get(url) | ||
response.raise_for_status() | ||
data = response.json() | ||
return data['data'] | ||
except requests.exceptions.RequestException as e: | ||
print(f"获取数据时出错: {e}") | ||
return [] | ||
|
||
# 查询 ClickHouse 数据 | ||
def query_clickhouse(client, project_names, batch_size=100): | ||
all_results = [] | ||
|
||
for i in range(0, len(project_names), batch_size): | ||
batch_names = project_names[i:i + batch_size] | ||
query = """ | ||
SELECT lower(CAST(platform AS String)), repo_name | ||
FROM export_repo | ||
WHERE repo_name IN %(batch_names)s | ||
""" | ||
try: | ||
batch_result = client.execute(query, {'batch_names': batch_names}) | ||
all_results.extend(batch_result) | ||
except Exception as e: | ||
print(f"查询失败: {e}") | ||
continue | ||
|
||
df_result = pd.DataFrame(all_results, columns=['platform', 'repo_name']) | ||
df_result = ( | ||
df_result.sort_values(by='platform', ascending=False) | ||
.drop_duplicates(subset=['repo_name'], keep='first') | ||
) | ||
return df_result | ||
|
||
# 保存到 ClickHouse | ||
def save_to_clickhouse(client, table_name, df): | ||
client.execute(f"TRUNCATE TABLE {table_name}") | ||
client.execute( | ||
f"INSERT INTO {table_name} (platform, repo_name) VALUES", | ||
df.to_dict('records') | ||
) | ||
print(f"数据已成功保存到表 {table_name}") | ||
|
||
# 主程序 | ||
if __name__ == "__main__": | ||
# Step 1: 获取数据 | ||
year, month = get_latest_month() | ||
data = None | ||
while data is None or len(data) == 0: | ||
month_str = str(month).zfill(2) | ||
indices = ['activity', 'open_rank'] | ||
regions = ['chinese', 'global'] | ||
all_data = [] | ||
|
||
for index in indices: | ||
for region in regions: | ||
temp_data = fetch_data(year, month_str, index, region) | ||
if temp_data: | ||
all_data.extend(temp_data[:300]) | ||
|
||
if all_data: | ||
data = all_data | ||
else: | ||
if month == 1: | ||
month = 12 | ||
year -= 1 | ||
else: | ||
month -= 1 | ||
|
||
# Step 2: 提取项目名称并去重 | ||
project_names = list({item['item']['name'] for item in data}) | ||
|
||
# Step 3: 连接 ClickHouse 数据库 | ||
# 读取数据的 ClickHouse | ||
source_client = Client( | ||
host='cc-2ze7189376o5m9759.public.clickhouse.ads.aliyuncs.com', | ||
port=9000, | ||
user='xlab', | ||
password='PASSWORD', | ||
database='opensource', | ||
send_receive_timeout=600 | ||
) | ||
|
||
# 保存数据的 ClickHouse | ||
target_client = Client( | ||
host='47.116.118.218', | ||
port=9000, | ||
user='USERS', | ||
database='opensource', | ||
send_receive_timeout=600 | ||
) | ||
|
||
# Step 4: 查询并处理 ClickHouse 数据 | ||
df_result = query_clickhouse(source_client, project_names) | ||
|
||
# Step 5: 保存结果到 ClickHouse | ||
table_name = "platform_project_mapping" | ||
save_to_clickhouse(target_client, table_name, df_result) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
requests | ||
pandas | ||
clickhouse-driver | ||
datetime |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters