Skip to content

Commit

Permalink
Merge pull request #133 from Kaleido66/feat/add-repo-data
Browse files Browse the repository at this point in the history
feat/add repository dashboard
  • Loading branch information
bifenglin authored Dec 2, 2024
2 parents ce6f565 + 0e87d63 commit 4ce71c8
Show file tree
Hide file tree
Showing 5 changed files with 362 additions and 1 deletion.
37 changes: 37 additions & 0 deletions .github/workflows/updateDashboardRepositoryData.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
name: Monthly Python Script Execution

on:
schedule:
# 每个月2号的00:00 UTC运行脚本
- cron: '0 0 2 * *'
workflow_dispatch: # 也允许手动触发工作流

jobs:
run_python_script:
runs-on: ubuntu-latest # 使用 GitHub 提供的 Ubuntu 环境

steps:
- name: Checkout code
uses: actions/checkout@v3 # 检出仓库代码

- name: Set up Python
uses: actions/setup-python@v4
with:
python-version: '3.10' # 设置 Python 版本

- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt # 安装 Python 依赖
- name: Run the Python script
run: |
python dashboard/company/scripts/workflow.py # 执行 workflow.py 脚本
python dashboard/repository/scripts/repo_name.py
python dashboard/repository/scripts/repo_data.py
env:
xlabDB_HOST: ${{ secrets.DB_HOST }}
xlabDB_USER: ${{ secrets.DB_USER }}
xlabDB_PASSWORD: ${{ secrets.DB_PASSWORD }}
dashboardDB_HOST: ${{ secrets.DASHBOARD_DB_HOST }}
dashboardDB_USER: ${{ secrets.DASHBOARD_DB_USER }}
dashboardDB_PASSWORD: ${{ secrets.DASHBOARD_DB_PASSWORD }}
209 changes: 209 additions & 0 deletions dashboard/repository/scripts/repo_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,209 @@
import pandas as pd
import requests
import re
from datetime import datetime, timedelta
from clickhouse_driver import Client

# 下载并聚合数据
def down_data_and_aggregate(url, org_repo_platform_df, metrics_list):
"""下载数据并聚合到一个DataFrame"""
all_data = pd.DataFrame() # 初始化空的DataFrame

for index, row in org_repo_platform_df.iterrows():
org_repo = row['repo_name']
platform = row['platform']

# 遍历每个指标
for metric in metrics_list:
cur_url = f"{url}{platform}/{org_repo}/{metric}"
response = requests.get(cur_url, timeout=30)

if response.status_code == 200:
data = response.json()
filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}-\d{2}$", k)}

metric_name = metric.split('.')[0] # 使用指标名去除扩展名
df = pd.DataFrame(list(filtered_data.items()), columns=['t_month', metric_name])
df['org_repo'] = org_repo
all_data = pd.concat([all_data, df], axis=0) # 聚合数据
else:
print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}")

# 聚合数据,按 "org_repo" 和 "t_month" 进行透视
all_data = all_data.pivot_table(index=['org_repo', 't_month'],
values=[metric.split('.')[0] for metric in metrics_list],
aggfunc='first').reset_index()

return all_data

# 获取每个项目最新一个月的数据
def get_latest_data(all_data):
"""获取每个项目最新一个月的数据"""
all_data['t_month'] = pd.to_datetime(all_data['t_month'], format='%Y-%m')
latest_data = all_data.sort_values('t_month').groupby('org_repo').tail(1)
latest_data['t_month'] = latest_data['t_month'].dt.strftime('%Y-%m')
return latest_data

# 下载近六个月的数据并聚合到一个DataFrame
def down_data_and_aggregate_contributor(url, org_repo_platform_df, metric):
"""下载近六个月的数据并聚合到一个DataFrame"""
all_data = pd.DataFrame() # 初始化空的DataFrame以聚合所有数据

# 计算近六个月的月份
current_date = datetime.now()
recent_six_months = [(current_date - timedelta(days=30 * i)).strftime("%Y-%m") for i in range(6)]

# 遍历每个仓库和平台
for index, row in org_repo_platform_df.iterrows():
org_repo = row['repo_name']
platform = row['platform']

# 构建URL
cur_url = f"{url}{platform}/{org_repo}/{metric}"
response = requests.get(cur_url)

if response.status_code == 200:
data = response.json()

# 只保留日期为"yyyy-mm"格式并且属于近六个月的数据
filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}-\d{2}$", k) and k in recent_six_months}

# 创建DataFrame并整合数据
rows = []
for month, contributors in filtered_data.items():
for contributor, value in contributors:
rows.append([org_repo, month, contributor, value])

df = pd.DataFrame(rows, columns=['org_repo', 't_month', 'contributor', 'value'])
all_data = pd.concat([all_data, df], axis=0) # 聚合数据
else:
print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}")

return all_data

# 年度数据下载与聚合
def down_data_and_aggregate_yearly(url, org_repo_platform_df, metrics_dict):
"""下载数据并按项目求和后聚合到一个DataFrame"""
all_data = pd.DataFrame() # 初始化空的DataFrame以聚合所有数据

# 遍历每个仓库、平台和所需的指标
for index, row in org_repo_platform_df.iterrows():
org_repo = row['repo_name']
platform = row['platform']

repo_data = {'org_repo': org_repo, 'stars': 0, 'forks': 0, 'participants': 0, 'merged_PRs': 0}

for metric, metric_name in metrics_dict.items():
cur_url = f"{url}{platform}/{org_repo}/{metric}"
response = requests.get(cur_url)

if response.status_code == 200:
data = response.json()
# 只保留年份格式为 "yyyy" 的数据
filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}$", k)}

# 计算当前指标的总和,并存储在repo_data字典中
repo_data[metric_name] = sum(filtered_data.values())
else:
print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}")

# 将当前项目的数据添加到总体数据中
all_data = pd.concat([all_data, pd.DataFrame([repo_data])], ignore_index=True)

return all_data

# 保存数据到ClickHouse
def save_to_clickhouse(client, table_name, df, columns):
"""将数据保存到ClickHouse"""
# 确保日期字段为字符串
if 't_month' in df.columns:
df['t_month'] = df['t_month'].astype(str)

# 清空目标表
client.execute(f"TRUNCATE TABLE {table_name}")

# 显式将 NaN 转换为 None(NULL)
df = df.applymap(lambda x: None if pd.isna(x) else x)

# 转换为记录字典列表并插入
records = df.to_dict('records')
client.execute(f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES", records)
print(f"数据已成功保存到表 {table_name}")

# 从ClickHouse查询数据
def query_clickhouse(client, table_name):
"""从ClickHouse查询数据"""
query = f"SELECT * FROM {table_name}"
result = client.execute(query)
return pd.DataFrame(result, columns=['org_repo', 't_month', 'contributor', 'value'])

# 从ClickHouse加载repo_name和platform列
def load_org_repo_platform_from_clickhouse(client, table_name):
"""从ClickHouse查询repo_name和platform列"""
query = f"SELECT repo_name, platform FROM {table_name}"
result = client.execute(query)
return pd.DataFrame(result, columns=['repo_name', 'platform'])


if __name__ == '__main__':
url = "https://oss.x-lab.info/open_digger/" # 基础API URL
platform_project_mapping_table = 'platform_project_mapping' # 存储repo_name和platform信息的表名

# 连接到ClickHouse
target_client = Client(
host=dashboard_host, # 目标服务器地址
port=9000, # ClickHouse 默认端口
user=dashboard_user, # 目标服务器用户名
password=dashboard_password, # 目标服务器密码
database='opensource', # 目标数据库名称
send_receive_timeout=600
)

# 从ClickHouse表加载repo_name和platform列
org_repo_platform_df = load_org_repo_platform_from_clickhouse(target_client, platform_project_mapping_table)

# -------------------- code_change_data Metric Set -------------------- (code change data)
metrics_list_1 = ["code_change_lines_add.json", "code_change_lines_remove.json", "code_change_lines_sum.json"]
all_data_1 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_1)
aggregated_table_1 = "REPO_code_change_data"
save_to_clickhouse(target_client, aggregated_table_1, all_data_1, ['org_repo', 't_month', 'code_change_lines_add', 'code_change_lines_remove', 'code_change_lines_sum'])

# -------------------- issues_data Metric Set -------------------- (issue data)
metrics_list_2 = ["issues_new.json", "issues_closed.json", "issue_comments.json"]
all_data_2 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_2)
aggregated_table_2 = "REPO_issues_data"
save_to_clickhouse(target_client, aggregated_table_2, all_data_2, ['org_repo', 't_month', 'issues_new', 'issues_closed', 'issue_comments'])

# -------------------- pr_data Metric Set -------------------- (pull request data)
metrics_list_3 = ["change_requests.json", "change_requests_accepted.json"]
all_data_3 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_3)
aggregated_table_3 = "REPO_pr_data"
save_to_clickhouse(target_client, aggregated_table_3, all_data_3, ['org_repo', 't_month', 'change_requests', 'change_requests_accepted'])

# -------------------- aggregated_data Metric Set -------------------- (activity & attention)
metrics_list_4 = ["activity.json", "openrank.json", "attention.json"]
all_data_4 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_4)
aggregated_table_4 = "REPO_aggregated_data"
save_to_clickhouse(target_client, aggregated_table_4, all_data_4, ['org_repo', 't_month', 'activity', 'openrank', 'attention'])

# -------------------- activity_details Metric Set -------------------- (monthly activity details)
metric_monthly = "activity_details.json"
all_data_monthly = down_data_and_aggregate_contributor(url, org_repo_platform_df, metric_monthly)
aggregated_table_monthly = "REPO_activity_details"
save_to_clickhouse(target_client, aggregated_table_monthly, all_data_monthly, ['org_repo', 't_month', 'contributor', 'value'])

# -------------------- starfork_participant_data Metric Set -------------------- (yearly star/fork/participant data)
metrics_dict_yearly = {
"stars.json": "stars",
"technical_fork.json": "forks",
"participants.json": "participants",
"change_requests_accepted.json": "merged_PRs"
}
all_data_yearly = down_data_and_aggregate_yearly(url, org_repo_platform_df, metrics_dict_yearly)
aggregated_table_yearly = "REPO_starfork_participant_data"
save_to_clickhouse(target_client, aggregated_table_yearly, all_data_yearly, ['org_repo', 'stars', 'forks', 'participants', 'merged_PRs'])

# -------------------- latest_month_data Metric Set -------------------- (latest month data)
latest_data_4 = get_latest_data(all_data_4)
latest_table_4 = "REPO_latest_month_data"
save_to_clickhouse(target_client, latest_table_4, latest_data_4, ['org_repo', 't_month', 'activity', 'openrank', 'attention'])
113 changes: 113 additions & 0 deletions dashboard/repository/scripts/repo_name.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import requests
import pandas as pd
from datetime import datetime
from clickhouse_driver import Client

# 获取最新的年月,如果最新月没有数据,则回退到上一个月
def get_latest_month():
today = datetime.today()
year = today.year
month = today.month
return year, month

# 获取数据
def fetch_data(year, month, index, region):
url = f'https://xlab-open-source.oss-cn-beijing.aliyuncs.com/open_leaderboard/{index}/repo/{region}/{year}{month}.json'
try:
response = requests.get(url)
response.raise_for_status()
data = response.json()
return data['data']
except requests.exceptions.RequestException as e:
print(f"获取数据时出错: {e}")
return []

# 查询 ClickHouse 数据
def query_clickhouse(client, project_names, batch_size=100):
all_results = []

for i in range(0, len(project_names), batch_size):
batch_names = project_names[i:i + batch_size]
query = """
SELECT lower(CAST(platform AS String)), repo_name
FROM export_repo
WHERE repo_name IN %(batch_names)s
"""
try:
batch_result = client.execute(query, {'batch_names': batch_names})
all_results.extend(batch_result)
except Exception as e:
print(f"查询失败: {e}")
continue

df_result = pd.DataFrame(all_results, columns=['platform', 'repo_name'])
df_result = (
df_result.sort_values(by='platform', ascending=False)
.drop_duplicates(subset=['repo_name'], keep='first')
)
return df_result

# 保存到 ClickHouse
def save_to_clickhouse(client, table_name, df):
client.execute(f"TRUNCATE TABLE {table_name}")
client.execute(
f"INSERT INTO {table_name} (platform, repo_name) VALUES",
df.to_dict('records')
)
print(f"数据已成功保存到表 {table_name}")

# 主程序
if __name__ == "__main__":
# Step 1: 获取数据
year, month = get_latest_month()
data = None
while data is None or len(data) == 0:
month_str = str(month).zfill(2)
indices = ['activity', 'open_rank']
regions = ['chinese', 'global']
all_data = []

for index in indices:
for region in regions:
temp_data = fetch_data(year, month_str, index, region)
if temp_data:
all_data.extend(temp_data[:300])

if all_data:
data = all_data
else:
if month == 1:
month = 12
year -= 1
else:
month -= 1

# Step 2: 提取项目名称并去重
project_names = list({item['item']['name'] for item in data})

# Step 3: 连接 ClickHouse 数据库
# 读取数据的 ClickHouse
source_client = Client(
host='cc-2ze7189376o5m9759.public.clickhouse.ads.aliyuncs.com',
port=9000,
user='xlab',
password='PASSWORD',
database='opensource',
send_receive_timeout=600
)

# 保存数据的 ClickHouse
target_client = Client(
host='47.116.118.218',
port=9000,
user='USERS',
database='opensource',
send_receive_timeout=600
)

# Step 4: 查询并处理 ClickHouse 数据
df_result = query_clickhouse(source_client, project_names)

# Step 5: 保存结果到 ClickHouse
table_name = "platform_project_mapping"
save_to_clickhouse(target_client, table_name, df_result)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
requests
pandas
clickhouse-driver
datetime
3 changes: 2 additions & 1 deletion src/components/table.js
Original file line number Diff line number Diff line change
Expand Up @@ -298,11 +298,12 @@ function dashboard(text, index, t_month, object) {
repo_name,
t_month_copy,
t_month,
org_repo: `${org_name}/${repo_name}`,
};
return (
<a
href={
'https://dataease.x-lab.info/link/dqMbZrBk?attachParams=' +
'https://dataease.x-lab.info/link/v5wLKVcF?attachParams=' +
btoa(JSON.stringify(params))
}
target="_blank"
Expand Down

0 comments on commit 4ce71c8

Please sign in to comment.