Merge pull request #133 from Kaleido66/feat/add-repo-data

feat/add repository dashboard
X-lab2017 · Dec 2, 2024 · 4ce71c8 · 4ce71c8
2 parents ce6f565 + 0e87d63
commit 4ce71c8
Show file tree

Hide file tree

Showing 5 changed files with 362 additions and 1 deletion.
diff --git a/.github/workflows/updateDashboardRepositoryData.yml b/.github/workflows/updateDashboardRepositoryData.yml
@@ -0,0 +1,37 @@
+name: Monthly Python Script Execution
+
+on:
+  schedule:
+    # 每个月2号的00:00 UTC运行脚本
+    - cron: '0 0 2 * *'
+  workflow_dispatch: # 也允许手动触发工作流
+
+jobs:
+  run_python_script:
+    runs-on: ubuntu-latest # 使用 GitHub 提供的 Ubuntu 环境
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3 # 检出仓库代码
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10' # 设置 Python 版本
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt  # 安装 Python 依赖
+      - name: Run the Python script
+        run: |
+          python dashboard/company/scripts/workflow.py  # 执行 workflow.py 脚本
+          python dashboard/repository/scripts/repo_name.py
+          python dashboard/repository/scripts/repo_data.py
+        env:
+          xlabDB_HOST: ${{ secrets.DB_HOST }}
+          xlabDB_USER: ${{ secrets.DB_USER }}
+          xlabDB_PASSWORD: ${{ secrets.DB_PASSWORD }}
+          dashboardDB_HOST: ${{ secrets.DASHBOARD_DB_HOST }}
+          dashboardDB_USER: ${{ secrets.DASHBOARD_DB_USER }}
+          dashboardDB_PASSWORD: ${{ secrets.DASHBOARD_DB_PASSWORD }}
diff --git a/dashboard/repository/scripts/repo_data.py b/dashboard/repository/scripts/repo_data.py
@@ -0,0 +1,209 @@
+import pandas as pd
+import requests
+import re
+from datetime import datetime, timedelta
+from clickhouse_driver import Client
+
+# 下载并聚合数据
+def down_data_and_aggregate(url, org_repo_platform_df, metrics_list):
+    """下载数据并聚合到一个DataFrame"""
+    all_data = pd.DataFrame()  # 初始化空的DataFrame
+
+    for index, row in org_repo_platform_df.iterrows():
+        org_repo = row['repo_name']
+        platform = row['platform']
+
+        # 遍历每个指标
+        for metric in metrics_list:
+            cur_url = f"{url}{platform}/{org_repo}/{metric}"
+            response = requests.get(cur_url, timeout=30)
+
+            if response.status_code == 200:
+                data = response.json()
+                filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}-\d{2}$", k)}
+
+                metric_name = metric.split('.')[0]  # 使用指标名去除扩展名
+                df = pd.DataFrame(list(filtered_data.items()), columns=['t_month', metric_name])
+                df['org_repo'] = org_repo
+                all_data = pd.concat([all_data, df], axis=0)  # 聚合数据
+            else:
+                print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}")
+
+    # 聚合数据，按 "org_repo" 和 "t_month" 进行透视
+    all_data = all_data.pivot_table(index=['org_repo', 't_month'], 
+                                    values=[metric.split('.')[0] for metric in metrics_list],
+                                    aggfunc='first').reset_index()
+
+    return all_data
+
+# 获取每个项目最新一个月的数据
+def get_latest_data(all_data):
+    """获取每个项目最新一个月的数据"""
+    all_data['t_month'] = pd.to_datetime(all_data['t_month'], format='%Y-%m')
+    latest_data = all_data.sort_values('t_month').groupby('org_repo').tail(1)
+    latest_data['t_month'] = latest_data['t_month'].dt.strftime('%Y-%m')
+    return latest_data
+
+# 下载近六个月的数据并聚合到一个DataFrame
+def down_data_and_aggregate_contributor(url, org_repo_platform_df, metric):
+    """下载近六个月的数据并聚合到一个DataFrame"""
+    all_data = pd.DataFrame()  # 初始化空的DataFrame以聚合所有数据
+
+    # 计算近六个月的月份
+    current_date = datetime.now()
+    recent_six_months = [(current_date - timedelta(days=30 * i)).strftime("%Y-%m") for i in range(6)]
+
+    # 遍历每个仓库和平台
+    for index, row in org_repo_platform_df.iterrows():
+        org_repo = row['repo_name']
+        platform = row['platform']
+
+        # 构建URL
+        cur_url = f"{url}{platform}/{org_repo}/{metric}"
+        response = requests.get(cur_url)
+
+        if response.status_code == 200:
+            data = response.json()
+
+            # 只保留日期为"yyyy-mm"格式并且属于近六个月的数据
+            filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}-\d{2}$", k) and k in recent_six_months}
+
+            # 创建DataFrame并整合数据
+            rows = []
+            for month, contributors in filtered_data.items():
+                for contributor, value in contributors:
+                    rows.append([org_repo, month, contributor, value])
+
+            df = pd.DataFrame(rows, columns=['org_repo', 't_month', 'contributor', 'value'])
+            all_data = pd.concat([all_data, df], axis=0)  # 聚合数据
+        else:
+            print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}")
+
+    return all_data
+
+# 年度数据下载与聚合
+def down_data_and_aggregate_yearly(url, org_repo_platform_df, metrics_dict):
+    """下载数据并按项目求和后聚合到一个DataFrame"""
+    all_data = pd.DataFrame()  # 初始化空的DataFrame以聚合所有数据
+
+    # 遍历每个仓库、平台和所需的指标
+    for index, row in org_repo_platform_df.iterrows():
+        org_repo = row['repo_name']
+        platform = row['platform']
+
+        repo_data = {'org_repo': org_repo, 'stars': 0, 'forks': 0, 'participants': 0, 'merged_PRs': 0}
+
+        for metric, metric_name in metrics_dict.items():
+            cur_url = f"{url}{platform}/{org_repo}/{metric}"
+            response = requests.get(cur_url)
+
+            if response.status_code == 200:
+                data = response.json()
+                # 只保留年份格式为 "yyyy" 的数据
+                filtered_data = {k: v for k, v in data.items() if re.match(r"^\d{4}$", k)}
+
+                # 计算当前指标的总和，并存储在repo_data字典中
+                repo_data[metric_name] = sum(filtered_data.values())
+            else:
+                print(f"Error: {cur_url} - HTTP Status Code: {response.status_code}")
+
+        # 将当前项目的数据添加到总体数据中
+        all_data = pd.concat([all_data, pd.DataFrame([repo_data])], ignore_index=True)
+
+    return all_data
+
+# 保存数据到ClickHouse
+def save_to_clickhouse(client, table_name, df, columns):
+    """将数据保存到ClickHouse"""
+    # 确保日期字段为字符串
+    if 't_month' in df.columns:
+        df['t_month'] = df['t_month'].astype(str)
+
+    # 清空目标表
+    client.execute(f"TRUNCATE TABLE {table_name}")
+
+    # 显式将 NaN 转换为 None（NULL）
+    df = df.applymap(lambda x: None if pd.isna(x) else x)
+
+    # 转换为记录字典列表并插入
+    records = df.to_dict('records')
+    client.execute(f"INSERT INTO {table_name} ({', '.join(columns)}) VALUES", records)
+    print(f"数据已成功保存到表 {table_name}")
+
+# 从ClickHouse查询数据
+def query_clickhouse(client, table_name):
+    """从ClickHouse查询数据"""
+    query = f"SELECT * FROM {table_name}"
+    result = client.execute(query)
+    return pd.DataFrame(result, columns=['org_repo', 't_month', 'contributor', 'value'])
+
+# 从ClickHouse加载repo_name和platform列
+def load_org_repo_platform_from_clickhouse(client, table_name):
+    """从ClickHouse查询repo_name和platform列"""
+    query = f"SELECT repo_name, platform FROM {table_name}"
+    result = client.execute(query)
+    return pd.DataFrame(result, columns=['repo_name', 'platform'])
+
+
+if __name__ == '__main__':
+    url = "https://oss.x-lab.info/open_digger/"  # 基础API URL
+    platform_project_mapping_table = 'platform_project_mapping'  # 存储repo_name和platform信息的表名
+
+    # 连接到ClickHouse
+    target_client = Client(
+        host=dashboard_host,  # 目标服务器地址
+        port=9000,  # ClickHouse 默认端口
+        user=dashboard_user,  # 目标服务器用户名
+        password=dashboard_password,  # 目标服务器密码
+        database='opensource',  # 目标数据库名称
+        send_receive_timeout=600
+    )
+
+    # 从ClickHouse表加载repo_name和platform列
+    org_repo_platform_df = load_org_repo_platform_from_clickhouse(target_client, platform_project_mapping_table)
+
+    # -------------------- code_change_data Metric Set -------------------- (code change data)
+    metrics_list_1 = ["code_change_lines_add.json", "code_change_lines_remove.json", "code_change_lines_sum.json"]
+    all_data_1 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_1)
+    aggregated_table_1 = "REPO_code_change_data"
+    save_to_clickhouse(target_client, aggregated_table_1, all_data_1, ['org_repo', 't_month', 'code_change_lines_add', 'code_change_lines_remove', 'code_change_lines_sum'])
+
+    # -------------------- issues_data Metric Set -------------------- (issue data)
+    metrics_list_2 = ["issues_new.json", "issues_closed.json", "issue_comments.json"]
+    all_data_2 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_2)
+    aggregated_table_2 = "REPO_issues_data"
+    save_to_clickhouse(target_client, aggregated_table_2, all_data_2, ['org_repo', 't_month', 'issues_new', 'issues_closed', 'issue_comments'])
+
+    # -------------------- pr_data Metric Set -------------------- (pull request data)
+    metrics_list_3 = ["change_requests.json", "change_requests_accepted.json"]
+    all_data_3 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_3)
+    aggregated_table_3 = "REPO_pr_data"
+    save_to_clickhouse(target_client, aggregated_table_3, all_data_3, ['org_repo', 't_month', 'change_requests', 'change_requests_accepted'])
+
+    # -------------------- aggregated_data Metric Set -------------------- (activity & attention)
+    metrics_list_4 = ["activity.json", "openrank.json", "attention.json"]
+    all_data_4 = down_data_and_aggregate(url, org_repo_platform_df, metrics_list_4)
+    aggregated_table_4 = "REPO_aggregated_data"
+    save_to_clickhouse(target_client, aggregated_table_4, all_data_4, ['org_repo', 't_month', 'activity', 'openrank', 'attention'])
+
+    # -------------------- activity_details Metric Set -------------------- (monthly activity details)
+    metric_monthly = "activity_details.json"
+    all_data_monthly = down_data_and_aggregate_contributor(url, org_repo_platform_df, metric_monthly)
+    aggregated_table_monthly = "REPO_activity_details"
+    save_to_clickhouse(target_client, aggregated_table_monthly, all_data_monthly, ['org_repo', 't_month', 'contributor', 'value'])
+
+    # -------------------- starfork_participant_data Metric Set -------------------- (yearly star/fork/participant data)
+    metrics_dict_yearly = {
+        "stars.json": "stars",
+        "technical_fork.json": "forks",
+        "participants.json": "participants",
+        "change_requests_accepted.json": "merged_PRs"
+    }
+    all_data_yearly = down_data_and_aggregate_yearly(url, org_repo_platform_df, metrics_dict_yearly)
+    aggregated_table_yearly = "REPO_starfork_participant_data"
+    save_to_clickhouse(target_client, aggregated_table_yearly, all_data_yearly, ['org_repo', 'stars', 'forks', 'participants', 'merged_PRs'])
+
+    # -------------------- latest_month_data Metric Set -------------------- (latest month data)
+    latest_data_4 = get_latest_data(all_data_4)
+    latest_table_4 = "REPO_latest_month_data"
+    save_to_clickhouse(target_client, latest_table_4, latest_data_4, ['org_repo', 't_month', 'activity', 'openrank', 'attention'])
diff --git a/dashboard/repository/scripts/repo_name.py b/dashboard/repository/scripts/repo_name.py
@@ -0,0 +1,113 @@
+import requests
+import pandas as pd
+from datetime import datetime
+from clickhouse_driver import Client
+
+# 获取最新的年月，如果最新月没有数据，则回退到上一个月
+def get_latest_month():
+    today = datetime.today()
+    year = today.year
+    month = today.month
+    return year, month
+
+# 获取数据
+def fetch_data(year, month, index, region):
+    url = f'https://xlab-open-source.oss-cn-beijing.aliyuncs.com/open_leaderboard/{index}/repo/{region}/{year}{month}.json'
+    try:
+        response = requests.get(url)
+        response.raise_for_status()
+        data = response.json()
+        return data['data']
+    except requests.exceptions.RequestException as e:
+        print(f"获取数据时出错: {e}")
+        return []
+
+# 查询 ClickHouse 数据
+def query_clickhouse(client, project_names, batch_size=100):
+    all_results = []
+
+    for i in range(0, len(project_names), batch_size):
+        batch_names = project_names[i:i + batch_size]
+        query = """
+            SELECT lower(CAST(platform AS String)), repo_name
+            FROM export_repo
+            WHERE repo_name IN %(batch_names)s
+        """
+        try:
+            batch_result = client.execute(query, {'batch_names': batch_names})
+            all_results.extend(batch_result)
+        except Exception as e:
+            print(f"查询失败: {e}")
+            continue
+
+    df_result = pd.DataFrame(all_results, columns=['platform', 'repo_name'])
+    df_result = (
+        df_result.sort_values(by='platform', ascending=False)
+        .drop_duplicates(subset=['repo_name'], keep='first')
+    )
+    return df_result
+
+# 保存到 ClickHouse
+def save_to_clickhouse(client, table_name, df):
+    client.execute(f"TRUNCATE TABLE {table_name}")
+    client.execute(
+        f"INSERT INTO {table_name} (platform, repo_name) VALUES",
+        df.to_dict('records')
+    )
+    print(f"数据已成功保存到表 {table_name}")
+
+# 主程序
+if __name__ == "__main__":
+    # Step 1: 获取数据
+    year, month = get_latest_month()
+    data = None
+    while data is None or len(data) == 0:
+        month_str = str(month).zfill(2)
+        indices = ['activity', 'open_rank']
+        regions = ['chinese', 'global']
+        all_data = []
+
+        for index in indices:
+            for region in regions:
+                temp_data = fetch_data(year, month_str, index, region)
+                if temp_data:
+                    all_data.extend(temp_data[:300])
+
+        if all_data:
+            data = all_data
+        else:
+            if month == 1:
+                month = 12
+                year -= 1
+            else:
+                month -= 1
+
+    # Step 2: 提取项目名称并去重
+    project_names = list({item['item']['name'] for item in data})
+
+    # Step 3: 连接 ClickHouse 数据库
+    # 读取数据的 ClickHouse
+    source_client = Client(
+        host='cc-2ze7189376o5m9759.public.clickhouse.ads.aliyuncs.com',
+        port=9000,
+        user='xlab',
+        password='PASSWORD',
+        database='opensource',
+        send_receive_timeout=600
+    )
+
+    # 保存数据的 ClickHouse
+    target_client = Client(
+        host='47.116.118.218',
+        port=9000,
+        user='USERS',
+        database='opensource',
+        send_receive_timeout=600
+    )
+
+    # Step 4: 查询并处理 ClickHouse 数据
+    df_result = query_clickhouse(source_client, project_names)
+
+    # Step 5: 保存结果到 ClickHouse
+    table_name = "platform_project_mapping"
+    save_to_clickhouse(target_client, table_name, df_result)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
 requests
 pandas
 clickhouse-driver
+datetime
diff --git a/src/components/table.js b/src/components/table.js
@@ -298,11 +298,12 @@ function dashboard(text, index, t_month, object) {
       repo_name,
       t_month_copy,
       t_month,
+      org_repo: `${org_name}/${repo_name}`,
     };
     return (
       <a
         href={
-          'https://dataease.x-lab.info/link/dqMbZrBk?attachParams=' +
+          'https://dataease.x-lab.info/link/v5wLKVcF?attachParams=' +
           btoa(JSON.stringify(params))
         }
         target="_blank"