From 73c326fe1af30876888270b9a958f82847c03b1a Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 10 Sep 2024 13:15:14 -0400 Subject: [PATCH 01/11] stash archiving scripts and github GHA --- .github/workflows/save_github_metrics.yml | 78 ++++++++++ pyproject.toml | 1 + src/usage_metrics/scripts/__init__.py | 3 + .../scripts/save_github_metrics.py | 135 ++++++++++++++++++ .../scripts/save_kaggle_metrics.py | 49 +++++++ 5 files changed, 266 insertions(+) create mode 100644 .github/workflows/save_github_metrics.yml create mode 100644 src/usage_metrics/scripts/__init__.py create mode 100644 src/usage_metrics/scripts/save_github_metrics.py create mode 100644 src/usage_metrics/scripts/save_kaggle_metrics.py diff --git a/.github/workflows/save_github_metrics.yml b/.github/workflows/save_github_metrics.yml new file mode 100644 index 0000000..eb0469f --- /dev/null +++ b/.github/workflows/save_github_metrics.yml @@ -0,0 +1,78 @@ +name: save-github-metrics +on: + workflow_dispatch: + schedule: + # Run every Tuesday at 8:00 PM UTC (Tuesday 12:00 PM AK) + # https://crontab.guru/#0_20_*_*_2 + - cron: "0 20 * * 2" + +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Set up conda environment for testing + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: environment.yml + cache-environment: true + cache-environment-key: environment-${{ hashFiles('pyproject.toml') }} + condarc: | + channels: + - conda-forge + - defaults + channel_priority: strict + + - name: Log conda environnment information + run: | + conda info + conda list + conda config --show-sources + conda config --show + printenv | sort + + - name: Authenticate gcloud + id: gcloud-auth + continue-on-error: true + uses: "google-github-actions/auth@v2" + with: + workload_identity_provider: "projects/345950277072/locations/global/workloadIdentityPools/gh-actions-pool/providers/gh-actions-provider" + service_account: "pudl-usage-metrics-etl@catalyst-cooperative-pudl.iam.gserviceaccount.com" + create_credentials_file: true + + - name: "Set up Cloud SDK" + uses: "google-github-actions/setup-gcloud@v2" + with: + version: ">= 363.0.0" + + - name: Save Github Metrics + env: + API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} + run: | + mamba run -n pudl-usage-metrics python run_data_update.py + + - name: Inform the Codemonkeys + uses: 8398a7/action-slack@v3 + with: + status: custom + fields: workflow,job,commit,repo,ref,author,took + custom_payload: | + { + username: 'action-slack', + icon_emoji: ':octocat:', + attachments: [{ + color: '${{ job.status }}' === 'success' ? 'good' : '${{ job.status }}' === 'failure' ? 'danger' : 'warning', + text: `${process.env.AS_WORKFLOW}\n${process.env.AS_JOB} (${process.env.AS_COMMIT}) of ${process.env.AS_REPO}@${process.env.AS_REF} by ${process.env.AS_AUTHOR} ${{ job.status }} in ${process.env.AS_TOOK}`, + }] + } + env: + GITHUB_TOKEN: ${{ github.token }} # required + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} # required + MATRIX_CONTEXT: ${{ toJson(matrix) }} # required + if: always() # Pick up events even if the job fails or is canceled. diff --git a/pyproject.toml b/pyproject.toml index 10c1d3f..f2499e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dependencies = [ "pg8000>=1.31.1", "cloud-sql-python-connector[pg8000]>=1.11.0", "google-cloud-storage>=2.17", + "kaggle>=1.6.3" ] classifiers = [ diff --git a/src/usage_metrics/scripts/__init__.py b/src/usage_metrics/scripts/__init__.py new file mode 100644 index 0000000..da1b7bb --- /dev/null +++ b/src/usage_metrics/scripts/__init__.py @@ -0,0 +1,3 @@ +"""Module contains assets that extract raw data.""" + +from . import save_github_metrics, save_kaggle_metrics diff --git a/src/usage_metrics/scripts/save_github_metrics.py b/src/usage_metrics/scripts/save_github_metrics.py new file mode 100644 index 0000000..06d1af0 --- /dev/null +++ b/src/usage_metrics/scripts/save_github_metrics.py @@ -0,0 +1,135 @@ +"""This script pull github traffic metrics and saves them to a GC Bucket.""" + +import json +import logging +import os +import sys +from dataclasses import dataclass +from datetime import date + +import requests +from google.cloud import storage +from requests.exceptions import HTTPError + +logger = logging.getLogger() +logger.basicConfig(level="INFO") + + +@dataclass +class Metric: + """Format metrics into folder names.""" + + name: str + folder: str + + +TOKEN = os.getenv("API_TOKEN_GITHUB", "...") +OWNER = "catalyst-cooperative" +REPO = "pudl" +BUCKET_NAME = "github-metrics" + +BIWEEKLY_METRICS = [ + Metric("clones", "clones"), + Metric("popular/paths", "popular_paths"), + Metric("popular/referrers", "popular_referrers"), + Metric("views", "views"), +] +PERSISTENT_METRICS = [Metric("stargazers", "stargazers"), Metric("forks", "forks")] + + +def get_biweekly_metrics(metric: str) -> str: + """Get json data for a biweekly github metric. + + Args: + metric (str): The github metric name. + + Returns: + json (str): The metric data as json text. + """ + query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/traffic/{metric}" + headers = { + "Authorization": f"token {TOKEN}", + "Accept": "application/vnd.github.v3+json", + } + + response = make_github_request(query_url, headers) + return json.dumps(response.json()) + + +def get_persistent_metrics(metric) -> str: + """Get githubs persistent metrics: forks and stargazers. + + Args: + metrics (str): the metric to retrieve (forks | stargazers) + + Returns: + json (str): A json string of metrics. + """ + query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/{metric}" + headers = { + "Authorization": f"token {TOKEN}", + "Accept": "application/vnd.github.v3.star+json", + } + + metrics = [] + page = 1 + while True: + params = {"page": page} + metrics_json = make_github_request(query_url, headers, params).json() + + if len(metrics_json) <= 0: + break + metrics += metrics_json + page += 1 + return json.dumps(metrics) + + +def make_github_request(query: str, headers: str, params: str = None): + """Makes a request to the github api. + + Args: + query (str): A github api request url. + headers (str): Header to include in the request. + params (str): Params of request. + + Returns: + response (requests.models.Response): the request response. + """ + try: + response = requests.get(query, headers=headers, params=params, timeout=100) + + response.raise_for_status() + except HTTPError as http_err: + raise HTTPError( + f"HTTP error occurred: {http_err}\n\tResponse test: {response.text}" + ) + except Exception as err: + raise Exception(f"Other error occurred: {err}") + return response + + +def upload_to_bucket(data, metric): + """Upload a gcp object.""" + storage_client = storage.Client() + bucket = storage_client.bucket(BUCKET_NAME) + blob_name = f"{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json" + + blob = bucket.blob(blob_name) + blob.upload_from_string(data) + + logger.info(f"Uploaded {metric.name} data to {blob_name}.") + + +def save_metrics(): + """Save github traffic metrics to google cloud bucket.""" + for metric in BIWEEKLY_METRICS: + metric_data = get_biweekly_metrics(metric.name) + upload_to_bucket(metric_data, metric) + + for metric in PERSISTENT_METRICS: + metric_data = get_persistent_metrics(metric.name) + upload_to_bucket(metric_data, metric) + + +if __name__ == "__main__": + sys.exit(save_metrics()) diff --git a/src/usage_metrics/scripts/save_kaggle_metrics.py b/src/usage_metrics/scripts/save_kaggle_metrics.py new file mode 100644 index 0000000..e5ba87b --- /dev/null +++ b/src/usage_metrics/scripts/save_kaggle_metrics.py @@ -0,0 +1,49 @@ +"""This script pull Kaggle traffic metrics and saves them to a GC Bucket.""" + +import json +import logging +import sys +from datetime import date + +from google.cloud import storage +from kaggle.api.kaggle_api_extended import KaggleApi + +KAGGLE_OWNER = "catalystcooperative" +KAGGLE_DATASET = "pudl-project" +OWNER = "catalyst-cooperative" +REPO = "pudl" +BUCKET_NAME = "kaggle-metrics" + +logger = logging.getLogger() +logger.basicConfig(level="INFO") + + +def get_kaggle_logs() -> str: + """Get PUDL project usage metadata from Kaggle site.""" + api = KaggleApi() + + metadata = api.metadata_get(KAGGLE_OWNER, KAGGLE_DATASET) + metadata.update({"metrics_date": date.today().strftime("%Y-%m-%d")}) + return json.dumps(metadata) + + +def upload_to_bucket(data): + """Upload a gcp object.""" + storage_client = storage.Client() + bucket = storage_client.bucket(BUCKET_NAME) + blob_name = f"kaggle-metrics-{date.today().strftime('%Y-%m-%d')}.json" + + blob = bucket.blob(blob_name) + blob.upload_from_string(data) + + logger.info(f"Uploaded today's data to {blob_name}.") + + +def save_metrics(): + """Save github traffic metrics to google cloud bucket.""" + kaggle_metrics = get_kaggle_logs() + upload_to_bucket(kaggle_metrics) + + +if __name__ == "__main__": + sys.exit(save_metrics()) From d521c074351a62f4b43cf16d204571457fe447a5 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Tue, 10 Sep 2024 13:23:48 -0400 Subject: [PATCH 02/11] create save daily metrics gha --- ...{save_github_metrics.yml => save_daily_metrics.yml} | 10 ++++++++-- src/usage_metrics/scripts/save_github_metrics.py | 2 +- src/usage_metrics/scripts/save_kaggle_metrics.py | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) rename .github/workflows/{save_github_metrics.yml => save_daily_metrics.yml} (88%) diff --git a/.github/workflows/save_github_metrics.yml b/.github/workflows/save_daily_metrics.yml similarity index 88% rename from .github/workflows/save_github_metrics.yml rename to .github/workflows/save_daily_metrics.yml index eb0469f..99d7b17 100644 --- a/.github/workflows/save_github_metrics.yml +++ b/.github/workflows/save_daily_metrics.yml @@ -1,4 +1,4 @@ -name: save-github-metrics +name: save-daily-metrics on: workflow_dispatch: schedule: @@ -55,7 +55,13 @@ jobs: env: API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} run: | - mamba run -n pudl-usage-metrics python run_data_update.py + mamba run -n pudl-usage-metrics python src/usage_metrics/scripts/save_github_metrics.py + + - name: Save Kaggle Metrics + env: + API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} + run: | + mamba run -n pudl-usage-metrics python src/usage_metrics/scripts/save_kaggle_metrics.py - name: Inform the Codemonkeys uses: 8398a7/action-slack@v3 diff --git a/src/usage_metrics/scripts/save_github_metrics.py b/src/usage_metrics/scripts/save_github_metrics.py index 06d1af0..5d40f31 100644 --- a/src/usage_metrics/scripts/save_github_metrics.py +++ b/src/usage_metrics/scripts/save_github_metrics.py @@ -12,7 +12,7 @@ from requests.exceptions import HTTPError logger = logging.getLogger() -logger.basicConfig(level="INFO") +logging.basicConfig(level="INFO") @dataclass diff --git a/src/usage_metrics/scripts/save_kaggle_metrics.py b/src/usage_metrics/scripts/save_kaggle_metrics.py index e5ba87b..cbe439f 100644 --- a/src/usage_metrics/scripts/save_kaggle_metrics.py +++ b/src/usage_metrics/scripts/save_kaggle_metrics.py @@ -15,7 +15,7 @@ BUCKET_NAME = "kaggle-metrics" logger = logging.getLogger() -logger.basicConfig(level="INFO") +logging.basicConfig(level="INFO") def get_kaggle_logs() -> str: From c0063b76481ad4b09963938d75c533b83875f864 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 11 Sep 2024 15:52:55 -0400 Subject: [PATCH 03/11] Update kaggle metric run --- src/usage_metrics/scripts/save_github_metrics.py | 4 ++-- src/usage_metrics/scripts/save_kaggle_metrics.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/usage_metrics/scripts/save_github_metrics.py b/src/usage_metrics/scripts/save_github_metrics.py index 5d40f31..679d41c 100644 --- a/src/usage_metrics/scripts/save_github_metrics.py +++ b/src/usage_metrics/scripts/save_github_metrics.py @@ -26,7 +26,7 @@ class Metric: TOKEN = os.getenv("API_TOKEN_GITHUB", "...") OWNER = "catalyst-cooperative" REPO = "pudl" -BUCKET_NAME = "github-metrics" +BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop" BIWEEKLY_METRICS = [ Metric("clones", "clones"), @@ -112,7 +112,7 @@ def upload_to_bucket(data, metric): """Upload a gcp object.""" storage_client = storage.Client() bucket = storage_client.bucket(BUCKET_NAME) - blob_name = f"{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json" + blob_name = f"github/{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json" blob = bucket.blob(blob_name) blob.upload_from_string(data) diff --git a/src/usage_metrics/scripts/save_kaggle_metrics.py b/src/usage_metrics/scripts/save_kaggle_metrics.py index cbe439f..c130737 100644 --- a/src/usage_metrics/scripts/save_kaggle_metrics.py +++ b/src/usage_metrics/scripts/save_kaggle_metrics.py @@ -12,7 +12,7 @@ KAGGLE_DATASET = "pudl-project" OWNER = "catalyst-cooperative" REPO = "pudl" -BUCKET_NAME = "kaggle-metrics" +BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop" logger = logging.getLogger() logging.basicConfig(level="INFO") @@ -31,7 +31,7 @@ def upload_to_bucket(data): """Upload a gcp object.""" storage_client = storage.Client() bucket = storage_client.bucket(BUCKET_NAME) - blob_name = f"kaggle-metrics-{date.today().strftime('%Y-%m-%d')}.json" + blob_name = f"kaggle/{date.today().strftime('%Y-%m-%d')}.json" blob = bucket.blob(blob_name) blob.upload_from_string(data) From 3240c99b2f2f7455893d1c82bf37cdeed95a498b Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 11 Sep 2024 16:04:18 -0400 Subject: [PATCH 04/11] Make cron job daily --- .github/workflows/save_daily_metrics.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/save_daily_metrics.yml b/.github/workflows/save_daily_metrics.yml index 99d7b17..c554fd7 100644 --- a/.github/workflows/save_daily_metrics.yml +++ b/.github/workflows/save_daily_metrics.yml @@ -2,9 +2,9 @@ name: save-daily-metrics on: workflow_dispatch: schedule: - # Run every Tuesday at 8:00 PM UTC (Tuesday 12:00 PM AK) + # Run every day at 8:00 PM UTC # https://crontab.guru/#0_20_*_*_2 - - cron: "0 20 * * 2" + - cron: "0 20 * * *" jobs: build: From 7b7b1b81ad0861050a41fb70f33728f98cecef74 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Wed, 11 Sep 2024 16:05:02 -0400 Subject: [PATCH 05/11] Update cron link --- .github/workflows/save_daily_metrics.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/save_daily_metrics.yml b/.github/workflows/save_daily_metrics.yml index c554fd7..62ba981 100644 --- a/.github/workflows/save_daily_metrics.yml +++ b/.github/workflows/save_daily_metrics.yml @@ -3,7 +3,7 @@ on: workflow_dispatch: schedule: # Run every day at 8:00 PM UTC - # https://crontab.guru/#0_20_*_*_2 + # https://crontab.guru/#0_20_*_*_* - cron: "0 20 * * *" jobs: From 6da16ca0d0c93d93d74531aaf40c6e3fc11c2c0f Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 13 Sep 2024 11:22:19 -0400 Subject: [PATCH 06/11] Update credentials --- .github/workflows/save_daily_metrics.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/save_daily_metrics.yml b/.github/workflows/save_daily_metrics.yml index 62ba981..a43c00c 100644 --- a/.github/workflows/save_daily_metrics.yml +++ b/.github/workflows/save_daily_metrics.yml @@ -59,7 +59,8 @@ jobs: - name: Save Kaggle Metrics env: - API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} run: | mamba run -n pudl-usage-metrics python src/usage_metrics/scripts/save_kaggle_metrics.py From 0824f187668a060910fd1e762a06ec9c206a36d8 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 13 Sep 2024 11:27:44 -0400 Subject: [PATCH 07/11] Fix env calls in run steps --- .github/workflows/save_daily_metrics.yml | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/workflows/save_daily_metrics.yml b/.github/workflows/save_daily_metrics.yml index a43c00c..1b3f044 100644 --- a/.github/workflows/save_daily_metrics.yml +++ b/.github/workflows/save_daily_metrics.yml @@ -22,8 +22,7 @@ jobs: with: environment-file: environment.yml cache-environment: true - cache-environment-key: environment-${{ hashFiles('pyproject.toml') }} - condarc: | + ondarc: | channels: - conda-forge - defaults @@ -51,18 +50,20 @@ jobs: with: version: ">= 363.0.0" - - name: Save Github Metrics + - shell: bash -l {0} + name: Save Github Metrics env: API_TOKEN_GITHUB: ${{ secrets.API_TOKEN_GITHUB }} run: | - mamba run -n pudl-usage-metrics python src/usage_metrics/scripts/save_github_metrics.py + python src/usage_metrics/scripts/save_github_metrics.py - - name: Save Kaggle Metrics + - shell: bash -l {0} + name: Save Kaggle Metrics env: KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} run: | - mamba run -n pudl-usage-metrics python src/usage_metrics/scripts/save_kaggle_metrics.py + python src/usage_metrics/scripts/save_kaggle_metrics.py - name: Inform the Codemonkeys uses: 8398a7/action-slack@v3 From 8caf839252fa9631dbacaccc5dd97e94513e119c Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 13 Sep 2024 11:28:31 -0400 Subject: [PATCH 08/11] Add kaggle secrets to tox --- .github/workflows/tox-pytest.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index 539c7ed..d6bcf79 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -61,6 +61,8 @@ jobs: POSTGRES_PASSWORD: ${{ secrets.POSTGRES_PASSWORD }} POSTGRES_DB: ${{ secrets.POSTGRES_DB }} POSTGRES_PORT: ${{ secrets.POSTGRES_PORT }} + KAGGLE_KEY: ${{ secrets.KAGGLE_KEY }} + KAGGLE_USERNAME: ${{ secrets.KAGGLE_USERNAME }} run: | tox From d07a406ab93e7b2b80e2b0a261bbcae15c3d0685 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 13 Sep 2024 12:25:29 -0400 Subject: [PATCH 09/11] Add kaggle creds to tox.ini --- tox.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/tox.ini b/tox.ini index 5f027de..f2276e0 100644 --- a/tox.ini +++ b/tox.ini @@ -19,6 +19,7 @@ passenv = GOOGLE_* GCLOUD_* GCP_* + KAGGLE_* HOME SQLALCHEMY_WARN_20 IPINFO_TOKEN From 4171766e63b9a470154b1fba4827cf757483ddb4 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Fri, 13 Sep 2024 12:39:02 -0400 Subject: [PATCH 10/11] Update readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 13058a2..004a99d 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,8 @@ If you want to take advantage of caching raw logs, rather than redownloading the Dagster stores run logs and caches in a directory stored in the `DAGSTER_HOME` environment variable. The `usage_metrics/dagster_home/dagster.yaml` file contains configuration for the dagster instance. **Note:** The `usage_metrics/dagster_home/storage` directory could grow to become a couple GBs because all op outputs for every run are stored there. You can read more about the dagster_home directory in the [dagster docs](https://docs.dagster.io/deployment/dagster-instance#default-local-behavior). +To use the Kaggle API, [sign up for a Kaggle account](https://www.kaggle.com). Then go to the ['Account' tab]((https://www.kaggle.com//account)) of your user profile and select 'Create API Token'. This will trigger the download of `kaggle.json`, a file containing your API credentials. Use this file to automatically set your Kaggle API credentials locally, or manually set `KAGGLE_USER` and `KAGGLE_KEY` environment variables. + To set these environment variables, run these commands: ``` @@ -41,6 +43,8 @@ conda activate pudl-usage-metrics conda env config vars set IPINFO_TOKEN="{your_api_key_here}" conda env config vars set DAGSTER_HOME="$(pwd)/dagster_home/" conda env config vars set DATA_DIR="$(pwd)/data/" +conda env config vars set KAGGLE_USER="{your_kaggle_username_here}" # If setting manually +conda env config vars set KAGGLE_KEY="{your_kaggle_api_key_here}" # If setting manually conda activate pudl-usage-metrics ``` From 125c047a2bc03b7829b380470debc31e7e86d4f5 Mon Sep 17 00:00:00 2001 From: e-belfer Date: Mon, 16 Sep 2024 10:23:14 -0400 Subject: [PATCH 11/11] Reorganize and prune organizational level variables --- .../scripts/save_github_metrics.py | 58 ++++++++++--------- .../scripts/save_kaggle_metrics.py | 14 ++--- 2 files changed, 36 insertions(+), 36 deletions(-) diff --git a/src/usage_metrics/scripts/save_github_metrics.py b/src/usage_metrics/scripts/save_github_metrics.py index 679d41c..2902641 100644 --- a/src/usage_metrics/scripts/save_github_metrics.py +++ b/src/usage_metrics/scripts/save_github_metrics.py @@ -4,6 +4,7 @@ import logging import os import sys +import time from dataclasses import dataclass from datetime import date @@ -23,21 +24,7 @@ class Metric: folder: str -TOKEN = os.getenv("API_TOKEN_GITHUB", "...") -OWNER = "catalyst-cooperative" -REPO = "pudl" -BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop" - -BIWEEKLY_METRICS = [ - Metric("clones", "clones"), - Metric("popular/paths", "popular_paths"), - Metric("popular/referrers", "popular_referrers"), - Metric("views", "views"), -] -PERSISTENT_METRICS = [Metric("stargazers", "stargazers"), Metric("forks", "forks")] - - -def get_biweekly_metrics(metric: str) -> str: +def get_biweekly_metrics(owner: str, repo: str, token: str, metric: str) -> str: """Get json data for a biweekly github metric. Args: @@ -46,9 +33,9 @@ def get_biweekly_metrics(metric: str) -> str: Returns: json (str): The metric data as json text. """ - query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/traffic/{metric}" + query_url = f"https://api.github.com/repos/{owner}/{repo}/traffic/{metric}" headers = { - "Authorization": f"token {TOKEN}", + "Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json", } @@ -56,7 +43,7 @@ def get_biweekly_metrics(metric: str) -> str: return json.dumps(response.json()) -def get_persistent_metrics(metric) -> str: +def get_persistent_metrics(owner: str, repo: str, token: str, metric: str) -> str: """Get githubs persistent metrics: forks and stargazers. Args: @@ -65,15 +52,19 @@ def get_persistent_metrics(metric) -> str: Returns: json (str): A json string of metrics. """ - query_url = f"https://api.github.com/repos/{OWNER}/{REPO}/{metric}" + query_url = f"https://api.github.com/repos/{owner}/{repo}/{metric}" headers = { - "Authorization": f"token {TOKEN}", + "Authorization": f"token {token}", "Accept": "application/vnd.github.v3.star+json", } metrics = [] page = 1 - while True: + + timeout = 600 # Set 10 minute timeout + timeout_start = time.time() + + while time.time() < timeout_start + timeout: params = {"page": page} metrics_json = make_github_request(query_url, headers, params).json() @@ -103,15 +94,14 @@ def make_github_request(query: str, headers: str, params: str = None): raise HTTPError( f"HTTP error occurred: {http_err}\n\tResponse test: {response.text}" ) - except Exception as err: - raise Exception(f"Other error occurred: {err}") return response def upload_to_bucket(data, metric): """Upload a gcp object.""" + bucket_name = "pudl-usage-metrics-archives.catalyst.coop" storage_client = storage.Client() - bucket = storage_client.bucket(BUCKET_NAME) + bucket = storage_client.bucket(bucket_name) blob_name = f"github/{metric.folder}/{date.today().strftime('%Y-%m-%d')}.json" blob = bucket.blob(blob_name) @@ -122,12 +112,24 @@ def upload_to_bucket(data, metric): def save_metrics(): """Save github traffic metrics to google cloud bucket.""" - for metric in BIWEEKLY_METRICS: - metric_data = get_biweekly_metrics(metric.name) + token = os.getenv("API_TOKEN_GITHUB", "...") + owner = "catalyst-cooperative" + repo = "pudl" + + biweekly_metrics = [ + Metric("clones", "clones"), + Metric("popular/paths", "popular_paths"), + Metric("popular/referrers", "popular_referrers"), + Metric("views", "views"), + ] + persistent_metrics = [Metric("stargazers", "stargazers"), Metric("forks", "forks")] + + for metric in biweekly_metrics: + metric_data = get_biweekly_metrics(owner, repo, token, metric.name) upload_to_bucket(metric_data, metric) - for metric in PERSISTENT_METRICS: - metric_data = get_persistent_metrics(metric.name) + for metric in persistent_metrics: + metric_data = get_persistent_metrics(owner, repo, token, metric.name) upload_to_bucket(metric_data, metric) diff --git a/src/usage_metrics/scripts/save_kaggle_metrics.py b/src/usage_metrics/scripts/save_kaggle_metrics.py index c130737..04bfc44 100644 --- a/src/usage_metrics/scripts/save_kaggle_metrics.py +++ b/src/usage_metrics/scripts/save_kaggle_metrics.py @@ -8,12 +8,6 @@ from google.cloud import storage from kaggle.api.kaggle_api_extended import KaggleApi -KAGGLE_OWNER = "catalystcooperative" -KAGGLE_DATASET = "pudl-project" -OWNER = "catalyst-cooperative" -REPO = "pudl" -BUCKET_NAME = "pudl-usage-metrics-archives.catalyst.coop" - logger = logging.getLogger() logging.basicConfig(level="INFO") @@ -21,16 +15,20 @@ def get_kaggle_logs() -> str: """Get PUDL project usage metadata from Kaggle site.""" api = KaggleApi() + kaggle_owner = "catalystcooperative" + kaggle_dataset = "pudl-project" - metadata = api.metadata_get(KAGGLE_OWNER, KAGGLE_DATASET) + metadata = api.metadata_get(kaggle_owner, kaggle_dataset) metadata.update({"metrics_date": date.today().strftime("%Y-%m-%d")}) return json.dumps(metadata) def upload_to_bucket(data): """Upload a gcp object.""" + bucket_name = "pudl-usage-metrics-archives.catalyst.coop" + storage_client = storage.Client() - bucket = storage_client.bucket(BUCKET_NAME) + bucket = storage_client.bucket(bucket_name) blob_name = f"kaggle/{date.today().strftime('%Y-%m-%d')}.json" blob = bucket.blob(blob_name)