Skip to content

Commit

Permalink
Merge pull request #67 from allenai/henryh/favyen/forest-loss-20240917
Browse files Browse the repository at this point in the history
Henryh/favyen/forest loss 20240917
  • Loading branch information
Hgherzog authored Jan 8, 2025
2 parents 5071092 + af43d44 commit f35765d
Show file tree
Hide file tree
Showing 55 changed files with 4,533 additions and 780 deletions.
1 change: 1 addition & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
.env
lightning_logs
wandb
**/test_data/**/**/*.tif
19 changes: 9 additions & 10 deletions .github/workflows/build_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,19 +75,12 @@ jobs:
echo "ghcr.io Docker image name is ${GHCR_IMAGE}"
echo "ghcr_image_name=\"${GHCR_IMAGE}\"" >> $GITHUB_OUTPUT
# TODO: Make sure skylight can grab the image tag and deploy
test:
runs-on: ubuntu-latest
runs-on: ubuntu-latest-m
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Log in to the Container registry
uses: docker/login-action@v3
with:
registry: ${{ env.REGISTRY }}
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Log in to the Container registry
uses: docker/login-action@v3
Expand All @@ -113,6 +106,11 @@ jobs:
with:
credentials_json: ${{ secrets.GOOGLE_CREDENTIALS }}

- name: Run unit tests with Docker Compose
run: |
docker compose -f docker-compose.yaml run \
test pytest tests/unit/
- name: Run tests with Docker Compose
run: |
docker compose -f docker-compose.yaml run \
Expand All @@ -122,7 +120,8 @@ jobs:
-e GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp-credentials.json \
-e RSLP_BUCKET=rslearn-eai \
-e RSLP_PREFIX=gs://rslearn-eai \
test pytest tests/ --ignore tests/integration_slow/
test pytest tests/integration/ --ignore tests/integration_slow/ -vv
- name: Clean up
if: always()
Expand Down Expand Up @@ -176,7 +175,7 @@ jobs:
-e GOOGLE_APPLICATION_CREDENTIALS=/tmp/gcp-credentials.json \
-e RSLP_BUCKET=rslearn-eai \
-e RSLP_PREFIX=gs://rslearn-eai \
rslearn_projects-test pytest tests/integration_slow/
rslearn_projects-test pytest tests/integration_slow/ -vv
- name: Clean up
if: always()
Expand Down
328 changes: 328 additions & 0 deletions .github/workflows/deploy_image_on_vm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
#!/bin/bash
cleanup() {
if [ -n "$VM_NAME" ]; then
echo -e "\nCleaning up VM $VM_NAME..."
gcloud compute instances delete "$VM_NAME" --zone="$ZONE" --quiet || true
fi
exit 1
}

# Set up trap for SIGINT (CTRL+C)
trap cleanup SIGINT
# Parse command line arguments
usage() {
echo "Usage: $0 [options]"
echo "Options:"
echo " --project-id GCP project ID (default: skylight-proto-1)"
echo " --zone GCP zone (default: us-west1-b)"
echo " --machine-type VM machine type (default: e2-micro)"
echo " --docker-image Docker image to run"
echo " --command Command to run in container on the vm"
echo " --user User (default: henryh)"
echo " --ghcr-user GitHub Container Registry user (default: allenai)"
echo " --delete Delete VM after completion (yes/no)"
echo " --beaker-token Beaker token"
echo " --beaker-addr Beaker address"
echo " --beaker-username Beaker username associated with the token"
echo " --rslp-project rslp project name (e.g forest_loss_driver)"
echo " --rslp-prefix rslp prefix"
echo " --workflow workflow name (e.g predict_pipeline) to run on beaker"
echo " --gpu-count Number of GPUs to use"
echo " --shared-memory Amount of shared memory"
echo " --cluster Cluster to use"
echo " --priority Priority level"
echo " --task-name Name of the task"
echo " --budget Budget to use"
echo " --workspace Workspace name"
exit 1
}

# Default values
PROJECT_ID="skylight-proto-1"
ZONE="us-west1-b"
MACHINE_TYPE="e2-micro"
IMAGE_FAMILY="debian-11"
IMAGE_PROJECT="debian-cloud"
USER="henryh"
GHCR_USER="allenai"
DELETE_VM="no"
GPU_COUNT="1"
SHARED_MEMORY="64Gib"
CLUSTER="ai2/jupiter-cirrascale-2"
PRIORITY="normal"
TASK_NAME="forest_loss_driver_inference_$(uuidgen | cut -c1-8)"
BUDGET="ai2/d5"
WORKSPACE="ai2/earth-systems"

# Parse arguments
while [ $# -gt 0 ]; do
case "$1" in
--project-id)
shift
PROJECT_ID="$1"
;;
--zone)
shift
ZONE="$1"
;;
--machine-type)
shift
MACHINE_TYPE="$1"
;;
--docker-image)
shift
DOCKER_IMAGE="$1"
;;
--command)
shift
COMMAND="$1"
;;
--user)
shift
USER="$1"
;;
--ghcr-user)
shift
GHCR_USER="$1"
;;
--delete)
shift
DELETE_VM="$1"
;;
--beaker-token)
shift
BEAKER_TOKEN="$1"
;;
--beaker-addr)
shift
BEAKER_ADDR="$1"
;;
--beaker-username)
shift
BEAKER_USERNAME="$1"
;;
--service-account)
shift
SERVICE_ACCOUNT="$1"
;;
--rslp-project)
shift
RSLP_PROJECT="$1"
;;
--rslp-prefix)
shift
RSLP_PREFIX="$1"
;;
--gpu-count)
shift
GPU_COUNT="$1"
;;
--shared-memory)
shift
SHARED_MEMORY="$1"
;;
--cluster)
shift
CLUSTER="$1"
;;
--priority)
shift
PRIORITY="$1"
;;
--task-name)
shift
TASK_NAME="$1"
;;
--budget)
shift
BUDGET="$1"
;;
--workspace)
shift
WORKSPACE="$1"
;;
--extra_args_model_predict)
shift
EXTRA_ARGS_MODEL_PREDICT="$1"
;;
-h|--help)
usage
;;
*)
echo "Unknown parameter: $1"
usage
;;
esac
shift
done

# Validate required arguments
if [ -z "$DOCKER_IMAGE" ]; then
echo "Error: --docker-image is required"
usage
fi

if [ -z "$COMMAND" ]; then
echo "Error: --command is required"
usage
fi
job_name="forest-loss-driver-inference-$(uuidgen | cut -c1-8)"
# Generate VM name
VM_NAME="rslp-$job_name"

# TODO: add back instance termination action and max run duration
create_vm() {
local vm_name="$1"
local project_id="$2"
local zone="$3"
local machine_type="$4"
local image_family="$5"
local image_project="$6"
local ghcr_user="$7"
local user="$8"
local docker_image="${9}"
local command="${10}"
local beaker_token="${11}"
local beaker_addr="${12}"
local beaker_username="${13}"
local service_account="${14}"
local rslp_project="${15}"
local gpu_count="${16}"
local shared_memory="${17}"
local cluster="${18}"
local priority="${19}"
local task_name="${20}"
local budget="${21}"
local workspace="${22}"
local rslp_prefix="${23}"
local extra_args_model_predict="${24}"
echo "Creating VM $vm_name in project $project_id..." && \
echo "Logged into GCP as $(gcloud config get-value account)" && \
echo "$(gcloud config list)" && \
if ! gcloud compute instances create "$vm_name" \
--project="$project_id" \
--zone="$zone" \
--machine-type="$machine_type" \
--service-account="$service_account" \
--scopes=cloud-platform \
--metadata=\
ops-agents-install='{"name": "ops-agent"}',\
google-logging-enable=TRUE,\
google-monitoring-enable=TRUE,\
enable-osconfig=TRUE,\
ghcr-user="$ghcr_user",\
user="$user",\
docker-image="$docker_image",\
command="$command",\
beaker-token="$beaker_token",\
beaker-addr="$beaker_addr",\
beaker_username="$beaker_username",\
rslp-project="$rslp_project",\
gpu-count="$gpu_count",\
shared-memory="$shared_memory",\
cluster="$cluster",\
priority="$priority",\
task-name="$task_name",\
budget="$budget",\
workspace="$workspace",\
rslp-prefix="$rslp_prefix",\
index-cache-dir="$INDEX_CACHE_DIR",\
tile-store-root-dir="$TILE_STORE_ROOT_DIR",\
extra_args_model_predict="$extra_args_model_predict" \
--metadata-from-file=startup-script=<(echo '#!/bin/bash
# Create a log dir
sudo mkdir -p /var/log/startup-script
# Redirect all output only to the log file to avoid buffer.Scanner token too long errors
exec 1> "/var/log/startup-script/startup.log" 2>&1
echo "Starting startup script at $(date)"
sudo apt-get update && \
sudo apt-get install -y docker.io && \
sudo systemctl start docker && \
export USER=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/user) && \
sudo usermod -aG docker $USER && \
export GHCR_TOKEN=$(gcloud secrets versions access latest --secret="ghcr_pat_forest_loss") && \
export GHCR_USER=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/ghcr-user) && \
export DOCKER_IMAGE=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/docker-image) && \
echo "Logging into GHCR" && \
echo "GHCR_TOKEN: $GHCR_TOKEN" && \
echo "GHCR_USER: $GHCR_USER" && \
echo $GHCR_TOKEN | sudo docker login ghcr.io -u $GHCR_USER --password-stdin && \
echo "Pulling Docker image" && \
sudo docker pull $DOCKER_IMAGE && \
echo "Docker image pulled" && \
export PL_API_KEY=$(gcloud secrets versions access latest --secret="planet_api_key_forest_loss") && \
export INDEX_CACHE_DIR=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/index-cache-dir) && \
export TILE_STORE_ROOT_DIR=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/tile-store-root-dir) && \
export LOCAL_INDEX_CACHE_DIR="/tmp/index_cache" && \
mkdir -p $LOCAL_INDEX_CACHE_DIR && \
gsutil -m cp -r $INDEX_CACHE_DIR/* $LOCAL_INDEX_CACHE_DIR/ && \
export COMMAND=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/command) && \
sudo docker run \
-e CLOUDSDK_AUTH_ACCESS_TOKEN=$(gcloud auth application-default print-access-token) \
-e PL_API_KEY=$PL_API_KEY \
-e TILE_STORE_ROOT_DIR=$TILE_STORE_ROOT_DIR \
-e INDEX_CACHE_DIR=file:///index_cache \
-v $LOCAL_INDEX_CACHE_DIR:/index_cache \
$DOCKER_IMAGE /bin/bash -c "$COMMAND" && \
echo "Data Extraction Complete" && \
if ! gsutil -m cp -r $LOCAL_INDEX_CACHE_DIR/* $INDEX_CACHE_DIR/; then
echo "WARNING: Failed to copy index cache back to $INDEX_CACHE_DIR" >&2
else
echo "Successfully copied index cache back to $INDEX_CACHE_DIR"
fi && \
export BEAKER_TOKEN=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/beaker-token) && \
export BEAKER_ADDR=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/beaker-addr) && \
curl -s '\''https://beaker.org/api/v3/release/cli?os=linux&arch=amd64'\'' | sudo tar -zxv -C /usr/local/bin ./beaker && \
export IMAGE_ID=$(docker images --format "{{.ID}}" $DOCKER_IMAGE | head -n 1) && \
export BEAKER_IMAGE_NAME=$(date +%Y%m%d_%H%M%S)_$(echo $DOCKER_IMAGE | tr '/' '_' | tr ':' '_' | tr -cd '[:alnum:]-') && \
export WORKSPACE=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/workspace) && \
beaker config set default_workspace $WORKSPACE && \
echo "Creating Beaker image" && \
beaker image create $IMAGE_ID --name $BEAKER_IMAGE_NAME && \
echo "Image uploaded to Beaker" && \
export BEAKER_USERNAME=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/beaker_username) && \
export GPU_COUNT=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/gpu-count) && \
export SHARED_MEMORY=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/shared-memory) && \
export CLUSTER=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/cluster) && \
export PRIORITY=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/priority) && \
export TASK_NAME=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/task-name) && \
export BUDGET=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/budget) && \
export RSLP_PREFIX=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/rslp-prefix) && \
export RSLP_PROJECT=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/rslp-project) && \
export EXTRA_ARGS=$(curl -H "Metadata-Flavor: Google" http://metadata.google.internal/computeMetadata/v1/instance/attributes/extra_args_model_predict) && \
export INFERENCE_JOB_LAUNCH_COMMAND="python rslp/$RSLP_PROJECT/job_launcher.py \
--project $RSLP_PROJECT \
--workflow predict \
--image $BEAKER_USERNAME/$BEAKER_IMAGE_NAME \
--gpu_count $GPU_COUNT \
--shared_memory $SHARED_MEMORY \
--cluster $CLUSTER \
--priority $PRIORITY \
--task_name $TASK_NAME \
--budget $BUDGET \
--workspace $WORKSPACE \
--extra_args $EXTRA_ARGS" && \
echo "INFERENCE_JOB_LAUNCH_COMMAND: $INFERENCE_JOB_LAUNCH_COMMAND" && \
echo "Launching inference job on Beaker" && \
docker run -e BEAKER_TOKEN=$BEAKER_TOKEN \
-e BEAKER_ADDR=$BEAKER_ADDR \
-e RSLP_PREFIX=$RSLP_PREFIX \
$DOCKER_IMAGE /bin/bash -c "$INFERENCE_JOB_LAUNCH_COMMAND" && \
echo "Model inference launched!"
') \
--image-family="$image_family" \
--image-project="$image_project" \
--boot-disk-size=200GB; then
echo "Failed to create VM instance"
exit 1
fi
echo "Done!"
}

# Create the VM
create_vm "$VM_NAME" "$PROJECT_ID" "$ZONE" "$MACHINE_TYPE" "$IMAGE_FAMILY" "$IMAGE_PROJECT" "$GHCR_USER" "$USER" "$DOCKER_IMAGE" "$COMMAND" "$BEAKER_TOKEN" "$BEAKER_ADDR" "$BEAKER_USERNAME" "$SERVICE_ACCOUNT" "$RSLP_PROJECT" "$GPU_COUNT" "$SHARED_MEMORY" "$CLUSTER" "$PRIORITY" "$TASK_NAME" "$BUDGET" "$WORKSPACE" "$RSLP_PREFIX" "$EXTRA_ARGS_MODEL_PREDICT"

echo "Done!"
Loading

0 comments on commit f35765d

Please sign in to comment.