From a8e37cb9218cdd3c20eedd91e4e46021f1f3a81c Mon Sep 17 00:00:00 2001
From: DelinQu <dlqu22@m.fudan.edu.cn>
Date: Thu, 4 Jul 2024 21:18:28 +0800
Subject: [PATCH 01/13] openvla intergration

---
 scripts/openvla_bridge.sh                     |  50 +++++
 scripts/openvla_drawer_variant_agg.sh         |  82 +++++++
 scripts/openvla_drawer_visual_matching.sh     | 132 ++++++++++++
 scripts/openvla_move_near_variant_agg.sh      | 138 ++++++++++++
 scripts/openvla_move_near_visual_matching.sh  |  33 +++
 scripts/openvla_pick_coke_can_variant_agg.sh  | 182 ++++++++++++++++
 .../openvla_pick_coke_can_visual_matching.sh  |  40 ++++
 scripts/openvla_put_in_drawer_variant_agg.sh  |  78 +++++++
 .../openvla_put_in_drawer_visual_matching.sh  |  65 ++++++
 simpler_env/main_inference.py                 |  10 +-
 simpler_env/policies/openvla/openvla_model.py | 203 ++++++++++++++++++
 11 files changed, 1012 insertions(+), 1 deletion(-)
 create mode 100644 scripts/openvla_bridge.sh
 create mode 100644 scripts/openvla_drawer_variant_agg.sh
 create mode 100644 scripts/openvla_drawer_visual_matching.sh
 create mode 100644 scripts/openvla_move_near_variant_agg.sh
 create mode 100644 scripts/openvla_move_near_visual_matching.sh
 create mode 100644 scripts/openvla_pick_coke_can_variant_agg.sh
 create mode 100644 scripts/openvla_pick_coke_can_visual_matching.sh
 create mode 100644 scripts/openvla_put_in_drawer_variant_agg.sh
 create mode 100644 scripts/openvla_put_in_drawer_visual_matching.sh
 create mode 100644 simpler_env/policies/openvla/openvla_model.py

diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh
new file mode 100644
index 00000000..e8c8289e
--- /dev/null
+++ b/scripts/openvla_bridge.sh
@@ -0,0 +1,50 @@
+gpu_id=0
+policy_model=openvla
+ckpt_path="openvla/openvla-7b"
+
+scene_name=bridge_table_1_v1
+robot=widowx
+rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
+robot_init_x=0.147
+robot_init_y=0.028
+export DISPLAY=:1.0
+# VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
+# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+#   --robot ${robot} --policy-setup widowx_bridge \
+#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+#   --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
+#   --rgb-overlay-path ${rgb_overlay_path} \
+#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+#
+# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+#   --robot ${robot} --policy-setup widowx_bridge \
+#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+#   --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
+#   --rgb-overlay-path ${rgb_overlay_path} \
+#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+#
+# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+#   --robot ${robot} --policy-setup widowx_bridge \
+#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+#   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
+#   --rgb-overlay-path ${rgb_overlay_path} \
+#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+
+scene_name=bridge_table_1_v2
+robot=widowx_sink_camera_setup
+rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png
+robot_init_x=0.127
+robot_init_y=0.06
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
+  --env-name PutEggplantInBasketScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
diff --git a/scripts/openvla_drawer_variant_agg.sh b/scripts/openvla_drawer_variant_agg.sh
new file mode 100644
index 00000000..07f64600
--- /dev/null
+++ b/scripts/openvla_drawer_variant_agg.sh
@@ -0,0 +1,82 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+declare -a ckpt_paths=(
+"openvla/openvla-7b"
+)
+
+declare -a env_names=(
+OpenTopDrawerCustomInScene-v0
+OpenMiddleDrawerCustomInScene-v0
+OpenBottomDrawerCustomInScene-v0
+CloseTopDrawerCustomInScene-v0
+CloseMiddleDrawerCustomInScene-v0
+CloseBottomDrawerCustomInScene-v0
+)
+
+EXTRA_ARGS="--enable-raytracing"
+
+
+# base setup
+scene_name=frl_apartment_stage_simple
+
+EvalSim() {
+  echo ${ckpt_path} ${env_name}
+
+  python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+    --robot google_robot_static \
+    --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+    --env-name ${env_name} --scene-name ${scene_name} \
+    --robot-init-x 0.65 0.85 3 --robot-init-y -0.2 0.2 3 \
+    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
+    --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+    ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalSim
+  done
+done
+
+
+# backgrounds
+
+declare -a scene_names=(
+"modern_bedroom_no_roof"
+"modern_office_no_roof"
+)
+
+for scene_name in "${scene_names[@]}"; do
+  for ckpt_path in "${ckpt_paths[@]}"; do
+    for env_name in "${env_names[@]}"; do
+      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt"
+      EvalSim
+    done
+  done
+done
+
+
+# lightings
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker"
+    EvalSim
+  done
+done
+
+
+# new cabinets
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3"
+    EvalSim
+  done
+done
diff --git a/scripts/openvla_drawer_visual_matching.sh b/scripts/openvla_drawer_visual_matching.sh
new file mode 100644
index 00000000..a021660e
--- /dev/null
+++ b/scripts/openvla_drawer_visual_matching.sh
@@ -0,0 +1,132 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+declare -a ckpt_paths=(
+"openvla/openvla-7b"
+)
+
+declare -a env_names=(
+OpenTopDrawerCustomInScene-v0
+OpenMiddleDrawerCustomInScene-v0
+OpenBottomDrawerCustomInScene-v0
+CloseTopDrawerCustomInScene-v0
+CloseMiddleDrawerCustomInScene-v0
+CloseBottomDrawerCustomInScene-v0
+)
+
+# URDF variations
+declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
+
+for urdf_version in "${urdf_version_arr[@]}"; do
+
+EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}"
+
+EvalOverlay() {
+# A0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
+  ${EXTRA_ARGS}
+
+# A1
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \
+  ${EXTRA_ARGS}
+
+# A2
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \
+  ${EXTRA_ARGS}
+
+# B0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
+  ${EXTRA_ARGS}
+
+# B1
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \
+  ${EXTRA_ARGS}
+
+# B2
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \
+  ${EXTRA_ARGS}
+
+# C0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
+  ${EXTRA_ARGS}
+
+# C1
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \
+  ${EXTRA_ARGS}
+
+# C2
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \
+  ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalOverlay
+  done
+done
+
+
+
+done
diff --git a/scripts/openvla_move_near_variant_agg.sh b/scripts/openvla_move_near_variant_agg.sh
new file mode 100644
index 00000000..e5ad488e
--- /dev/null
+++ b/scripts/openvla_move_near_variant_agg.sh
@@ -0,0 +1,138 @@
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+# base setup
+
+env_name=MoveNearGoogleInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+
+
+# distractor
+
+for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs no_distractor=True;
+
+done
+
+
+# backgrounds
+
+env_name=MoveNearGoogleInScene-v0
+declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
+                      "google_pick_coke_can_1_v4_alt_background_2")
+
+for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+done
+
+
+
+
+
+# lighting
+
+env_name=MoveNearGoogleInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs slightly_darker_lighting=True;
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs slightly_brighter_lighting=True;
+
+done
+
+
+
+
+
+# table textures
+
+env_name=MoveNearGoogleInScene-v0
+declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
+                      "Baked_sc1_staging_objaverse_cabinet2_h870")
+
+for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+done
+
+
+
+
+# camera orientations
+
+declare -a env_arr=("MoveNearAltGoogleCameraInScene-v0" \
+                   "MoveNearAltGoogleCamera2InScene-v0")
+scene_name=google_pick_coke_can_1_v4
+
+for env_name in "${env_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
+
+done
+
+done
diff --git a/scripts/openvla_move_near_visual_matching.sh b/scripts/openvla_move_near_visual_matching.sh
new file mode 100644
index 00000000..7e334d8c
--- /dev/null
+++ b/scripts/openvla_move_near_visual_matching.sh
@@ -0,0 +1,33 @@
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+
+env_name=MoveNearGoogleBakedTexInScene-v0
+# env_name=MoveNearGoogleBakedTexInScene-v1
+scene_name=google_pick_coke_can_1_v4
+rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png
+
+# URDF variations
+declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
+
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+for urdf_version in "${urdf_version_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
+  --additional-env-build-kwargs urdf_version=${urdf_version} \
+  --additional-env-save-tags baked_except_bpb_orange;
+
+done
+
+done
diff --git a/scripts/openvla_pick_coke_can_variant_agg.sh b/scripts/openvla_pick_coke_can_variant_agg.sh
new file mode 100644
index 00000000..72e91605
--- /dev/null
+++ b/scripts/openvla_pick_coke_can_variant_agg.sh
@@ -0,0 +1,182 @@
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+
+# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
+declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
+
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+# base setup
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+
+
+# table textures
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+
+declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
+                      "Baked_sc1_staging_objaverse_cabinet2_h870")
+
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+done
+
+
+
+
+# distractors
+
+env_name=GraspSingleOpenedCokeCanDistractorInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} distractor_config=more;
+
+done
+
+done
+
+
+
+
+# backgrounds
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
+                      "google_pick_coke_can_1_v4_alt_background_2")
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for scene_name in "${scene_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+done
+
+
+
+# lightings
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True;
+
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} slightly_brighter_lighting=True;
+
+done
+
+done
+
+
+
+
+# camera orientations
+
+declare -a env_arr=("GraspSingleOpenedCokeCanAltGoogleCameraInScene-v0" \
+                   "GraspSingleOpenedCokeCanAltGoogleCamera2InScene-v0")
+scene_name=google_pick_coke_can_1_v4
+
+for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for env_name in "${env_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option};
+
+done
+
+done
+
+done
diff --git a/scripts/openvla_pick_coke_can_visual_matching.sh b/scripts/openvla_pick_coke_can_visual_matching.sh
new file mode 100644
index 00000000..083a71ee
--- /dev/null
+++ b/scripts/openvla_pick_coke_can_visual_matching.sh
@@ -0,0 +1,40 @@
+
+
+gpu_id=0
+
+declare -a arr=("openvla/openvla-7b")
+
+# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
+declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
+
+# URDF variations
+declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
+
+env_name=GraspSingleOpenedCokeCanInScene-v0
+scene_name=google_pick_coke_can_1_v4
+rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png
+
+for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
+
+
+
+for urdf_version in "${urdf_version_arr[@]}";
+
+do for coke_can_option in "${coke_can_options_arr[@]}";
+
+do for ckpt_path in "${arr[@]}";
+
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
+  --env-name ${env_name} --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
+
+done
+
+done
+
+done
diff --git a/scripts/openvla_put_in_drawer_variant_agg.sh b/scripts/openvla_put_in_drawer_variant_agg.sh
new file mode 100644
index 00000000..a3c63a12
--- /dev/null
+++ b/scripts/openvla_put_in_drawer_variant_agg.sh
@@ -0,0 +1,78 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+
+
+
+declare -a arr=("openvla/openvla-7b")
+
+declare -a env_names=(
+PlaceIntoClosedTopDrawerCustomInScene-v0
+)
+
+EXTRA_ARGS="--enable-raytracing  --additional-env-build-kwargs model_ids=apple"
+
+
+# base setup
+scene_name=frl_apartment_stage_simple
+
+EvalSim() {
+  echo ${ckpt_path} ${env_name}
+
+  python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+    --robot google_robot_static \
+    --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+    --env-name ${env_name} --scene-name ${scene_name} \
+    --robot-init-x 0.65 0.65 1 --robot-init-y -0.2 0.2 3 \
+    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
+    --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+    ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalSim
+  done
+done
+
+
+# backgrounds
+
+declare -a scene_names=(
+"modern_bedroom_no_roof"
+"modern_office_no_roof"
+)
+
+for scene_name in "${scene_names[@]}"; do
+  for ckpt_path in "${ckpt_paths[@]}"; do
+    for env_name in "${env_names[@]}"; do
+      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt model_ids=apple"
+      EvalSim
+    done
+  done
+done
+
+
+# lightings
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter model_ids=apple"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker model_ids=apple"
+    EvalSim
+  done
+done
+
+
+# new cabinets
+scene_name=frl_apartment_stage_simple
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2 model_ids=apple"
+    EvalSim
+    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3 model_ids=apple"
+    EvalSim
+  done
+done
diff --git a/scripts/openvla_put_in_drawer_visual_matching.sh b/scripts/openvla_put_in_drawer_visual_matching.sh
new file mode 100644
index 00000000..98539bf3
--- /dev/null
+++ b/scripts/openvla_put_in_drawer_visual_matching.sh
@@ -0,0 +1,65 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+
+declare -a arr=("openvla/openvla-7b")
+
+
+declare -a env_names=(
+PlaceIntoClosedTopDrawerCustomInScene-v0
+# PlaceIntoClosedMiddleDrawerCustomInScene-v0
+# PlaceIntoClosedBottomDrawerCustomInScene-v0
+)
+
+
+# URDF variations
+declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
+
+for urdf_version in "${urdf_version_arr[@]}"; do
+
+EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version} model_ids=baked_apple_v2"
+
+
+EvalOverlay() {
+# A0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
+  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
+  ${EXTRA_ARGS}
+
+# B0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
+  ${EXTRA_ARGS}
+
+# C0
+python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  --robot google_robot_static \
+  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
+  --env-name ${env_name} --scene-name dummy_drawer \
+  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
+  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
+  ${EXTRA_ARGS}
+}
+
+
+for ckpt_path in "${ckpt_paths[@]}"; do
+  for env_name in "${env_names[@]}"; do
+    EvalOverlay
+  done
+done
+
+
+
+done
diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py
index 320f2685..7ebe20c2 100644
--- a/simpler_env/main_inference.py
+++ b/simpler_env/main_inference.py
@@ -7,6 +7,7 @@
 from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator
 from simpler_env.policies.octo.octo_server_model import OctoServerInference
 from simpler_env.policies.rt1.rt1_model import RT1Inference
+from simpler_env.policies.openvla.openvla_model import OpenVALInference
 
 try:
     from simpler_env.policies.octo.octo_model import OctoInference
@@ -28,7 +29,7 @@
             gpus[0],
             [tf.config.LogicalDeviceConfiguration(memory_limit=args.tf_memory_limit)],
         )
-
+    print(f"**** {args.policy_model} ****")
     # policy model creation; update this if you are using a new policy model
     if args.policy_model == "rt1":
         assert args.ckpt_path is not None
@@ -53,6 +54,13 @@
                 init_rng=args.octo_init_rng,
                 action_scale=args.action_scale,
             )
+    elif args.policy_model == "openvla":
+        assert args.ckpt_path is not None
+        model = OpenVALInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
     else:
         raise NotImplementedError()
 
diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py
new file mode 100644
index 00000000..66bdf0d4
--- /dev/null
+++ b/simpler_env/policies/openvla/openvla_model.py
@@ -0,0 +1,203 @@
+from collections import deque
+from typing import Optional, Sequence
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from transforms3d.euler import euler2axangle
+from simpler_env.utils.action.action_ensemble import ActionEnsembler
+from transformers import AutoModelForVision2Seq, AutoProcessor
+from PIL import Image
+import torch
+import cv2 as cv
+
+
+class OpenVALInference:
+    def __init__(
+        self,
+        saved_model_path: str = "openvla/openvla-7b",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        horizon: int = 2,
+        pred_action_horizon: int = 1,
+        exec_horizon: int = 1,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+    ) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            action_ensemble_temp = 0.0
+            self.sticky_gripper_num_repeat = 1
+        elif policy_setup == "google_robot":
+            unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            action_ensemble_temp = 0.0
+            self.sticky_gripper_num_repeat = 15
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+        self.processor = AutoProcessor.from_pretrained(saved_model_path, trust_remote_code=True)
+        self.vla = AutoModelForVision2Seq.from_pretrained(
+            "openvla/openvla-7b",
+            attn_implementation="flash_attention_2",  # [Optional] Requires `flash_attn`
+            torch_dtype=torch.bfloat16,
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+        ).cuda()
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.horizon = horizon
+        self.pred_action_horizon = pred_action_horizon
+        self.exec_horizon = exec_horizon
+        self.action_ensemble = action_ensemble
+        self.action_ensemble_temp = action_ensemble_temp
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.task = None
+        self.task_description = None
+        self.image_history = deque(maxlen=self.horizon)
+        if self.action_ensemble:
+            self.action_ensembler = ActionEnsembler(self.pred_action_horizon, self.action_ensemble_temp)
+        else:
+            self.action_ensembler = None
+        self.num_image_history = 0
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        self.image_history.append(image)
+        self.num_image_history = min(self.num_image_history + 1, self.horizon)
+
+    def reset(self, task_description: str) -> None:
+        self.task_description = task_description
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.num_image_history = 0
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        self._add_image_to_history(self._resize_image(image))
+
+        image: Image.Image = Image.fromarray(image)
+        prompt = task_description
+
+        # predict action (7-dof; un-normalize for bridgev2)
+        inputs = self.processor(prompt, image).to("cuda:0", dtype=torch.bfloat16)
+        raw_actions = self.vla.predict_action(**inputs, unnorm_key=self.unnorm_key, do_sample=False)[None]
+        # print(f"*** raw actions {raw_actions} ***")
+
+        if self.action_ensemble:
+            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+        raw_action = {
+            "world_vector": np.array(raw_actions[0, :3]),
+            "rotation_delta": np.array(raw_actions[0, 3:6]),
+            "open_gripper": np.array(raw_actions[0, 6:7]),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64)
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and self.sticky_action_is_on is False:
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+
+        action["terminate_episode"] = np.array([0.0])
+
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate([a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1)
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(pred_actions[:, action_dim], label="predicted action")
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)

From 7bed3d4bce61ef2bf58ccbaf8f81ccff306883b9 Mon Sep 17 00:00:00 2001
From: DelinQu <dlqu22@m.fudan.edu.cn>
Date: Fri, 5 Jul 2024 18:23:37 +0800
Subject: [PATCH 02/13] intergrate openvla policy and PR

---
 scripts/openvla_bridge.sh | 49 +++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh
index e8c8289e..80e733ef 100644
--- a/scripts/openvla_bridge.sh
+++ b/scripts/openvla_bridge.sh
@@ -7,31 +7,30 @@ robot=widowx
 rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
 robot_init_x=0.147
 robot_init_y=0.028
-export DISPLAY=:1.0
-# VK_ICD_FILENAMES=/usr/share/vulkan/icd.d/nvidia_icd.json
-# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
-#   --robot ${robot} --policy-setup widowx_bridge \
-#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-#   --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
-#
-# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
-#   --robot ${robot} --policy-setup widowx_bridge \
-#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-#   --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
-#
-# python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
-#   --robot ${robot} --policy-setup widowx_bridge \
-#   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-#   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
+
+python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+  --robot ${robot} --policy-setup widowx_bridge \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
+  --rgb-overlay-path ${rgb_overlay_path} \
+  --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
 
 scene_name=bridge_table_1_v2

From f0674e7701750e6262879b26a5a0a0954726479a Mon Sep 17 00:00:00 2001
From: DelinQu <dlqu22@m.fudan.edu.cn>
Date: Sat, 6 Jul 2024 21:50:17 +0800
Subject: [PATCH 03/13] openvla policy intergration pull request

---
 simpler_env/main_inference.py                 | 4 ++--
 simpler_env/policies/openvla/openvla_model.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py
index 7ebe20c2..699c00af 100644
--- a/simpler_env/main_inference.py
+++ b/simpler_env/main_inference.py
@@ -7,7 +7,7 @@
 from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator
 from simpler_env.policies.octo.octo_server_model import OctoServerInference
 from simpler_env.policies.rt1.rt1_model import RT1Inference
-from simpler_env.policies.openvla.openvla_model import OpenVALInference
+from simpler_env.policies.openvla.openvla_model import OpenVLAInference
 
 try:
     from simpler_env.policies.octo.octo_model import OctoInference
@@ -56,7 +56,7 @@
             )
     elif args.policy_model == "openvla":
         assert args.ckpt_path is not None
-        model = OpenVALInference(
+        model = OpenVLAInference(
             saved_model_path=args.ckpt_path,
             policy_setup=args.policy_setup,
             action_scale=args.action_scale,
diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py
index 66bdf0d4..b21f1378 100644
--- a/simpler_env/policies/openvla/openvla_model.py
+++ b/simpler_env/policies/openvla/openvla_model.py
@@ -11,7 +11,7 @@
 import cv2 as cv
 
 
-class OpenVALInference:
+class OpenVLAInference:
     def __init__(
         self,
         saved_model_path: str = "openvla/openvla-7b",
@@ -144,7 +144,7 @@ def step(
                 relative_gripper_action = self.previous_gripper_action - current_gripper_action
             self.previous_gripper_action = current_gripper_action
 
-            if np.abs(relative_gripper_action) > 0.5 and self.sticky_action_is_on is False:
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
                 self.sticky_action_is_on = True
                 self.sticky_gripper_action = relative_gripper_action
 

From 29053792b9e3103c30e27b69a0268b4d3da8a710 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 28 Jul 2024 09:50:03 -0700
Subject: [PATCH 04/13] update openvla inference scripts

---
 scripts/octo_bridge.sh                        |  0
 scripts/octo_drawer_variant_agg.sh            |  0
 scripts/octo_drawer_visual_matching.sh        |  0
 scripts/octo_move_near_variant_agg.sh         |  0
 scripts/octo_move_near_visual_matching.sh     |  0
 scripts/octo_pick_coke_can_variant_agg.sh     |  0
 scripts/octo_pick_coke_can_visual_matching.sh |  0
 scripts/octo_put_in_drawer_variant_agg.sh     |  0
 scripts/octo_put_in_drawer_visual_matching.sh |  0
 scripts/openvla_bridge.sh                     |  0
 scripts/openvla_drawer_variant_agg.sh         |  0
 scripts/openvla_drawer_visual_matching.sh     |  0
 scripts/openvla_move_near_variant_agg.sh      |  0
 scripts/openvla_move_near_visual_matching.sh  |  0
 scripts/openvla_pick_coke_can_variant_agg.sh  |  0
 .../openvla_pick_coke_can_visual_matching.sh  |  0
 scripts/openvla_put_in_drawer_variant_agg.sh  |  2 +-
 .../openvla_put_in_drawer_visual_matching.sh  |  2 +-
 scripts/rt1_drawer_variant_agg.sh             |  0
 scripts/rt1_drawer_visual_matching.sh         |  0
 scripts/rt1_move_near_variant_agg.sh          |  0
 scripts/rt1_move_near_visual_matching.sh      |  0
 scripts/rt1_pick_coke_can_variant_agg.sh      |  0
 scripts/rt1_pick_coke_can_visual_matching.sh  |  0
 scripts/rt1_put_in_drawer_variant_agg.sh      |  0
 scripts/rt1_put_in_drawer_visual_matching.sh  |  0
 scripts/rt1x_bridge.sh                        |  0
 simpler_env/policies/openvla/openvla_model.py | 26 ++-----------------
 28 files changed, 4 insertions(+), 26 deletions(-)
 mode change 100644 => 100755 scripts/octo_bridge.sh
 mode change 100644 => 100755 scripts/octo_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/octo_move_near_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_move_near_visual_matching.sh
 mode change 100644 => 100755 scripts/octo_pick_coke_can_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_pick_coke_can_visual_matching.sh
 mode change 100644 => 100755 scripts/octo_put_in_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/octo_put_in_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_bridge.sh
 mode change 100644 => 100755 scripts/openvla_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_move_near_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_move_near_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_pick_coke_can_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_pick_coke_can_visual_matching.sh
 mode change 100644 => 100755 scripts/openvla_put_in_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/openvla_put_in_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_move_near_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_move_near_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_pick_coke_can_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_pick_coke_can_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1_put_in_drawer_variant_agg.sh
 mode change 100644 => 100755 scripts/rt1_put_in_drawer_visual_matching.sh
 mode change 100644 => 100755 scripts/rt1x_bridge.sh

diff --git a/scripts/octo_bridge.sh b/scripts/octo_bridge.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_drawer_variant_agg.sh b/scripts/octo_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_drawer_visual_matching.sh b/scripts/octo_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_move_near_variant_agg.sh b/scripts/octo_move_near_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_move_near_visual_matching.sh b/scripts/octo_move_near_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_pick_coke_can_variant_agg.sh b/scripts/octo_pick_coke_can_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_pick_coke_can_visual_matching.sh b/scripts/octo_pick_coke_can_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_put_in_drawer_variant_agg.sh b/scripts/octo_put_in_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/octo_put_in_drawer_visual_matching.sh b/scripts/octo_put_in_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_bridge.sh b/scripts/openvla_bridge.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_drawer_variant_agg.sh b/scripts/openvla_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_drawer_visual_matching.sh b/scripts/openvla_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_move_near_variant_agg.sh b/scripts/openvla_move_near_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_move_near_visual_matching.sh b/scripts/openvla_move_near_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_pick_coke_can_variant_agg.sh b/scripts/openvla_pick_coke_can_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_pick_coke_can_visual_matching.sh b/scripts/openvla_pick_coke_can_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/openvla_put_in_drawer_variant_agg.sh b/scripts/openvla_put_in_drawer_variant_agg.sh
old mode 100644
new mode 100755
index a3c63a12..fb8ce514
--- a/scripts/openvla_put_in_drawer_variant_agg.sh
+++ b/scripts/openvla_put_in_drawer_variant_agg.sh
@@ -2,7 +2,7 @@
 
 
 
-declare -a arr=("openvla/openvla-7b")
+declare -a ckpt_paths=("openvla/openvla-7b")
 
 declare -a env_names=(
 PlaceIntoClosedTopDrawerCustomInScene-v0
diff --git a/scripts/openvla_put_in_drawer_visual_matching.sh b/scripts/openvla_put_in_drawer_visual_matching.sh
old mode 100644
new mode 100755
index 98539bf3..14dd46a4
--- a/scripts/openvla_put_in_drawer_visual_matching.sh
+++ b/scripts/openvla_put_in_drawer_visual_matching.sh
@@ -1,6 +1,6 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
 
-declare -a arr=("openvla/openvla-7b")
+declare -a ckpt_paths=("openvla/openvla-7b")
 
 
 declare -a env_names=(
diff --git a/scripts/rt1_drawer_variant_agg.sh b/scripts/rt1_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_drawer_visual_matching.sh b/scripts/rt1_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_move_near_variant_agg.sh b/scripts/rt1_move_near_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_move_near_visual_matching.sh b/scripts/rt1_move_near_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_pick_coke_can_variant_agg.sh b/scripts/rt1_pick_coke_can_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_pick_coke_can_visual_matching.sh b/scripts/rt1_pick_coke_can_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_put_in_drawer_variant_agg.sh b/scripts/rt1_put_in_drawer_variant_agg.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1_put_in_drawer_visual_matching.sh b/scripts/rt1_put_in_drawer_visual_matching.sh
old mode 100644
new mode 100755
diff --git a/scripts/rt1x_bridge.sh b/scripts/rt1x_bridge.sh
old mode 100644
new mode 100755
diff --git a/simpler_env/policies/openvla/openvla_model.py b/simpler_env/policies/openvla/openvla_model.py
index b21f1378..ae43ef0b 100644
--- a/simpler_env/policies/openvla/openvla_model.py
+++ b/simpler_env/policies/openvla/openvla_model.py
@@ -1,10 +1,8 @@
-from collections import deque
 from typing import Optional, Sequence
 import os
 import matplotlib.pyplot as plt
 import numpy as np
 from transforms3d.euler import euler2axangle
-from simpler_env.utils.action.action_ensemble import ActionEnsembler
 from transformers import AutoModelForVision2Seq, AutoProcessor
 from PIL import Image
 import torch
@@ -17,7 +15,7 @@ def __init__(
         saved_model_path: str = "openvla/openvla-7b",
         unnorm_key: Optional[str] = None,
         policy_setup: str = "widowx_bridge",
-        horizon: int = 2,
+        horizon: int = 1,
         pred_action_horizon: int = 1,
         exec_horizon: int = 1,
         image_size: list[int] = [224, 224],
@@ -26,13 +24,9 @@ def __init__(
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
         if policy_setup == "widowx_bridge":
             unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key
-            action_ensemble = True
-            action_ensemble_temp = 0.0
             self.sticky_gripper_num_repeat = 1
         elif policy_setup == "google_robot":
             unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key
-            action_ensemble = True
-            action_ensemble_temp = 0.0
             self.sticky_gripper_num_repeat = 15
         else:
             raise NotImplementedError(
@@ -56,8 +50,6 @@ def __init__(
         self.horizon = horizon
         self.pred_action_horizon = pred_action_horizon
         self.exec_horizon = exec_horizon
-        self.action_ensemble = action_ensemble
-        self.action_ensemble_temp = action_ensemble_temp
 
         self.sticky_action_is_on = False
         self.gripper_action_repeat = 0
@@ -66,22 +58,10 @@ def __init__(
 
         self.task = None
         self.task_description = None
-        self.image_history = deque(maxlen=self.horizon)
-        if self.action_ensemble:
-            self.action_ensembler = ActionEnsembler(self.pred_action_horizon, self.action_ensemble_temp)
-        else:
-            self.action_ensembler = None
         self.num_image_history = 0
 
-    def _add_image_to_history(self, image: np.ndarray) -> None:
-        self.image_history.append(image)
-        self.num_image_history = min(self.num_image_history + 1, self.horizon)
-
     def reset(self, task_description: str) -> None:
         self.task_description = task_description
-        self.image_history.clear()
-        if self.action_ensemble:
-            self.action_ensembler.reset()
         self.num_image_history = 0
 
         self.sticky_action_is_on = False
@@ -109,7 +89,7 @@ def step(
                 self.reset(task_description)
 
         assert image.dtype == np.uint8
-        self._add_image_to_history(self._resize_image(image))
+        image = self._resize_image(image)
 
         image: Image.Image = Image.fromarray(image)
         prompt = task_description
@@ -119,8 +99,6 @@ def step(
         raw_actions = self.vla.predict_action(**inputs, unnorm_key=self.unnorm_key, do_sample=False)[None]
         # print(f"*** raw actions {raw_actions} ***")
 
-        if self.action_ensemble:
-            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
         raw_action = {
             "world_vector": np.array(raw_actions[0, :3]),
             "rotation_delta": np.array(raw_actions[0, 3:6]),

From 52727f9f29ab4306e946fa595c9d0392b95f5b62 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 28 Jul 2024 09:52:19 -0700
Subject: [PATCH 05/13] update readme

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index f232c0d4..ee813be5 100644
--- a/README.md
+++ b/README.md
@@ -289,6 +289,13 @@ If you are using CUDA 12, then to use GPU for Octo inference, you need CUDA vers
 
 `PATH=/usr/local/cuda-12.3/bin:$PATH   LD_LIBRARY_PATH=/usr/local/cuda-12.3/lib64:$LD_LIBRARY_PATH   bash scripts/octo_xxx_script.sh`
 
+### OpenVLA Inference Setup
+
+```
+pip install torch==2.3.1 torchvision==0.18.1 timm==0.9.10 tokenizers==0.15.2 accelerate==0.32.1
+pip install flash-attn==2.6.1 --no-build-isolation
+```
+
 ## Troubleshooting
 
 1. If you encounter issues such as

From 37d8fa8de5cd0f7b2dda472fa8ebd2b48c6994f0 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sat, 17 Aug 2024 23:40:26 -0700
Subject: [PATCH 06/13] Add OpenVLA metrics

---
 simpler_env/utils/metrics.py            | 82 +++++++++++++++++++++++++
 tools/calc_metrics_evaluation_videos.py | 15 +++++
 2 files changed, 97 insertions(+)

diff --git a/simpler_env/utils/metrics.py b/simpler_env/utils/metrics.py
index 39a251c9..019a5a2f 100644
--- a/simpler_env/utils/metrics.py
+++ b/simpler_env/utils/metrics.py
@@ -5,6 +5,33 @@
 import numpy as np
 
 REAL_PERF = {    # Real robot eval performance --> extract via: REAL_PERF[task][policy]
+    "google_robot_pick_coke_can_horizontal": {
+        "rt-2-x": 0.920,
+        "rt-1-converged": 0.960,
+        "rt-1-15pct": 1.000,
+        "rt-1-x": 0.880,
+        "rt-1-begin": 0.200,
+        "octo-base": 0.440,
+        "openvla-7b": 0.640,
+    },
+    "google_robot_pick_coke_can_vertical": {
+        "rt-2-x": 0.800,
+        "rt-1-converged": 0.880,
+        "rt-1-15pct": 0.960,
+        "rt-1-x": 0.560,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.200,
+        "openvla-7b": 0.280,
+    },
+    "google_robot_pick_coke_can_standing": {
+        "rt-2-x": 1.000,
+        "rt-1-converged": 0.720,
+        "rt-1-15pct": 0.800,
+        "rt-1-x": 0.840,
+        "rt-1-begin": 0.200,
+        "octo-base": 0.240,
+        "openvla-7b": 0.360,
+    },
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.907,
         "rt-1-converged": 0.853,
@@ -12,6 +39,7 @@
         "rt-1-x": 0.760,
         "rt-1-begin": 0.133,
         "octo-base": 0.293,
+        "openvla-7b": 0.427,
     },
     "google_robot_move_near": {
         "rt-2-x": 0.733,
@@ -20,6 +48,7 @@
         "rt-1-x": 0.450,
         "rt-1-begin": 0.017,
         "octo-base": 0.350,
+        "openvla-7b": 0.667,
     },
     "google_robot_open_drawer": {
         "rt-2-x": 0.333,
@@ -28,6 +57,7 @@
         "rt-1-x": 0.519,
         "rt-1-begin": 0.000,
         "octo-base": 0.148,
+        "openvla-7b": 0.111,
     },
     "google_robot_close_drawer": {
         "rt-2-x": 0.630,
@@ -36,6 +66,16 @@
         "rt-1-x": 0.741,
         "rt-1-begin": 0.000,
         "octo-base": 0.519,
+        "openvla-7b": 0.148,
+    },
+    "google_robot_drawer": {
+        "rt-2-x": 0.481,
+        "rt-1-converged": 0.870,
+        "rt-1-15pct": 0.796,
+        "rt-1-x": 0.630,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.333,
+        "openvla-7b": 0.130,
     },
     "google_robot_place_apple_in_closed_top_drawer": {
         "rt-2-x": 0.074,
@@ -44,6 +84,7 @@
         "rt-1-x": 0.407,
         "rt-1-begin": 0.000,
         "octo-base": 0.000,
+        "openvla-7b": 0.000,
     },
     "widowx_spoon_on_towel": {
         "rt-1-x": 0.000,
@@ -69,6 +110,33 @@
 
 
 SIMPLER_PERF = {    # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
+    "google_robot_pick_coke_can_horizontal": {
+        "rt-2-x": 0.740,
+        "rt-1-converged": 0.960,
+        "rt-1-15pct": 0.860,
+        "rt-1-x": 0.820,
+        "rt-1-begin": 0.050,
+        "octo-base": 0.210,
+        "openvla-7b": 0.310,
+    },
+    "google_robot_pick_coke_can_vertical": {
+        "rt-2-x": 0.740,
+        "rt-1-converged": 0.900,
+        "rt-1-15pct": 0.790,
+        "rt-1-x": 0.330,
+        "rt-1-begin": 0.000,
+        "octo-base": 0.210,
+        "openvla-7b": 0.030,
+    },
+    "google_robot_pick_coke_can_standing": {
+        "rt-2-x": 0.880,
+        "rt-1-converged": 0.710,
+        "rt-1-15pct": 0.480,
+        "rt-1-x": 0.550,
+        "rt-1-begin": 0.030,
+        "octo-base": 0.090,
+        "openvla-7b": 0.190,
+    },
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.787,
         "rt-1-converged": 0.857,
@@ -76,6 +144,7 @@
         "rt-1-x": 0.567,
         "rt-1-begin": 0.027,
         "octo-base": 0.170,
+        "openvla-7b": 0.177,
     },
     "google_robot_move_near": {
         "rt-2-x": 0.779,
@@ -84,6 +153,7 @@
         "rt-1-x": 0.317,
         "rt-1-begin": 0.050,
         "octo-base": 0.042,
+        "openvla-7b": 0.492,
     },
     "google_robot_open_drawer": {
         "rt-2-x": 0.157,
@@ -92,6 +162,7 @@
         "rt-1-x": 0.296,
         "rt-1-begin": 0.000,
         "octo-base": 0.009,
+        "openvla-7b": 0.250,
     },
     "google_robot_close_drawer": {
         "rt-2-x": 0.343,
@@ -100,6 +171,16 @@
         "rt-1-x": 0.891,
         "rt-1-begin": 0.278,
         "octo-base": 0.444,
+        "openvla-7b": 0.574,
+    },
+    "google_robot_drawer": {
+        "rt-2-x": 0.250,
+        "rt-1-converged": 0.730,
+        "rt-1-15pct": 0.565,
+        "rt-1-x": 0.597,
+        "rt-1-begin": 0.139,
+        "octo-base": 0.227,
+        "openvla-7b": 0.412,
     },
     "google_robot_place_apple_in_closed_top_drawer": {
         "rt-2-x": 0.037,
@@ -108,6 +189,7 @@
         "rt-1-x": 0.213,
         "rt-1-begin": 0.000,
         "octo-base": 0.000,
+        "openvla-7b": 0.000,
     },
     "widowx_spoon_on_towel": {
         "rt-1-x": 0.000,
diff --git a/tools/calc_metrics_evaluation_videos.py b/tools/calc_metrics_evaluation_videos.py
index 1981ad25..3ea1eff5 100644
--- a/tools/calc_metrics_evaluation_videos.py
+++ b/tools/calc_metrics_evaluation_videos.py
@@ -27,6 +27,7 @@ def calc_pick_coke_can_stats(root_result_dir):
             "rt-1-x": 0.88,
             "rt-1-begin": 0.20,
             "octo-base": 0.44,
+            "openvla-7b": 0.64,
         },
         "vertical": {
             "rt-2-x": 0.80,
@@ -35,6 +36,7 @@ def calc_pick_coke_can_stats(root_result_dir):
             "rt-1-x": 0.56,
             "rt-1-begin": 0.00,
             "octo-base": 0.20,
+            "openvla-7b": 0.28,
         },
         "standing": {
             "rt-2-x": 1.00,
@@ -43,6 +45,7 @@ def calc_pick_coke_can_stats(root_result_dir):
             "rt-1-x": 0.84,
             "rt-1-begin": 0.20,
             "octo-base": 0.24,
+            "openvla-7b": 0.36,
         },
     }
 
@@ -282,6 +285,7 @@ def calc_move_near_stats(root_result_dir):
         "rt-1-x": 0.45,
         "rt-1-begin": 0.017,
         "octo-base": 0.35,
+        "openvla-7b": 0.667,
     }
 
     ckpt_alias_keys = list(move_near_real_success.keys())
@@ -413,6 +417,7 @@ def calc_drawer_stats(root_result_dir):
             "rt-1-x": 0.519,
             "rt-1-begin": 0.000,
             "octo-base": 0.148,
+            "openvla-7b": 0.111,
         },
         "close": {
             "rt-2-x": 0.630,
@@ -421,6 +426,7 @@ def calc_drawer_stats(root_result_dir):
             "rt-1-x": 0.741,
             "rt-1-begin": 0.000,
             "octo-base": 0.519,
+            "openvla-7b": 0.148,
         },
     }
 
@@ -642,6 +648,7 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
             "rt-1-x": 0.407,
             "rt-1-begin": 0.000,
             "octo-base": 0.000,
+            "openvla-7b": 0.000,
         },
     }
 
@@ -855,21 +862,25 @@ def calc_bridge_put_on_env_stats(root_result_dir):
             "rt-1-x": 0.042,
             "octo-base": 0.500,
             "octo-small": 0.542,
+            "openvla-7b": 0.10,
         },
         "put_carrot_on_plate": {
             "rt-1-x": 0.167,
             "octo-base": 0.500,
             "octo-small": 0.208,
+            "openvla-7b": 0.10,
         },
         "stack_green_block_on_yellow_block": {
             "rt-1-x": 0.000,
             "octo-base": 0.292,
             "octo-small": 0.583,
+            "openvla-7b": 0.10,
         },
         "put_eggplant_in_basket": {
             "rt-1-x": 0.000,
             "octo-base": 0.400,
             "octo-small": 0.600,
+            "openvla-7b": 0.10,
         },
     }
     real_success_dict = {
@@ -877,17 +888,20 @@ def calc_bridge_put_on_env_stats(root_result_dir):
             "rt-1-x": 0.000,
             "octo-base": 0.333,
             "octo-small": 0.417,
+            "openvla-7b": 0.10,
         },
         "put_carrot_on_plate": {"rt-1-x": 0.00, "octo-base": 0.25, "octo-small": 0.083},
         "stack_green_block_on_yellow_block": {
             "rt-1-x": 0.000,
             "octo-base": 0.000,
             "octo-small": 0.125,
+            "openvla-7b": 0.10,
         },
         "put_eggplant_in_basket": {
             "rt-1-x": 0.000,
             "octo-base": 0.250,
             "octo-small": 0.400,
+            "openvla-7b": 0.10,
         },
     }
 
@@ -1023,6 +1037,7 @@ def calc_bridge_put_on_env_stats(root_result_dir):
     "octo-base": "octo-base",
     "octo-small": "octo-small",
     "octo-server": "octo-server",
+    "openvla-7b": "openvla-7b",
 }
 
 parser = argparse.ArgumentParser()

From 3667e65213d9f0d826081b1f9d720297b4e70fae Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sat, 17 Aug 2024 23:55:38 -0700
Subject: [PATCH 07/13] update readme

---
 README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index ee813be5..c998a097 100644
--- a/README.md
+++ b/README.md
@@ -23,9 +23,10 @@ We hope that our work guides and inspires future real-to-sim evaluation efforts.
   - [Code Structure](#code-structure)
   - [Adding New Policies](#adding-new-policies)
   - [Adding New Real-to-Sim Evaluation Environments and Robots](#adding-new-real-to-sim-evaluation-environments-and-robots)
-  - [Full Installation (RT-1 and Octo Inference, Env Building)](#full-installation-rt-1-and-octo-inference-env-building)
+  - [Full Installation (RT-1, Octo, OpenVLA Inference, Env Building)](#full-installation-rt-1-octo-openvla-inference-env-building)
     - [RT-1 Inference Setup](#rt-1-inference-setup)
     - [Octo Inference Setup](#octo-inference-setup)
+    - [OpenVLA Inference Setup](#openvla-inference-setup)
   - [Troubleshooting](#troubleshooting)
   - [Citation](#citation)
 
@@ -97,7 +98,7 @@ cd {this_repo}
 pip install -e .
 ```
 
-**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-and-octo-inference-env-building).**
+**If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo, OpenVLA), or add new robots and environments, please additionally follow the full installation instructions [here](#full-installation-rt-1-octo-openvla-inference-env-building).**
 
 
 ## Examples
@@ -105,7 +106,7 @@ pip install -e .
 - Simple RT-1 and Octo evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
 - Colab notebook for RT-1 and Octo inference: see [this link](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb).
 - Environment interactive visualization and manual control: see [`ManiSkill2_real2sim/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py`](https://github.com/simpler-env/ManiSkill2_real2sim/blob/main/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py)
-- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, and Octo policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
+- Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, Octo, and OpenVLA policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
 - Real-to-sim evaluation videos from running `scripts/*.sh`: see [this link](https://huggingface.co/datasets/xuanlinli17/simpler-env-eval-example-videos/tree/main).
 
 ## Current Environments
@@ -219,7 +220,7 @@ If you want to use existing environments for evaluating new policies, you can ke
 We provide a step-by-step guide to add new real-to-sim evaluation environments and robots in [this README](ADDING_NEW_ENVS_ROBOTS.md)
 
 
-## Full Installation (RT-1 and Octo Inference, Env Building)
+## Full Installation (RT-1, Octo, OpenVLA Inference, Env Building)
 
 If you'd like to perform evaluations on our provided agents (e.g., RT-1, Octo), or add new robots and environments, please follow the full installation instructions below.
 

From 2b0ecc8ec6793ec2268e630b1710939feb998407 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 18 Aug 2024 09:27:27 -0700
Subject: [PATCH 08/13] add openvla simple inference

---
 README.md                                              |  2 +-
 ...imple_inference_visual_matching_prepackaged_envs.py | 10 ++++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c998a097..83239580 100644
--- a/README.md
+++ b/README.md
@@ -103,7 +103,7 @@ pip install -e .
 
 ## Examples
 
-- Simple RT-1 and Octo evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
+- Simple RT-1, Octo, and OpenVLA evaluation script on prepackaged environments with visual matching evaluation setup: see [`simpler_env/simple_inference_visual_matching_prepackaged_envs.py`](https://github.com/simpler-env/SimplerEnv/blob/main/simpler_env/simple_inference_visual_matching_prepackaged_envs.py).
 - Colab notebook for RT-1 and Octo inference: see [this link](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb).
 - Environment interactive visualization and manual control: see [`ManiSkill2_real2sim/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py`](https://github.com/simpler-env/ManiSkill2_real2sim/blob/main/mani_skill2_real2sim/examples/demo_manual_control_custom_envs.py)
 - Policy inference scripts to reproduce our Google Robot and WidowX real-to-sim evaluation results with sweeps over object / robot poses and advanced loggings. These contain both visual matching and variant aggregation evaluation setups along with RT-1, RT-1-X, Octo, and OpenVLA policies. See [`scripts/`](https://github.com/simpler-env/SimplerEnv/tree/main/scripts).
diff --git a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
index 8bf23b87..2e09cd22 100644
--- a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
+++ b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
@@ -6,6 +6,8 @@
         --ckpt-path ./checkpoints/rt_1_tf_trained_for_000400120  --task google_robot_pick_coke_can  --logging-root ./results_simple_eval/  --n-trajs 10
     python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy octo-small \
         --ckpt-path None --task widowx_spoon_on_towel  --logging-root ./results_simple_eval/  --n-trajs 10
+    python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy openvla/openvla-7b \
+        --ckpt-path None --task google_robot_move_near_v1  --logging-root ./results_simple_eval/  --n-trajs 10
 """
 
 import argparse
@@ -21,7 +23,7 @@
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small"])
+parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small", "openvla/openvla-7b"])
 parser.add_argument(
     "--ckpt-path",
     type=str,
@@ -37,7 +39,7 @@
 parser.add_argument("--n-trajs", type=int, default=10)
 
 args = parser.parse_args()
-if args.policy in ["octo-base", "octo-small"]:
+if args.policy in ["octo-base", "octo-small", "openvla/openvla-7b"]:
     if args.ckpt_path in [None, "None"] or "rt_1_x" in args.ckpt_path:
         args.ckpt_path = args.policy
 if args.ckpt_path[-1] == "/":
@@ -75,6 +77,10 @@
     from simpler_env.policies.octo.octo_model import OctoInference
 
     model = OctoInference(model_type=args.ckpt_path, policy_setup=policy_setup, init_rng=0)
+elif "openvla" in args.policy:
+    from simpler_env.policies.openvla.openvla_model import OpenVLAInference
+
+    model = OpenVLAInference(saved_model_path=args.ckpt_path, policy_setup=policy_setup)
 else:
     raise NotImplementedError()
 

From 2cd7aaea91a7ed19d81c7652b4c9f01e157e6444 Mon Sep 17 00:00:00 2001
From: xuanlinli17 <xuanlinli17@gmail.com>
Date: Sun, 18 Aug 2024 09:44:30 -0700
Subject: [PATCH 09/13] minor readme modification

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 83239580..c46c40d8 100644
--- a/README.md
+++ b/README.md
@@ -184,6 +184,7 @@ simpler_env/
    policies/: policy implementations
       rt1/: RT-1 policy implementation
       octo/: Octo policy implementation
+      openvla/: OpenVLA policy implementation
    utils/:
       env/: environment building and observation utilities
       debug/: debugging tools for policies and robots
@@ -206,7 +207,7 @@ scripts/: example bash scripts for policy inference under our variant aggregatio
 
 If you want to use existing environments for evaluating new policies, you can keep `./ManiSkill2_real2sim` as is.
 
-1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`) and Octo (`simpler_env/policies/octo`) policies.
+1. Implement new policy inference scripts in `simpler_env/policies/{your_new_policy}`, following the examples for RT-1 (`simpler_env/policies/rt1`), Octo (`simpler_env/policies/octo`), and OpenVLA (`simpler_env/policies/openvla`) policies.
 2. You can now use `simpler_env/simple_inference_visual_matching_prepackaged_envs.py` to perform policy evaluations in simulation.
    - If the policy behaviors deviate a lot from those in the real-world, you can write similar scripts as in `simpler_env/utils/debug/{policy_name}_inference_real_video.py` to debug the policy behaviors. The debugging script performs policy inference by feeding real eval video frames into the policy. If the policy behavior still deviates significantly from real, this may suggest that policy actions are processed incorrectly into the simulation environments. Please double check action orderings and action spaces.
 3. If you'd like to perform customized evaluations,

From 8a0802956b51bffc8d06eb74a0e5d6c6848f81a1 Mon Sep 17 00:00:00 2001
From: DelinQu <dlqu22@m.fudan.edu.cn>
Date: Sun, 29 Dec 2024 18:12:01 +0800
Subject: [PATCH 10/13] fix drawer bugs and auto params

---
 scripts/{openvla_bridge.sh => bridge.sh}      |  17 +-
 ...r_variant_agg.sh => drawer_variant_agg.sh} |  16 +-
 scripts/drawer_visual_matching.sh             | 135 ++++
 .../misc/octo_drawer_variant_agg_alt_urdf.sh  |  87 ---
 .../octo_move_near_variant_agg_alt_urdf.sh    | 144 ----
 ...octo_pick_coke_can_variant_agg_alt_urdf.sh | 183 -----
 ...ariant_agg.sh => move_near_variant_agg.sh} |  26 +-
 ...tching.sh => move_near_visual_matching.sh} |  14 +-
 scripts/octo_drawer_variant_agg.sh            |  85 ---
 scripts/octo_drawer_visual_matching.sh        | 135 ----
 scripts/octo_move_near_variant_agg.sh         | 139 ----
 scripts/octo_move_near_visual_matching.sh     |  48 --
 scripts/octo_pick_coke_can_variant_agg.sh     | 182 -----
 scripts/octo_pick_coke_can_visual_matching.sh |  43 --
 scripts/octo_put_in_drawer_variant_agg.sh     |  80 ---
 scripts/octo_put_in_drawer_visual_matching.sh |  66 --
 scripts/openvla_drawer_visual_matching.sh     | 132 ----
 ...nt_agg.sh => pick_coke_can_variant_agg.sh} |  28 +-
 ...ng.sh => pick_coke_can_visual_matching.sh} |  14 +-
 ...nt_agg.sh => put_in_drawer_variant_agg.sh} |  16 +-
 ...ng.sh => put_in_drawer_visual_matching.sh} |  24 +-
 scripts/run_cogact.sh                         |  39 ++
 scripts/run_octo.sh                           |  39 ++
 scripts/run_openvla.sh                        |  39 ++
 scripts/run_rt1.sh                            |  39 ++
 simpler_env/evaluation/argparse.py            |   3 +-
 .../evaluation/maniskill2_evaluator.py        |  39 +-
 simpler_env/main_inference.py                 |  16 +-
 simpler_env/policies/octo/octo_model.py       |  12 +-
 simpler_env/policies/rt1/rt1_model.py         |  14 +-
 simpler_env/policies/sim_cogact/__init__.py   |   2 +
 .../policies/sim_cogact/adaptive_ensemble.py  |  44 ++
 .../policies/sim_cogact/cogact_policy.py      | 232 +++++++
 .../sim_cogact/scripts/cogact_bridge.sh       |  20 +-
 .../scripts/cogact_drawer_variant_agg.sh      |   8 +-
 .../scripts/cogact_drawer_visual_matching.sh  |  26 +-
 .../scripts/cogact_move_near_variant_agg.sh   |  18 +-
 .../cogact_move_near_visual_matching.sh       |   4 +-
 .../cogact_pick_coke_can_variant_agg.sh       |  19 +-
 .../cogact_pick_coke_can_visual_matching.sh   |   6 +-
 .../cogact_put_in_drawer_variant_agg.sh       |   8 +-
 .../cogact_put_in_drawer_visual_matching.sh   |  12 +-
 .../sim_cogact/scripts}/octo_bridge.sh        |  12 +-
 ...erence_visual_matching_prepackaged_envs.py |  10 +-
 simpler_env/utils/action/action_ensemble.py   |   2 +-
 simpler_env/utils/metrics.py                  | 270 +++++---
 simpler_env/utils/visualization.py            |  14 +
 tools/calc_metrics_evaluation_videos.py       | 642 +++++++++++++-----
 48 files changed, 1429 insertions(+), 1774 deletions(-)
 rename scripts/{openvla_bridge.sh => bridge.sh} (71%)
 mode change 100755 => 100644
 rename scripts/{rt1_drawer_variant_agg.sh => drawer_variant_agg.sh} (86%)
 mode change 100755 => 100644
 create mode 100644 scripts/drawer_visual_matching.sh
 delete mode 100644 scripts/misc/octo_drawer_variant_agg_alt_urdf.sh
 delete mode 100644 scripts/misc/octo_move_near_variant_agg_alt_urdf.sh
 delete mode 100644 scripts/misc/octo_pick_coke_can_variant_agg_alt_urdf.sh
 rename scripts/{rt1_move_near_variant_agg.sh => move_near_variant_agg.sh} (79%)
 mode change 100755 => 100644
 rename scripts/{rt1_move_near_visual_matching.sh => move_near_visual_matching.sh} (77%)
 mode change 100755 => 100644
 delete mode 100755 scripts/octo_drawer_variant_agg.sh
 delete mode 100755 scripts/octo_drawer_visual_matching.sh
 delete mode 100755 scripts/octo_move_near_variant_agg.sh
 delete mode 100755 scripts/octo_move_near_visual_matching.sh
 delete mode 100755 scripts/octo_pick_coke_can_variant_agg.sh
 delete mode 100755 scripts/octo_pick_coke_can_visual_matching.sh
 delete mode 100755 scripts/octo_put_in_drawer_variant_agg.sh
 delete mode 100755 scripts/octo_put_in_drawer_visual_matching.sh
 delete mode 100755 scripts/openvla_drawer_visual_matching.sh
 rename scripts/{rt1_pick_coke_can_variant_agg.sh => pick_coke_can_variant_agg.sh} (82%)
 mode change 100755 => 100644
 rename scripts/{rt1_pick_coke_can_visual_matching.sh => pick_coke_can_visual_matching.sh} (79%)
 mode change 100755 => 100644
 rename scripts/{rt1_put_in_drawer_variant_agg.sh => put_in_drawer_variant_agg.sh} (86%)
 mode change 100755 => 100644
 rename scripts/{rt1_put_in_drawer_visual_matching.sh => put_in_drawer_visual_matching.sh} (75%)
 mode change 100755 => 100644
 create mode 100644 scripts/run_cogact.sh
 create mode 100644 scripts/run_octo.sh
 create mode 100644 scripts/run_openvla.sh
 create mode 100644 scripts/run_rt1.sh
 create mode 100644 simpler_env/policies/sim_cogact/__init__.py
 create mode 100644 simpler_env/policies/sim_cogact/adaptive_ensemble.py
 create mode 100644 simpler_env/policies/sim_cogact/cogact_policy.py
 rename scripts/rt1x_bridge.sh => simpler_env/policies/sim_cogact/scripts/cogact_bridge.sh (89%)
 mode change 100755 => 100644
 rename scripts/openvla_drawer_variant_agg.sh => simpler_env/policies/sim_cogact/scripts/cogact_drawer_variant_agg.sh (91%)
 mode change 100755 => 100644
 rename scripts/rt1_drawer_visual_matching.sh => simpler_env/policies/sim_cogact/scripts/cogact_drawer_visual_matching.sh (82%)
 mode change 100755 => 100644
 rename scripts/openvla_move_near_variant_agg.sh => simpler_env/policies/sim_cogact/scripts/cogact_move_near_variant_agg.sh (90%)
 mode change 100755 => 100644
 rename scripts/openvla_move_near_visual_matching.sh => simpler_env/policies/sim_cogact/scripts/cogact_move_near_visual_matching.sh (92%)
 mode change 100755 => 100644
 rename scripts/openvla_pick_coke_can_variant_agg.sh => simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_variant_agg.sh (92%)
 mode change 100755 => 100644
 rename scripts/openvla_pick_coke_can_visual_matching.sh => simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_visual_matching.sh (93%)
 mode change 100755 => 100644
 rename scripts/openvla_put_in_drawer_variant_agg.sh => simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_variant_agg.sh (91%)
 mode change 100755 => 100644
 rename scripts/openvla_put_in_drawer_visual_matching.sh => simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_visual_matching.sh (84%)
 mode change 100755 => 100644
 rename {scripts => simpler_env/policies/sim_cogact/scripts}/octo_bridge.sh (92%)
 mode change 100755 => 100644

diff --git a/scripts/openvla_bridge.sh b/scripts/bridge.sh
old mode 100755
new mode 100644
similarity index 71%
rename from scripts/openvla_bridge.sh
rename to scripts/bridge.sh
index 80e733ef..e346f543
--- a/scripts/openvla_bridge.sh
+++ b/scripts/bridge.sh
@@ -1,6 +1,9 @@
-gpu_id=0
-policy_model=openvla
-ckpt_path="openvla/openvla-7b"
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
+
 
 scene_name=bridge_table_1_v1
 robot=widowx
@@ -8,7 +11,7 @@ rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
 robot_init_x=0.147
 robot_init_y=0.028
 
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot ${robot} --policy-setup widowx_bridge \
   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
   --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
@@ -16,7 +19,7 @@ python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot ${robot} --policy-setup widowx_bridge \
   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
   --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
@@ -24,7 +27,7 @@ python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot ${robot} --policy-setup widowx_bridge \
   --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
@@ -39,7 +42,7 @@ rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png
 robot_init_x=0.127
 robot_init_y=0.06
 
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot ${robot} --policy-setup widowx_bridge \
   --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name PutEggplantInBasketScene-v0 --scene-name ${scene_name} \
diff --git a/scripts/rt1_drawer_variant_agg.sh b/scripts/drawer_variant_agg.sh
old mode 100755
new mode 100644
similarity index 86%
rename from scripts/rt1_drawer_variant_agg.sh
rename to scripts/drawer_variant_agg.sh
index 983229b6..17113f21
--- a/scripts/rt1_drawer_variant_agg.sh
+++ b/scripts/drawer_variant_agg.sh
@@ -1,13 +1,11 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-
-
-declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
-)
+declare -a ckpt_paths=(${ckpt_path})
 
 declare -a env_names=(
 OpenTopDrawerCustomInScene-v0
@@ -27,7 +25,7 @@ scene_name=frl_apartment_stage_simple
 EvalSim() {
   echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
     --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/drawer_visual_matching.sh b/scripts/drawer_visual_matching.sh
new file mode 100644
index 00000000..93869004
--- /dev/null
+++ b/scripts/drawer_visual_matching.sh
@@ -0,0 +1,135 @@
+# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
+
+declare -a ckpt_paths=($ckpt_path)
+
+declare -a env_names=(
+  OpenTopDrawerCustomInScene-v0
+  OpenMiddleDrawerCustomInScene-v0
+  OpenBottomDrawerCustomInScene-v0
+  CloseTopDrawerCustomInScene-v0
+  CloseMiddleDrawerCustomInScene-v0
+  CloseBottomDrawerCustomInScene-v0
+)
+
+# URDF variations
+declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
+
+for urdf_version in "${urdf_version_arr[@]}"; do
+
+  EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}"
+
+  EvalOverlay() {
+    # A0
+    echo ${ckpt_path} ${env_name} ${urdf_version}
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
+      ${EXTRA_ARGS}
+
+    # A1
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \
+      ${EXTRA_ARGS}
+
+    # A2
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \
+      ${EXTRA_ARGS}
+
+    # B0
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
+      ${EXTRA_ARGS}
+
+    # B1
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \
+      ${EXTRA_ARGS}
+
+    # B2
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \
+      ${EXTRA_ARGS}
+
+    # C0
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
+      ${EXTRA_ARGS}
+
+    # C1
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \
+      ${EXTRA_ARGS}
+
+    # C2
+    CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
+      --robot google_robot_static \
+      --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
+      --env-name ${env_name} --scene-name dummy_drawer \
+      --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \
+      --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
+      --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
+      --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \
+      ${EXTRA_ARGS}
+  }
+
+
+  for ckpt_path in "${ckpt_paths[@]}"; do
+    for env_name in "${env_names[@]}"; do
+      EvalOverlay
+    done
+  done
+
+done
diff --git a/scripts/misc/octo_drawer_variant_agg_alt_urdf.sh b/scripts/misc/octo_drawer_variant_agg_alt_urdf.sh
deleted file mode 100644
index 9440be5d..00000000
--- a/scripts/misc/octo_drawer_variant_agg_alt_urdf.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
-
-declare -a policy_models=(
-"octo-base"
-)
-
-declare -a env_names=(
-OpenTopDrawerCustomInScene-v0
-OpenMiddleDrawerCustomInScene-v0
-OpenBottomDrawerCustomInScene-v0
-CloseTopDrawerCustomInScene-v0
-CloseMiddleDrawerCustomInScene-v0
-CloseBottomDrawerCustomInScene-v0
-)
-
-urdf_version=recolor_sim
-
-EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs urdf_version=${urdf_version}"
-
-
-# base setup
-scene_name=frl_apartment_stage_simple
-
-EvalSim() {
-  echo ${policy_model} ${env_name}
-
-  python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-    --robot google_robot_static \
-    --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-    --env-name ${env_name} --scene-name ${scene_name} \
-    --robot-init-x 0.65 0.85 3 --robot-init-y -0.2 0.2 3 \
-    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
-    --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-    ${EXTRA_ARGS}
-}
-
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalSim
-  done
-done
-
-
-# backgrounds
-
-declare -a scene_names=(
-"modern_bedroom_no_roof"
-"modern_office_no_roof"
-)
-
-for scene_name in "${scene_names[@]}"; do
-  for policy_model in "${policy_models[@]}"; do
-    for env_name in "${env_names[@]}"; do
-      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version}"
-      EvalSim
-    done
-  done
-done
-
-
-# lightings
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} light_mode=brighter"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} light_mode=darker"
-    EvalSim
-  done
-done
-
-
-# new cabinets
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} station_name=mk_station2"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt urdf_version=${urdf_version} station_name=mk_station3"
-    EvalSim
-  done
-done
diff --git a/scripts/misc/octo_move_near_variant_agg_alt_urdf.sh b/scripts/misc/octo_move_near_variant_agg_alt_urdf.sh
deleted file mode 100644
index 11c1aa09..00000000
--- a/scripts/misc/octo_move_near_variant_agg_alt_urdf.sh
+++ /dev/null
@@ -1,144 +0,0 @@
-
-gpu_id=0
-
-declare -a arr=("octo-base")
-urdf_version=recolor_sim
-
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
-
-
-# base setup
-
-env_name=MoveNearGoogleInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs urdf_version=${urdf_version};
-
-done
-
-
-
-# distractor
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs no_distractor=True urdf_version=${urdf_version};
-
-done
-
-
-# backgrounds
-
-env_name=MoveNearGoogleInScene-v0
-declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
-                      "google_pick_coke_can_1_v4_alt_background_2")
-
-for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs urdf_version=${urdf_version};
-
-done
-
-done
-
-
-
-
-
-# lighting
-
-env_name=MoveNearGoogleInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_darker_lighting=True urdf_version=${urdf_version};
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_brighter_lighting=True urdf_version=${urdf_version};
-
-done
-
-
-
-
-
-# table textures
-
-env_name=MoveNearGoogleInScene-v0
-declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
-                      "Baked_sc1_staging_objaverse_cabinet2_h870")
-
-for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_brighter_lighting=True urdf_version=${urdf_version};
-
-done
-
-done
-
-
-
-
-# camera orientations
-
-declare -a env_arr=("MoveNearAltGoogleCameraInScene-v0" \
-                   "MoveNearAltGoogleCamera2InScene-v0")
-scene_name=google_pick_coke_can_1_v4
-
-for env_name in "${env_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_brighter_lighting=True urdf_version=${urdf_version};
-
-done
-
-done
diff --git a/scripts/misc/octo_pick_coke_can_variant_agg_alt_urdf.sh b/scripts/misc/octo_pick_coke_can_variant_agg_alt_urdf.sh
deleted file mode 100644
index 5d7b3dc5..00000000
--- a/scripts/misc/octo_pick_coke_can_variant_agg_alt_urdf.sh
+++ /dev/null
@@ -1,183 +0,0 @@
-
-gpu_id=0
-
-declare -a arr=("octo-base")
-
-# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
-declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
-urdf_version=recolor_sim
-# recolor_sim
-
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
-
-
-# base setup
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-
-
-# table textures
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-
-declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
-                      "Baked_sc1_staging_objaverse_cabinet2_h870")
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-done
-
-
-
-
-# distractors
-
-env_name=GraspSingleOpenedCokeCanDistractorInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version} distractor_config=more;
-
-done
-
-done
-
-
-
-
-# backgrounds
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
-                      "google_pick_coke_can_1_v4_alt_background_2")
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-done
-
-
-
-# lightings
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version} slightly_darker_lighting=True;
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version} slightly_brighter_lighting=True;
-
-done
-
-done
-
-
-
-
-# camera orientations
-
-declare -a env_arr=("GraspSingleOpenedCokeCanAltGoogleCameraInScene-v0" \
-                   "GraspSingleOpenedCokeCanAltGoogleCamera2InScene-v0")
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for env_name in "${env_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-done
diff --git a/scripts/rt1_move_near_variant_agg.sh b/scripts/move_near_variant_agg.sh
old mode 100755
new mode 100644
similarity index 79%
rename from scripts/rt1_move_near_variant_agg.sh
rename to scripts/move_near_variant_agg.sh
index 7c95760c..d4430537
--- a/scripts/rt1_move_near_variant_agg.sh
+++ b/scripts/move_near_variant_agg.sh
@@ -1,10 +1,10 @@
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-gpu_id=0
-
-declare -a arr=("./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/")
+declare -a arr=("${ckpt_path}")
 
 for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
 
@@ -16,7 +16,7 @@ scene_name=google_pick_coke_can_1_v4
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -31,7 +31,7 @@ done
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -52,7 +52,7 @@ for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -74,7 +74,7 @@ scene_name=google_pick_coke_can_1_v4
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -82,7 +82,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
   --additional-env-build-kwargs slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -106,7 +106,7 @@ for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -130,7 +130,7 @@ for env_name in "${env_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_move_near_visual_matching.sh b/scripts/move_near_visual_matching.sh
old mode 100755
new mode 100644
similarity index 77%
rename from scripts/rt1_move_near_visual_matching.sh
rename to scripts/move_near_visual_matching.sh
index 1949333d..d14a2e0e
--- a/scripts/rt1_move_near_visual_matching.sh
+++ b/scripts/move_near_visual_matching.sh
@@ -1,10 +1,10 @@
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-gpu_id=0
-
-declare -a arr=("./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/")
+declare -a arr=($ckpt_path)
 
 env_name=MoveNearGoogleBakedTexInScene-v0
 # env_name=MoveNearGoogleBakedTexInScene-v1
@@ -21,7 +21,7 @@ for urdf_version in "${urdf_version_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/octo_drawer_variant_agg.sh b/scripts/octo_drawer_variant_agg.sh
deleted file mode 100755
index a566b369..00000000
--- a/scripts/octo_drawer_variant_agg.sh
+++ /dev/null
@@ -1,85 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
-
-declare -a policy_models=(
-"octo-base"
-)
-
-declare -a env_names=(
-OpenTopDrawerCustomInScene-v0
-OpenMiddleDrawerCustomInScene-v0
-OpenBottomDrawerCustomInScene-v0
-CloseTopDrawerCustomInScene-v0
-CloseMiddleDrawerCustomInScene-v0
-CloseBottomDrawerCustomInScene-v0
-)
-
-EXTRA_ARGS="--enable-raytracing"
-
-
-# base setup
-scene_name=frl_apartment_stage_simple
-
-EvalSim() {
-  echo ${policy_model} ${env_name}
-
-  python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-    --robot google_robot_static \
-    --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-    --env-name ${env_name} --scene-name ${scene_name} \
-    --robot-init-x 0.65 0.85 3 --robot-init-y -0.2 0.2 3 \
-    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
-    --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-    ${EXTRA_ARGS}
-}
-
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalSim
-  done
-done
-
-
-# backgrounds
-
-declare -a scene_names=(
-"modern_bedroom_no_roof"
-"modern_office_no_roof"
-)
-
-for scene_name in "${scene_names[@]}"; do
-  for policy_model in "${policy_models[@]}"; do
-    for env_name in "${env_names[@]}"; do
-      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt"
-      EvalSim
-    done
-  done
-done
-
-
-# lightings
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker"
-    EvalSim
-  done
-done
-
-
-# new cabinets
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3"
-    EvalSim
-  done
-done
diff --git a/scripts/octo_drawer_visual_matching.sh b/scripts/octo_drawer_visual_matching.sh
deleted file mode 100755
index 41e8c893..00000000
--- a/scripts/octo_drawer_visual_matching.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
-
-declare -a policy_models=(
-"octo-base"
-# "octo-server"
-)
-
-declare -a env_names=(
-OpenTopDrawerCustomInScene-v0
-OpenMiddleDrawerCustomInScene-v0
-OpenBottomDrawerCustomInScene-v0
-CloseTopDrawerCustomInScene-v0
-CloseMiddleDrawerCustomInScene-v0
-CloseBottomDrawerCustomInScene-v0
-)
-
-# URDF variations
-declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
-
-for urdf_version in "${urdf_version_arr[@]}"; do
-
-EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}"
-
-EvalOverlay() {
-# A0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
-  ${EXTRA_ARGS}
-
-# A1
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \
-  ${EXTRA_ARGS}
-
-# A2
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \
-  ${EXTRA_ARGS}
-
-# B0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
-  ${EXTRA_ARGS}
-
-# B1
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \
-  ${EXTRA_ARGS}
-
-# B2
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \
-  ${EXTRA_ARGS}
-
-# C0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
-  ${EXTRA_ARGS}
-
-# C1
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \
-  ${EXTRA_ARGS}
-
-# C2
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \
-  ${EXTRA_ARGS}
-}
-
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalOverlay
-  done
-done
-
-
-done
diff --git a/scripts/octo_move_near_variant_agg.sh b/scripts/octo_move_near_variant_agg.sh
deleted file mode 100755
index 43733968..00000000
--- a/scripts/octo_move_near_variant_agg.sh
+++ /dev/null
@@ -1,139 +0,0 @@
-
-gpu_id=0
-
-declare -a arr=("octo-base")
-
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
-
-
-# base setup
-
-env_name=MoveNearGoogleInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
-
-done
-
-
-
-# distractor
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs no_distractor=True;
-
-done
-
-
-# backgrounds
-
-env_name=MoveNearGoogleInScene-v0
-declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
-                      "google_pick_coke_can_1_v4_alt_background_2")
-
-for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
-
-done
-
-done
-
-
-
-
-
-# lighting
-
-env_name=MoveNearGoogleInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_darker_lighting=True;
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs slightly_brighter_lighting=True;
-
-done
-
-
-
-
-
-# table textures
-
-env_name=MoveNearGoogleInScene-v0
-declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
-                      "Baked_sc1_staging_objaverse_cabinet2_h870")
-
-for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
-
-done
-
-done
-
-
-
-
-# camera orientations
-
-declare -a env_arr=("MoveNearAltGoogleCameraInScene-v0" \
-                   "MoveNearAltGoogleCamera2InScene-v0")
-scene_name=google_pick_coke_can_1_v4
-
-for env_name in "${env_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1;
-
-done
-
-done
diff --git a/scripts/octo_move_near_visual_matching.sh b/scripts/octo_move_near_visual_matching.sh
deleted file mode 100755
index dd89caee..00000000
--- a/scripts/octo_move_near_visual_matching.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-
-
-gpu_id=0
-
-declare -a policy_models=(
-  "octo-base"
-  # "octo-server"
-)
-
-env_name=MoveNearGoogleBakedTexInScene-v0
-# env_name=MoveNearGoogleBakedTexInScene-v1
-scene_name=google_pick_coke_can_1_v4
-rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_1.png
-# rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_move_near_real_eval_2.png
-
-# URDF variations
-declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
-
-for policy_model in "${policy_models[@]}"; do echo "$policy_model"; done
-
-
-for urdf_version in "${urdf_version_arr[@]}";
-
-do for policy_model in "${policy_models[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --rgb-overlay-path ${rgb_overlay_path} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.21 0.21 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
-  --additional-env-build-kwargs urdf_version=${urdf_version} \
-  --additional-env-save-tags baked_except_bpb_orange; # google_move_near_real_eval_1.png
-
-# do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-#   --robot google_robot_static \
-#   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-#   --env-name ${env_name} --scene-name ${scene_name} \
-#   --rgb-overlay-path ${rgb_overlay_path} \
-#   --robot-init-x 0.36 0.36 1 --robot-init-y 0.22 0.22 1 --obj-variation-mode episode --obj-episode-range 0 60 \
-#   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.028 -0.028 1 \
-#   --additional-env-build-kwargs urdf_version=${urdf_version} \
-#   --additional-env-save-tags baked_except_bpb_orange; # google_move_near_real_eval_2.png
-
-done
-
-done
diff --git a/scripts/octo_pick_coke_can_variant_agg.sh b/scripts/octo_pick_coke_can_variant_agg.sh
deleted file mode 100755
index c446b5d1..00000000
--- a/scripts/octo_pick_coke_can_variant_agg.sh
+++ /dev/null
@@ -1,182 +0,0 @@
-
-gpu_id=0
-
-declare -a arr=("octo-base")
-
-# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
-declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
-
-for policy_model in "${arr[@]}"; do echo "$policy_model"; done
-
-
-# base setup
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option};
-
-done
-
-done
-
-
-
-# table textures
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-
-declare -a scene_arr=("Baked_sc1_staging_objaverse_cabinet1_h870" \
-                      "Baked_sc1_staging_objaverse_cabinet2_h870")
-
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option};
-
-done
-
-done
-
-done
-
-
-
-
-# distractors
-
-env_name=GraspSingleOpenedCokeCanDistractorInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option};
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} distractor_config=more;
-
-done
-
-done
-
-
-
-
-# backgrounds
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-declare -a scene_arr=("google_pick_coke_can_1_v4_alt_background" \
-                      "google_pick_coke_can_1_v4_alt_background_2")
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for scene_name in "${scene_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option};
-
-done
-
-done
-
-done
-
-
-
-# lightings
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True;
-
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} slightly_brighter_lighting=True;
-
-done
-
-done
-
-
-
-
-# camera orientations
-
-declare -a env_arr=("GraspSingleOpenedCokeCanAltGoogleCameraInScene-v0" \
-                   "GraspSingleOpenedCokeCanAltGoogleCamera2InScene-v0")
-scene_name=google_pick_coke_can_1_v4
-
-for coke_can_option in "${coke_can_options_arr[@]}";
-
-do for env_name in "${env_arr[@]}";
-
-do for policy_model in "${arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option};
-
-done
-
-done
-
-done
diff --git a/scripts/octo_pick_coke_can_visual_matching.sh b/scripts/octo_pick_coke_can_visual_matching.sh
deleted file mode 100755
index c633a339..00000000
--- a/scripts/octo_pick_coke_can_visual_matching.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-
-
-gpu_id=0
-
-declare -a policy_models=(
-  "octo-base"
-  # "octo-small"
-  # "octo-server"
-)
-
-# lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
-declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
-
-# URDF variations
-declare -a urdf_version_arr=(None "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" "recolor_cabinet_visual_matching_1")
-
-env_name=GraspSingleOpenedCokeCanInScene-v0
-scene_name=google_pick_coke_can_1_v4
-rgb_overlay_path=./ManiSkill2_real2sim/data/real_inpainting/google_coke_can_real_eval_1.png
-
-for policy_model in "${policy_models[@]}"; do echo "$policy_model"; done
-
-
-for urdf_version in "${urdf_version_arr[@]}";
-
-do for policy_model in "${policy_models[@]}";
-
-do for coke_can_option in "${coke_can_options_arr[@]}";
-
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static  --policy-setup google_robot \
-  --control-freq 3 --sim-freq 501 --max-episode-steps 80 \
-  --env-name ${env_name} --scene-name ${scene_name} \
-  --rgb-overlay-path ${rgb_overlay_path} \
-  --robot-init-x 0.35 0.35 1 --robot-init-y 0.20 0.20 1 --obj-init-x -0.35 -0.12 5 --obj-init-y -0.02 0.42 5 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --additional-env-build-kwargs ${coke_can_option} urdf_version=${urdf_version};
-
-done
-
-done
-
-done
diff --git a/scripts/octo_put_in_drawer_variant_agg.sh b/scripts/octo_put_in_drawer_variant_agg.sh
deleted file mode 100755
index b833a896..00000000
--- a/scripts/octo_put_in_drawer_variant_agg.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
-
-declare -a policy_models=(
-"octo-base"
-)
-
-declare -a env_names=(
-PlaceIntoClosedTopDrawerCustomInScene-v0
-)
-
-EXTRA_ARGS="--enable-raytracing  --additional-env-build-kwargs model_ids=apple"
-
-
-# base setup
-scene_name=frl_apartment_stage_simple
-
-EvalSim() {
-  echo ${policy_model} ${env_name}
-
-  python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-    --robot google_robot_static \
-    --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
-    --env-name ${env_name} --scene-name ${scene_name} \
-    --robot-init-x 0.65 0.65 1 --robot-init-y -0.2 0.2 3 \
-    --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0.0 0.0 1 \
-    --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
-    ${EXTRA_ARGS}
-}
-
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalSim
-  done
-done
-
-
-# backgrounds
-
-declare -a scene_names=(
-"modern_bedroom_no_roof"
-"modern_office_no_roof"
-)
-
-for scene_name in "${scene_names[@]}"; do
-  for policy_model in "${policy_models[@]}"; do
-    for env_name in "${env_names[@]}"; do
-      EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt model_ids=apple"
-      EvalSim
-    done
-  done
-done
-
-
-# lightings
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=brighter model_ids=apple"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt light_mode=darker model_ids=apple"
-    EvalSim
-  done
-done
-
-
-# new cabinets
-scene_name=frl_apartment_stage_simple
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station2 model_ids=apple"
-    EvalSim
-    EXTRA_ARGS="--additional-env-build-kwargs shader_dir=rt station_name=mk_station3 model_ids=apple"
-    EvalSim
-  done
-done
diff --git a/scripts/octo_put_in_drawer_visual_matching.sh b/scripts/octo_put_in_drawer_visual_matching.sh
deleted file mode 100755
index f945815c..00000000
--- a/scripts/octo_put_in_drawer_visual_matching.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
-
-declare -a policy_models=(
-"octo-base"
-# "octo-server"
-)
-
-declare -a env_names=(
-PlaceIntoClosedTopDrawerCustomInScene-v0
-# PlaceIntoClosedMiddleDrawerCustomInScene-v0
-# PlaceIntoClosedBottomDrawerCustomInScene-v0
-)
-
-# URDF variations
-declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
-
-for urdf_version in "${urdf_version_arr[@]}"; do
-
-EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version} model_ids=baked_apple_v2"
-
-EvalOverlay() {
-# A0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
-  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
-  ${EXTRA_ARGS}
-
-# B0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
-  ${EXTRA_ARGS}
-
-# C0
-python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path None \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
-  ${EXTRA_ARGS}
-}
-
-
-for policy_model in "${policy_models[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalOverlay
-  done
-done
-
-
-done
diff --git a/scripts/openvla_drawer_visual_matching.sh b/scripts/openvla_drawer_visual_matching.sh
deleted file mode 100755
index a021660e..00000000
--- a/scripts/openvla_drawer_visual_matching.sh
+++ /dev/null
@@ -1,132 +0,0 @@
-# shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-declare -a ckpt_paths=(
-"openvla/openvla-7b"
-)
-
-declare -a env_names=(
-OpenTopDrawerCustomInScene-v0
-OpenMiddleDrawerCustomInScene-v0
-OpenBottomDrawerCustomInScene-v0
-CloseTopDrawerCustomInScene-v0
-CloseMiddleDrawerCustomInScene-v0
-CloseBottomDrawerCustomInScene-v0
-)
-
-# URDF variations
-declare -a urdf_version_arr=("recolor_cabinet_visual_matching_1" "recolor_tabletop_visual_matching_1" "recolor_tabletop_visual_matching_2" None)
-
-for urdf_version in "${urdf_version_arr[@]}"; do
-
-EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_station_recolor light_mode=simple disable_bad_material=True urdf_version=${urdf_version}"
-
-EvalOverlay() {
-# A0
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.644 0.644 1 --robot-init-y -0.179 -0.179 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.03 -0.03 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
-  ${EXTRA_ARGS}
-
-# A1
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.765 0.765 1 --robot-init-y -0.182 -0.182 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.02 -0.02 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a1.png \
-  ${EXTRA_ARGS}
-
-# A2
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.889 0.889 1 --robot-init-y -0.203 -0.203 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.06 -0.06 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a2.png \
-  ${EXTRA_ARGS}
-
-# B0
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.652 0.652 1 --robot-init-y 0.009 0.009 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
-  ${EXTRA_ARGS}
-
-# B1
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.752 0.752 1 --robot-init-y 0.009 0.009 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b1.png \
-  ${EXTRA_ARGS}
-
-# B2
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.851 0.851 1 --robot-init-y 0.035 0.035 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b2.png \
-  ${EXTRA_ARGS}
-
-# C0
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.665 0.665 1 --robot-init-y 0.224 0.224 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
-  ${EXTRA_ARGS}
-
-# C1
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.765 0.765 1 --robot-init-y 0.222 0.222 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c1.png \
-  ${EXTRA_ARGS}
-
-# C2
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
-  --robot google_robot_static \
-  --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
-  --env-name ${env_name} --scene-name dummy_drawer \
-  --robot-init-x 0.865 0.865 1 --robot-init-y 0.222 0.222 1 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.025 -0.025 1 \
-  --obj-init-x-range 0 0 1 --obj-init-y-range 0 0 1 \
-  --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c2.png \
-  ${EXTRA_ARGS}
-}
-
-
-for ckpt_path in "${ckpt_paths[@]}"; do
-  for env_name in "${env_names[@]}"; do
-    EvalOverlay
-  done
-done
-
-
-
-done
diff --git a/scripts/rt1_pick_coke_can_variant_agg.sh b/scripts/pick_coke_can_variant_agg.sh
old mode 100755
new mode 100644
similarity index 82%
rename from scripts/rt1_pick_coke_can_variant_agg.sh
rename to scripts/pick_coke_can_variant_agg.sh
index 63b36e1e..f5e1427a
--- a/scripts/rt1_pick_coke_can_variant_agg.sh
+++ b/scripts/pick_coke_can_variant_agg.sh
@@ -1,10 +1,10 @@
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-gpu_id=0
-
-declare -a arr=("./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/")
+declare -a arr=($ckpt_path)
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
@@ -21,7 +21,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -49,7 +49,7 @@ do for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -75,7 +75,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -83,7 +83,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option};
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -110,7 +110,7 @@ do for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -135,7 +135,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -143,7 +143,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -170,7 +170,7 @@ do for env_name in "${env_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_pick_coke_can_visual_matching.sh b/scripts/pick_coke_can_visual_matching.sh
old mode 100755
new mode 100644
similarity index 79%
rename from scripts/rt1_pick_coke_can_visual_matching.sh
rename to scripts/pick_coke_can_visual_matching.sh
index b0086684..5fa0451a
--- a/scripts/rt1_pick_coke_can_visual_matching.sh
+++ b/scripts/pick_coke_can_visual_matching.sh
@@ -1,11 +1,11 @@
+ckpt_path=$1
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
+declare -a arr=($ckpt_path)
 
-gpu_id=0
-
-declare -a arr=("./checkpoints/rt_1_tf_trained_for_000400120/" \
-                "./checkpoints/rt_1_tf_trained_for_000058240/" \
-                "./checkpoints/rt_1_x_tf_trained_for_002272480_step/" \
-                "./checkpoints/rt_1_tf_trained_for_000001120/" )
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
@@ -27,7 +27,7 @@ do for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_put_in_drawer_variant_agg.sh b/scripts/put_in_drawer_variant_agg.sh
old mode 100755
new mode 100644
similarity index 86%
rename from scripts/rt1_put_in_drawer_variant_agg.sh
rename to scripts/put_in_drawer_variant_agg.sh
index 8aacf728..e92d5454
--- a/scripts/rt1_put_in_drawer_variant_agg.sh
+++ b/scripts/put_in_drawer_variant_agg.sh
@@ -1,13 +1,11 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_paths=($1)
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-
-
-declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
-)
+declare -a arr=($ckpt_path)
 
 declare -a env_names=(
 PlaceIntoClosedTopDrawerCustomInScene-v0
@@ -22,7 +20,7 @@ scene_name=frl_apartment_stage_simple
 EvalSim() {
   echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
     --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_put_in_drawer_visual_matching.sh b/scripts/put_in_drawer_visual_matching.sh
old mode 100755
new mode 100644
similarity index 75%
rename from scripts/rt1_put_in_drawer_visual_matching.sh
rename to scripts/put_in_drawer_visual_matching.sh
index 3d250f6d..a42dd4bd
--- a/scripts/rt1_put_in_drawer_visual_matching.sh
+++ b/scripts/put_in_drawer_visual_matching.sh
@@ -1,12 +1,11 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+ckpt_paths=($1)
+policy_model=$2
+action_ensemble_temp=$3
+logging_dir=$4
+gpu_id=$5
 
-declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
-)
-
+declare -a arr=($ckpt_path)
 
 declare -a env_names=(
 PlaceIntoClosedTopDrawerCustomInScene-v0
@@ -25,7 +24,7 @@ EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_st
 
 EvalOverlay() {
 # A0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -34,9 +33,9 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
   --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_a0.png \
   ${EXTRA_ARGS}
-
+echo "Done A0"
 # B0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -45,9 +44,9 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
   --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_b0.png \
   ${EXTRA_ARGS}
-
+echo "Done B0"
 # C0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} --action-ensemble-temp ${action_ensemble_temp} --logging-dir ${logging_dir} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -56,6 +55,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   --obj-init-x-range -0.08 -0.02 3 --obj-init-y-range -0.02 0.08 3 \
   --rgb-overlay-path ./ManiSkill2_real2sim/data/real_inpainting/open_drawer_c0.png \
   ${EXTRA_ARGS}
+echo "Done C0"
 }
 
 
diff --git a/scripts/run_cogact.sh b/scripts/run_cogact.sh
new file mode 100644
index 00000000..be4df642
--- /dev/null
+++ b/scripts/run_cogact.sh
@@ -0,0 +1,39 @@
+model_name=cogact
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  CogACT/CogACT-Large
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/scripts/run_octo.sh b/scripts/run_octo.sh
new file mode 100644
index 00000000..bb6116d3
--- /dev/null
+++ b/scripts/run_octo.sh
@@ -0,0 +1,39 @@
+model_name=octo-base
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  None
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/scripts/run_openvla.sh b/scripts/run_openvla.sh
new file mode 100644
index 00000000..fb5f3c43
--- /dev/null
+++ b/scripts/run_openvla.sh
@@ -0,0 +1,39 @@
+model_name=openvla
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  ~/projects/vla_ptm/pretrained/openvla-7b
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/scripts/run_rt1.sh b/scripts/run_rt1.sh
new file mode 100644
index 00000000..eb803c34
--- /dev/null
+++ b/scripts/run_rt1.sh
@@ -0,0 +1,39 @@
+model_name=rt1
+tasks=(
+  bridge.sh
+  drawer_variant_agg.sh
+  drawer_visual_matching.sh
+  move_near_variant_agg.sh
+  move_near_visual_matching.sh
+  pick_coke_can_variant_agg.sh
+  pick_coke_can_visual_matching.sh
+  put_in_drawer_variant_agg.sh
+  put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+  ../pretrained/rt_1_tf_trained_for_000400120
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+  base_dir=$(dirname $ckpt_path)
+
+  # evaluation in simulator
+  # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+  mkdir -p $logging_dir
+  for i in ${!tasks[@]}; do
+    task=${tasks[$i]}
+    echo "🚀 running $task ..."
+    device=0
+    session_name=CUDA${device}-$(basename $logging_dir)-${task}
+    bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+  done
+
+  # statistics evalution results
+  echo "🚀 all tasks DONE! Calculating metrics..."
+  python tools/calc_metrics_evaluation_videos.py \
+    --log-dir-root $logging_dir \
+    >>$logging_dir/total.metrics
+done
diff --git a/simpler_env/evaluation/argparse.py b/simpler_env/evaluation/argparse.py
index d17e4d88..9da3e992 100644
--- a/simpler_env/evaluation/argparse.py
+++ b/simpler_env/evaluation/argparse.py
@@ -44,7 +44,7 @@ def get_args():
         help="Obtain image observation from this camera for policy input. None = default",
     )
     parser.add_argument("--action-scale", type=float, default=1.0)
-
+    parser.add_argument("--action-ensemble-temp", type=float, default=-0.8)
     parser.add_argument("--control-freq", type=int, default=3)
     parser.add_argument("--sim-freq", type=int, default=513)
     parser.add_argument("--max-episode-steps", type=int, default=80)
@@ -99,7 +99,6 @@ def get_args():
         default=[-0.02, 0.42, 5],
         help="[ymin, ymax, num]",
     )
-
     parser.add_argument(
         "--additional-env-build-kwargs",
         nargs="+",
diff --git a/simpler_env/evaluation/maniskill2_evaluator.py b/simpler_env/evaluation/maniskill2_evaluator.py
index 99a788e0..a9ea438d 100644
--- a/simpler_env/evaluation/maniskill2_evaluator.py
+++ b/simpler_env/evaluation/maniskill2_evaluator.py
@@ -7,9 +7,12 @@
 import numpy as np
 from transforms3d.euler import quat2euler
 
-from simpler_env.utils.env.env_builder import build_maniskill2_env, get_robot_control_mode
+from simpler_env.utils.env.env_builder import (
+    build_maniskill2_env,
+    get_robot_control_mode,
+)
 from simpler_env.utils.env.observation_utils import get_image_from_maniskill2_obs_dict
-from simpler_env.utils.visualization import write_video
+from simpler_env.utils.visualization import write_interval_video, write_video
 
 
 def run_maniskill2_eval_single_episode(
@@ -36,7 +39,6 @@ def run_maniskill2_eval_single_episode(
     additional_env_save_tags=None,
     logging_dir="./results",
 ):
-
     if additional_env_build_kwargs is None:
         additional_env_build_kwargs = {}
 
@@ -62,7 +64,7 @@ def run_maniskill2_eval_single_episode(
         **additional_env_build_kwargs,
         **kwargs,
     )
-
+    # __import__('ipdb').set_trace()
     # initialize environment
     env_reset_options = {
         "robot_init_options": {
@@ -84,7 +86,7 @@ def run_maniskill2_eval_single_episode(
         }
     obs, _ = env.reset(options=env_reset_options)
     # for long-horizon environments, we check if the current subtask is the final subtask
-    is_final_subtask = env.is_final_subtask() 
+    is_final_subtask = env.is_final_subtask()
 
     # Obtain language instruction
     if instruction is not None:
@@ -105,8 +107,10 @@ def run_maniskill2_eval_single_episode(
 
     timestep = 0
     success = "failure"
+    # action_ensemble = model.action_ensemble_temp  if hasattr(model, "action_ensemble") else "none"
 
     # Step the environment
+    task_descriptions = []
     while not (predicted_terminated or truncated):
         # step the model; "raw_action" is raw model action output; "action" is the processed action to be sent into maniskill env
         raw_action, action = model.step(image, task_description)
@@ -120,9 +124,11 @@ def run_maniskill2_eval_single_episode(
 
         # step the environment
         obs, reward, done, truncated, info = env.step(
-            np.concatenate([action["world_vector"], action["rot_axangle"], action["gripper"]]),
+            np.concatenate(
+                [action["world_vector"], action["rot_axangle"], action["gripper"]]
+            ),
         )
-        
+
         success = "success" if done else "failure"
         new_task_description = env.get_language_instruction()
         if new_task_description != task_description:
@@ -132,14 +138,18 @@ def run_maniskill2_eval_single_episode(
 
         print(timestep, info)
 
-        image = get_image_from_maniskill2_obs_dict(env, obs, camera_name=obs_camera_name)
+        image = get_image_from_maniskill2_obs_dict(
+            env, obs, camera_name=obs_camera_name
+        )
         images.append(image)
+        task_descriptions.append(task_description)
         timestep += 1
 
     episode_stats = info.get("episode_stats", {})
 
     # save video
     env_save_name = env_name
+
     for k, v in additional_env_build_kwargs.items():
         env_save_name = env_save_name + f"_{k}_{v}"
     if additional_env_save_tags is not None:
@@ -158,7 +168,7 @@ def run_maniskill2_eval_single_episode(
     else:
         rgb_overlay_path_str = "None"
     r, p, y = quat2euler(robot_init_quat)
-    video_path = f"{ckpt_path_basename}/{scene_name}/{control_mode}/{env_save_name}/rob_{robot_init_x}_{robot_init_y}_rot_{r:.3f}_{p:.3f}_{y:.3f}_rgb_overlay_{rgb_overlay_path_str}/{video_name}"
+    video_path = f"{scene_name}/{control_mode}/{env_save_name}/rob_{robot_init_x}_{robot_init_y}_rot_{r:.3f}_{p:.3f}_{y:.3f}_rgb_overlay_{rgb_overlay_path_str}/{video_name}"
     video_path = os.path.join(logging_dir, video_path)
     write_video(video_path, images, fps=5)
 
@@ -168,7 +178,6 @@ def run_maniskill2_eval_single_episode(
     os.makedirs(action_root, exist_ok=True)
     action_path = action_root + os.path.basename(action_path)
     model.visualize_epoch(predicted_actions, images, save_path=action_path)
-
     return success == "success"
 
 
@@ -211,8 +220,14 @@ def maniskill2_evaluator(model, args):
                                 )
                             )
                 elif args.obj_variation_mode == "episode":
-                    for obj_episode_id in range(args.obj_episode_range[0], args.obj_episode_range[1]):
-                        success_arr.append(run_maniskill2_eval_single_episode(obj_episode_id=obj_episode_id, **kwargs))
+                    for obj_episode_id in range(
+                        args.obj_episode_range[0], args.obj_episode_range[1]
+                    ):
+                        success_arr.append(
+                            run_maniskill2_eval_single_episode(
+                                obj_episode_id=obj_episode_id, **kwargs
+                            )
+                        )
                 else:
                     raise NotImplementedError()
 
diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py
index 699c00af..4ac6eb36 100644
--- a/simpler_env/main_inference.py
+++ b/simpler_env/main_inference.py
@@ -5,9 +5,6 @@
 
 from simpler_env.evaluation.argparse import get_args
 from simpler_env.evaluation.maniskill2_evaluator import maniskill2_evaluator
-from simpler_env.policies.octo.octo_server_model import OctoServerInference
-from simpler_env.policies.rt1.rt1_model import RT1Inference
-from simpler_env.policies.openvla.openvla_model import OpenVLAInference
 
 try:
     from simpler_env.policies.octo.octo_model import OctoInference
@@ -32,6 +29,7 @@
     print(f"**** {args.policy_model} ****")
     # policy model creation; update this if you are using a new policy model
     if args.policy_model == "rt1":
+        from simpler_env.policies.rt1.rt1_model import RT1Inference
         assert args.ckpt_path is not None
         model = RT1Inference(
             saved_model_path=args.ckpt_path,
@@ -39,6 +37,7 @@
             action_scale=args.action_scale,
         )
     elif "octo" in args.policy_model:
+        from simpler_env.policies.octo.octo_server_model import OctoServerInference
         if args.ckpt_path is None or args.ckpt_path == "None":
             args.ckpt_path = args.policy_model
         if "server" in args.policy_model:
@@ -56,11 +55,22 @@
             )
     elif args.policy_model == "openvla":
         assert args.ckpt_path is not None
+        from simpler_env.policies.openvla.openvla_model import OpenVLAInference
         model = OpenVLAInference(
             saved_model_path=args.ckpt_path,
             policy_setup=args.policy_setup,
             action_scale=args.action_scale,
         )
+    elif args.policy_model == "cogact":
+        from simpler_env.policies.sim_cogact import CogACTInference
+        assert args.ckpt_path is not None
+        model = CogACTInference(
+            saved_model_path=args.ckpt_path,  # e.g., CogACT/CogACT-Base
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+            action_model_type='DiT-L',
+            cfg_scale=1.5                     # cfg from 1.5 to 7 also performs well
+        )
     else:
         raise NotImplementedError()
 
diff --git a/simpler_env/policies/octo/octo_model.py b/simpler_env/policies/octo/octo_model.py
index 92110985..e05a51e0 100644
--- a/simpler_env/policies/octo/octo_model.py
+++ b/simpler_env/policies/octo/octo_model.py
@@ -57,7 +57,9 @@ def __init__(
             self.action_std = self.model.dataset_statistics[dataset_id]["action"]["std"]
         else:
             raise NotImplementedError()
-
+        print(
+            f"*** policy setup: {policy_setup}, dataset_id: {dataset_id}, model_type: {model_type}, mean {self.action_mean}, std {self.action_std} ***"
+        )
         self.image_size = image_size
         self.action_scale = action_scale
         self.horizon = horizon
@@ -127,7 +129,9 @@ def reset(self, task_description: str) -> None:
         # self.gripper_is_closed = False
         self.previous_gripper_action = None
 
-    def step(self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
         """
         Input:
             image: np.ndarray of shape (H, W, 3), uint8
@@ -241,7 +245,9 @@ def step(self, image: np.ndarray, task_description: Optional[str] = None, *args,
 
         return raw_action, action
 
-    def visualize_epoch(self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str) -> None:
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
         images = [self._resize_image(image) for image in images]
         ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
 
diff --git a/simpler_env/policies/rt1/rt1_model.py b/simpler_env/policies/rt1/rt1_model.py
index c838217b..cf52726d 100644
--- a/simpler_env/policies/rt1/rt1_model.py
+++ b/simpler_env/policies/rt1/rt1_model.py
@@ -116,7 +116,9 @@ def reset(self, task_description: str) -> None:
         self._initialize_task_description(task_description)
 
     @staticmethod
-    def _small_action_filter_google_robot(raw_action: dict[str, np.ndarray | tf.Tensor], arm_movement: bool = False, gripper: bool = True) -> dict[str, np.ndarray | tf.Tensor]:
+    def _small_action_filter_google_robot(
+        raw_action: dict[str, np.ndarray | tf.Tensor], arm_movement: bool = False, gripper: bool = True
+    ) -> dict[str, np.ndarray | tf.Tensor]:
         # small action filtering for google robot
         if arm_movement:
             raw_action["world_vector"] = tf.where(
@@ -147,7 +149,9 @@ def _small_action_filter_google_robot(raw_action: dict[str, np.ndarray | tf.Tens
             )
         return raw_action
 
-    def step(self, image: np.ndarray, task_description: Optional[str] = None) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
         """
         Input:
             image: np.ndarray of shape (H, W, 3), uint8
@@ -165,7 +169,7 @@ def step(self, image: np.ndarray, task_description: Optional[str] = None) -> tup
                 # task description has changed; update language embedding
                 # self._initialize_task_description(task_description)
                 self.reset(task_description)
-        
+
         assert image.dtype == np.uint8
         image = self._resize_image(image)
         self.observation["image"] = image
@@ -229,7 +233,9 @@ def step(self, image: np.ndarray, task_description: Optional[str] = None) -> tup
 
         return raw_action, action
 
-    def visualize_epoch(self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str) -> None:
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
         images = [self._resize_image(image) for image in images]
         predicted_action_name_to_values_over_time = defaultdict(list)
         figure_layout = [
diff --git a/simpler_env/policies/sim_cogact/__init__.py b/simpler_env/policies/sim_cogact/__init__.py
new file mode 100644
index 00000000..43a37cfc
--- /dev/null
+++ b/simpler_env/policies/sim_cogact/__init__.py
@@ -0,0 +1,2 @@
+from .cogact_policy import CogACTInference
+from .adaptive_ensemble import AdaptiveEnsembler
\ No newline at end of file
diff --git a/simpler_env/policies/sim_cogact/adaptive_ensemble.py b/simpler_env/policies/sim_cogact/adaptive_ensemble.py
new file mode 100644
index 00000000..520f85cc
--- /dev/null
+++ b/simpler_env/policies/sim_cogact/adaptive_ensemble.py
@@ -0,0 +1,44 @@
+"""
+adaptive_ensemble.py
+
+"""
+
+from collections import deque
+import numpy as np
+
+
+class AdaptiveEnsembler:
+    def __init__(self, pred_action_horizon, adaptive_ensemble_alpha=0.0):
+        self.pred_action_horizon = pred_action_horizon
+        self.action_history = deque(maxlen=self.pred_action_horizon)
+        self.adaptive_ensemble_alpha = adaptive_ensemble_alpha
+
+    def reset(self):
+        self.action_history.clear()
+
+    def ensemble_action(self, cur_action):
+        self.action_history.append(cur_action)
+        num_actions = len(self.action_history)
+        if cur_action.ndim == 1:
+            curr_act_preds = np.stack(self.action_history)
+        else:
+            curr_act_preds = np.stack(
+                [pred_actions[i] for (i, pred_actions) in zip(range(num_actions - 1, -1, -1), self.action_history)]
+            )
+
+        # calculate cosine similarity between the current prediction and all previous predictions
+        ref = curr_act_preds[num_actions-1, :]
+        previous_pred = curr_act_preds
+        dot_product = np.sum(previous_pred * ref, axis=1)  
+        norm_previous_pred = np.linalg.norm(previous_pred, axis=1)  
+        norm_ref = np.linalg.norm(ref)  
+        cos_similarity = dot_product / (norm_previous_pred * norm_ref + 1e-7)
+
+        # compute the weights for each prediction
+        weights = np.exp(self.adaptive_ensemble_alpha * cos_similarity)
+        weights = weights / weights.sum()
+  
+        # compute the weighted average across all predictions for this timestep
+        cur_action = np.sum(weights[:, None] * curr_act_preds, axis=0)
+
+        return cur_action
\ No newline at end of file
diff --git a/simpler_env/policies/sim_cogact/cogact_policy.py b/simpler_env/policies/sim_cogact/cogact_policy.py
new file mode 100644
index 00000000..f3609018
--- /dev/null
+++ b/simpler_env/policies/sim_cogact/cogact_policy.py
@@ -0,0 +1,232 @@
+"""
+cogact_policy.py
+
+"""
+from collections import deque
+from typing import Optional, Sequence
+import os
+from PIL import Image
+import torch
+import cv2 as cv
+import matplotlib.pyplot as plt
+import numpy as np
+
+from transforms3d.euler import euler2axangle
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+
+from vla import load_vla
+from sim_cogact.adaptive_ensemble import AdaptiveEnsembler
+
+class CogACTInference:
+    def __init__(
+        self,
+        saved_model_path: str = 'CogACT/CogACT-Base',
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        horizon: int = 0,
+        action_ensemble_horizon: Optional[int] = None,
+        image_size: list[int] = [224, 224],
+        future_action_window_size: int = 15,
+        action_dim: int = 7,
+        action_model_type: str = "DiT-L",
+        action_scale: float = 1.0,
+        cfg_scale: float = 1.5,
+        use_ddim: bool = True,
+        num_ddim_steps: int = 10,
+        use_bf16: bool = True,
+        action_ensemble = True,
+        adaptive_ensemble_alpha = 0.1,
+    ) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig" if unnorm_key is None else unnorm_key
+            action_ensemble = action_ensemble
+            adaptive_ensemble_alpha = adaptive_ensemble_alpha
+            if action_ensemble_horizon is None:
+                # Set 7 for widowx_bridge to fix the window size of motion scale between each frame. see appendix in our paper for details
+                action_ensemble_horizon = 7
+            self.sticky_gripper_num_repeat = 1
+        elif policy_setup == "google_robot":
+            unnorm_key = "fractal20220817_data" if unnorm_key is None else unnorm_key
+            action_ensemble = action_ensemble
+            adaptive_ensemble_alpha = adaptive_ensemble_alpha
+            if action_ensemble_horizon is None:
+                # Set 2 for google_robot to fix the window size of motion scale between each frame. see appendix in our paper for details
+                action_ensemble_horizon = 2
+            self.sticky_gripper_num_repeat = 10
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+        self.use_ddim = use_ddim
+        self.num_ddim_steps = num_ddim_steps
+        self.vla = load_vla(
+          saved_model_path,                       # choose from ['CogACT/CogACT-Small', 'CogACT/CogACT-Base', 'CogACT/CogACT-Large'] or the local path
+          load_for_training=False, 
+          action_model_type=action_model_type,              # choose from ['DiT-Small', 'DiT-Base', 'DiT-Large'] to match the model weight
+          future_action_window_size=future_action_window_size,
+          action_dim=action_dim,
+        )
+
+        if use_bf16:
+            self.vla.vlm = self.vla.vlm.to(torch.bfloat16)
+        self.vla = self.vla.to("cuda").eval()
+        self.cfg_scale = cfg_scale
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.horizon = horizon
+        self.action_ensemble = action_ensemble
+        self.adaptive_ensemble_alpha = adaptive_ensemble_alpha
+        self.action_ensemble_horizon = action_ensemble_horizon
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.task_description = None
+        self.image_history = deque(maxlen=self.horizon)
+        if self.action_ensemble:
+            self.action_ensembler = AdaptiveEnsembler(self.action_ensemble_horizon, self.adaptive_ensemble_alpha)
+        else:
+            self.action_ensembler = None
+        self.num_image_history = 0
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        self.image_history.append(image)
+        self.num_image_history = min(self.num_image_history + 1, self.horizon)
+
+    def reset(self, task_description: str) -> None:
+        self.task_description = task_description
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.num_image_history = 0
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        self._add_image_to_history(self._resize_image(image))
+        image: Image.Image = Image.fromarray(image)
+        raw_actions, normalized_actions = self.vla.predict_action(image=image, 
+                                                                instruction=self.task_description,
+                                                                unnorm_key=self.unnorm_key,
+                                                                do_sample=False, 
+                                                                cfg_scale=self.cfg_scale,
+                                                                use_ddim=self.use_ddim,
+                                                                num_ddim_steps=self.num_ddim_steps,
+                                                                )
+
+        if self.action_ensemble:
+            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+        raw_action = {
+            "world_vector": np.array(raw_actions[0, :3]),
+            "rotation_delta": np.array(raw_actions[0, 3:6]),
+            "open_gripper": np.array(raw_actions[0, 6:7]),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(raw_action["rotation_delta"], dtype=np.float64)
+
+        roll, pitch, yaw = action_rotation_delta
+        axes, angles = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = axes * angles
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            action["gripper"] = 0
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+                self.previous_gripper_action = current_gripper_action
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            # fix a bug in the SIMPLER code here
+            # self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+                self.previous_gripper_action = current_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+        
+        action["terminate_episode"] = np.array([0.0])
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def visualize_epoch(
+        self, predicted_raw_actions: Sequence[np.ndarray], images: Sequence[np.ndarray], save_path: str
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate([a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1)
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(pred_actions[:, action_dim], label="predicted action")
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)
diff --git a/scripts/rt1x_bridge.sh b/simpler_env/policies/sim_cogact/scripts/cogact_bridge.sh
old mode 100755
new mode 100644
similarity index 89%
rename from scripts/rt1x_bridge.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_bridge.sh
index c4608ce0..2a3f8827
--- a/scripts/rt1x_bridge.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_bridge.sh
@@ -1,8 +1,6 @@
-
-
 gpu_id=0
-policy_model=rt1
-ckpt_path=checkpoints/rt_1_x_tf_trained_for_002272480_step/
+policy_model=cogact
+ckpt_path=CogACT/CogACT-Base # CogACT/CogACT-Large CogACT/CogACT-Small
 
 scene_name=bridge_table_1_v1
 robot=widowx
@@ -13,23 +11,23 @@ robot_init_y=0.028
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-  --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
+  --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
-  --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
+  --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -48,6 +46,4 @@ CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-mod
   --env-name PutEggplantInBasketScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
-  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
-
-
+  --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1;
\ No newline at end of file
diff --git a/scripts/openvla_drawer_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_variant_agg.sh
old mode 100755
new mode 100644
similarity index 91%
rename from scripts/openvla_drawer_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_drawer_variant_agg.sh
index 07f64600..0bac2a90
--- a/scripts/openvla_drawer_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_variant_agg.sh
@@ -1,8 +1,10 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+gpu_id=0
+
 declare -a ckpt_paths=(
-"openvla/openvla-7b"
+"CogACT/CogACT-Base"
 )
-
+# CogACT/CogACT-Large CogACT/CogACT-Small
 declare -a env_names=(
 OpenTopDrawerCustomInScene-v0
 OpenMiddleDrawerCustomInScene-v0
@@ -21,7 +23,7 @@ scene_name=frl_apartment_stage_simple
 EvalSim() {
   echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path None \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
     --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/rt1_drawer_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_visual_matching.sh
old mode 100755
new mode 100644
similarity index 82%
rename from scripts/rt1_drawer_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_drawer_visual_matching.sh
index 77b78d2d..c302e827
--- a/scripts/rt1_drawer_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_drawer_visual_matching.sh
@@ -1,12 +1,8 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-
+gpu_id=0
 
 declare -a ckpt_paths=(
-"./checkpoints/rt_1_tf_trained_for_000400120/"
-"./checkpoints/rt_1_tf_trained_for_000058240/"
-"./checkpoints/rt_1_x_tf_trained_for_002272480_step/"
-"./checkpoints/rt_1_tf_trained_for_000001120/"
+"CogACT/CogACT-Base"
 )
 
 declare -a env_names=(
@@ -27,7 +23,7 @@ EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_st
 
 EvalOverlay() {
 # A0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -38,7 +34,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # A1
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -49,7 +45,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # A2
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -60,7 +56,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # B0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -71,7 +67,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # B1
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -82,7 +78,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # B2
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -93,7 +89,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # C0
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -104,7 +100,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # C1
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -115,7 +111,7 @@ python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path}
   ${EXTRA_ARGS}
 
 # C2
-python simpler_env/main_inference.py --policy-model rt1 --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 113 \
   --env-name ${env_name} --scene-name dummy_drawer \
diff --git a/scripts/openvla_move_near_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_variant_agg.sh
old mode 100755
new mode 100644
similarity index 90%
rename from scripts/openvla_move_near_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_move_near_variant_agg.sh
index e5ad488e..b5b11b1c
--- a/scripts/openvla_move_near_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_variant_agg.sh
@@ -1,7 +1,7 @@
-
 gpu_id=0
 
-declare -a arr=("openvla/openvla-7b")
+declare -a arr=("CogACT/CogACT-Base")
+# CogACT/CogACT-Large CogACT/CogACT-Small
 for ckpt_path in "${arr[@]}"; do echo "$ckpt_path"; done
 
 
@@ -12,7 +12,7 @@ scene_name=google_pick_coke_can_1_v4
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -27,7 +27,7 @@ done
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -48,7 +48,7 @@ for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -70,7 +70,7 @@ scene_name=google_pick_coke_can_1_v4
 
 for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -78,7 +78,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 -0.09 -0.09 1 \
   --additional-env-build-kwargs slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -102,7 +102,7 @@ for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -126,7 +126,7 @@ for env_name in "${env_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/openvla_move_near_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_visual_matching.sh
old mode 100755
new mode 100644
similarity index 92%
rename from scripts/openvla_move_near_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_move_near_visual_matching.sh
index 7e334d8c..833865c0
--- a/scripts/openvla_move_near_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_move_near_visual_matching.sh
@@ -1,7 +1,7 @@
 
 gpu_id=0
 
-declare -a arr=("openvla/openvla-7b")
+declare -a arr=("CogACT/CogACT-Base")
 
 env_name=MoveNearGoogleBakedTexInScene-v0
 # env_name=MoveNearGoogleBakedTexInScene-v1
@@ -18,7 +18,7 @@ for urdf_version in "${urdf_version_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/openvla_pick_coke_can_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_variant_agg.sh
old mode 100755
new mode 100644
similarity index 92%
rename from scripts/openvla_pick_coke_can_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_variant_agg.sh
index 72e91605..79d1a387
--- a/scripts/openvla_pick_coke_can_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_variant_agg.sh
@@ -1,7 +1,6 @@
-
 gpu_id=0
 
-declare -a arr=("openvla/openvla-7b")
+declare -a arr=("CogACT/CogACT-Base")
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
@@ -18,7 +17,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -46,7 +45,7 @@ do for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -72,7 +71,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -80,7 +79,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option};
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -107,7 +106,7 @@ do for scene_name in "${scene_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -132,7 +131,7 @@ for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -140,7 +139,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
   --robot-init-rot-quat-center 0 0 0 1 --robot-init-rot-rpy-range 0 0 1 0 0 1 0 0 1 \
   --additional-env-build-kwargs ${coke_can_option} slightly_darker_lighting=True;
 
-CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
@@ -167,7 +166,7 @@ do for env_name in "${env_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/openvla_pick_coke_can_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_visual_matching.sh
old mode 100755
new mode 100644
similarity index 93%
rename from scripts/openvla_pick_coke_can_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_visual_matching.sh
index 083a71ee..3e5fbc25
--- a/scripts/openvla_pick_coke_can_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_pick_coke_can_visual_matching.sh
@@ -1,8 +1,6 @@
-
-
 gpu_id=0
 
-declare -a arr=("openvla/openvla-7b")
+declare -a arr=("CogACT/CogACT-Base")
 
 # lr_switch=laying horizontally but flipped left-right to match real eval; upright=standing; laid_vertically=laying vertically
 declare -a coke_can_options_arr=("lr_switch=True" "upright=True" "laid_vertically=True")
@@ -24,7 +22,7 @@ do for coke_can_option in "${coke_can_options_arr[@]}";
 
 do for ckpt_path in "${arr[@]}";
 
-do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 80 \
   --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/openvla_put_in_drawer_variant_agg.sh b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_variant_agg.sh
old mode 100755
new mode 100644
similarity index 91%
rename from scripts/openvla_put_in_drawer_variant_agg.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_variant_agg.sh
index fb8ce514..04d11634
--- a/scripts/openvla_put_in_drawer_variant_agg.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_variant_agg.sh
@@ -1,8 +1,10 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
+gpu_id=0
 
 
-
-declare -a ckpt_paths=("openvla/openvla-7b")
+declare -a ckpt_paths=(
+"CogACT/CogACT-Base"
+)
 
 declare -a env_names=(
 PlaceIntoClosedTopDrawerCustomInScene-v0
@@ -17,7 +19,7 @@ scene_name=frl_apartment_stage_simple
 EvalSim() {
   echo ${ckpt_path} ${env_name}
 
-  python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+  CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
     --robot google_robot_static \
     --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
     --env-name ${env_name} --scene-name ${scene_name} \
diff --git a/scripts/openvla_put_in_drawer_visual_matching.sh b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_visual_matching.sh
old mode 100755
new mode 100644
similarity index 84%
rename from scripts/openvla_put_in_drawer_visual_matching.sh
rename to simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_visual_matching.sh
index 14dd46a4..c1b3d23e
--- a/scripts/openvla_put_in_drawer_visual_matching.sh
+++ b/simpler_env/policies/sim_cogact/scripts/cogact_put_in_drawer_visual_matching.sh
@@ -1,6 +1,8 @@
 # shader_dir=rt means that we turn on ray-tracing rendering; this is quite crucial for the open / close drawer task as policies often rely on shadows to infer depth
-
-declare -a ckpt_paths=("openvla/openvla-7b")
+gpu_id=0
+declare -a ckpt_paths=(
+"CogACT/CogACT-Base"
+)
 
 
 declare -a env_names=(
@@ -20,7 +22,7 @@ EXTRA_ARGS="--enable-raytracing --additional-env-build-kwargs station_name=mk_st
 
 EvalOverlay() {
 # A0
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -31,7 +33,7 @@ python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_p
   ${EXTRA_ARGS}
 
 # B0
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
@@ -42,7 +44,7 @@ python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_p
   ${EXTRA_ARGS}
 
 # C0
-python simpler_env/main_inference.py --policy-model openvla --ckpt-path ${ckpt_path} \
+CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model cogact --ckpt-path ${ckpt_path} \
   --robot google_robot_static \
   --control-freq 3 --sim-freq 513 --max-episode-steps 200 \
   --env-name ${env_name} --scene-name dummy_drawer \
diff --git a/scripts/octo_bridge.sh b/simpler_env/policies/sim_cogact/scripts/octo_bridge.sh
old mode 100755
new mode 100644
similarity index 92%
rename from scripts/octo_bridge.sh
rename to simpler_env/policies/sim_cogact/scripts/octo_bridge.sh
index 607498b0..e4f4d686
--- a/scripts/octo_bridge.sh
+++ b/simpler_env/policies/sim_cogact/scripts/octo_bridge.sh
@@ -1,5 +1,3 @@
-
-
 gpu_id=0
 declare -a policy_models=(
 "octo-small"
@@ -15,13 +13,13 @@ rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_real_eval_1.png
 robot_init_x=0.147
 robot_init_y=0.028
 
-for init_rng in 0 2 4;
+for init_rng in 0 2 4 6 8;
 
 do for policy_model in "${policy_models[@]}";
 
 do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge --octo-init-rng ${init_rng} \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name StackGreenCubeOnYellowCubeBakedTexInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -30,7 +28,7 @@ do CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge --octo-init-rng ${init_rng} \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name PutCarrotOnPlateInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -39,7 +37,7 @@ CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-mod
 
 CUDA_VISIBLE_DEVICES=${gpu_id} python simpler_env/main_inference.py --policy-model ${policy_model} --ckpt-path ${ckpt_path} \
   --robot ${robot} --policy-setup widowx_bridge --octo-init-rng ${init_rng} \
-  --control-freq 5 --sim-freq 500 --max-episode-steps 60 \
+  --control-freq 5 --sim-freq 500 --max-episode-steps 120 \
   --env-name PutSpoonOnTableClothInScene-v0 --scene-name ${scene_name} \
   --rgb-overlay-path ${rgb_overlay_path} \
   --robot-init-x ${robot_init_x} ${robot_init_x} 1 --robot-init-y ${robot_init_y} ${robot_init_y} 1 --obj-variation-mode episode --obj-episode-range 0 24 \
@@ -60,7 +58,7 @@ rgb_overlay_path=ManiSkill2_real2sim/data/real_inpainting/bridge_sink.png
 robot_init_x=0.127
 robot_init_y=0.06
 
-for init_rng in 0 2 4;
+for init_rng in 0 2 4 6 8;
 
 do for policy_model in "${policy_models[@]}";
 
diff --git a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
index 2e09cd22..8bf23b87 100644
--- a/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
+++ b/simpler_env/simple_inference_visual_matching_prepackaged_envs.py
@@ -6,8 +6,6 @@
         --ckpt-path ./checkpoints/rt_1_tf_trained_for_000400120  --task google_robot_pick_coke_can  --logging-root ./results_simple_eval/  --n-trajs 10
     python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy octo-small \
         --ckpt-path None --task widowx_spoon_on_towel  --logging-root ./results_simple_eval/  --n-trajs 10
-    python simpler_env/simple_inference_visual_matching_prepackaged_envs.py --policy openvla/openvla-7b \
-        --ckpt-path None --task google_robot_move_near_v1  --logging-root ./results_simple_eval/  --n-trajs 10
 """
 
 import argparse
@@ -23,7 +21,7 @@
 
 parser = argparse.ArgumentParser()
 
-parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small", "openvla/openvla-7b"])
+parser.add_argument("--policy", default="rt1", choices=["rt1", "octo-base", "octo-small"])
 parser.add_argument(
     "--ckpt-path",
     type=str,
@@ -39,7 +37,7 @@
 parser.add_argument("--n-trajs", type=int, default=10)
 
 args = parser.parse_args()
-if args.policy in ["octo-base", "octo-small", "openvla/openvla-7b"]:
+if args.policy in ["octo-base", "octo-small"]:
     if args.ckpt_path in [None, "None"] or "rt_1_x" in args.ckpt_path:
         args.ckpt_path = args.policy
 if args.ckpt_path[-1] == "/":
@@ -77,10 +75,6 @@
     from simpler_env.policies.octo.octo_model import OctoInference
 
     model = OctoInference(model_type=args.ckpt_path, policy_setup=policy_setup, init_rng=0)
-elif "openvla" in args.policy:
-    from simpler_env.policies.openvla.openvla_model import OpenVLAInference
-
-    model = OpenVLAInference(saved_model_path=args.ckpt_path, policy_setup=policy_setup)
 else:
     raise NotImplementedError()
 
diff --git a/simpler_env/utils/action/action_ensemble.py b/simpler_env/utils/action/action_ensemble.py
index 367ccf91..fc88fed2 100644
--- a/simpler_env/utils/action/action_ensemble.py
+++ b/simpler_env/utils/action/action_ensemble.py
@@ -21,7 +21,7 @@ def ensemble_action(self, cur_action):
             curr_act_preds = np.stack(
                 [pred_actions[i] for (i, pred_actions) in zip(range(num_actions - 1, -1, -1), self.action_history)]
             )
-        # more recent predictions get exponentially *less* weight than older predictions
+        # if temp > 0, more recent predictions get exponentially *less* weight than older predictions
         weights = np.exp(-self.action_ensemble_temp * np.arange(num_actions))
         weights = weights / weights.sum()
         # compute the weighted average across all predictions for this timestep
diff --git a/simpler_env/utils/metrics.py b/simpler_env/utils/metrics.py
index 019a5a2f..6a7f207c 100644
--- a/simpler_env/utils/metrics.py
+++ b/simpler_env/utils/metrics.py
@@ -3,35 +3,9 @@
 from typing import Sequence, Optional
 
 import numpy as np
+from pandas.core.frame import frame_sub_kwargs
 
-REAL_PERF = {    # Real robot eval performance --> extract via: REAL_PERF[task][policy]
-    "google_robot_pick_coke_can_horizontal": {
-        "rt-2-x": 0.920,
-        "rt-1-converged": 0.960,
-        "rt-1-15pct": 1.000,
-        "rt-1-x": 0.880,
-        "rt-1-begin": 0.200,
-        "octo-base": 0.440,
-        "openvla-7b": 0.640,
-    },
-    "google_robot_pick_coke_can_vertical": {
-        "rt-2-x": 0.800,
-        "rt-1-converged": 0.880,
-        "rt-1-15pct": 0.960,
-        "rt-1-x": 0.560,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.200,
-        "openvla-7b": 0.280,
-    },
-    "google_robot_pick_coke_can_standing": {
-        "rt-2-x": 1.000,
-        "rt-1-converged": 0.720,
-        "rt-1-15pct": 0.800,
-        "rt-1-x": 0.840,
-        "rt-1-begin": 0.200,
-        "octo-base": 0.240,
-        "openvla-7b": 0.360,
-    },
+REAL_PERF = {  # Real robot eval performance --> extract via: REAL_PERF[task][policy]
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.907,
         "rt-1-converged": 0.853,
@@ -39,7 +13,6 @@
         "rt-1-x": 0.760,
         "rt-1-begin": 0.133,
         "octo-base": 0.293,
-        "openvla-7b": 0.427,
     },
     "google_robot_move_near": {
         "rt-2-x": 0.733,
@@ -48,7 +21,6 @@
         "rt-1-x": 0.450,
         "rt-1-begin": 0.017,
         "octo-base": 0.350,
-        "openvla-7b": 0.667,
     },
     "google_robot_open_drawer": {
         "rt-2-x": 0.333,
@@ -57,7 +29,6 @@
         "rt-1-x": 0.519,
         "rt-1-begin": 0.000,
         "octo-base": 0.148,
-        "openvla-7b": 0.111,
     },
     "google_robot_close_drawer": {
         "rt-2-x": 0.630,
@@ -66,16 +37,6 @@
         "rt-1-x": 0.741,
         "rt-1-begin": 0.000,
         "octo-base": 0.519,
-        "openvla-7b": 0.148,
-    },
-    "google_robot_drawer": {
-        "rt-2-x": 0.481,
-        "rt-1-converged": 0.870,
-        "rt-1-15pct": 0.796,
-        "rt-1-x": 0.630,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.333,
-        "openvla-7b": 0.130,
     },
     "google_robot_place_apple_in_closed_top_drawer": {
         "rt-2-x": 0.074,
@@ -84,7 +45,6 @@
         "rt-1-x": 0.407,
         "rt-1-begin": 0.000,
         "octo-base": 0.000,
-        "openvla-7b": 0.000,
     },
     "widowx_spoon_on_towel": {
         "rt-1-x": 0.000,
@@ -109,34 +69,7 @@
 }
 
 
-SIMPLER_PERF = {    # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
-    "google_robot_pick_coke_can_horizontal": {
-        "rt-2-x": 0.740,
-        "rt-1-converged": 0.960,
-        "rt-1-15pct": 0.860,
-        "rt-1-x": 0.820,
-        "rt-1-begin": 0.050,
-        "octo-base": 0.210,
-        "openvla-7b": 0.310,
-    },
-    "google_robot_pick_coke_can_vertical": {
-        "rt-2-x": 0.740,
-        "rt-1-converged": 0.900,
-        "rt-1-15pct": 0.790,
-        "rt-1-x": 0.330,
-        "rt-1-begin": 0.000,
-        "octo-base": 0.210,
-        "openvla-7b": 0.030,
-    },
-    "google_robot_pick_coke_can_standing": {
-        "rt-2-x": 0.880,
-        "rt-1-converged": 0.710,
-        "rt-1-15pct": 0.480,
-        "rt-1-x": 0.550,
-        "rt-1-begin": 0.030,
-        "octo-base": 0.090,
-        "openvla-7b": 0.190,
-    },
+SIMPLER_PERF = {  # SIMPLER simulated eval performance --> extract via: SIMPLER_PERF[task][policy]
     "google_robot_pick_coke_can": {
         "rt-2-x": 0.787,
         "rt-1-converged": 0.857,
@@ -144,7 +77,6 @@
         "rt-1-x": 0.567,
         "rt-1-begin": 0.027,
         "octo-base": 0.170,
-        "openvla-7b": 0.177,
     },
     "google_robot_move_near": {
         "rt-2-x": 0.779,
@@ -153,7 +85,6 @@
         "rt-1-x": 0.317,
         "rt-1-begin": 0.050,
         "octo-base": 0.042,
-        "openvla-7b": 0.492,
     },
     "google_robot_open_drawer": {
         "rt-2-x": 0.157,
@@ -162,7 +93,6 @@
         "rt-1-x": 0.296,
         "rt-1-begin": 0.000,
         "octo-base": 0.009,
-        "openvla-7b": 0.250,
     },
     "google_robot_close_drawer": {
         "rt-2-x": 0.343,
@@ -171,16 +101,6 @@
         "rt-1-x": 0.891,
         "rt-1-begin": 0.278,
         "octo-base": 0.444,
-        "openvla-7b": 0.574,
-    },
-    "google_robot_drawer": {
-        "rt-2-x": 0.250,
-        "rt-1-converged": 0.730,
-        "rt-1-15pct": 0.565,
-        "rt-1-x": 0.597,
-        "rt-1-begin": 0.139,
-        "octo-base": 0.227,
-        "openvla-7b": 0.412,
     },
     "google_robot_place_apple_in_closed_top_drawer": {
         "rt-2-x": 0.037,
@@ -189,7 +109,6 @@
         "rt-1-x": 0.213,
         "rt-1-begin": 0.000,
         "octo-base": 0.000,
-        "openvla-7b": 0.000,
     },
     "widowx_spoon_on_towel": {
         "rt-1-x": 0.000,
@@ -213,6 +132,187 @@
     },
 }
 
+REF = {
+    "ckpt_name": [
+        "RT-1(Converged)",
+        "RT-1(15%)",
+        "RT-1-X",
+        "RT-2-X",
+        "Octo-Base",
+        "Octo-Small",
+        "RT-1(begin)",
+        "OpenVLA",
+        "RoboVLM",
+    ],
+    "coke_can/matching_avg": [0.857, 0.710, 0.567, 0.787, 0.170, "nan", 0.027, 0.163, 0.727],
+    "coke_can/variant_avg": [0.898, 0.813, 0.490, 0.823, 0.006, "nan", 0.022, 0.545, "nan"],
+    "coke_can/matching/horizontal": [
+        0.960,
+        0.860,
+        0.820,
+        0.740,
+        0.210,
+        "nan",
+        0.050,
+        0.270,
+        0.850,
+    ],
+    "coke_can/matching/vertical": [
+        0.900,
+        0.790,
+        0.330,
+        0.740,
+        0.210,
+        "nan",
+        0.000,
+        0.030,
+        0.430,
+    ],
+    "coke_can/matching/standing": [
+        0.710,
+        0.480,
+        0.550,
+        0.880,
+        0.090,
+        "nan",
+        0.030,
+        0.190,
+        0.900,
+    ],
+    "coke_can/variant/horizontal": [
+        0.969,
+        0.920,
+        0.569,
+        0.822,
+        0.005,
+        "nan",
+        0.022,
+        0.711,
+        "nan",
+    ],
+    "coke_can/variant/vertical": [
+        0.760,
+        0.704,
+        0.204,
+        0.754,
+        0.000,
+        "nan",
+        0.013,
+        0.271,
+        "nan",
+    ],
+    "coke_can/variant/standing": [
+        0.964,
+        0.813,
+        0.698,
+        0.893,
+        0.013,
+        "nan",
+        0.031,
+        0.653,
+        "nan",
+    ],
+    "move_near/variant": [0.500, 0.446, 0.323, 0.792, 0.031, "nan", 0.040, 0.477, "nan"],
+    "move_near/matching": [0.442, 0.354, 0.317, 0.779, 0.042, "nan", 0.050, 0.462, 0.663],
+    "drawer/matching_avg": [0.730, 0.565, 0.597, 0.250, 0.227, "nan", 0.139, 0.356, 0.268],
+    "drawer/variant_avg": [0.323, 0.267, 0.294, 0.353, 0.011, "nan", 0.069, 0.177, "nan"],
+    "drawer/matching/open": [0.601, 0.463, 0.296, 0.157, 0.009, "nan", 0.000, 0.194, 0.287],
+    "drawer/matching/close": [0.861, 0.667, 0.891, 0.343, 0.444, "nan", 0.278, 0.518, 0.25],
+    "drawer/variant/open": [0.270, 0.212, 0.069, 0.333, 0.000, "nan", 0.005, 0.158, "nan"],
+    "drawer/variant/close": [0.376, 0.323, 0.519, 0.372, 0.021, "nan", 0.132, 0.195, "nan"],
+    "apple_in_drawer/matching_avg": [0.065, 0.130, 0.213, 0.037, 0.000, 0.000, 0.000, "nan", 0.361],
+    "apple_in_drawer/variant_avg": [0.026, 0.021, 0.101, 0.206, 0.000, 0.000, 0.000, "nan", "nan"],
+    "put_spoon_on_tablecloth/matching_partial": [
+        "nan",
+        "nan",
+        0.167,
+        "nan",
+        0.347,
+        0.778,
+        "nan",
+        0.041,
+        0.375
+    ],
+    "put_spoon_on_tablecloth/matching_entire": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.125,
+        0.472,
+        "nan",
+        0.000,
+        0.208
+    ],
+    "put_carrot_on_plate/matching_partial": [
+        "nan",
+        "nan",
+        0.208,
+        "nan",
+        0.528,
+        0.278,
+        "nan",
+        0.333,
+        0.333
+    ],
+    "put_carrot_on_plate/matching_entire": [
+        "nan",
+        "nan",
+        0.042,
+        "nan",
+        0.083,
+        0.097,
+        "nan",
+        0.000,
+        0.25
+    ],
+    "stack_green_block_on_yellow_block/matching_partial": [
+        "nan",
+        "nan",
+        0.083,
+        "nan",
+        0.319,
+        0.403,
+        "nan",
+        0.125,
+        0.083
+    ],
+    "stack_green_block_on_yellow_block/matching_entire": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.000,
+        0.042,
+        "nan",
+        0.000,
+        0.083
+    ],
+    "put_eggplant_in_basket/matching_partial": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.667,
+        0.875,
+        "nan",
+        0.083,
+        0.000
+    ],
+    "put_eggplant_in_basket/matching_entire": [
+        "nan",
+        "nan",
+        0.000,
+        "nan",
+        0.431,
+        0.569,
+        "nan",
+        0.041,
+        0.000
+    ],
+}
+
+
 def pearson_correlation(perf_sim: Sequence[float], perf_real: Sequence[float]) -> float:
     perf_sim, perf_real = np.array(perf_sim), np.array(perf_real)
     assert perf_sim.shape == perf_real.shape
@@ -255,6 +355,7 @@ def print_all_kruskal_results(
     Prints out the Kruskal-Wallis test for each checkpoint
     """
     from scipy.stats import kruskal
+
     sim, real = np.array(sim), np.array(real)
     assert sim.shape == real.shape
     print(title)
@@ -307,5 +408,4 @@ def get_dir_stats(
             results.append(1)
         elif succ_fail_pattern[1] in fname:
             results.append(0)
-
     return results
diff --git a/simpler_env/utils/visualization.py b/simpler_env/utils/visualization.py
index 824f4c5b..15531a92 100644
--- a/simpler_env/utils/visualization.py
+++ b/simpler_env/utils/visualization.py
@@ -27,6 +27,20 @@ def write_video(path, images, fps=5):
         images_npy = images
     media.write_video(path, images_npy, fps=fps)
 
+def write_interval_video(path, images, fps=5, interval=20):
+    # images: list of numpy arrays
+    root_dir = Path(path).parent
+    if not os.path.exists(root_dir):
+        os.makedirs(root_dir)
+    print(f"skip writting video D:")
+    os.system(f"touch {path}")
+    return
+
+    if not isinstance(images[0], np.ndarray):
+        images_npy = [image.numpy() for image in images]
+    else:
+        images_npy = images
+    media.write_video(path, images_npy[::interval], fps=fps)
 
 def plot_pred_and_gt_action_trajectory(predicted_actions, gt_actions, stacked_images):
     """
diff --git a/tools/calc_metrics_evaluation_videos.py b/tools/calc_metrics_evaluation_videos.py
index 3ea1eff5..4af72010 100644
--- a/tools/calc_metrics_evaluation_videos.py
+++ b/tools/calc_metrics_evaluation_videos.py
@@ -1,6 +1,7 @@
 import argparse
 import glob
 from pathlib import Path
+import pandas as pd
 
 import numpy as np
 from scipy.stats import kruskal
@@ -13,6 +14,7 @@
     print_all_kruskal_results,
 )
 
+from simpler_env.utils.metrics import REF
 # Calculate metrics for each task
 
 
@@ -21,31 +23,40 @@ def calc_pick_coke_can_stats(root_result_dir):
     # If you use a new checkpoint, please update the real evaluation results here
     coke_can_real_success = {
         "horizontal": {
-            "rt-2-x": 0.92,
-            "rt-1-converged": 0.96,
-            "rt-1-15pct": 1.0,
-            "rt-1-x": 0.88,
-            "rt-1-begin": 0.20,
-            "octo-base": 0.44,
-            "openvla-7b": 0.64,
+            # "rt-2-x": 0.92,
+            # "rt-1-converged": 0.96,
+            # "rt-1-15pct": 1.0,
+            # "rt-1-x": 0.88,
+            # "rt-1-begin": 0.20,
+            # "octo-base": 0.44,
+            # "openvla-7b": 0.44,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
         "vertical": {
-            "rt-2-x": 0.80,
-            "rt-1-converged": 0.88,
-            "rt-1-15pct": 0.96,
-            "rt-1-x": 0.56,
-            "rt-1-begin": 0.00,
-            "octo-base": 0.20,
-            "openvla-7b": 0.28,
+            # "rt-2-x": 0.80,
+            # "rt-1-converged": 0.88,
+            # "rt-1-15pct": 0.96,
+            # "rt-1-x": 0.56,
+            # "rt-1-begin": 0.00,
+            # "octo-base": 0.20,
+            # "openvla-7b": 0.20,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
         "standing": {
-            "rt-2-x": 1.00,
-            "rt-1-converged": 0.72,
-            "rt-1-15pct": 0.80,
-            "rt-1-x": 0.84,
-            "rt-1-begin": 0.20,
-            "octo-base": 0.24,
-            "openvla-7b": 0.36,
+            # "rt-2-x": 1.00,
+            # "rt-1-converged": 0.72,
+            # "rt-1-15pct": 0.80,
+            # "rt-1-x": 0.84,
+            # "rt-1-begin": 0.20,
+            # "octo-base": 0.24,
+            # "openvla-7b": 0.24,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
     }
 
@@ -55,17 +66,18 @@ def calc_pick_coke_can_stats(root_result_dir):
         "vertical": "laid_vertically",
         "standing": "upright",
     }
-    n_orientations = len(coke_can_orientation_map_dict)  # number of coke can orientations
-    n_trials_per_ckpt_per_orientation = (
-        25  # number of trials per checkpoint per coke can orientation; update if it is different
-    )
+    n_orientations = len(
+        coke_can_orientation_map_dict
+    )  # number of coke can orientations
+    n_trials_per_ckpt_per_orientation = 25  # number of trials per checkpoint per coke can orientation; update if it is different
     # extra patterns required in file name; if you are using different visual matching overlay image, please update here
     extra_pattern_require_sim_variants = ["rgb_overlay_None"]
     extra_pattern_require_visual_matching = ["rgb_overlay_google_coke_can_real_eval_1"]
 
     # get simulation variant success
     coke_can_sim_variant_success = {
-        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in coke_can_orientation_map_dict.keys()
+        k1: {k2: [] for k2 in ckpt_alias_keys}
+        for k1 in coke_can_orientation_map_dict.keys()
     }
 
     # hardcoded variant aggregation result dirs; if you have new variants, please update here
@@ -108,9 +120,15 @@ def calc_pick_coke_can_stats(root_result_dir):
     for coke_can_orientation in coke_can_orientation_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for variant in (
-                base_variants + background_variants + lighting_variants + distractor_variants + table_texture_variants
+                base_variants
+                + background_variants
+                + lighting_variants
+                + distractor_variants
+                + table_texture_variants
             ):
-                variant = variant.format(coke_can_orientation_map_dict[coke_can_orientation])
+                variant = variant.format(
+                    coke_can_orientation_map_dict[coke_can_orientation]
+                )
                 variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                 avg_sim_success = np.mean(
                     get_dir_stats(
@@ -120,8 +138,10 @@ def calc_pick_coke_can_stats(root_result_dir):
                 )
                 if np.isnan(avg_sim_success):
                     print(f"WARNING: avg_sim_success is nan for {variant}")
-                coke_can_sim_variant_success[coke_can_orientation][ckpt_alias].append(avg_sim_success)
-            
+                coke_can_sim_variant_success[coke_can_orientation][ckpt_alias].append(
+                    avg_sim_success
+                )
+
             coke_can_sim_variant_success[coke_can_orientation][ckpt_alias] = np.mean(
                 coke_can_sim_variant_success[coke_can_orientation][ckpt_alias]
             )
@@ -160,29 +180,40 @@ def calc_pick_coke_can_stats(root_result_dir):
             avg_orientation_sim_variant_results[-1].append(
                 coke_can_sim_variant_success[coke_can_orientation][ckpt_alias]
             )
-            avg_orientation_real_results[-1].append(coke_can_real_success[coke_can_orientation][ckpt_alias])
-        avg_orientation_sim_variant_results[-1] = np.mean(avg_orientation_sim_variant_results[-1])
+            avg_orientation_real_results[-1].append(
+                coke_can_real_success[coke_can_orientation][ckpt_alias]
+            )
+        avg_orientation_sim_variant_results[-1] = np.mean(
+            avg_orientation_sim_variant_results[-1]
+        )
         avg_orientation_real_results[-1] = np.mean(avg_orientation_real_results[-1])
     print("avg_orientation_sim_variant_results", avg_orientation_sim_variant_results)
     print("avg_orientation_real_results", avg_orientation_real_results)
     print(
         "mean_maximum_rank_violation(avg_orientation_sim_variant_results, avg_orientation_real_results)",
-        mean_maximum_rank_violation(avg_orientation_sim_variant_results, avg_orientation_real_results),
+        mean_maximum_rank_violation(
+            avg_orientation_sim_variant_results, avg_orientation_real_results
+        ),
     )
     print(
         "pearson_correlation(avg_orientation_sim_variant_results, avg_orientation_real_results)",
-        pearson_correlation(avg_orientation_sim_variant_results, avg_orientation_real_results),
+        pearson_correlation(
+            avg_orientation_sim_variant_results, avg_orientation_real_results
+        ),
     )
     print("-" * 20)
 
     # get visual matching success
     coke_can_sim_visual_matching_success = {
-        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in coke_can_orientation_map_dict.keys()
+        k1: {k2: [] for k2 in ckpt_alias_keys}
+        for k1 in coke_can_orientation_map_dict.keys()
     }
     for coke_can_orientation in coke_can_orientation_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for variant in base_visual_matching_variants:
-                variant = variant.format(coke_can_orientation_map_dict[coke_can_orientation])
+                variant = variant.format(
+                    coke_can_orientation_map_dict[coke_can_orientation]
+                )
                 variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                 avg_sim_success = np.mean(
                     get_dir_stats(
@@ -192,10 +223,18 @@ def calc_pick_coke_can_stats(root_result_dir):
                 )
                 if np.isnan(avg_sim_success):
                     print(f"WARNING: avg_sim_success is nan for {variant}")
-                coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias].append(avg_sim_success)
-            print(f"Orientation {coke_can_orientation}, ckpt {ckpt_alias} all robot arm visual matching success: {coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]}")
-            coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias] = np.mean(
-                coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]
+                coke_can_sim_visual_matching_success[coke_can_orientation][
+                    ckpt_alias
+                ].append(avg_sim_success)
+            print(
+                f"Orientation {coke_can_orientation}, ckpt {ckpt_alias} all robot arm visual matching success: {coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]}"
+            )
+            coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias] = (
+                np.mean(
+                    coke_can_sim_visual_matching_success[coke_can_orientation][
+                        ckpt_alias
+                    ]
+                )
             )
 
     for coke_can_orientation in coke_can_orientation_map_dict.keys():
@@ -210,21 +249,27 @@ def calc_pick_coke_can_stats(root_result_dir):
         print(
             f"{coke_can_orientation} MMRV",
             mean_maximum_rank_violation(
-                list(coke_can_sim_visual_matching_success[coke_can_orientation].values()),
+                list(
+                    coke_can_sim_visual_matching_success[coke_can_orientation].values()
+                ),
                 list(coke_can_real_success[coke_can_orientation].values()),
             ),
         )
         print(
             f"{coke_can_orientation} pearson correlation",
             pearson_correlation(
-                list(coke_can_sim_visual_matching_success[coke_can_orientation].values()),
+                list(
+                    coke_can_sim_visual_matching_success[coke_can_orientation].values()
+                ),
                 list(coke_can_real_success[coke_can_orientation].values()),
             ),
         )
         print_all_kruskal_results(
             construct_unordered_trial_results(
                 n_trials_per_ckpt_per_orientation,
-                list(coke_can_sim_visual_matching_success[coke_can_orientation].values()),
+                list(
+                    coke_can_sim_visual_matching_success[coke_can_orientation].values()
+                ),
             ),
             construct_unordered_trial_results(
                 n_trials_per_ckpt_per_orientation,
@@ -242,8 +287,12 @@ def calc_pick_coke_can_stats(root_result_dir):
             avg_orientation_sim_visual_matching_results[-1].append(
                 coke_can_sim_visual_matching_success[coke_can_orientation][ckpt_alias]
             )
-            avg_orientation_real_results[-1].append(coke_can_real_success[coke_can_orientation][ckpt_alias])
-        avg_orientation_sim_visual_matching_results[-1] = np.mean(avg_orientation_sim_visual_matching_results[-1])
+            avg_orientation_real_results[-1].append(
+                coke_can_real_success[coke_can_orientation][ckpt_alias]
+            )
+        avg_orientation_sim_visual_matching_results[-1] = np.mean(
+            avg_orientation_sim_visual_matching_results[-1]
+        )
         avg_orientation_real_results[-1] = np.mean(avg_orientation_real_results[-1])
     print(
         "avg_orientation_sim_visual_matching_results",
@@ -252,11 +301,15 @@ def calc_pick_coke_can_stats(root_result_dir):
     print("avg_orientation_real_results", avg_orientation_real_results)
     print(
         "mean_maximum_rank_violation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results)",
-        mean_maximum_rank_violation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results),
+        mean_maximum_rank_violation(
+            avg_orientation_sim_visual_matching_results, avg_orientation_real_results
+        ),
     )
     print(
         "pearson_correlation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results)",
-        pearson_correlation(avg_orientation_sim_visual_matching_results, avg_orientation_real_results),
+        pearson_correlation(
+            avg_orientation_sim_visual_matching_results, avg_orientation_real_results
+        ),
     )
     print_all_kruskal_results(
         construct_unordered_trial_results(
@@ -273,19 +326,31 @@ def calc_pick_coke_can_stats(root_result_dir):
     print("*" * 20)
     for _ in range(3):
         print()
+    # process results and return
+    ret_dict = {}
+    ret_dict["coke_can/matching_avg"] = avg_orientation_sim_visual_matching_results
+    ret_dict["coke_can/variant_avg"] = avg_orientation_sim_variant_results
+    for key, val in coke_can_sim_visual_matching_success.items():
+        ret_dict[f"coke_can/matching/{key}"] = [succ for _, succ in val.items()]
+    for key, val in coke_can_sim_variant_success.items():
+        ret_dict[f"coke_can/variant/{key}"] = [succ for _, succ in val.items()]
+    return ret_dict
 
 
 def calc_move_near_stats(root_result_dir):
     print("***Move Near results***")
     # If you use a new checkpoint, please update the real evaluation results here
     move_near_real_success = {
-        "rt-2-x": 0.733,
-        "rt-1-converged": 0.633,
-        "rt-1-15pct": 0.583,
-        "rt-1-x": 0.45,
-        "rt-1-begin": 0.017,
-        "octo-base": 0.35,
-        "openvla-7b": 0.667,
+        # "rt-2-x": 0.733,
+        # "rt-1-converged": 0.633,
+        # "rt-1-15pct": 0.583,
+        # "rt-1-x": 0.45,
+        # "rt-1-begin": 0.017,
+        # "octo-base": 0.35,
+        # "openvla-7b": 0.35,
+        # "internvla-1_8b": 0.44,
+        # "internvla_diff-1_8b": 0.44,
+        "ours": 0.0,
     }
 
     ckpt_alias_keys = list(move_near_real_success.keys())
@@ -335,14 +400,24 @@ def calc_move_near_stats(root_result_dir):
 
     for ckpt_alias in ckpt_alias_keys:
         for variant in (
-            base_variants + background_variants + lighting_variants + distractor_variants + table_texture_variants
+            base_variants
+            + background_variants
+            + lighting_variants
+            + distractor_variants
+            + table_texture_variants
         ):
             variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
-            avg_sim_success = np.mean(get_dir_stats(variant, extra_pattern_require=extra_pattern_require_sim_variants))
+            avg_sim_success = np.mean(
+                get_dir_stats(
+                    variant, extra_pattern_require=extra_pattern_require_sim_variants
+                )
+            )
             if np.isnan(avg_sim_success):
                 print(f"WARNING: avg_sim_success is nan for {variant}")
             move_near_sim_variant_success[ckpt_alias].append(avg_sim_success)
-        move_near_sim_variant_success[ckpt_alias] = np.mean(move_near_sim_variant_success[ckpt_alias])
+        move_near_sim_variant_success[ckpt_alias] = np.mean(
+            move_near_sim_variant_success[ckpt_alias]
+        )
 
     print("-" * 20)
     print("sim variant avg success", move_near_sim_variant_success)
@@ -369,15 +444,20 @@ def calc_move_near_stats(root_result_dir):
         for variant in base_visual_matching_variants:
             variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
             avg_sim_success = np.mean(
-                get_dir_stats(variant, extra_pattern_require=extra_pattern_require_visual_matching)
+                get_dir_stats(
+                    variant, extra_pattern_require=extra_pattern_require_visual_matching
+                )
             )
             if np.isnan(avg_sim_success):
                 print(f"WARNING: avg_sim_success is nan for {variant}")
             move_near_sim_visual_matching_success[ckpt_alias].append(avg_sim_success)
-            
-        print(f"Ckpt {ckpt_alias} all robot arm visual matching success: {move_near_sim_visual_matching_success[ckpt_alias]}")
-        move_near_sim_visual_matching_success[ckpt_alias] = np.mean(move_near_sim_visual_matching_success[ckpt_alias])
 
+        print(
+            f"Ckpt {ckpt_alias} all robot arm visual matching success: {move_near_sim_visual_matching_success[ckpt_alias]}"
+        )
+        move_near_sim_visual_matching_success[ckpt_alias] = np.mean(
+            move_near_sim_visual_matching_success[ckpt_alias]
+        )
 
     print("sim visual matching success", move_near_sim_visual_matching_success)
     print("real success", move_near_real_success)
@@ -396,8 +476,12 @@ def calc_move_near_stats(root_result_dir):
         ),
     )
     print_all_kruskal_results(
-        construct_unordered_trial_results(n_trials_per_ckpt, list(move_near_sim_visual_matching_success.values())),
-        construct_unordered_trial_results(n_trials_per_ckpt, list(move_near_real_success.values())),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt, list(move_near_sim_visual_matching_success.values())
+        ),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt, list(move_near_real_success.values())
+        ),
         "avg kruskal:",
     )
 
@@ -405,28 +489,46 @@ def calc_move_near_stats(root_result_dir):
     for _ in range(3):
         print()
 
+    # TODO: process ret results
+    ret_dict = {}
+    for key, val in move_near_sim_variant_success.items():
+        ret_dict["move_near/variant"] = [
+            succ for _, succ in move_near_sim_variant_success.items()
+        ]
+    for key, val in move_near_sim_visual_matching_success.items():
+        ret_dict["move_near/matching"] = [
+            succ for _, succ in move_near_sim_visual_matching_success.items()
+        ]
+    return ret_dict
+
 
 def calc_drawer_stats(root_result_dir):
     print("***Drawer results***")
     # If you use a new checkpoint, please update the real evaluation results here
     drawer_real_success = {
         "open": {
-            "rt-2-x": 0.333,
-            "rt-1-converged": 0.815,
-            "rt-1-15pct": 0.704,
-            "rt-1-x": 0.519,
-            "rt-1-begin": 0.000,
-            "octo-base": 0.148,
-            "openvla-7b": 0.111,
+            # "rt-2-x": 0.333,
+            # "rt-1-converged": 0.815,
+            # "rt-1-15pct": 0.704,
+            # "rt-1-x": 0.519,
+            # "rt-1-begin": 0.000,
+            # "octo-base": 0.148,
+            # "openvla-7b": 0.148,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
         "close": {
-            "rt-2-x": 0.630,
-            "rt-1-converged": 0.926,
-            "rt-1-15pct": 0.889,
-            "rt-1-x": 0.741,
-            "rt-1-begin": 0.000,
-            "octo-base": 0.519,
-            "openvla-7b": 0.148,
+            # "rt-2-x": 0.630,
+            # "rt-1-converged": 0.926,
+            # "rt-1-15pct": 0.889,
+            # "rt-1-x": 0.741,
+            # "rt-1-begin": 0.000,
+            # "octo-base": 0.519,
+            # "openvla-7b": 0.519,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
     }
 
@@ -444,15 +546,15 @@ def calc_drawer_stats(root_result_dir):
         ],
     }
     n_tasks = len(drawer_task_map_dict)
-    n_trials_per_ckpt_per_task = (
-        27  # number of trials per checkpoint for all open / all close tasks; update if it is different
-    )
+    n_trials_per_ckpt_per_task = 27  # number of trials per checkpoint for all open / all close tasks; update if it is different
     # extra patterns required in file name; if you are using different visual matching overlay image, please update here
     extra_pattern_require_sim_variants = ["rgb_overlay_None"]
     extra_pattern_require_visual_matching = ["rgb_overlay_open_drawer"]
 
     # get simulation variant success
-    drawer_sim_variant_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_variant_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
 
     # hardcoded variant aggregation result dirs; if you have new variants, please update here
     base_variants = [
@@ -487,7 +589,12 @@ def calc_drawer_stats(root_result_dir):
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
-                for variant in base_variants + background_variants + lighting_variants + table_texture_variants:
+                for variant in (
+                    base_variants
+                    + background_variants
+                    + lighting_variants
+                    + table_texture_variants
+                ):
                     variant = variant.format(specific_task)
                     variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                     avg_sim_success = np.mean(
@@ -498,7 +605,9 @@ def calc_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             drawer_sim_variant_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_variant_success[drawer_task][ckpt_alias]
             )
@@ -531,7 +640,9 @@ def calc_drawer_stats(root_result_dir):
         avg_sim_variant_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_variant_results[-1].append(drawer_sim_variant_success[drawer_task][ckpt_alias])
+            avg_sim_variant_results[-1].append(
+                drawer_sim_variant_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
         avg_sim_variant_results[-1] = np.mean(avg_sim_variant_results[-1])
         avg_real_results[-1] = np.mean(avg_real_results[-1])
@@ -548,7 +659,9 @@ def calc_drawer_stats(root_result_dir):
     print("-" * 20)
 
     # get visual matching success
-    drawer_sim_visual_matching_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_visual_matching_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
@@ -563,13 +676,21 @@ def calc_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             tmp_variant_avg_each_robot_arm = []
             for i in range(len(base_visual_matching_variants)):
                 tmp_variant_avg_each_robot_arm.append(
-                    np.mean(drawer_sim_visual_matching_success[drawer_task][ckpt_alias][i::len(drawer_task_map_dict[drawer_task])])
+                    np.mean(
+                        drawer_sim_visual_matching_success[drawer_task][ckpt_alias][
+                            i :: len(drawer_task_map_dict[drawer_task])
+                        ]
+                    )
                 )
-            print(f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}")
+            print(
+                f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}"
+            )
             drawer_sim_visual_matching_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
             )
@@ -612,9 +733,13 @@ def calc_drawer_stats(root_result_dir):
         avg_sim_visual_matching_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_visual_matching_results[-1].append(drawer_sim_visual_matching_success[drawer_task][ckpt_alias])
+            avg_sim_visual_matching_results[-1].append(
+                drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
-        avg_sim_visual_matching_results[-1] = np.mean(avg_sim_visual_matching_results[-1])
+        avg_sim_visual_matching_results[-1] = np.mean(
+            avg_sim_visual_matching_results[-1]
+        )
         avg_real_results[-1] = np.mean(avg_real_results[-1])
     print("avg_sim_visual_matching_results", avg_sim_visual_matching_results)
     print("avg_real_results", avg_real_results)
@@ -627,28 +752,46 @@ def calc_drawer_stats(root_result_dir):
         pearson_correlation(avg_sim_visual_matching_results, avg_real_results),
     )
     print_all_kruskal_results(
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results),
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_real_results),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results
+        ),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_real_results
+        ),
         "avg kruskal:",
     )
 
     print("*" * 20)
     for _ in range(3):
         print()
-        
-        
+
+    # TODO: process ret dict
+    ret_dict = {
+        "drawer/matching_avg": avg_sim_visual_matching_results,
+        "drawer/variant_avg": avg_sim_variant_results,
+    }
+    for key, val in drawer_sim_visual_matching_success.items():
+        ret_dict[f"drawer/matching/{key}"] = [succ for _, succ in val.items()]
+    for key, val in drawer_sim_variant_success.items():
+        ret_dict[f"drawer/variant/{key}"] = [succ for _, succ in val.items()]
+    return ret_dict
+
+
 def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     print("***Drawer results***")
     # If you use a new checkpoint, please update the real evaluation results here
     drawer_real_success = {
         "put_apple_into_top_drawer": {
-            "rt-2-x": 0.074,
-            "rt-1-converged": 0.185,
-            "rt-1-15pct": 0.185,
-            "rt-1-x": 0.407,
-            "rt-1-begin": 0.000,
-            "octo-base": 0.000,
-            "openvla-7b": 0.000,
+            # "rt-2-x": 0.074,
+            # "rt-1-converged": 0.185,
+            # "rt-1-15pct": 0.185,
+            # "rt-1-x": 0.407,
+            # "rt-1-begin": 0.000,
+            # "octo-base": 0.000,
+            # "openvla-7b": 0.000,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
     }
 
@@ -659,9 +802,7 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         ],
     }
     n_tasks = len(drawer_task_map_dict)
-    n_trials_per_ckpt_per_task = (
-        27  # number of trials per checkpoint for each key in drawer_task_map_dict; update if it is different
-    )
+    n_trials_per_ckpt_per_task = 27  # number of trials per checkpoint for each key in drawer_task_map_dict; update if it is different
     # extra patterns required in file name; if you are using different visual matching overlay image, please update here
     extra_pattern_require_sim_variants = ["rgb_overlay_None", "apple"]
     extra_pattern_require_visual_matching = ["rgb_overlay_open_drawer", "apple"]
@@ -669,7 +810,9 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     extra_log_str_visual_matching = "model_ids_baked_apple_v2"
 
     # get simulation variant success
-    drawer_sim_variant_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_variant_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
 
     # hardcoded variant aggregation result dirs; if you have new variants, please update here
     base_variants = [
@@ -704,8 +847,15 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
-                for variant in base_variants + background_variants + lighting_variants + table_texture_variants:
-                    variant = variant.format(specific_task) + f"_{extra_log_str_variant_agg}"
+                for variant in (
+                    base_variants
+                    + background_variants
+                    + lighting_variants
+                    + table_texture_variants
+                ):
+                    variant = (
+                        variant.format(specific_task) + f"_{extra_log_str_variant_agg}"
+                    )
                     variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                     avg_sim_success = np.mean(
                         get_dir_stats(
@@ -715,7 +865,9 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_variant_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             drawer_sim_variant_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_variant_success[drawer_task][ckpt_alias]
             )
@@ -748,7 +900,9 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         avg_sim_variant_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_variant_results[-1].append(drawer_sim_variant_success[drawer_task][ckpt_alias])
+            avg_sim_variant_results[-1].append(
+                drawer_sim_variant_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
         avg_sim_variant_results[-1] = np.mean(avg_sim_variant_results[-1])
         avg_real_results[-1] = np.mean(avg_real_results[-1])
@@ -765,12 +919,17 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
     print("-" * 20)
 
     # get visual matching success
-    drawer_sim_visual_matching_success = {k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()}
+    drawer_sim_visual_matching_success = {
+        k1: {k2: [] for k2 in ckpt_alias_keys} for k1 in drawer_task_map_dict.keys()
+    }
     for drawer_task in drawer_task_map_dict.keys():
         for ckpt_alias in ckpt_alias_keys:
             for specific_task in drawer_task_map_dict[drawer_task]:
                 for variant in base_visual_matching_variants:
-                    variant = variant.format(specific_task) + f"_{extra_log_str_visual_matching}"
+                    variant = (
+                        variant.format(specific_task)
+                        + f"_{extra_log_str_visual_matching}"
+                    )
                     variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                     avg_sim_success = np.mean(
                         get_dir_stats(
@@ -780,13 +939,21 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
                     )
                     if np.isnan(avg_sim_success):
                         print(f"WARNING: avg_sim_success is nan for {variant}")
-                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(avg_sim_success)
+                    drawer_sim_visual_matching_success[drawer_task][ckpt_alias].append(
+                        avg_sim_success
+                    )
             tmp_variant_avg_each_robot_arm = []
             for i in range(len(base_visual_matching_variants)):
                 tmp_variant_avg_each_robot_arm.append(
-                    np.mean(drawer_sim_visual_matching_success[drawer_task][ckpt_alias][i::len(drawer_task_map_dict[drawer_task])])
+                    np.mean(
+                        drawer_sim_visual_matching_success[drawer_task][ckpt_alias][
+                            i :: len(drawer_task_map_dict[drawer_task])
+                        ]
+                    )
                 )
-            print(f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}")
+            print(
+                f"Drawer task {drawer_task}, ckpt {ckpt_alias} all robot arm visual matching success: {tmp_variant_avg_each_robot_arm}"
+            )
             drawer_sim_visual_matching_success[drawer_task][ckpt_alias] = np.mean(
                 drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
             )
@@ -829,9 +996,13 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         avg_sim_visual_matching_results.append([])
         avg_real_results.append([])
         for drawer_task in drawer_task_map_dict.keys():
-            avg_sim_visual_matching_results[-1].append(drawer_sim_visual_matching_success[drawer_task][ckpt_alias])
+            avg_sim_visual_matching_results[-1].append(
+                drawer_sim_visual_matching_success[drawer_task][ckpt_alias]
+            )
             avg_real_results[-1].append(drawer_real_success[drawer_task][ckpt_alias])
-        avg_sim_visual_matching_results[-1] = np.mean(avg_sim_visual_matching_results[-1])
+        avg_sim_visual_matching_results[-1] = np.mean(
+            avg_sim_visual_matching_results[-1]
+        )
         avg_real_results[-1] = np.mean(avg_real_results[-1])
     print("avg_sim_visual_matching_results", avg_sim_visual_matching_results)
     print("avg_real_results", avg_real_results)
@@ -844,14 +1015,29 @@ def calc_long_horizon_apple_in_drawer_stats(root_result_dir):
         pearson_correlation(avg_sim_visual_matching_results, avg_real_results),
     )
     print_all_kruskal_results(
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results),
-        construct_unordered_trial_results(n_trials_per_ckpt_per_task * n_tasks, avg_real_results),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_sim_visual_matching_results
+        ),
+        construct_unordered_trial_results(
+            n_trials_per_ckpt_per_task * n_tasks, avg_real_results
+        ),
         "avg kruskal:",
     )
 
     print("*" * 20)
     for _ in range(3):
         print()
+ 
+    # TODO: process ret dict
+    ret_dict = {
+        "apple_in_drawer/matching_avg": avg_sim_visual_matching_results,
+        "apple_in_drawer/variant_avg": avg_sim_variant_results,
+    }
+    # for key, val in drawer_sim_visual_matching_success.items():
+    #     ret_dict[f"drawer/matching/{key}"] = [succ for _, succ in val.items()]
+    # for key, val in drawer_sim_variant_success.items():
+    #     ret_dict[f"drawer/variant/{key}"] = [succ for _, succ in val.items()]
+    return ret_dict
 
 
 def calc_bridge_put_on_env_stats(root_result_dir):
@@ -859,49 +1045,78 @@ def calc_bridge_put_on_env_stats(root_result_dir):
     # If you use a new checkpoint, please update the real evaluation results here
     real_partial_success_dict = {
         "put_spoon_on_tablecloth": {
-            "rt-1-x": 0.042,
-            "octo-base": 0.500,
-            "octo-small": 0.542,
-            "openvla-7b": 0.10,
+            # "rt-1-x": 0.042,
+            # "octo-base": 0.500,
+            # "openvla-7b": 0.500,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.542,
         },
         "put_carrot_on_plate": {
-            "rt-1-x": 0.167,
-            "octo-base": 0.500,
-            "octo-small": 0.208,
-            "openvla-7b": 0.10,
+            # "rt-1-x": 0.167,
+            # "octo-base": 0.500,
+            # "openvla-7b": 0.500,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.208,
         },
         "stack_green_block_on_yellow_block": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.292,
-            "octo-small": 0.583,
-            "openvla-7b": 0.10,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.292,
+            # "openvla-7b": 0.292,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.583,
         },
         "put_eggplant_in_basket": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.400,
-            "octo-small": 0.600,
-            "openvla-7b": 0.10,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.400,
+            # "openvla-7b": 0.400,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.600,
         },
     }
     real_success_dict = {
         "put_spoon_on_tablecloth": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.333,
-            "octo-small": 0.417,
-            "openvla-7b": 0.10,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.333,
+            # "openvla-7b": 0.333,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.417,
+        },
+        "put_carrot_on_plate": {
+            # "rt-1-x": 0.00,
+            # "octo-base": 0.25,
+            # "octo-small": 0.083,
+            # "openvla-7b": 0,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
         },
-        "put_carrot_on_plate": {"rt-1-x": 0.00, "octo-base": 0.25, "octo-small": 0.083},
         "stack_green_block_on_yellow_block": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.000,
-            "octo-small": 0.125,
-            "openvla-7b": 0.10,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.000,
+            # "openvla-7b": 0.000,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.125,
         },
         "put_eggplant_in_basket": {
-            "rt-1-x": 0.000,
-            "octo-base": 0.250,
-            "octo-small": 0.400,
-            "openvla-7b": 0.10,
+            # "rt-1-x": 0.000,
+            # "octo-base": 0.250,
+            # "openvla-7b": 0.250,
+            # "internvla-1_8b": 0.44,
+            # "internvla_diff-1_8b": 0.44,
+            "ours": 0.0,
+            # "octo-small": 0.400,
         },
     }
 
@@ -945,6 +1160,7 @@ def calc_bridge_put_on_env_stats(root_result_dir):
     # partial_succ_fail_pattern = ['consecutive_grasp_True', 'consecutive_grasp_False']
 
     # get visual matching success
+    ret_dict = {}
     for task in tasks:
         real_success = real_success_dict[task]
         real_partial_success = real_partial_success_dict[task]
@@ -958,32 +1174,49 @@ def calc_bridge_put_on_env_stats(root_result_dir):
                 # we average octo performance over different random seeds
                 tmp = []
                 for seed in octo_seed_range:
-                    tmp.extend([f"{variant}_octo_init_rng_{seed}" for variant in base_visual_matching_variants])
+                    tmp.extend(
+                        [
+                            f"{variant}_octo_init_rng_{seed}"
+                            for variant in base_visual_matching_variants
+                        ]
+                    )
                 base_visual_matching_variants = tmp
             for variant in base_visual_matching_variants:
                 variant = f"{root_result_dir}/{CKPT_MAPPING[ckpt_alias]}/{variant}"
                 avg_sim_success = np.mean(
                     get_dir_stats(
                         variant,
-                        extra_pattern_require=extra_pattern_require_visual_matching[task],
+                        extra_pattern_require=extra_pattern_require_visual_matching[
+                            task
+                        ],
                         succ_fail_pattern=succ_fail_pattern,
                     )
                 )
                 avg_sim_partial_success = np.mean(
                     get_dir_stats(
                         variant,
-                        extra_pattern_require=extra_pattern_require_visual_matching[task],
+                        extra_pattern_require=extra_pattern_require_visual_matching[
+                            task
+                        ],
                         succ_fail_pattern=partial_succ_fail_pattern,
                     )
                 )
                 if np.isnan(avg_sim_success) or np.isnan(avg_sim_partial_success):
                     print(f"WARNING: avg_sim_success is nan for {variant}")
                 sim_visual_matching_success[ckpt_alias].append(avg_sim_success)
-                sim_visual_matching_partial_success[ckpt_alias].append(avg_sim_partial_success)
-            sim_visual_matching_success[ckpt_alias] = np.mean(sim_visual_matching_success[ckpt_alias])
-            sim_visual_matching_partial_success[ckpt_alias] = np.mean(sim_visual_matching_partial_success[ckpt_alias])
+                sim_visual_matching_partial_success[ckpt_alias].append(
+                    avg_sim_partial_success
+                )
+            sim_visual_matching_success[ckpt_alias] = np.mean(
+                sim_visual_matching_success[ckpt_alias]
+            )
+            sim_visual_matching_partial_success[ckpt_alias] = np.mean(
+                sim_visual_matching_partial_success[ckpt_alias]
+            )
 
-        print("sim visual matching partial success", sim_visual_matching_partial_success)
+        print(
+            "sim visual matching partial success", sim_visual_matching_partial_success
+        )
         print("real partial success", real_partial_success)
         print(
             "visual matching MMRV (partial success)",
@@ -1000,8 +1233,12 @@ def calc_bridge_put_on_env_stats(root_result_dir):
             ),
         )
         print_all_kruskal_results(
-            construct_unordered_trial_results(n_trials_per_ckpt, list(sim_visual_matching_partial_success.values())),
-            construct_unordered_trial_results(n_trials_per_ckpt, list(real_partial_success.values())),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(sim_visual_matching_partial_success.values())
+            ),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(real_partial_success.values())
+            ),
             "avg kruskal (partial success):",
         )
 
@@ -1009,15 +1246,23 @@ def calc_bridge_put_on_env_stats(root_result_dir):
         print("real success", real_success)
         print(
             "visual matching MMRV",
-            mean_maximum_rank_violation(list(sim_visual_matching_success.values()), list(real_success.values())),
+            mean_maximum_rank_violation(
+                list(sim_visual_matching_success.values()), list(real_success.values())
+            ),
         )
         print(
             "visual matching pearson correlation",
-            pearson_correlation(list(sim_visual_matching_success.values()), list(real_success.values())),
+            pearson_correlation(
+                list(sim_visual_matching_success.values()), list(real_success.values())
+            ),
         )
         print_all_kruskal_results(
-            construct_unordered_trial_results(n_trials_per_ckpt, list(sim_visual_matching_success.values())),
-            construct_unordered_trial_results(n_trials_per_ckpt, list(real_success.values())),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(sim_visual_matching_success.values())
+            ),
+            construct_unordered_trial_results(
+                n_trials_per_ckpt, list(real_success.values())
+            ),
             "avg kruskal:",
         )
 
@@ -1025,40 +1270,69 @@ def calc_bridge_put_on_env_stats(root_result_dir):
         for _ in range(3):
             print()
 
+        ret_dict[f"{task}/matching_partial"] = [
+            succ for _, succ in sim_visual_matching_partial_success.items()
+        ]
+        ret_dict[f"{task}/matching_entire"] = [
+            succ for _, succ in sim_visual_matching_success.items()
+        ]
+
+    return ret_dict
+
 
 # Define checkpoint alias-to-directory mapping; If you use a new checkpoint, please update the dict
 
 CKPT_MAPPING = {
-    "rt-2-x": "rt_2_x",
-    "rt-1-converged": "rt_1_tf_trained_for_000400120",
-    "rt-1-15pct": "rt_1_tf_trained_for_000058240",
-    "rt-1-x": "rt_1_x_tf_trained_for_002272480_step",
-    "rt-1-begin": "rt_1_tf_trained_for_000001120",
-    "octo-base": "octo-base",
-    "octo-small": "octo-small",
-    "octo-server": "octo-server",
-    "openvla-7b": "openvla-7b",
+    # "rt-2-x": "rt_2_x",
+    # "rt-1-converged": "rt_1_tf_trained_for_000400120",
+    # "rt-1-15pct": "rt_1_tf_trained_for_000058240",
+    # "rt-1-x": "rt_1_x_tf_trained_for_002272480_step",
+    # "rt-1-begin": "rt_1_tf_trained_for_000001120",
+    # "octo-base": "octo-base",
+    # "openvla-7b": "openvla-7b",
+    # "internvla-1_8b": "internvla-1_8b",
+    # "internvla_diff-1_8b": "internvla_diff-1_8b",
+    "ours": "TODO",
+    # "octo-small": "octo-small",
+    # "octo-server": "octo-server",
 }
 
 parser = argparse.ArgumentParser()
-parser.add_argument("--task", type=str, default="pick_coke_can", help="task name")
-parser.add_argument("--log-dir-root", type=str, default="./results/", help="log directory")
-
+parser.add_argument(
+    "--log-dir-root", type=str, default="./results/", help="log directory"
+)
 args = parser.parse_args()
 
-if args.task == "pick_coke_can":
-    calc_pick_coke_can_stats(args.log_dir_root)
-elif args.task == "move_near":
-    calc_move_near_stats(args.log_dir_root)
-elif args.task == "drawer":
-    calc_drawer_stats(args.log_dir_root)
-elif args.task == "long_horizon_apple_in_drawer":
-    calc_long_horizon_apple_in_drawer_stats(args.log_dir_root)
-elif args.task == "bridge_put_on":
-    calc_bridge_put_on_env_stats(args.log_dir_root)
-else:
-    raise ValueError(f"Unknown task: {args.task}")
+# NOTE: replace the CKPT_MAPPING with the actual checkpoint directories
+CKPT_MAPPING["ours"] = Path(args.log_dir_root).name
+
+pick_coke_can_results = calc_pick_coke_can_stats(str(Path(args.log_dir_root).parent))
+move_near_real_results = calc_move_near_stats(str(Path(args.log_dir_root).parent))
+drawer_results = calc_drawer_stats(str(Path(args.log_dir_root).parent))
+long_horizon_apple_in_drawer_results = calc_long_horizon_apple_in_drawer_stats(str(Path(args.log_dir_root).parent))
+bridge_put_on_results = calc_bridge_put_on_env_stats(
+    str(Path(args.log_dir_root).parent)
+)
+
+results = {
+    **pick_coke_can_results,
+    **move_near_real_results,
+    **drawer_results,
+    **bridge_put_on_results,
+    **long_horizon_apple_in_drawer_results,
+    "ckpt_name": ["ours"],
+}
+
+df = pd.DataFrame(results)
+df_ref = pd.DataFrame(REF)
+results_df = pd.concat([df, df_ref], axis=0, ignore_index=True)
+results_df = results_df.transpose()
+markdown = results_df.to_markdown(f"{args.log_dir_root}/results.md", index=True)
+
 
+df.loc[0, "ckpt_name"] = CKPT_MAPPING["ours"]
+df.to_csv(f"{args.log_dir_root}/results.csv", index=False, float_format="%.3f")
+print(df)
 exit(0)
 
 """

From 0281e63167f7bb190b90575a0f04b1987e872f91 Mon Sep 17 00:00:00 2001
From: dlqu_0010 <dlqu22@m.fudan.edu.cn>
Date: Tue, 7 Jan 2025 15:06:09 +0800
Subject: [PATCH 11/13] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index c46c40d8..dd35d541 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# SimplerEnv: Simulated Manipulation Policy Evaluation Environments for Real Robot Setups
+# SimplerEnv: Simulated Manipulation Policy Evaluation Environments for Real Robot Setups (Multi-model Support 🔥)
 
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/simpler-env/SimplerEnv/blob/main/example.ipynb)
 

From 0936b9e3d8531643584012c5198429b0ac3e901b Mon Sep 17 00:00:00 2001
From: dlqu_0010 <dlqu22@m.fudan.edu.cn>
Date: Thu, 6 Feb 2025 09:37:01 +0800
Subject: [PATCH 12/13] add spatialvla support, make sure transformers >=
 4.47.0

---
 scripts/run_spatialvla.sh                     |  42 +++
 simpler_env/main_inference.py                 |   9 +-
 .../policies/spatialvla/spatialvla_model.py   | 240 ++++++++++++++++++
 3 files changed, 290 insertions(+), 1 deletion(-)
 create mode 100644 scripts/run_spatialvla.sh
 create mode 100644 simpler_env/policies/spatialvla/spatialvla_model.py

diff --git a/scripts/run_spatialvla.sh b/scripts/run_spatialvla.sh
new file mode 100644
index 00000000..370370e8
--- /dev/null
+++ b/scripts/run_spatialvla.sh
@@ -0,0 +1,42 @@
+model_name=spatialvla
+tasks=(
+    bridge.sh
+    drawer_variant_agg.sh
+    drawer_visual_matching.sh
+    move_near_variant_agg.sh
+    move_near_visual_matching.sh
+    pick_coke_can_variant_agg.sh
+    pick_coke_can_visual_matching.sh
+
+    # put_in_drawer_variant_agg.sh
+    # put_in_drawer_visual_matching.sh
+)
+
+ckpts=(
+    IPEC-COMMUNITY/spatialvla-4b-224-pt # or a local path
+)
+
+action_ensemble_temp=-0.8
+for ckpt_path in ${ckpts[@]}; do
+    # 🤗 NOTE: set hf cache to avoid confilcts
+    # base_dir=$(dirname $ckpt_path)
+    # export HF_MODULES_CACHE=$base_dir/hf_cache/modules
+    # mkdir -p $HF_MODULES_CACHE
+    # logging_dir=$base_dir/simpler_env/$(basename $ckpt_path)${action_ensemble_temp}
+  
+    logging_dir=results/$(basename $ckpt_path)${action_ensemble_temp}
+
+    mkdir -p $logging_dir
+    for i in ${!tasks[@]}; do
+        task=${tasks[$i]}
+        echo "🚀 running $task ..."
+        device=0
+        bash scripts/$task $ckpt_path $model_name $action_ensemble_temp $logging_dir $device
+    done
+
+    # statistics evalution results
+    echo "🚀 all tasks DONE! Calculating metrics..."
+    python tools/calc_metrics_evaluation_videos.py \
+        --log-dir-root $logging_dir \
+        >>$logging_dir/total.metrics
+done
diff --git a/simpler_env/main_inference.py b/simpler_env/main_inference.py
index 4ac6eb36..97578936 100644
--- a/simpler_env/main_inference.py
+++ b/simpler_env/main_inference.py
@@ -15,7 +15,6 @@
 
 if __name__ == "__main__":
     args = get_args()
-
     os.environ["DISPLAY"] = ""
     # prevent a single jax process from taking up all the GPU memory
     os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"
@@ -71,6 +70,14 @@
             action_model_type='DiT-L',
             cfg_scale=1.5                     # cfg from 1.5 to 7 also performs well
         )
+    elif args.policy_model == "spatialvla":
+        assert args.ckpt_path is not None
+        from simpler_env.policies.spatialvla.spatialvla_model import SpatialVLAInference
+        model = SpatialVLAInference(
+            saved_model_path=args.ckpt_path,
+            policy_setup=args.policy_setup,
+            action_scale=args.action_scale,
+        )
     else:
         raise NotImplementedError()
 
diff --git a/simpler_env/policies/spatialvla/spatialvla_model.py b/simpler_env/policies/spatialvla/spatialvla_model.py
new file mode 100644
index 00000000..d1c2ba8f
--- /dev/null
+++ b/simpler_env/policies/spatialvla/spatialvla_model.py
@@ -0,0 +1,240 @@
+from typing import Optional, Sequence, List
+import os
+import matplotlib.pyplot as plt
+import numpy as np
+from transforms3d.euler import euler2axangle
+from transformers import AutoModel, AutoProcessor
+from collections import deque
+from PIL import Image
+import torch
+import cv2 as cv
+
+from simpler_env.utils.action.action_ensemble import ActionEnsembler
+
+
+class SpatialVLAInference:
+    def __init__(
+        self,
+        saved_model_path: str = "IPEC-COMMUNITY/spatialvla-4b-224-pt",
+        unnorm_key: Optional[str] = None,
+        policy_setup: str = "widowx_bridge",
+        exec_horizon: int = 1,
+        image_size: list[int] = [224, 224],
+        action_scale: float = 1.0,
+        action_ensemble_temp: float = -0.8,
+    ) -> None:
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if policy_setup == "widowx_bridge":
+            unnorm_key = "bridge_orig/1.0.0" if unnorm_key is None else unnorm_key
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 1
+        elif policy_setup == "google_robot":
+            unnorm_key = (
+                "fractal20220817_data/0.1.0" if unnorm_key is None else unnorm_key
+            )
+            action_ensemble = True
+            self.sticky_gripper_num_repeat = 10
+        else:
+            raise NotImplementedError(
+                f"Policy setup {policy_setup} not supported for octo models. The other datasets can be found in the huggingface config.json file."
+            )
+        self.policy_setup = policy_setup
+        self.unnorm_key = unnorm_key
+
+        print(f"*** policy_setup: {policy_setup}, unnorm_key: {unnorm_key} ***")
+        self.processor = AutoProcessor.from_pretrained(
+            saved_model_path, trust_remote_code=True
+        )
+        self.vla = (
+            AutoModel.from_pretrained(
+                saved_model_path,
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=True,
+            )
+            .eval()
+            .cuda()
+        )
+
+        self.image_size = image_size
+        self.action_scale = action_scale
+        self.obs_horizon = (self.processor.num_obs_steps - 1) * self.processor.obs_delta + 1
+        self.obs_interval = self.processor.obs_delta
+        self.pred_action_horizon = self.processor.action_chunk_size
+        self.image_history = deque(maxlen=self.obs_horizon)
+        self.exec_horizon = exec_horizon
+
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+        self.action_ensemble = action_ensemble
+        self.action_ensemble_temp = action_ensemble_temp
+
+        if self.action_ensemble:
+            self.action_ensembler = ActionEnsembler(
+                self.pred_action_horizon, self.action_ensemble_temp
+            )
+        else:
+            self.action_ensembler = None
+
+        self.task = None
+        self.task_description = None
+
+    def reset(self, task_description: str) -> None:
+        self.image_history.clear()
+        if self.action_ensemble:
+            self.action_ensembler.reset()
+        self.task_description = task_description
+        self.sticky_action_is_on = False
+        self.gripper_action_repeat = 0
+        self.sticky_gripper_action = 0.0
+        self.previous_gripper_action = None
+
+    def step(
+        self, image: np.ndarray, task_description: Optional[str] = None, *args, **kwargs
+    ) -> tuple[dict[str, np.ndarray], dict[str, np.ndarray]]:
+        """
+        Input:
+            image: np.ndarray of shape (H, W, 3), uint8
+            task_description: Optional[str], task description; if different from previous task description, policy state is reset
+        Output:
+            raw_action: dict; raw policy action output
+            action: dict; processed action to be sent to the maniskill2 environment, with the following keys:
+                - 'world_vector': np.ndarray of shape (3,), xyz translation of robot end-effector
+                - 'rot_axangle': np.ndarray of shape (3,), axis-angle representation of end-effector rotation
+                - 'gripper': np.ndarray of shape (1,), gripper action
+                - 'terminate_episode': np.ndarray of shape (1,), 1 if episode should be terminated, 0 otherwise
+        """
+        if task_description is not None:
+            if task_description != self.task_description:
+                self.reset(task_description)
+
+        assert image.dtype == np.uint8
+        image = self._resize_image(image)
+        self._add_image_to_history(image)
+        images: List[Image.Image] = self._obtain_image_history()
+        prompt = task_description
+
+        # predict action (7-dof; un-normalize for bridgev2)
+        inputs = self.processor(images=images, text=prompt, unnorm_key=self.unnorm_key, return_tensors="pt", do_normalize=False)
+        with torch.no_grad():
+            if hasattr(self.processor, "action_tokenizer"):
+                generation_outputs = self.vla.predict_action(inputs)
+                raw_actions = self.processor.decode_actions(
+                    generation_outputs=generation_outputs,
+                    unnorm_key=self.unnorm_key,
+                )["actions"]
+            else:
+                raw_actions = self.vla.predict_action(**inputs)["actions"]
+                raw_actions = raw_actions.cpu().numpy()
+
+        if self.action_ensemble:
+            raw_actions = self.action_ensembler.ensemble_action(raw_actions)[None]
+
+        raw_action = {
+            "world_vector": np.array(raw_actions[0, :3]),
+            "rotation_delta": np.array(raw_actions[0, 3:6]),
+            "open_gripper": np.array(
+                raw_actions[0, 6:7]
+            ),  # range [0, 1]; 1 = open; 0 = close
+        }
+
+        # process raw_action to obtain the action to be sent to the maniskill2 environment
+        action = {}
+        action["world_vector"] = raw_action["world_vector"] * self.action_scale
+        action_rotation_delta = np.asarray(
+            raw_action["rotation_delta"], dtype=np.float64
+        )
+        roll, pitch, yaw = action_rotation_delta
+        action_rotation_ax, action_rotation_angle = euler2axangle(roll, pitch, yaw)
+        action_rotation_axangle = action_rotation_ax * action_rotation_angle
+        action["rot_axangle"] = action_rotation_axangle * self.action_scale
+
+        if self.policy_setup == "google_robot":
+            action["gripper"] = 0
+            current_gripper_action = raw_action["open_gripper"]
+            if self.previous_gripper_action is None:
+                relative_gripper_action = np.array([0])
+                self.previous_gripper_action = current_gripper_action
+            else:
+                relative_gripper_action = self.previous_gripper_action - current_gripper_action
+            # fix a bug in the SIMPLER code here
+            # self.previous_gripper_action = current_gripper_action
+
+            if np.abs(relative_gripper_action) > 0.5 and (not self.sticky_action_is_on):
+                self.sticky_action_is_on = True
+                self.sticky_gripper_action = relative_gripper_action
+                self.previous_gripper_action = current_gripper_action
+
+            if self.sticky_action_is_on:
+                self.gripper_action_repeat += 1
+                relative_gripper_action = self.sticky_gripper_action
+
+            if self.gripper_action_repeat == self.sticky_gripper_num_repeat:
+                self.sticky_action_is_on = False
+                self.gripper_action_repeat = 0
+                self.sticky_gripper_action = 0.0
+
+            action["gripper"] = relative_gripper_action
+
+        elif self.policy_setup == "widowx_bridge":
+            action["gripper"] = 2.0 * (raw_action["open_gripper"] > 0.5) - 1.0
+        
+        action["terminate_episode"] = np.array([0.0])
+        return raw_action, action
+
+    def _resize_image(self, image: np.ndarray) -> np.ndarray:
+        image = cv.resize(image, tuple(self.image_size), interpolation=cv.INTER_AREA)
+        return image
+
+    def _add_image_to_history(self, image: np.ndarray) -> None:
+        if len(self.image_history) == 0:
+            self.image_history.extend([image] * self.obs_horizon)
+        else:
+            self.image_history.append(image)
+
+    def _obtain_image_history(self) -> List[Image.Image]:
+        image_history = list(self.image_history)
+        images = image_history[:: self.obs_interval]
+        images = [Image.fromarray(image).convert("RGB") for image in images]
+        return images
+
+    def visualize_epoch(
+        self,
+        predicted_raw_actions: Sequence[np.ndarray],
+        images: Sequence[np.ndarray],
+        save_path: str,
+    ) -> None:
+        images = [self._resize_image(image) for image in images]
+        ACTION_DIM_LABELS = ["x", "y", "z", "roll", "pitch", "yaw", "grasp"]
+
+        img_strip = np.concatenate(np.array(images[::3]), axis=1)
+
+        # set up plt figure
+        figure_layout = [["image"] * len(ACTION_DIM_LABELS), ACTION_DIM_LABELS]
+        plt.rcParams.update({"font.size": 12})
+        fig, axs = plt.subplot_mosaic(figure_layout)
+        fig.set_size_inches([45, 10])
+
+        # plot actions
+        pred_actions = np.array(
+            [
+                np.concatenate(
+                    [a["world_vector"], a["rotation_delta"], a["open_gripper"]], axis=-1
+                )
+                for a in predicted_raw_actions
+            ]
+        )
+        for action_dim, action_label in enumerate(ACTION_DIM_LABELS):
+            # actions have batch, horizon, dim, in this example we just take the first action for simplicity
+            axs[action_label].plot(
+                pred_actions[:, action_dim], label="predicted action"
+            )
+            axs[action_label].set_title(action_label)
+            axs[action_label].set_xlabel("Time in one episode")
+
+        axs["image"].imshow(img_strip)
+        axs["image"].set_xlabel("Time in one episode (subsampled)")
+        plt.legend()
+        plt.savefig(save_path)

From 8e9aaaae456f336d4db94171e53db25ca0905ab1 Mon Sep 17 00:00:00 2001
From: dlqu_0010 <dlqu22@m.fudan.edu.cn>
Date: Thu, 6 Feb 2025 09:43:39 +0800
Subject: [PATCH 13/13] add spatialvla support, transformers >= 4.47.0

---
 README.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/README.md b/README.md
index dd35d541..799dc18c 100644
--- a/README.md
+++ b/README.md
@@ -30,6 +30,14 @@ We hope that our work guides and inspires future real-to-sim evaluation efforts.
   - [Troubleshooting](#troubleshooting)
   - [Citation](#citation)
 
+## Models
+| Model Name   | support | Note  |
+| -----------  | -----  | -----  |
+| Octo         | ✅     |        |
+| RT1          | ✅     |        |
+| OpenVLA      | ✅     |        |
+| CogACT       | ✅     | OpenVLA-based         |
+| SpatialVLA   | ✅     | transformers >= 4.47.0|
 
 ## Getting Started