diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..787002f --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*/__pycache__/ +__pycache__/ diff --git a/.gitmodules b/.gitmodules index 528b0dc..e69de29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +0,0 @@ -[submodule "third-party/DPVO"] - path = third-party/DPVO - url = https://github.com/princeton-vl/DPVO.git -[submodule "third-party/ViTPose"] - path = third-party/ViTPose - url = https://github.com/ViTAE-Transformer/ViTPose.git diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d707024 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,33 @@ +ARG PYTORCH="2.0.0" +ARG CUDA="11.7" +ARG CUDNN="8" + +FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel + +ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all" +ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" + +RUN apt-get update && apt-get install -y wget git ninja-build unzip libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx\ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ENV FORCE_CUDA="1" +ENV CUDA_HOME=/usr/local/cuda +RUN pip install fvcore iopath +RUN wget https://github.com/NVIDIA/cub/archive/refs/tags/1.17.2.tar.gz && tar xzf 1.17.2.tar.gz +ENV CUB_HOME=/workspace/cub-1.17.2 +ENV TORCH_CUDA_ARCH_LIST="3.5;5.0;6.0;6.1;7.0;7.5;8.0;8.6+PTX" +RUN pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py310_cu117_pyt200/download.html + +RUN MMCV_WITH_OPS=1 pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu117/torch2.0.0/index.html +RUN pip install mmdet==3.1.0 mmpose==1.3.0 mmengine==0.8.3 mmpretrain==1.2.0 + +RUN pip install https://data.pyg.org/whl/torch-2.0.0%2Bcu117/torch_scatter-2.1.2%2Bpt20cu117-cp310-cp310-linux_x86_64.whl +RUN git clone https://github.com/princeton-vl/DPVO.git && cd DPVO && git checkout 5833835 && wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip && unzip eigen-3.4.0.zip -d thirdparty && rm -rf eigen-3.4.0.zip && pip install -e . + +COPY ./ /WHAM/ +WORKDIR /WHAM/ +RUN pip install -e . && pip install -r requirements.txt +ENV WHAM_ROOT='/WHAM/' + +RUN bash ./fetch_demo_data.sh diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..67800ac --- /dev/null +++ b/Makefile @@ -0,0 +1,17 @@ +# Variables +PYTHON := python3 +DOCKER := docker +PACKAGE_NAME := wham + +# Targets +.PHONY: install demo docker-image + +install: + $(PIP) install . + +docker-image: + $(DOCKER) build . -t $(PACKAGE_NAME) + +demo: + @echo Running on file $1 in folder $0 + $(DOCKER) run -v $(directory):/input_data --rm --gpus all $(PACKAGE_NAME) bash /WHAM/run_demo.sh /input_data/$(video_name) \ No newline at end of file diff --git a/configs/DPVO/default.yaml b/configs/DPVO/default.yaml new file mode 100644 index 0000000..5e69ebc --- /dev/null +++ b/configs/DPVO/default.yaml @@ -0,0 +1,19 @@ +### DPVO Config File ### + +# VO config (increase for better accuracy) +PATCHES_PER_FRAME: 96 +REMOVAL_WINDOW: 22 +OPTIMIZATION_WINDOW: 10 +PATCH_LIFETIME: 13 + +# threshold for keyframe removal +KEYFRAME_THRESH: 15.0 + +# camera motion model +MOTION_MODEL: 'DAMPED_LINEAR' +MOTION_DAMPING: 0.5 + +# maybe use mixed precision for inference +MIXED_PRECISION: True + +GRADIENT_BIAS: False diff --git a/configs/VIT/coco.py b/configs/VIT/coco.py new file mode 100644 index 0000000..865a95b --- /dev/null +++ b/configs/VIT/coco.py @@ -0,0 +1,181 @@ +dataset_info = dict( + dataset_name='coco', + paper_info=dict( + author='Lin, Tsung-Yi and Maire, Michael and ' + 'Belongie, Serge and Hays, James and ' + 'Perona, Pietro and Ramanan, Deva and ' + r'Doll{\'a}r, Piotr and Zitnick, C Lawrence', + title='Microsoft coco: Common objects in context', + container='European conference on computer vision', + year='2014', + homepage='http://cocodataset.org/', + ), + keypoint_info={ + 0: + dict(name='nose', id=0, color=[51, 153, 255], type='upper', swap=''), + 1: + dict( + name='left_eye', + id=1, + color=[51, 153, 255], + type='upper', + swap='right_eye'), + 2: + dict( + name='right_eye', + id=2, + color=[51, 153, 255], + type='upper', + swap='left_eye'), + 3: + dict( + name='left_ear', + id=3, + color=[51, 153, 255], + type='upper', + swap='right_ear'), + 4: + dict( + name='right_ear', + id=4, + color=[51, 153, 255], + type='upper', + swap='left_ear'), + 5: + dict( + name='left_shoulder', + id=5, + color=[0, 255, 0], + type='upper', + swap='right_shoulder'), + 6: + dict( + name='right_shoulder', + id=6, + color=[255, 128, 0], + type='upper', + swap='left_shoulder'), + 7: + dict( + name='left_elbow', + id=7, + color=[0, 255, 0], + type='upper', + swap='right_elbow'), + 8: + dict( + name='right_elbow', + id=8, + color=[255, 128, 0], + type='upper', + swap='left_elbow'), + 9: + dict( + name='left_wrist', + id=9, + color=[0, 255, 0], + type='upper', + swap='right_wrist'), + 10: + dict( + name='right_wrist', + id=10, + color=[255, 128, 0], + type='upper', + swap='left_wrist'), + 11: + dict( + name='left_hip', + id=11, + color=[0, 255, 0], + type='lower', + swap='right_hip'), + 12: + dict( + name='right_hip', + id=12, + color=[255, 128, 0], + type='lower', + swap='left_hip'), + 13: + dict( + name='left_knee', + id=13, + color=[0, 255, 0], + type='lower', + swap='right_knee'), + 14: + dict( + name='right_knee', + id=14, + color=[255, 128, 0], + type='lower', + swap='left_knee'), + 15: + dict( + name='left_ankle', + id=15, + color=[0, 255, 0], + type='lower', + swap='right_ankle'), + 16: + dict( + name='right_ankle', + id=16, + color=[255, 128, 0], + type='lower', + swap='left_ankle') + }, + skeleton_info={ + 0: + dict(link=('left_ankle', 'left_knee'), id=0, color=[0, 255, 0]), + 1: + dict(link=('left_knee', 'left_hip'), id=1, color=[0, 255, 0]), + 2: + dict(link=('right_ankle', 'right_knee'), id=2, color=[255, 128, 0]), + 3: + dict(link=('right_knee', 'right_hip'), id=3, color=[255, 128, 0]), + 4: + dict(link=('left_hip', 'right_hip'), id=4, color=[51, 153, 255]), + 5: + dict(link=('left_shoulder', 'left_hip'), id=5, color=[51, 153, 255]), + 6: + dict(link=('right_shoulder', 'right_hip'), id=6, color=[51, 153, 255]), + 7: + dict( + link=('left_shoulder', 'right_shoulder'), + id=7, + color=[51, 153, 255]), + 8: + dict(link=('left_shoulder', 'left_elbow'), id=8, color=[0, 255, 0]), + 9: + dict( + link=('right_shoulder', 'right_elbow'), id=9, color=[255, 128, 0]), + 10: + dict(link=('left_elbow', 'left_wrist'), id=10, color=[0, 255, 0]), + 11: + dict(link=('right_elbow', 'right_wrist'), id=11, color=[255, 128, 0]), + 12: + dict(link=('left_eye', 'right_eye'), id=12, color=[51, 153, 255]), + 13: + dict(link=('nose', 'left_eye'), id=13, color=[51, 153, 255]), + 14: + dict(link=('nose', 'right_eye'), id=14, color=[51, 153, 255]), + 15: + dict(link=('left_eye', 'left_ear'), id=15, color=[51, 153, 255]), + 16: + dict(link=('right_eye', 'right_ear'), id=16, color=[51, 153, 255]), + 17: + dict(link=('left_ear', 'left_shoulder'), id=17, color=[51, 153, 255]), + 18: + dict( + link=('right_ear', 'right_shoulder'), id=18, color=[51, 153, 255]) + }, + joint_weights=[ + 1., 1., 1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, + 1.5 + ], + sigmas=[ + 0.026, 0.025, 0.025, 0.035, 0.035, 0.079, 0.079, 0.072, 0.072, 0.062, + 0.062, 0.107, 0.107, 0.087, 0.087, 0.089, 0.089 + ]) diff --git a/configs/VIT/default_runtime.py b/configs/VIT/default_runtime.py new file mode 100644 index 0000000..8a90d77 --- /dev/null +++ b/configs/VIT/default_runtime.py @@ -0,0 +1,54 @@ +default_scope = 'mmpose' + +# hooks +default_hooks = dict( + timer=dict(type='IterTimerHook'), + logger=dict(type='LoggerHook', interval=50), + param_scheduler=dict(type='ParamSchedulerHook'), + checkpoint=dict(type='CheckpointHook', interval=10), + sampler_seed=dict(type='DistSamplerSeedHook'), + visualization=dict(type='PoseVisualizationHook', enable=False), + badcase=dict( + type='BadCaseAnalysisHook', + enable=False, + out_dir='badcase', + metric_type='loss', + badcase_thr=5)) + +# custom hooks +custom_hooks = [ + # Synchronize model buffers such as running_mean and running_var in BN + # at the end of each epoch + dict(type='SyncBuffersHook') +] + +# multi-processing backend +env_cfg = dict( + cudnn_benchmark=False, + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + dist_cfg=dict(backend='nccl'), +) + +# visualizer +vis_backends = [ + dict(type='LocalVisBackend'), + # dict(type='TensorboardVisBackend'), + # dict(type='WandbVisBackend'), +] +visualizer = dict( + type='PoseLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# logger +log_processor = dict( + type='LogProcessor', window_size=50, by_epoch=True, num_digits=6) +log_level = 'INFO' +load_from = None +resume = False + +# file I/O backend +backend_args = dict(backend='local') + +# training/validation/testing progress +train_cfg = dict(by_epoch=True) +val_cfg = dict() +test_cfg = dict() \ No newline at end of file diff --git a/configs/VIT/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py b/configs/VIT/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py new file mode 100644 index 0000000..9601771 --- /dev/null +++ b/configs/VIT/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py @@ -0,0 +1,155 @@ +_base_ = ['default_runtime.py'] + +# runtime +train_cfg = dict(max_epochs=210, val_interval=10) + +# optimizer +custom_imports = dict( + imports=['mmpose.engine.optim_wrappers.layer_decay_optim_wrapper'], + allow_failed_imports=False) + +optim_wrapper = dict( + optimizer=dict( + type='AdamW', lr=5e-4, betas=(0.9, 0.999), weight_decay=0.1), + paramwise_cfg=dict( + num_layers=12, + layer_decay_rate=0.8, + custom_keys={ + 'bias': dict(decay_multi=0.0), + 'pos_embed': dict(decay_mult=0.0), + 'relative_position_bias_table': dict(decay_mult=0.0), + 'norm': dict(decay_mult=0.0), + }, + ), + constructor='LayerDecayOptimWrapperConstructor', + clip_grad=dict(max_norm=1., norm_type=2), +) + +# learning policy +param_scheduler = [ + dict( + type='LinearLR', begin=0, end=500, start_factor=0.001, + by_epoch=False), # warm-up + dict( + type='MultiStepLR', + begin=0, + end=210, + milestones=[170, 200], + gamma=0.1, + by_epoch=True) +] + +# automatically scaling LR based on the actual training batch size +auto_scale_lr = dict(base_batch_size=512) + +# hooks +default_hooks = dict( + checkpoint=dict(save_best='coco/AP', rule='greater', max_keep_ckpts=1)) + +# codec settings +codec = dict( + type='UDPHeatmap', input_size=(192, 256), heatmap_size=(48, 64), sigma=2) + +# model settings +model = dict( + type='TopdownPoseEstimator', + data_preprocessor=dict( + type='PoseDataPreprocessor', + mean=[123.675, 116.28, 103.53], + std=[58.395, 57.12, 57.375], + bgr_to_rgb=True), + backbone=dict( + type='mmpretrain.VisionTransformer', + arch={ + 'embed_dims': 384, + 'num_layers': 12, + 'num_heads': 12, + 'feedforward_channels': 384 * 4 + }, + img_size=(256, 192), + patch_size=16, + qkv_bias=True, + drop_path_rate=0.1, + with_cls_token=False, + out_type='featmap', + patch_cfg=dict(padding=2), + init_cfg=dict( + type='Pretrained', + checkpoint='https://download.openmmlab.com/mmpose/' + 'v1/pretrained_models/mae_pretrain_vit_small_20230913.pth'), + ), + head=dict( + type='HeatmapHead', + in_channels=384, + out_channels=17, + deconv_out_channels=(256, 256), + deconv_kernel_sizes=(4, 4), + loss=dict(type='KeypointMSELoss', use_target_weight=True), + decoder=codec), + test_cfg=dict( + flip_test=True, + flip_mode='heatmap', + shift_heatmap=False, + )) + +# base dataset settings +data_root = 'data/coco/' +dataset_type = 'CocoDataset' +data_mode = 'topdown' + +# pipelines +train_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='RandomFlip', direction='horizontal'), + dict(type='RandomHalfBody'), + dict(type='RandomBBoxTransform'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='GenerateTarget', encoder=codec), + dict(type='PackPoseInputs') +] +val_pipeline = [ + dict(type='LoadImage'), + dict(type='GetBBoxCenterScale'), + dict(type='TopdownAffine', input_size=codec['input_size'], use_udp=True), + dict(type='PackPoseInputs') +] + +# data loaders +train_dataloader = dict( + batch_size=64, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_train2017.json', + data_prefix=dict(img='train2017/'), + pipeline=train_pipeline, + )) +val_dataloader = dict( + batch_size=32, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False, round_up=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + data_mode=data_mode, + ann_file='annotations/person_keypoints_val2017.json', + bbox_file='data/coco/person_detection_results/' + 'COCO_val2017_detections_AP_H_56_person.json', + data_prefix=dict(img='val2017/'), + test_mode=True, + pipeline=val_pipeline, + )) +test_dataloader = val_dataloader + +# evaluators +val_evaluator = dict( + type='CocoMetric', + ann_file=data_root + 'annotations/person_keypoints_val2017.json') +test_evaluator = val_evaluator \ No newline at end of file diff --git a/configs/config.py b/configs/config.py index 1ee9c2d..47e6891 100644 --- a/configs/config.py +++ b/configs/config.py @@ -52,6 +52,20 @@ cfg.LOSS.LOSS_WEIGHT = 60. cfg.LOSS.CAM_LOSS_EPOCH = 5 +cfg.MMPOSE_CFG = CN() +cfg.MMPOSE_CFG.POSE_CONFIG='' +cfg.MMPOSE_CFG.POSE_CHECKPOINT='' +cfg.MMPOSE_CFG.DET_CONFIG='' +cfg.MMPOSE_CFG.DET_CHECKPOINT='' +cfg.MMPOSE_CFG.BBOX_CONF = 0.5 +cfg.MMPOSE_CFG.TRACKING_THR = 0.1 +cfg.MMPOSE_CFG.MINIMUM_FRMAES = 30 + +cfg.DPVO = CN() +cfg.DPVO.CFG = '' +cfg.DPVO.CKPT = '' + +cfg.FEATURES_EXTR_CKPT = '' def get_cfg_defaults(): """Get a yacs CfgNode object with default values for my_project.""" diff --git a/configs/yamls/demo.py b/configs/yamls/demo.py new file mode 100644 index 0000000..6ab60ce --- /dev/null +++ b/configs/yamls/demo.py @@ -0,0 +1,35 @@ +import os +from yacs.config import CfgNode as CN + +get_env_var = os.environ.get + +cfg = CN() + +cfg.LOGDIR='' +cfg.DEVICE='cuda' +cfg.EXP_NAME='demo' +cfg.OUTPUT_DIR='experiments/' +cfg.NUM_WORKERS=0 +cfg.MODEL_CONFIG=get_env_var('WHAM_CFG', './configs/yamls/model_base.yaml') + +cfg.TRAIN = CN() +cfg.TRAIN.STAGE = 'stage2' +cfg.TRAIN.CHECKPOINT = get_env_var('WHAM_CKPT', './checkpoints/wham_vit_bedlam_w_3dpw.pth.tar') + +cfg.MODEL = CN() +cfg.MODEL.BACKBONE = 'vit' + +cfg.MMPOSE_CFG = CN() +cfg.MMPOSE_CFG.POSE_CONFIG = get_env_var('POSE2D_CFG', './configs/VIT/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py') +cfg.MMPOSE_CFG.POSE_CHECKPOINT = get_env_var('POSE2D_CKPT', 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth') +cfg.MMPOSE_CFG.DET_CONFIG = get_env_var('DETECTOR_CFG', './configs/rtmdet_m_8xb32-300e_coco.py') +cfg.MMPOSE_CFG.DET_CHECKPOINT = get_env_var('DETECTOR_CKPT', './checkpoints/yolov8m.pt') +cfg.MMPOSE_CFG.BBOX_CONF = 0.5 +cfg.MMPOSE_CFG.TRACKING_THR = 0.1 +cfg.MMPOSE_CFG.MINIMUM_FRMAES = 30 + +cfg.DPVO = CN() +cfg.DPVO.CFG = get_env_var('DPVO_CFG', './configs/DPVO/default.yaml') +cfg.DPVO.CKPT = get_env_var('DPVO_CKPT', './checkpoints/dpvo.pth') + +cfg.FEATURES_EXTR_CKPT = get_env_var('FEATURES_EXTR_CKPT', './checkpoints/hmr2a.ckpt') diff --git a/configs/yamls/demo.yaml b/configs/yamls/demo.yaml index efe9a07..6f55030 100644 --- a/configs/yamls/demo.yaml +++ b/configs/yamls/demo.yaml @@ -10,4 +10,19 @@ TRAIN: CHECKPOINT: 'checkpoints/wham_vit_bedlam_w_3dpw.pth.tar' MODEL: - BACKBONE: 'vit' \ No newline at end of file + BACKBONE: 'vit' + +MMPOSE_CFG: + POSE_CONFIG: 'configs/VIT/td-hm_ViTPose-small_8xb64-210e_coco-256x192.py' + POSE_CHECKPOINT: 'https://download.openmmlab.com/mmpose/v1/body_2d_keypoint/topdown_heatmap/coco/td-hm_ViTPose-small_8xb64-210e_coco-256x192-62d7a712_20230314.pth' + DET_CONFIG: '' + DET_CHECKPOINT: 'checkpoints/yolov8m.pt' + BBOX_CONF: 0.5 + TRACKING_THR: 0.1 + MINIMUM_FRMAES: 30 + +DPVO: + CFG: 'configs/DPVO/default.yaml' + CKPT: 'checkpoints/dpvo.pth' + +FEATURES_EXTR_CKPT: 'checkpoints/hmr2a.ckpt' \ No newline at end of file diff --git a/demo.py b/demo.py index 23ce9a9..1e2810f 100644 --- a/demo.py +++ b/demo.py @@ -17,13 +17,13 @@ from progress.bar import Bar from configs.config import get_cfg_defaults -from lib.data._custom import CustomDataset -from lib.models import build_network, build_body_model -from lib.models.preproc.detector import DetectionModel -from lib.models.preproc.extractor import FeatureExtractor +from wham.data._custom import CustomDataset +from wham.models import build_network, build_body_model +from wham.models.preproc.detector import DetectionModel +from wham.models.preproc.extractor import FeatureExtractor try: - from lib.models.preproc.slam import SLAMModel + from wham.models.preproc.slam import SLAMModel _run_global = True except: logger.info('DPVO is not properly installed. Only estimate in local coordinates !') @@ -51,7 +51,7 @@ def run(cfg, osp.exists(osp.join(output_pth, 'slam_results.pth'))): detector = DetectionModel(cfg.DEVICE.lower()) - extractor = FeatureExtractor(cfg.DEVICE.lower()) + extractor = FeatureExtractor(cfg.FEATURES_EXTR_CKPT, cfg.DEVICE.lower()) if run_global: slam = SLAMModel(video, output_pth, width, height, calib) else: slam = None @@ -120,7 +120,7 @@ def run(cfg, # Visualize if visualize: - from lib.vis.run_vis import run_vis_on_demo + from wham.vis.run_vis import run_vis_on_demo run_vis_on_demo(cfg, video, results, output_pth, network.smpl, vis_global=run_global) if __name__ == '__main__': diff --git a/fetch_demo_data.sh b/fetch_demo_data.sh index edbc6f4..7c65b42 100644 --- a/fetch_demo_data.sh +++ b/fetch_demo_data.sh @@ -40,8 +40,7 @@ gdown "https://drive.google.com/uc?id=1i7kt9RlCCCNEW2aYaDWVr-G778JkLNcB&export=d gdown "https://drive.google.com/uc?id=19qkI-a6xuwob9_RFNSPWf1yWErwVVlks&export=download&confirm=t" -O 'checkpoints/wham_vit_bedlam_w_3dpw.pth.tar' gdown "https://drive.google.com/uc?id=1J6l8teyZrL0zFzHhzkC7efRhU0ZJ5G9Y&export=download&confirm=t" -O 'checkpoints/hmr2a.ckpt' gdown "https://drive.google.com/uc?id=1kXTV4EYb-BI3H7J-bkR3Bc4gT9zfnHGT&export=download&confirm=t" -O 'checkpoints/dpvo.pth' -gdown "https://drive.google.com/uc?id=1zJ0KP23tXD42D47cw1Gs7zE2BA_V_ERo&export=download&confirm=t" -O 'checkpoints/yolov8x.pt' -gdown "https://drive.google.com/uc?id=1xyF7F3I7lWtdq82xmEPVQ5zl4HaasBso&export=download&confirm=t" -O 'checkpoints/vitpose-h-multi-coco.pth' +wget "https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m.pt" -O 'checkpoints/yolov8m.pt' # Demo videos gdown "https://drive.google.com/uc?id=1KjfODCcOUm_xIMLLR54IcjJtf816Dkc7&export=download&confirm=t" -O 'examples.tar.gz' diff --git a/lib/utils/utils.py b/lib/utils/utils.py deleted file mode 100644 index 0d4628e..0000000 --- a/lib/utils/utils.py +++ /dev/null @@ -1,252 +0,0 @@ -# -*- coding: utf-8 -*- - -# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is -# holder of all proprietary rights on this computer program. -# You can only use this computer program if you have closed -# a license agreement with MPG or you get the right to use the computer -# program from someone who is authorized to grant you that right. -# Any use of the computer program without a valid license is prohibited and -# liable to prosecution. -# -# Copyright©2019 Max-Planck-Gesellschaft zur Förderung -# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute -# for Intelligent Systems. All rights reserved. -# -# Contact: ps-license@tuebingen.mpg.de - -import os -import yaml -import torch -import shutil -import logging -import operator -from tqdm import tqdm -from os import path as osp -from functools import reduce -from typing import List, Union -from collections import OrderedDict -from torch.optim.lr_scheduler import _LRScheduler - -class CustomScheduler(_LRScheduler): - def __init__(self, optimizer, lr_lambda): - self.lr_lambda = lr_lambda - super(CustomScheduler, self).__init__(optimizer) - - def get_lr(self): - return [base_lr * self.lr_lambda(self.last_epoch) - for base_lr in self.base_lrs] - -def lr_decay_fn(epoch): - if epoch == 0: return 1.0 - if epoch % big_epoch == 0: - return big_decay - else: - return small_decay - -def save_obj(v, f, file_name='output.obj'): - obj_file = open(file_name, 'w') - for i in range(len(v)): - obj_file.write('v ' + str(v[i][0]) + ' ' + str(v[i][1]) + ' ' + str(v[i][2]) + '\n') - for i in range(len(f)): - obj_file.write('f ' + str(f[i][0]+1) + '/' + str(f[i][0]+1) + ' ' + str(f[i][1]+1) + '/' + str(f[i][1]+1) + ' ' + str(f[i][2]+1) + '/' + str(f[i][2]+1) + '\n') - obj_file.close() - - -def check_data_pararell(train_weight): - new_state_dict = OrderedDict() - for k, v in train_weight.items(): - name = k[7:] if k.startswith('module') else k # remove `module.` - new_state_dict[name] = v - return new_state_dict - - -def get_from_dict(dict, keys): - return reduce(operator.getitem, keys, dict) - - -def tqdm_enumerate(iter): - i = 0 - for y in tqdm(iter): - yield i, y - i += 1 - - -def iterdict(d): - for k,v in d.items(): - if isinstance(v, dict): - d[k] = dict(v) - iterdict(v) - return d - - -def accuracy(output, target): - _, pred = output.topk(1) - pred = pred.view(-1) - - correct = pred.eq(target).sum() - - return correct.item(), target.size(0) - correct.item() - - -def lr_decay(optimizer, step, lr, decay_step, gamma): - lr = lr * gamma ** (step/decay_step) - for param_group in optimizer.param_groups: - param_group['lr'] = lr - return lr - - -def step_decay(optimizer, step, lr, decay_step, gamma): - lr = lr * gamma ** (step / decay_step) - for param_group in optimizer.param_groups: - param_group['lr'] = lr - return lr - - -def read_yaml(filename): - return yaml.load(open(filename, 'r')) - - -def write_yaml(filename, object): - with open(filename, 'w') as f: - yaml.dump(object, f) - - -def save_dict_to_yaml(obj, filename, mode='w'): - with open(filename, mode) as f: - yaml.dump(obj, f, default_flow_style=False) - - -def save_to_file(obj, filename, mode='w'): - with open(filename, mode) as f: - f.write(obj) - - -def concatenate_dicts(dict_list, dim=0): - rdict = dict.fromkeys(dict_list[0].keys()) - for k in rdict.keys(): - rdict[k] = torch.cat([d[k] for d in dict_list], dim=dim) - return rdict - - -def bool_to_string(x: Union[List[bool],bool]) -> Union[List[str],str]: - """ - boolean to string conversion - :param x: list or bool to be converted - :return: string converted thing - """ - if isinstance(x, bool): - return [str(x)] - for i, j in enumerate(x): - x[i]=str(j) - return x - - -def checkpoint2model(checkpoint, key='gen_state_dict'): - state_dict = checkpoint[key] - print(f'Performance of loaded model on 3DPW is {checkpoint["performance"]:.2f}mm') - # del state_dict['regressor.mean_theta'] - return state_dict - - -def get_optimizer(cfg, model, optim_type, momentum, stage): - if stage == 'stage2': - param_list = [{'params': model.integrator.parameters()}] - for name, param in model.named_parameters(): - # if 'integrator' not in name and 'motion_encoder' not in name and 'trajectory_decoder' not in name: - if 'integrator' not in name: - param_list.append({'params': param, 'lr': cfg.TRAIN.LR_FINETUNE}) - else: - param_list = [{'params': model.parameters()}] - - if optim_type in ['sgd', 'SGD']: - opt = torch.optim.SGD(lr=cfg.TRAIN.LR, params=param_list, momentum=momentum) - elif optim_type in ['Adam', 'adam', 'ADAM']: - opt = torch.optim.Adam(lr=cfg.TRAIN.LR, params=param_list, weight_decay=cfg.TRAIN.WD, betas=(0.9, 0.999)) - else: - raise ModuleNotFoundError - - return opt - - -def create_logger(logdir, phase='train'): - os.makedirs(logdir, exist_ok=True) - - log_file = osp.join(logdir, f'{phase}_log.txt') - - head = '%(asctime)-15s %(message)s' - logging.basicConfig(filename=log_file, - format=head) - logger = logging.getLogger() - logger.setLevel(logging.INFO) - console = logging.StreamHandler() - logging.getLogger('').addHandler(console) - - return logger - - -class AverageMeter(object): - def __init__(self): - self.val = 0 - self.avg = 0 - self.sum = 0 - self.count = 0 - - def update(self, val, n=1): - self.val = val - self.sum += val * n - self.count += n - self.avg = self.sum / self.count - - -def prepare_output_dir(cfg, cfg_file): - - # ==== create logdir - logdir = osp.join(cfg.OUTPUT_DIR, cfg.EXP_NAME) - os.makedirs(logdir, exist_ok=True) - shutil.copy(src=cfg_file, dst=osp.join(cfg.OUTPUT_DIR, 'config.yaml')) - - cfg.LOGDIR = logdir - - # save config - save_dict_to_yaml(cfg, osp.join(cfg.LOGDIR, 'config.yaml')) - - return cfg - - -def prepare_groundtruth(batch, device): - groundtruths = dict() - gt_keys = ['pose', 'cam', 'betas', 'kp3d', 'mask', 'bbox', 'res', 'cam_intrinsics', 'init_root', 'cam_angvel'] - for gt_key in gt_keys: - if gt_key in batch.keys(): - dtype = torch.float32 if batch[gt_key].dtype == torch.float64 else batch[gt_key].dtype - groundtruths[gt_key] = batch[gt_key].to(dtype=dtype, device=device) - - return groundtruths - - -def prepare_input(batch, device, use_features): - # Input keypoints data - kp2d = batch['kp2d'].to(device).float() - - # Input features - if use_features and 'features' in batch.keys(): - features = batch['features'].to(device).float() - else: - features = None - - # Initial SMPL parameters - init_smpl = batch['init_pose'].to(device).float() - - # Initial keypoints - init_kp = torch.cat(( - batch['init_kp3d'], batch['init_kp2d'] - ), dim=-1).to(device).float() - - return kp2d, (init_kp, init_smpl), features - - -def prepare_batch(batch, device, use_features=True): - x, inits, features = prepare_input(batch, device, use_features) - groundtruths = prepare_groundtruth(batch, device) - - return x, inits, features, groundtruths \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index ca3855f..75696bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ chumpy @ git+https://github.com/mattloper/chumpy -numpy==1.22.3 yacs joblib scikit-image @@ -10,12 +9,14 @@ tensorboard smplx progress einops -mmcv==1.3.9 timm==0.4.9 munkres xtcocotools>=1.8 loguru -setuptools==59.5.0 tqdm ultralytics -gdown==4.6.0 \ No newline at end of file +gdown==4.6.0 +mmcv +mmpose +mmpretrain +mmengine \ No newline at end of file diff --git a/run_demo.sh b/run_demo.sh new file mode 100644 index 0000000..c1c638b --- /dev/null +++ b/run_demo.sh @@ -0,0 +1,3 @@ +#! bin/bash + +python ./demo.py --video $1 --output_pth /input_data/ --visualize \ No newline at end of file diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..8af3ee5 --- /dev/null +++ b/setup.py @@ -0,0 +1,26 @@ +from setuptools import find_packages, setup + + +if __name__ == '__main__': + setup( + name='wham', + version='0.1.0', + description='', + author='', + author_email='', + keywords='computer vision', + packages=find_packages(exclude=('configs', 'tools')), + classifiers=[ + 'Development Status :: 4 - Beta', + 'Operating System :: OS Independent', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', + ], + url='https://github.com/yohanshin/WHAM', + license='Apache License 2.0', + python_requires='>=3.8', + zip_safe=False) \ No newline at end of file diff --git a/third-party/DPVO b/third-party/DPVO deleted file mode 160000 index 5833835..0000000 --- a/third-party/DPVO +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 58338357611e7cf3563ed90f82168da7f8741d24 diff --git a/third-party/ViTPose b/third-party/ViTPose deleted file mode 160000 index d521645..0000000 --- a/third-party/ViTPose +++ /dev/null @@ -1 +0,0 @@ -Subproject commit d5216452796c90c6bc29f5c5ec0bdba94366768a diff --git a/lib/eval/evaluate_3dpw.py b/tools/evaluate_3dpw.py similarity index 100% rename from lib/eval/evaluate_3dpw.py rename to tools/evaluate_3dpw.py diff --git a/lib/eval/evaluate_emdb.py b/tools/evaluate_emdb.py similarity index 100% rename from lib/eval/evaluate_emdb.py rename to tools/evaluate_emdb.py diff --git a/lib/eval/evaluate_rich.py b/tools/evaluate_rich.py similarity index 100% rename from lib/eval/evaluate_rich.py rename to tools/evaluate_rich.py diff --git a/wham/config/default.py b/wham/config/default.py new file mode 100644 index 0000000..47e6891 --- /dev/null +++ b/wham/config/default.py @@ -0,0 +1,121 @@ +import argparse +from yacs.config import CfgNode as CN + +# Configuration variable +cfg = CN() + +cfg.TITLE = 'default' +cfg.OUTPUT_DIR = 'results' +cfg.EXP_NAME = 'default' +cfg.DEVICE = 'cuda' +cfg.DEBUG = False +cfg.EVAL = False +cfg.RESUME = False +cfg.LOGDIR = '' +cfg.NUM_WORKERS = 5 +cfg.SEED_VALUE = -1 +cfg.SUMMARY_ITER = 1 +cfg.MODEL_CONFIG = '' + +cfg.TRAIN = CN() +cfg.TRAIN.STAGE = 'stage1' +cfg.TRAIN.DATASET_EVAL = '3dpw' +cfg.TRAIN.CHECKPOINT = '' +cfg.TRAIN.BATCH_SIZE = 64 +cfg.TRAIN.START_EPOCH = 0 +cfg.TRAIN.END_EPOCH = 999 +cfg.TRAIN.PRETRAINED = '' +cfg.TRAIN.OPTIM = 'Adam' +cfg.TRAIN.LR = 3e-4 +cfg.TRAIN.LR_FINETUNE = 5e-5 +cfg.TRAIN.LR_PATIENCE = 5 +cfg.TRAIN.LR_DECAY_RATIO = 0.1 +cfg.TRAIN.WD = 0.0 +cfg.TRAIN.MOMENTUM = 0.9 + +cfg.DATASET = CN() +cfg.DATASET.SEQLEN = 81 +cfg.DATASET.RATIO = [1.0, 0, 0, 0, 0] + +cfg.MODEL = CN() +cfg.MODEL.BACKBONE = 'vit' + +cfg.LOSS = CN() +cfg.LOSS.SHAPE_LOSS_WEIGHT = 0.001 +cfg.LOSS.JOINT2D_LOSS_WEIGHT = 5. +cfg.LOSS.JOINT3D_LOSS_WEIGHT = 5. +cfg.LOSS.POSE_LOSS_WEIGHT = 1. +cfg.LOSS.CASCADED_LOSS_WEIGHT = 0.0 +cfg.LOSS.STATIONARY_LOSS_WEIGHT = 0.04 +cfg.LOSS.ROOT_LOSS_WEIGHT = 0.004 +cfg.LOSS.CAM_LOSS_WEIGHT = 0.04 +cfg.LOSS.LOSS_WEIGHT = 60. +cfg.LOSS.CAM_LOSS_EPOCH = 5 + +cfg.MMPOSE_CFG = CN() +cfg.MMPOSE_CFG.POSE_CONFIG='' +cfg.MMPOSE_CFG.POSE_CHECKPOINT='' +cfg.MMPOSE_CFG.DET_CONFIG='' +cfg.MMPOSE_CFG.DET_CHECKPOINT='' +cfg.MMPOSE_CFG.BBOX_CONF = 0.5 +cfg.MMPOSE_CFG.TRACKING_THR = 0.1 +cfg.MMPOSE_CFG.MINIMUM_FRMAES = 30 + +cfg.DPVO = CN() +cfg.DPVO.CFG = '' +cfg.DPVO.CKPT = '' + +cfg.FEATURES_EXTR_CKPT = '' + +def get_cfg_defaults(): + """Get a yacs CfgNode object with default values for my_project.""" + # Return a clone so that the defaults will not be altered + # This is for the "local variable" use pattern + return cfg.clone() + + +def get_cfg(args, test): + """ + Define configuration. + """ + import os + + cfg = get_cfg_defaults() + if os.path.exists(args.cfg): + cfg.merge_from_file(args.cfg) + + cfg.merge_from_list(args.opts) + if test: + cfg.merge_from_list(['EVAL', True]) + + return cfg.clone() + + +def bool_arg(value): + if value.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif value.lower() in ('no', 'false', 'f', 'n', '0'): + return False + + +def parse_args(test=False): + parser = argparse.ArgumentParser() + parser.add_argument('-c', '--cfg', type=str, default='./configs/debug.yaml', help='cfg file path') + parser.add_argument( + "--eval-set", type=str, default='3dpw', help="Evaluation dataset") + parser.add_argument( + "--eval-split", type=str, default='test', help="Evaluation data split") + parser.add_argument('--render', default=False, type=bool_arg, + help='Render SMPL meshes after the evaluation') + parser.add_argument('--save-results', default=False, type=bool_arg, + help='Save SMPL parameters after the evaluation') + parser.add_argument( + "opts", default=None, nargs=argparse.REMAINDER, + help="Modify config options using the command-line") + + args = parser.parse_args() + print(args, end='\n\n') + cfg_file = args.cfg + cfg = get_cfg(args, test) + + return cfg, cfg_file, args \ No newline at end of file diff --git a/wham/constants.py b/wham/constants.py new file mode 100644 index 0000000..0fef269 --- /dev/null +++ b/wham/constants.py @@ -0,0 +1,49 @@ +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import os +import torch + +IMG_FEAT_DIM = { + 'resnet': 2048, + 'vit': 1024 +} + +N_JOINTS = 17 +root = os.path.join(os.getenv('WHAM_ROOT', '.'), 'dataset') +class PATHS: + PARSED_DATA = f'{root}/parsed_data' + THREEDPW_PTH = f'{root}/3DPW' + RICH_PTH = f'{root}/RICH' + EMDB_PTH = f'{root}/EMDB' + +class KEYPOINTS: + NUM_JOINTS = N_JOINTS + H36M_TO_J17 = [6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9] + H36M_TO_J14 = H36M_TO_J17[:14] + J17_TO_H36M = [14, 3, 4, 5, 2, 1, 0, 15, 12, 16, 13, 9, 10, 11, 8, 7, 6] + COCO_AUG_DICT = f'{root}/body_models/coco_aug_dict.pth' + TREE = [[5, 6], 0, 0, 1, 2, -1, -1, 5, 6, 7, 8, -1, -1, 11, 12, 13, 14, 15, 15, 15, 16, 16, 16] + + # STD scale for video noise + S_BIAS = 1e-1 + S_JITTERING = 5e-2 + S_PEAK = 3e-1 + S_PEAK_MASK = 5e-3 + S_MASK = 0.03 + + +class BMODEL: + MAIN_JOINTS = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] # reduced_joints + + FLDR = f'{root}/body_models/smpl/' + SMPLX2SMPL = f'{root}/body_models/smplx2smpl.pkl' + FACES = f'{root}/body_models/smpl_faces.npy' + MEAN_PARAMS = f'{root}/body_models/smpl_mean_params.npz' + JOINTS_REGRESSOR_WHAM = f'{root}/body_models/J_regressor_wham.npy' + JOINTS_REGRESSOR_H36M = f'{root}/body_models/J_regressor_h36m.npy' + JOINTS_REGRESSOR_EXTRA = f'{root}/body_models/J_regressor_extra.npy' + JOINTS_REGRESSOR_FEET = f'{root}/body_models/J_regressor_feet.npy' + PARENTS = torch.tensor([ + -1, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 9, 12, 13, 14, 16, 17, 18, 19, 20, 21]) \ No newline at end of file diff --git a/lib/data/__init__.py b/wham/data/__init__.py similarity index 100% rename from lib/data/__init__.py rename to wham/data/__init__.py diff --git a/lib/data/_custom.py b/wham/data/_custom.py similarity index 95% rename from lib/data/_custom.py rename to wham/data/_custom.py index b1b92a5..2692417 100644 --- a/lib/data/_custom.py +++ b/wham/data/_custom.py @@ -5,12 +5,11 @@ import torch import numpy as np -from configs import constants as _C from .normalizer import Normalizer -from lib.utils import transforms -from lib.models import build_body_model -from lib.utils.kp_utils import root_centering -from lib.utils.imutils import compute_cam_intrinsics +from ..utils import transforms +from ..models import build_body_model +from ..utils.kp_utils import root_centering +from ..utils.imutils import compute_cam_intrinsics KEYPOINTS_THR = 0.3 diff --git a/lib/data/_dataset.py b/wham/data/_dataset.py similarity index 95% rename from lib/data/_dataset.py rename to wham/data/_dataset.py index 301a4e4..fe7a75c 100644 --- a/lib/data/_dataset.py +++ b/wham/data/_dataset.py @@ -6,14 +6,13 @@ import numpy as np from skimage.util.shape import view_as_windows -from configs import constants as _C from .normalizer import Normalizer -from lib.utils.imutils import transform +from ..utils.imutils import transform class BaseDataset(torch.utils.data.Dataset): def __init__(self, cfg, training=True): super(BaseDataset, self).__init__() - self.n_joints = _C.KEYPOINTS.NUM_JOINTS + self.n_joints = cfg.KEYPOINTS.NUM_JOINTS self.epoch = 0 self.n_frames = cfg.DATASET.SEQLEN + 1 self.training = training diff --git a/lib/data/_dataset_eval.py b/wham/data/_dataset_eval.py similarity index 91% rename from lib/data/_dataset_eval.py rename to wham/data/_dataset_eval.py index 289e5cc..7f3df2c 100644 --- a/lib/data/_dataset_eval.py +++ b/wham/data/_dataset_eval.py @@ -7,11 +7,10 @@ import joblib import numpy as np -from configs import constants as _C -import lib.utils.data_utils as d_utils -from lib.utils import transforms -from lib.utils.kp_utils import root_centering -from lib.data._dataset import BaseDataset +from .. import utils as d_utils +from ..utils import transforms +from ..utils.kp_utils import root_centering +from ._dataset import BaseDataset FPS = 30 class EvalDataset(BaseDataset): @@ -20,7 +19,7 @@ def __init__(self, cfg, data, split, backbone): self.data = data - parsed_data_path = os.path.join(_C.PATHS.PARSED_DATA, f'{data}_{split}_{backbone}.pth') + parsed_data_path = os.path.join(cfg.PATHS.PARSED_DATA, f'{data}_{split}_{backbone}.pth') self.labels = joblib.load(parsed_data_path) def __getitem__(self, index): diff --git a/lib/data/dataloader.py b/wham/data/dataloader.py similarity index 91% rename from lib/data/dataloader.py rename to wham/data/dataloader.py index 389833b..4a0ab88 100644 --- a/lib/data/dataloader.py +++ b/wham/data/dataloader.py @@ -2,7 +2,7 @@ from __future__ import print_function from __future__ import division -from lib.utils.data_utils import make_collate_fn +from ..utils.data_utils import make_collate_fn from ._dataset_eval import EvalDataset import torch diff --git a/lib/data/normalizer.py b/wham/data/normalizer.py similarity index 97% rename from lib/data/normalizer.py rename to wham/data/normalizer.py index 825e6df..ea55455 100644 --- a/lib/data/normalizer.py +++ b/wham/data/normalizer.py @@ -1,8 +1,9 @@ import torch import random import numpy as np +import scipy.signal as signal -from lib.utils.imutils import transform_keypoints +from ..utils.imutils import transform_keypoints class Normalizer: def __init__(self, cfg): @@ -64,7 +65,6 @@ def normalize_keypoints_to_image(x, res): def compute_bbox_from_keypoints(X, do_augment=False, mask=None): def smooth_bbox(bb): # Smooth bounding box detection - import scipy.signal as signal smoothed = np.array([signal.medfilt(param, int(30 / 2)) for param in bb]) return smoothed diff --git a/lib/eval/eval_utils.py b/wham/eval/eval_utils.py similarity index 99% rename from lib/eval/eval_utils.py rename to wham/eval/eval_utils.py index f0ee437..9848aa4 100644 --- a/lib/eval/eval_utils.py +++ b/wham/eval/eval_utils.py @@ -6,6 +6,8 @@ import numpy as np from matplotlib import pyplot as plt +from ..models.smpl import SMPL_MODEL_DIR +from ..models.smpl import SMPL def compute_accel(joints): """ @@ -63,8 +65,6 @@ def compute_error_verts(pred_verts, target_verts=None, target_theta=None): """ if target_verts is None: - from lib.models.smpl import SMPL_MODEL_DIR - from lib.models.smpl import SMPL device = 'cpu' smpl = SMPL( SMPL_MODEL_DIR, diff --git a/lib/models/__init__.py b/wham/models/__init__.py similarity index 80% rename from lib/models/__init__.py rename to wham/models/__init__.py index ef11899..b9f39a4 100644 --- a/lib/models/__init__.py +++ b/wham/models/__init__.py @@ -1,10 +1,13 @@ -import os, sys -import yaml +import os +import sys + import torch +import yacs.config from loguru import logger -from configs import constants as _C +from .. import constants as _C from .smpl import SMPL +from .wham import Network def build_body_model(device, batch_size=1, gender='neutral', **kwargs): @@ -19,11 +22,13 @@ def build_body_model(device, batch_size=1, gender='neutral', **kwargs): def build_network(cfg, smpl): - from .wham import Network - + s = yacs.config.CfgNode() with open(cfg.MODEL_CONFIG, 'r') as f: - model_config = yaml.safe_load(f) + model_config = dict(s.load_cfg(f)) model_config.update({'d_feat': _C.IMG_FEAT_DIM[cfg.MODEL.BACKBONE]}) + model_config.update({'main_joints': _C.BMODEL.MAIN_JOINTS}) + model_config.update({'num_joints': _C.KEYPOINTS.NUM_JOINTS}) + network = Network(smpl, **model_config).to(cfg.DEVICE) diff --git a/lib/models/layers/__init__.py b/wham/models/layers/__init__.py similarity index 100% rename from lib/models/layers/__init__.py rename to wham/models/layers/__init__.py diff --git a/lib/models/layers/modules.py b/wham/models/layers/modules.py similarity index 96% rename from lib/models/layers/modules.py rename to wham/models/layers/modules.py index 8d147aa..c96e3d0 100644 --- a/lib/models/layers/modules.py +++ b/wham/models/layers/modules.py @@ -5,9 +5,8 @@ import torch import numpy as np from torch import nn -from configs import constants as _C from .utils import rollout_global_motion -from lib.utils.transforms import axis_angle_to_matrix +from ...utils.transforms import axis_angle_to_matrix class Regressor(nn.Module): @@ -178,13 +177,15 @@ class MotionDecoder(nn.Module): def __init__(self, d_embed, rnn_type, - n_layers): + n_layers, + main_joints): super().__init__() self.n_pose = 24 # SMPL pose initialization - self.neural_init = NeuralInitialization(len(_C.BMODEL.MAIN_JOINTS) * 6, d_embed, rnn_type, n_layers) + self.main_joints = main_joints + self.neural_init = NeuralInitialization(len(main_joints) * 6, d_embed, rnn_type, n_layers) # 3d keypoints regressor self.regressor = Regressor( @@ -195,7 +196,7 @@ def forward(self, x, init): """ b, f = x.shape[:2] - h0 = self.neural_init(init[:, :, _C.BMODEL.MAIN_JOINTS].reshape(b, 1, -1)) + h0 = self.neural_init(init[:, :, self.main_joints].reshape(b, 1, -1)) # Recursive prediction of SMPL parameters pred_pose_list = [init.reshape(b, 1, -1)] diff --git a/lib/models/layers/utils.py b/wham/models/layers/utils.py similarity index 98% rename from lib/models/layers/utils.py rename to wham/models/layers/utils.py index 8358a4d..d3c8f61 100644 --- a/lib/models/layers/utils.py +++ b/wham/models/layers/utils.py @@ -1,5 +1,5 @@ import torch -from lib.utils import transforms +from ...utils import transforms diff --git a/lib/models/preproc/backbone/hmr2.py b/wham/models/preproc/backbone/hmr2.py similarity index 100% rename from lib/models/preproc/backbone/hmr2.py rename to wham/models/preproc/backbone/hmr2.py diff --git a/lib/models/preproc/backbone/pose_transformer.py b/wham/models/preproc/backbone/pose_transformer.py similarity index 100% rename from lib/models/preproc/backbone/pose_transformer.py rename to wham/models/preproc/backbone/pose_transformer.py diff --git a/lib/models/preproc/backbone/smpl_head.py b/wham/models/preproc/backbone/smpl_head.py similarity index 96% rename from lib/models/preproc/backbone/smpl_head.py rename to wham/models/preproc/backbone/smpl_head.py index 9460d35..84af792 100644 --- a/lib/models/preproc/backbone/smpl_head.py +++ b/wham/models/preproc/backbone/smpl_head.py @@ -4,8 +4,8 @@ import numpy as np import einops -from configs import constants as _C -from lib.utils.transforms import axis_angle_to_matrix +from .... import constants as _C +from ....utils.transforms import axis_angle_to_matrix from .pose_transformer import TransformerDecoder def rot6d_to_rotmat(x: torch.Tensor) -> torch.Tensor: @@ -51,9 +51,8 @@ def __init__(self): transformer_args_from_cfg = dict( depth=6, heads=8, mlp_dim=1024, dim_head=64, dropout=0.0, emb_dropout=0.0, norm='layer', context_dim=1280 ) - transformer_args = (transformer_args | transformer_args_from_cfg) self.transformer = TransformerDecoder( - **transformer_args + **transformer_args, **transformer_args_from_cfg ) dim=transformer_args['dim'] self.decpose = nn.Linear(dim, npose) diff --git a/lib/models/preproc/backbone/t_cond_mlp.py b/wham/models/preproc/backbone/t_cond_mlp.py similarity index 100% rename from lib/models/preproc/backbone/t_cond_mlp.py rename to wham/models/preproc/backbone/t_cond_mlp.py diff --git a/lib/models/preproc/backbone/utils.py b/wham/models/preproc/backbone/utils.py similarity index 98% rename from lib/models/preproc/backbone/utils.py rename to wham/models/preproc/backbone/utils.py index 5276cff..94daa51 100644 --- a/lib/models/preproc/backbone/utils.py +++ b/wham/models/preproc/backbone/utils.py @@ -2,10 +2,6 @@ from __future__ import print_function from __future__ import division -import os -import os.path as osp -from collections import OrderedDict - import cv2 import numpy as np from skimage.filters import gaussian diff --git a/lib/models/preproc/backbone/vit.py b/wham/models/preproc/backbone/vit.py similarity index 100% rename from lib/models/preproc/backbone/vit.py rename to wham/models/preproc/backbone/vit.py diff --git a/lib/models/preproc/detector.py b/wham/models/preproc/detector.py similarity index 51% rename from lib/models/preproc/detector.py rename to wham/models/preproc/detector.py index 1bbbad9..943dca2 100644 --- a/lib/models/preproc/detector.py +++ b/wham/models/preproc/detector.py @@ -11,33 +11,46 @@ from progress.bar import Bar from ultralytics import YOLO -from mmpose.apis import ( - inference_top_down_pose_model, - init_pose_model, - get_track_id, - vis_pose_result, -) +from mmpose.apis import init_model, inference_topdown, _track_by_iou -ROOT_DIR = osp.abspath(f"{__file__}/../../../../") -VIT_DIR = osp.join(ROOT_DIR, "third-party/ViTPose") -BBOX_CONF = 0.5 -TRACKING_THR = 0.1 -MINIMUM_FRMAES = 30 +def get_track_id(results, results_last, next_id, min_keypoints=3, tracking_thr=0.3): + for result in results: + track_id, results_last, match_result = _track_by_iou(result, results_last, + tracking_thr) + if track_id == -1: + if np.count_nonzero(result.pred_instances.keypoints[0, :, 1]) > min_keypoints: + result.track_id = next_id + next_id += 1 + else: + # If the number of keypoints detected is small, + # delete that person instance. + result.pred_instances.keypoints[0, :, :] = -10 + result.pred_instances.bboxes *= 0 + result.track_id = -1 + else: + result.track_id = track_id + del match_result + + return results, next_id + class DetectionModel(object): - def __init__(self, device): + def __init__(self, device, mmpose_cfg): # ViTPose - pose_model_cfg = osp.join(VIT_DIR, 'configs/body/2d_kpt_sview_rgb_img/topdown_heatmap/coco/ViTPose_huge_coco_256x192.py') - pose_model_ckpt = osp.join(ROOT_DIR, 'checkpoints', 'vitpose-h-multi-coco.pth') - self.pose_model = init_pose_model(pose_model_cfg, pose_model_ckpt, device=device.lower()) + self.mmpose_cfg = mmpose_cfg + pose_model_cfg = mmpose_cfg.POSE_CONFIG + pose_model_ckpt = mmpose_cfg.POSE_CHECKPOINT + self.pose_model = init_model(pose_model_cfg, pose_model_ckpt, device=device.lower()) # YOLO - bbox_model_ckpt = osp.join(ROOT_DIR, 'checkpoints', 'yolov8x.pt') + bbox_model_ckpt = mmpose_cfg.DET_CHECKPOINT self.bbox_model = YOLO(bbox_model_ckpt) self.device = device + self.track_thr = self.mmpose_cfg.TRACKING_THR + self.min_frames = self.mmpose_cfg.MINIMUM_FRMAES self.initialize_tracking() def initialize_tracking(self, ): @@ -56,44 +69,45 @@ def xyxy_to_cxcys(self, bbox): scale = max(bbox[2] - bbox[0], bbox[3] - bbox[1]) / 200 return np.array([[cx, cy, scale]]) - def track(self, img, fps, length): + def detect(self, img): - # bbox detection + # bbox detection, output list of np.array(float32) bboxes = self.bbox_model.predict( - img, device=self.device, classes=0, conf=BBOX_CONF, save=False, verbose=False)[0].boxes.xyxy.detach().cpu().numpy() - bboxes = [{'bbox': bbox} for bbox in bboxes] + img, device=self.device, classes=0, conf=self.mmpose_cfg.BBOX_CONF, save=False, verbose=False)[0].boxes.xyxy.detach().cpu().numpy() - # keypoints detection - pose_results, returned_outputs = inference_top_down_pose_model( + + pose_results = inference_topdown( self.pose_model, img, - person_results=bboxes, - format='xyxy', - return_heatmap=False, - outputs=None) - - # person identification + bboxes=bboxes, + bbox_format='xyxy') + return bboxes, pose_results + + def track_detections(self, pose_results): pose_results, self.next_id = get_track_id( pose_results, self.pose_results_last, self.next_id, - use_oks=False, - tracking_thr=TRACKING_THR, - use_one_euro=True, - fps=fps) - + tracking_thr=self.track_thr) for pose_result in pose_results: - _id = pose_result['track_id'] - xyxy = pose_result['bbox'] + _id = pose_result.track_id + xyxy = pose_result.pred_instances.bboxes[0] bbox = self.xyxy_to_cxcys(xyxy) self.tracking_results['id'].append(_id) self.tracking_results['frame_id'].append(self.frame_id) self.tracking_results['bbox'].append(bbox) - self.tracking_results['keypoints'].append(pose_result['keypoints'][None]) + kpts = np.zeros((1,17,3), dtype=float) + kpts[:,:,:2] = pose_result.pred_instances.keypoints + kpts[:,:,2] = pose_result.pred_instances.keypoint_scores + self.tracking_results['keypoints'].append(kpts) self.frame_id += 1 self.pose_results_last = pose_results + + def track(self, img, fps, length): + bboxes, pose_results = self.detect(img) + self.track_detections(pose_results) def process(self, fps): for key in ['id', 'frame_id']: @@ -113,7 +127,7 @@ def process(self, fps): # Smooth bounding box detection ids = list(output.keys()) for _id in ids: - if len(output[_id]['bbox']) < MINIMUM_FRMAES: + if len(output[_id]['bbox']) < self.min_frames: del output[_id] continue @@ -121,4 +135,11 @@ def process(self, fps): smoothed_bbox = np.array([signal.medfilt(param, kernel) for param in output[_id]['bbox'].T]).T output[_id]['bbox'] = smoothed_bbox - return output \ No newline at end of file + return output + + +class TrackingModel(DetectionModel): + def __init__(self, track_thr=.3, min_frames=10): + self.track_thr = track_thr + self.min_frames = min_frames + self.initialize_tracking() diff --git a/lib/models/preproc/extractor.py b/wham/models/preproc/extractor.py similarity index 91% rename from lib/models/preproc/extractor.py rename to wham/models/preproc/extractor.py index b28e473..65636ba 100644 --- a/lib/models/preproc/extractor.py +++ b/wham/models/preproc/extractor.py @@ -11,19 +11,15 @@ from progress.bar import Bar from scipy.ndimage.filters import gaussian_filter1d -from configs import constants as _C from .backbone.hmr2 import hmr2 from .backbone.utils import process_image -ROOT_DIR = osp.abspath(f"{__file__}/../../../../") - class FeatureExtractor(object): - def __init__(self, device, max_batch_size=64): + def __init__(self, ckpt, device, max_batch_size=64): self.device = device self.max_batch_size = max_batch_size - ckpt = osp.join(ROOT_DIR, 'checkpoints', 'hmr2a.ckpt') self.model = hmr2(ckpt).to(device).eval() def run(self, video, tracking_results, patch_h=256, patch_w=256): diff --git a/lib/models/preproc/slam.py b/wham/models/preproc/slam.py similarity index 80% rename from lib/models/preproc/slam.py rename to wham/models/preproc/slam.py index a58ecad..d38738c 100644 --- a/lib/models/preproc/slam.py +++ b/wham/models/preproc/slam.py @@ -11,22 +11,18 @@ from dpvo.utils import Timer from dpvo.dpvo import DPVO from dpvo.config import cfg -from dpvo.stream import image_stream, video_stream - -ROOT_DIR = osp.abspath(f"{__file__}/../../../../") -DPVO_DIR = osp.join(ROOT_DIR, "third-party/DPVO") +from dpvo.stream import video_stream class SLAMModel(object): - def __init__(self, video, output_pth, width, height, calib=None, stride=1, skip=0, buffer=2048): - - if calib == None or not osp.exists(calib): + def __init__(self, cfg, video, output_pth, width, height, calib=None, stride=1, skip=0, buffer=2048): + if (calib is None) or not osp.exists(calib): calib = osp.join(output_pth, 'calib.txt') if not osp.exists(calib): self.estimate_intrinsics(width, height, calib) - self.dpvo_cfg = osp.join(DPVO_DIR, 'config/default.yaml') - self.dpvo_ckpt = osp.join(ROOT_DIR, 'checkpoints', 'dpvo.pth') + self.dpvo_cfg = cfg.CFG + self.dpvo_ckpt = cfg.CKPT self.buffer = buffer self.times = [] diff --git a/lib/models/smpl.py b/wham/models/smpl.py similarity index 99% rename from lib/models/smpl.py rename to wham/models/smpl.py index ac8d14b..300970d 100644 --- a/lib/models/smpl.py +++ b/wham/models/smpl.py @@ -6,13 +6,13 @@ import torch import numpy as np -from lib.utils import transforms +from ..utils import transforms from smplx import SMPL as _SMPL from smplx.utils import SMPLOutput as ModelOutput from smplx.lbs import vertices2joints -from configs import constants as _C +from .. import constants as _C class SMPL(_SMPL): """ Extension of the official SMPL implementation to support more joints """ diff --git a/lib/models/wham.py b/wham/models/wham.py similarity index 95% rename from lib/models/wham.py rename to wham/models/wham.py index c18f88d..6eeca9f 100644 --- a/lib/models/wham.py +++ b/wham/models/wham.py @@ -5,25 +5,26 @@ import torch from torch import nn -from configs import constants as _C -from lib.utils import transforms -from lib.models.layers import (MotionEncoder, MotionDecoder, TrajectoryDecoder, TrajectoryRefiner, Integrator, +from ..utils import transforms +from .layers import (MotionEncoder, MotionDecoder, TrajectoryDecoder, TrajectoryRefiner, Integrator, rollout_global_motion, compute_camera_pose, reset_root_velocity, compute_camera_motion) class Network(nn.Module): def __init__(self, smpl, + main_joints, pose_dr=0.1, d_embed=512, n_layers=3, d_feat=2048, rnn_type='LSTM', + num_joints=17, **kwargs ): super().__init__() - n_joints = _C.KEYPOINTS.NUM_JOINTS + n_joints = num_joints self.smpl = smpl in_dim = n_joints * 2 + 3 d_context = d_embed + n_joints * 3 @@ -49,7 +50,8 @@ def __init__(self, # Module 4. Motion Decoder self.motion_decoder = MotionDecoder(d_embed=d_context, rnn_type=rnn_type, - n_layers=n_layers) + n_layers=n_layers, + main_joints=main_joints) # Module 5. Trajectory Refiner self.trajectory_refiner = TrajectoryRefiner(d_embed=d_context, diff --git a/lib/utils/data_utils.py b/wham/utils/data_utils.py similarity index 98% rename from lib/utils/data_utils.py rename to wham/utils/data_utils.py index bf2c6cd..e476663 100644 --- a/lib/utils/data_utils.py +++ b/wham/utils/data_utils.py @@ -5,7 +5,7 @@ import torch import numpy as np -from lib.utils import transforms +from . import transforms def make_collate_fn(): diff --git a/lib/utils/imutils.py b/wham/utils/imutils.py similarity index 100% rename from lib/utils/imutils.py rename to wham/utils/imutils.py diff --git a/lib/utils/kp_utils.py b/wham/utils/kp_utils.py similarity index 95% rename from lib/utils/kp_utils.py rename to wham/utils/kp_utils.py index 479cf0c..6759c4e 100644 --- a/lib/utils/kp_utils.py +++ b/wham/utils/kp_utils.py @@ -2,8 +2,6 @@ from __future__ import print_function from __future__ import division -from configs import constants as _C - import torch diff --git a/lib/utils/transforms.py b/wham/utils/transforms.py similarity index 100% rename from lib/utils/transforms.py rename to wham/utils/transforms.py diff --git a/wham/utils/utils.py b/wham/utils/utils.py new file mode 100644 index 0000000..4583e51 --- /dev/null +++ b/wham/utils/utils.py @@ -0,0 +1,80 @@ +# -*- coding: utf-8 -*- + +# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is +# holder of all proprietary rights on this computer program. +# You can only use this computer program if you have closed +# a license agreement with MPG or you get the right to use the computer +# program from someone who is authorized to grant you that right. +# Any use of the computer program without a valid license is prohibited and +# liable to prosecution. +# +# Copyright©2019 Max-Planck-Gesellschaft zur Förderung +# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute +# for Intelligent Systems. All rights reserved. +# +# Contact: ps-license@tuebingen.mpg.de + +import os +import yaml +import torch +import shutil +from os import path as osp + + +def save_dict_to_yaml(obj, filename, mode='w'): + with open(filename, mode) as f: + yaml.dump(obj, f, default_flow_style=False) + + +def prepare_output_dir(cfg, cfg_file): + + # ==== create logdir + logdir = osp.join(cfg.OUTPUT_DIR, cfg.EXP_NAME) + os.makedirs(logdir, exist_ok=True) + shutil.copy(src=cfg_file, dst=osp.join(cfg.OUTPUT_DIR, 'config.yaml')) + + cfg.LOGDIR = logdir + + # save config + save_dict_to_yaml(cfg, osp.join(cfg.LOGDIR, 'config.yaml')) + + return cfg + + +def prepare_groundtruth(batch, device): + groundtruths = dict() + gt_keys = ['pose', 'cam', 'betas', 'kp3d', 'mask', 'bbox', 'res', 'cam_intrinsics', 'init_root', 'cam_angvel'] + for gt_key in gt_keys: + if gt_key in batch.keys(): + dtype = torch.float32 if batch[gt_key].dtype == torch.float64 else batch[gt_key].dtype + groundtruths[gt_key] = batch[gt_key].to(dtype=dtype, device=device) + + return groundtruths + + +def prepare_input(batch, device, use_features): + # Input keypoints data + kp2d = batch['kp2d'].to(device).float() + + # Input features + if use_features and 'features' in batch.keys(): + features = batch['features'].to(device).float() + else: + features = None + + # Initial SMPL parameters + init_smpl = batch['init_pose'].to(device).float() + + # Initial keypoints + init_kp = torch.cat(( + batch['init_kp3d'], batch['init_kp2d'] + ), dim=-1).to(device).float() + + return kp2d, (init_kp, init_smpl), features + + +def prepare_batch(batch, device, use_features=True): + x, inits, features = prepare_input(batch, device, use_features) + groundtruths = prepare_groundtruth(batch, device) + + return x, inits, features, groundtruths \ No newline at end of file diff --git a/lib/vis/renderer.py b/wham/vis/renderer.py similarity index 100% rename from lib/vis/renderer.py rename to wham/vis/renderer.py diff --git a/lib/vis/run_vis.py b/wham/vis/run_vis.py similarity index 98% rename from lib/vis/run_vis.py rename to wham/vis/run_vis.py index e6c2ec5..92bc216 100644 --- a/lib/vis/run_vis.py +++ b/wham/vis/run_vis.py @@ -7,7 +7,7 @@ import numpy as np from progress.bar import Bar -from lib.vis.renderer import Renderer, get_global_cameras +from .renderer import Renderer, get_global_cameras def run_vis_on_demo(cfg, video, results, output_pth, smpl, vis_global=True): # to torch tensor diff --git a/lib/vis/tools.py b/wham/vis/tools.py similarity index 100% rename from lib/vis/tools.py rename to wham/vis/tools.py