diff --git a/README.md b/README.md index 8c39c073b..71b5ae5b8 100644 --- a/README.md +++ b/README.md @@ -235,6 +235,7 @@ The collection of pre-trained, state-of-the-art AI models. | [](hand_recognition/minimal-hand/) |[minimal-hand](/hand_recognition/minimal-hand/) | [Minimal Hand](https://github.com/CalciferZh/minimal-hand) | TensorFlow | 1.2.8 and later | | [](hand_recognition/v2v-posenet/) |[v2v-posenet](/hand_recognition/v2v-posenet/) | [V2V-PoseNet](https://github.com/mks0601/V2V-PoseNet_RELEASE) | Pytorch | 1.2.6 and later | | [](hands_recognition/hands_segmentation_pytorch/) |[hands_segmentation_pytorch](/hand_recognition/hands_segmentation_pytorch/) | [hands-segmentation-pytorch](https://github.com/guglielmocamporese/hands-segmentation-pytorch) | Pytorch | 1.2.10 and later | +| [](hand_recognition/ego2hands/) |[ego2hands](/hand_recognition/ego2hands/) | [Ego2Hands](https://github.com/AlextheEngineer/Ego2Hands) | Pytorch | 1.2.16 and later | ## Image captioning diff --git a/hand_recognition/ego2hands/README.md b/hand_recognition/ego2hands/README.md new file mode 100644 index 000000000..877dc4e95 --- /dev/null +++ b/hand_recognition/ego2hands/README.md @@ -0,0 +1,75 @@ +*# Ego2Hands - Egocentric Two-hand Segmentation and Detection +## Input + +* **Image or Video** + +![sample image](sample_image.png) + + +## Output + +* **Predicted mask** + +![result](output.png) + +Estimated mask of the hands (without option ```--overlay```), +or a mask overlayed over the original image (with option ```--overlay```). + +The result will be saved to ```./output.png``` by default but it can be specified with the ```-s``` option + +* **Energy prediction** +![result_energy](output_energy.png) + +The segmentation mask predicts all region of your arms but the "energy" prediction will focus on the position of your hands. This can also be used to create a bounding box of the hands. + +The result will be saved to ```./output.png``` by default but it can be specified with the ```-s``` option. For this output, the result will be saved to a path with "_energy" inserted before the extension. + +## Usage +An Internet connection is required when running the script for the first time, as the model files will be downloaded automatically. + +The predicted mask of the hands in the input media will be generated by running the script. + +#### Example 1: Inference on prepared demo image. +```bash +$ python3 ego2hands.py +``` + +#### Example 2: Specify input path, save path. +```bash +$ python3 ego2hands.py -i input.png -s output.png +``` +The ```-i``` and ```-s``` options can be used to specify the input path and the save path, respectively. +In this example, the segmentation result will be saved to output.png, and the energy result will be saved to output_energy.png. + +#### Example 3: Specify the size of the image, and visualize the overlayed mask. +```bash +$ python3 ego2hands.py --width 256 --height 512 --overlay +``` +Use options ```--width``` and ```height``` to specify the image size on which the model will be running inference. +The result will always be resized to the original size of the image. +You can visualize the result with a mask overlayed over the original image using the option ```--overlay```. + +#### Example 4: Inference on Video. +```bash +$ python3 ego2hands.py -v 0 +``` +argument after the ```-v``` option can be the device ID of the webcam, +or the path to the input video. + +## Reference + +* [Ego2Hands](https://github.com/AlextheEngineer/Ego2Hands) + +## Framework + +Pytorch + + +## Model Format + +ONNX opset=11 + +## Netron + +- [ego2hands.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/ego2hands/ego2hands.onnx.prototxt) +* \ No newline at end of file diff --git a/hand_recognition/ego2hands/ego2hands.py b/hand_recognition/ego2hands/ego2hands.py new file mode 100644 index 000000000..296f4b478 --- /dev/null +++ b/hand_recognition/ego2hands/ego2hands.py @@ -0,0 +1,244 @@ +import sys +import time + +import ailia +import cv2 + +import numpy as np + +# import original modules +sys.path.append('../../util') + +# logger +from logging import getLogger # noqa: E402 + +import webcamera_utils # noqa: E402 +from image_utils import imread # noqa: E402 +from model_utils import check_and_download_models # noqa: E402 +from arg_utils import get_base_parser, get_savepath, update_parser # noqa: E402 + +import matplotlib.pyplot as plt + +logger = getLogger(__name__) + + +# ====================== +# Parameters +# ====================== + +MODEL_NAME = "ego2hands" +WEIGHT_PATH = MODEL_NAME + ".onnx" +MODEL_PATH = WEIGHT_PATH + '.prototxt' + +REMOTE_PATH = "https://storage.googleapis.com/ailia-models/" + MODEL_NAME + "/" + +DEFAULT_INPUT_PATH = 'sample_image.png' +DEFAULT_SAVE_PATH = 'output.png' + +# ====================== +# Arguemnt Parser Config +# ====================== +parser = get_base_parser( + 'Ego2Hands: Egocentric Two-hand Segmentation and Detection', + DEFAULT_INPUT_PATH, DEFAULT_SAVE_PATH +) + +parser.add_argument( + '--height', type=int, default=None, + help='height of the image to run inference on ' +) + +parser.add_argument( + '--width', type=int, default=None, + help='width of the image to run inference on' +) + +parser.add_argument( + '--overlay', action='store_true', + help='Visualize the mask overlayed on the image' +) + +args = update_parser(parser) + +# ====================== +# Helper functions +# ====================== + +def preprocess(image, h=None, w=None): + + if h is not None and w is not None: + image = cv2.resize(image, (w, h)) + + image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + + img_edge = cv2.Canny(image, 25, 100).astype(np.float32) + img_real_test = np.stack((image, img_edge), -1) + img_real_test = (img_real_test - 128.0) / 256.0 + return img_real_test.transpose(2, 0, 1) + + +def postprocess(org_image, seg_output, energy_output): + seg_output_final = cv2.resize(seg_output[0].transpose(1,2,0), dsize=(org_image.shape[1], org_image.shape[0]), interpolation=cv2.INTER_LINEAR_EXACT, ) + seg_output_final = np.argmax(seg_output_final, axis=-1) + energy_l_final = cv2.resize(energy_output[0,1][None].transpose(1,2,0), dsize=(org_image.shape[1], org_image.shape[0]), interpolation=cv2.INTER_LINEAR_EXACT, ) + energy_r_final = cv2.resize(energy_output[0,2][None].transpose(1,2,0), dsize=(org_image.shape[1], org_image.shape[0]), interpolation=cv2.INTER_LINEAR_EXACT, ) + return seg_output_final, energy_l_final, energy_r_final + +def get_bounding_box_from_energy(energy, close_kernel_size = 15, close_op = True): + energy_positives = (energy > 0.5).astype(np.uint8) + if close_op: + energy_positives = cv2.erode(energy_positives, np.ones((close_kernel_size, close_kernel_size))) + energy_positives = cv2.dilate(energy_positives, np.ones((close_kernel_size, close_kernel_size))) + coords = np.where(energy_positives.astype(bool)) + if coords[0].size != 0: + row_min, row_max, col_min, col_max = np.min(coords[0]), np.max(coords[0]), np.min(coords[1]), np.max(coords[1]) + else: + row_min, row_max, col_min, col_max = 0, 0, 0, 0 + return np.array([row_min, row_max, col_min, col_max]) + +def create_visualization(image, seg, energy_l, energy_r, overlay=False, savepath=None): + # visualize segmentation mask + if overlay: + mask = np.where((seg == 1)[:,:,None], image/2 + np.array([0,0,128])[None,None], image) + mask = np.where((seg == 2)[:,:,None], image/2 + np.array([128,0,0])[None,None], mask) + else: + mask = seg * 100 + mask = np.tile(mask[:,:,None], (1,1,3)) + mask = (mask).astype(np.uint8) + + # vizualize energy map and bounding box + if overlay: + energy_vis = np.where((energy_l > 0.5)[:,:,None], image/2 + np.array([0,0,128])[None,None], image) + energy_vis = np.where((energy_r > 0.5)[:,:,None], energy_vis/2 + np.array([128,0,0])[None,None], energy_vis) + else: + energy_vis = np.tile(((energy_l > 0.5) * 100 + (energy_r > 0.5) * 200)[:,:,None], (1,1,3)).astype('uint8') + + bbox_l = get_bounding_box_from_energy(energy_l) + bbox_r = get_bounding_box_from_energy(energy_r) + energy_vis = cv2.rectangle(energy_vis, (bbox_l[2], bbox_l[0]), (bbox_l[3], bbox_l[1]), (0, 255, 0), 2) + energy_vis = cv2.rectangle(energy_vis, (bbox_r[2], bbox_r[0]), (bbox_r[3], bbox_r[1]), (0, 255, 0), 2) + energy_vis = (energy_vis).astype(np.uint8) + return mask, energy_vis + +def visualize_and_save(image, seg, energy_l, energy_r, overlay=False, savepath=None): + mask, energy_vis = create_visualization(image, seg, energy_l, energy_r, overlay=overlay) + + plt.imshow(mask) + plt.show() + + plt.imshow(energy_vis) + plt.show() + + if savepath is not None: + logger.info(f'saving result to {savepath}') + mask = cv2.cvtColor(mask, cv2.COLOR_RGB2BGR) + cv2.imwrite(savepath, mask) + + energy_vis = cv2.cvtColor(energy_vis, cv2.COLOR_RGB2BGR) + energy_savepath = savepath.split('.') + energy_savepath[-2] += '_energy' + energy_savepath = '.'.join(energy_savepath) + cv2.imwrite(energy_savepath, energy_vis) + +def update_frame(image, mask, energy, frame): + vis = np.concatenate([mask, energy], axis=1).astype('uint8') + if frame is None: + frame = plt.imshow(vis) + else: + frame.set_data(vis) + plt.pause(0.1) + return frame + +# ====================== +# Main functions +# ====================== + +def recognize_from_image(model): + logger.info('Start inference...') + + image_path = args.input[0] + + # prepare input data + org_img = cv2.cvtColor(imread(image_path), cv2.COLOR_BGR2RGB).astype(np.uint8) + if args.height is not None and args.width is not None: + h = org_img.shape[0] + w = org_img.shape[1] + else: + h = args.height + w = args.width + + image = preprocess(org_img, h = h, w = h)[None] + + if args.benchmark and not (args.video is not None): + logger.info('BENCHMARK mode') + for i in range(5): + start = int(round(time.time() * 1000)) + _, _, seg_output_final, energy_output_final = model.predict([image]) + end = int(round(time.time() * 1000)) + logger.info(f'\tailia processing time {end - start} ms') + else: + _, _, seg_output_final, energy_output_final = model.predict([image]) + + seg, energy_l, energy_r = postprocess(org_img, seg_output_final, energy_output_final) + + # visualize + visualize_and_save(org_img, seg, energy_l, energy_r, args.overlay, args.savepath) + + logger.info('Script finished successfully.') + +def recognize_from_video(model): + # net initialize + + capture = webcamera_utils.get_capture(args.video) + + _, t = capture.read() + if args.height is not None and args.width is not None: + h = t.shape[0] + w = t.shape[1] + else: + h = args.height + w = args.width + + frame_shown = None + + while(True): + ret, frame = capture.read() + if (cv2.waitKey(1) & 0xFF == ord("q")) or not ret: + break + frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB).astype(np.uint8) + + + # inference + image = preprocess(frame, h = h, w = w)[None] + + _, _, seg_output_final, energy_output_final = model.predict([image]) + + + seg, energy_l, energy_r = postprocess(frame, seg_output_final, energy_output_final) + mask_vis, energy_vis = create_visualization(frame, seg, energy_l, energy_r, overlay=args.overlay) + + # visualize + frame_shown = update_frame(frame, mask_vis, energy_vis, frame_shown) + if not plt.get_fignums(): + break + + capture.release() + logger.info('Script finished successfully.') + +def main(): + # model files check and download + check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) + + # net initialize + model = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id = args.env_id) + + if args.video is not None: + # video mode + recognize_from_video(model) + else: + # image mode + recognize_from_image(model) + + +if __name__ == '__main__': + main() diff --git a/hand_recognition/ego2hands/output.png b/hand_recognition/ego2hands/output.png new file mode 100644 index 000000000..978b39f85 Binary files /dev/null and b/hand_recognition/ego2hands/output.png differ diff --git a/hand_recognition/ego2hands/output_energy.png b/hand_recognition/ego2hands/output_energy.png new file mode 100644 index 000000000..a210a957e Binary files /dev/null and b/hand_recognition/ego2hands/output_energy.png differ diff --git a/hand_recognition/ego2hands/sample_image.png b/hand_recognition/ego2hands/sample_image.png new file mode 100644 index 000000000..5b3c8c613 Binary files /dev/null and b/hand_recognition/ego2hands/sample_image.png differ diff --git a/scripts/download_all_models.sh b/scripts/download_all_models.sh index 76821420e..0e5a273c5 100755 --- a/scripts/download_all_models.sh +++ b/scripts/download_all_models.sh @@ -131,6 +131,7 @@ cd ../../hand_recognition/hand3d; python3 hand3d.py ${OPTION} cd ../../hand_recognition/minimal-hand; python3 minimal-hand.py ${OPTION} cd ../../hand_recognition/v2v-posenet; python3 v2v-posenet.py ${OPTION} cd ../../hand_recognition/hands_segmentation_pytorch; python3 hands_segmentation_pytorch.py ${OPTION} +cd ../../hand_recognition/ego2hands; python3 ego2hands.py ${OPTION} cd ../../image_captioning/illustration2vec; python3 illustration2vec.py ${OPTION} cd ../../image_captioning/image_captioning_pytorch; python3 image_captioning_pytorch.py ${OPTION} cd ../../image_captioning/blip2; python3 blip2.py ${OPTION}