diff --git a/README.md b/README.md index 35c70529a..26b57d4a2 100644 --- a/README.md +++ b/README.md @@ -680,7 +680,7 @@ If you would like to try on your computer: | | Model | Reference | Exported From | Supported Ailia Version | Blog | |:-----------|------------:|:------------:|:------------:|:------------:|:------------:| | [](optical_flow_estimation/raft/) | [raft](/optical_flow_estimation/raft/) | [RAFT: Recurrent All Pairs Field Transforms for Optical Flow](https://github.com/princeton-vl/RAFT) | Pytorch | 1.2.6 and later | [EN](https://medium.com/axinc-ai/raft-a-machine-learning-model-for-estimating-optical-flow-6ab6d077e178) [JP](https://medium.com/axinc/raft-optical-flow%E3%82%92%E6%8E%A8%E5%AE%9A%E3%81%99%E3%82%8B%E6%A9%9F%E6%A2%B0%E5%AD%A6%E7%BF%92%E3%83%A2%E3%83%87%E3%83%AB-bf898965de05) | - +| [](optical_flow_estimation/cotracker3/) | [cotracker3](/optical_flow_estimation/cotracker3/) | [ CoTracker3: Simpler and Better Point Tracking by Pseudo-Labelling Real Videos](https://github.com/facebookresearch/co-tracker) | Pytorch | 2.4 and later | | ## Point segmentation | | Model | Reference | Exported From | Supported Ailia Version | Blog | diff --git a/optical_flow_estimation/cotracker3/LICENSE.md b/optical_flow_estimation/cotracker3/LICENSE.md new file mode 100644 index 000000000..e395ca3e2 --- /dev/null +++ b/optical_flow_estimation/cotracker3/LICENSE.md @@ -0,0 +1,399 @@ +Attribution-NonCommercial 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/optical_flow_estimation/cotracker3/README.md b/optical_flow_estimation/cotracker3/README.md new file mode 100644 index 000000000..ab6da444d --- /dev/null +++ b/optical_flow_estimation/cotracker3/README.md @@ -0,0 +1,51 @@ +# CoTracker3: Simpler and Better Point Tracking by Pseudo-Labelling Real Videos + +## Input + +![Input](input.gif) + +(Image from https://github.com/facebookresearch/co-tracker/blob/main/gradio_demo/videos/bear.mp4) + +Shape : (1, 3, 854, 480) + +## Output + +![Output](output.gif) + + +### usage +Automatically downloads the onnx and prototxt files on the first run. +It is necessary to be connected to the Internet while downloading. + +For the sample video, +``` bash +$ python3 cotracker3.py +``` + +If you want to specify the input video, put the video path after the `--input` option. +You can use `--savepath` option to change the name of the output file to save. + +```bash +$ python3 cotracker3.py --input IMAGE_PATH --savepath SAVE_IMAGE_PATH +``` + +By default, the ailia SDK is used. If you want to use ONNX Runtime, use the --onnx option. +```bash +$ python3 cotracker3.py --onnx +``` + +## Reference + +- [CoTracker3: Simpler and Better Point Tracking by Pseudo-Labelling Real Videos](https://github.com/facebookresearch/co-tracker) + +## Framework + +Pytorch 2.4 + +## Model Format + +ONNX opset=20 + +## Netron + +[cotracker3.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/cotracker3/cotracker3.onnx.prototxt) diff --git a/optical_flow_estimation/cotracker3/cotracker3.py b/optical_flow_estimation/cotracker3/cotracker3.py new file mode 100644 index 000000000..fc9552594 --- /dev/null +++ b/optical_flow_estimation/cotracker3/cotracker3.py @@ -0,0 +1,147 @@ +import sys +import cv2 +import time +import numpy as np + +import ailia +import onnxruntime as ort +from vis import Visualizer + +# import original modules +sys.path.append('../../util') +from arg_utils import get_base_parser, update_parser # noqa: E402 +from model_utils import check_and_download_models # noqa: E402 + +# logger +from logging import getLogger # noqa: E402 +logger = getLogger(__name__) + + +# ====================== +# Parameters +# ====================== +VIDEO_PATH = 'input.mp4' +SAVE_PATH = 'output.mp4' + +# ====================== +# Argument Parser Config +# ====================== +parser = get_base_parser( + 'CoTracker3: Simpler and Better Point Tracking by Pseudo-Labelling Real Videos', + VIDEO_PATH, + SAVE_PATH, +) + +parser.add_argument("--grid_size", type=int, default=10, help="Regular grid size") +parser.add_argument( + "--grid_query_frame", + type=int, + default=0, + help="Compute dense and grid tracks starting from this frame", +) +parser.add_argument( + "--backward_tracking", + action="store_true", + help="Compute tracks in both directions, not only forward", +) + +parser.add_argument('--onnx', action='store_true', help='execute onnxruntime version.') + +args = update_parser(parser) + +# ========================== +# MODEL AND OTHER PARAMETERS +# ========================== +WEIGHT_PATH = 'cotracker3.onnx' +MODEL_PATH = 'cotracker3.onnx.prototxt' +REMOTE_PATH = 'https://storage.googleapis.com/ailia-models/cotracker3/' + +def read_video_from_path(path): + try: + cap = cv2.VideoCapture(path) + except Exception as e: + print("Error opening video file: ", e) + return None + frames = [] + + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + frames.append(frame) + cap.release() + + return np.stack(frames) + + +def compute(net,video): + if not args.onnx: + result = net.run((video,np.array(args.grid_size ,dtype=np.int64), + np.array(args.grid_query_frame,dtype=np.int64))) + else: + input_name1 = net.get_inputs()[0].name + input_name2 = net.get_inputs()[1].name + input_name3 = net.get_inputs()[2].name + result= net.run([],{input_name1:video, + input_name2:np.array(args.grid_size ,dtype=np.int64), + input_name3:np.array(args.grid_query_frame,dtype=np.int64)}) + return result + +# ====================== +# Main functions +# ====================== +def recognize_from_video(): + # net initialize + if not args.onnx: + memory_mode = ailia.get_memory_mode( + reduce_constant=True, ignore_input_with_initializer=True, + reduce_interstage=False, reuse_interstage=True) + + net = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=args.env_id,memory_mode=memory_mode) + else: + net = ort.InferenceSession(WEIGHT_PATH) + + # load video + vis = Visualizer( pad_value=120, linewidth=3) + + for path in args.input: + video = read_video_from_path(path) + np.transpose(video,(0, 3, 1, 2)) + video = np.transpose(video,(0, 3, 1, 2))[np.newaxis, ...].astype(np.float32) + + + # calculate feature map + logger.info('Start calculating feature map...') + if args.benchmark: + logger.info('BENCHMARK mode') + for i in range(args.benchmark_count): + start = int(round(time.time() * 1000)) + result = compute(net,video) + end = int(round(time.time() * 1000)) + logger.info(f'\tailia processing time {end - start} ms') + else: + result = compute(net,video) + + pred_tracks = np.array(result[0]) + pred_visibility = np.array(result[1]) + + # save a video with predicted tracks + logger.info(f'saved at : {args.savepath}') + vis.visualize( + video, + pred_tracks, + pred_visibility, + args.savepath + ) + logger.info('Script finished successfully.') + + +def main(): + # model files check and download + check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH) + + recognize_from_video() + + +if __name__ == '__main__': + main() diff --git a/optical_flow_estimation/cotracker3/input.gif b/optical_flow_estimation/cotracker3/input.gif new file mode 100644 index 000000000..2653a71cf Binary files /dev/null and b/optical_flow_estimation/cotracker3/input.gif differ diff --git a/optical_flow_estimation/cotracker3/input.mp4 b/optical_flow_estimation/cotracker3/input.mp4 new file mode 100644 index 000000000..451ad7a12 Binary files /dev/null and b/optical_flow_estimation/cotracker3/input.mp4 differ diff --git a/optical_flow_estimation/cotracker3/output.gif b/optical_flow_estimation/cotracker3/output.gif new file mode 100644 index 000000000..817795b10 Binary files /dev/null and b/optical_flow_estimation/cotracker3/output.gif differ diff --git a/optical_flow_estimation/cotracker3/vis.py b/optical_flow_estimation/cotracker3/vis.py new file mode 100644 index 000000000..116e905ac --- /dev/null +++ b/optical_flow_estimation/cotracker3/vis.py @@ -0,0 +1,214 @@ +import os +import cv2 +import numpy as np + + +from matplotlib import cm +import matplotlib.pyplot as plt + +def read_video_from_path(path): + try: + cap = cv2.VideoCapture(path) + except Exception as e: + print("Error opening video file: ", e) + return None + frames = [] + + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + frames.append(frame) # フレームをリストに追加 + cap.release() + + return np.stack(frames) + + +class Visualizer: + def __init__( + self, + pad_value: int = 0, + linewidth: int = 2, + show_first_frame: int = 10, + tracks_leave_trace: int = 0, # -1 for infinite + ): + + self.color_map = cm.get_cmap("gist_rainbow") + + self.show_first_frame = show_first_frame + self.tracks_leave_trace = tracks_leave_trace + self.pad_value = pad_value + self.linewidth = linewidth + + def visualize( + self, + video, + tracks, + visibility=None, + + filename: str = "video", + + query_frame=0, + opacity: float = 1.0, + ): + + video = pad(video,self.pad_value,255) + + color_alpha = int(opacity * 255) + tracks = tracks + self.pad_value + + res_video = self.draw_tracks_on_video( + video=video, + tracks=tracks, + visibility=visibility, + query_frame=query_frame, + color_alpha=color_alpha, + ) + + self.save_video(res_video, filename=filename) + return res_video + + def save_video(self, video, filename): + + + wide_list = [video[:,i,:,:,:] for i in range(video.shape[1])] + + wide_list = [np.transpose(wide[0],(1, 2, 0)) for wide in wide_list] + + # Prepare the video file path + save_path = filename + + # Create a writer object + height, width, channels = wide_list[0].shape + fourcc = cv2.VideoWriter_fourcc(*'mp4v') # MP4コーデック + out = cv2.VideoWriter(save_path, fourcc, 30, (width, height)) + + # Write frames to the video file + for frame in wide_list[2:-1]: + out.write(frame) + out.release() + + print(f"Video saved to {save_path}") + + def draw_tracks_on_video( + self, + video, + tracks, + visibility = None, + query_frame=0, + color_alpha: int = 255, + ): + + + B, T, C, H, W = video.shape + _, _, N, D = tracks.shape + + segm_mask = None + + assert D == 2 + assert C == 3 + + video = np.transpose(video[0],(0, 2, 3, 1)).astype(np.uint8) # S, H, W, C + tracks = tracks[0].astype(np.int64) + + + res_video = [] + + # process input video + for rgb in video: + res_video.append(rgb.copy()) + vector_colors = np.zeros((T, N, 3)) + + #if segm_mask is None: + y_min, y_max = ( + tracks[query_frame, :, 1].min(), + tracks[query_frame, :, 1].max(), + ) + norm = plt.Normalize(y_min, y_max) + for n in range(N): + query_frame_ = query_frame + + color = self.color_map(norm(tracks[query_frame_, n, 1])) + color = np.array(color[:3])[None] * 255 + vector_colors[:, n] = np.repeat(color, T, axis=0) + + # draw tracks + if self.tracks_leave_trace != 0: + for t in range(query_frame + 1, T): + first_ind = ( + max(0, t - self.tracks_leave_trace) + if self.tracks_leave_trace >= 0 + else 0 + ) + curr_tracks = tracks[first_ind : t + 1] + curr_colors = vector_colors[first_ind : t + 1] + + res_video[t] = self._draw_pred_tracks( + res_video[t], + curr_tracks, + curr_colors, + ) + + # draw points + for t in range(T): + img = np.uint8(res_video[t]) + for i in range(N): + coord = (tracks[t, i, 0], tracks[t, i, 1]) + visibile = True + if visibility is not None: + visibile = visibility[0, t, i] + if coord[0] != 0 and coord[1] != 0: + img = draw_circle( + img, + coord=coord, + radius=int(self.linewidth * 2), + color=vector_colors[t, i].astype(int), + visible=visibile, + color_alpha=color_alpha, + ) + res_video[t] = np.array(img) + + # construct the final rgb sequence + if self.show_first_frame > 0: + res_video = [res_video[0]] * self.show_first_frame + res_video[1:] + return np.transpose(np.stack(res_video),(0, 3, 1, 2))[np.newaxis, ...].astype(np.uint8) + +def draw_ellipse(image, left_up_point, right_down_point, color, visible=True): + center = ( + (left_up_point[0] + right_down_point[0]) // 2, + (left_up_point[1] + right_down_point[1]) // 2, + ) + axes = ( + abs(right_down_point[0] - left_up_point[0]) // 2, + abs(right_down_point[1] - left_up_point[1]) // 2, + ) + thickness = -1 if visible else 2 + color = tuple(map(int, color)) + cv2.ellipse(image, center, axes, 0, 0, 360, color, thickness) + return image + +def draw_circle(rgb, coord, radius, color=(255, 0, 0), visible=True, color_alpha=None): + # Create a draw object + # Calculate the bounding box of the circle + left_up_point = (coord[0] - radius, coord[1] - radius) + right_down_point = (coord[0] + radius, coord[1] + radius) + # Draw the circle + color = tuple(list(color) + [color_alpha if color_alpha is not None else 255]) + + rgb = draw_ellipse(rgb,left_up_point,right_down_point,color,True) + + return rgb + + + +def pad(video, pad_value, constant_value=255): + padding = ((0, 0), + (0, 0), + (0, 0), + (pad_value, pad_value), + (pad_value, pad_value)) + + padded_video = np.pad(video, pad_width=padding, mode='constant', constant_values=constant_value) + return padded_video + + diff --git a/scripts/download_all_models.sh b/scripts/download_all_models.sh index 05a5a8110..4493db60d 100755 --- a/scripts/download_all_models.sh +++ b/scripts/download_all_models.sh @@ -314,6 +314,7 @@ cd ../../object_tracking/qd-3dt; python3 qd-3dt.py ${OPTION} cd ../../object_tracking/strong_sort; python3 strong_sort.py ${OPTION} cd ../../object_tracking/deepsort_vehicle; python3 deepsort_vehicle.py ${OPTION} cd ../../optical_flow_estimation/raft; python3 raft.py ${OPTION} +cd ../../optical_flow_estimation/cotracker3; python3 cotracker3.py ${OPTION} cd ../../point_segmentation/pointnet_pytorch; python3 pointnet_pytorch.py ${OPTION} cd ../../pose_estimation/lightweight-human-pose-estimation; python3 lightweight-human-pose-estimation.py ${OPTION} cd ../../pose_estimation/openpose; python3 openpose.py ${OPTION}