diff --git a/classy_vision/dataset/classy_kinetics400.py b/classy_vision/dataset/classy_kinetics400.py index 9ce1d042d0..5484c81cc1 100644 --- a/classy_vision/dataset/classy_kinetics400.py +++ b/classy_vision/dataset/classy_kinetics400.py @@ -45,6 +45,7 @@ def __init__( audio_channels: int, step_between_clips: int, frame_rate: Optional[int], + dilation: int, clips_per_video: int, video_dir: str, extensions: List[str], @@ -72,6 +73,7 @@ def __init__( step_between_clips: Number of frames between each clip. frame_rate: desired video frame rate. If None, keep orignal video frame rate. + dilation: the spacing between adjacent sampled frames clips_per_video: Number of clips to sample from each video video_dir: path to video folder extensions: A list of file extensions, such as "avi" and "mp4". Only @@ -104,6 +106,7 @@ def __init__( _video_min_dimension=video_min_dimension, _audio_samples=audio_samples, _audio_channels=audio_channels, + dilation=dilation, ) metadata = dataset.metadata if metadata and not os.path.exists(metadata_filepath): @@ -149,6 +152,7 @@ def from_config(cls, config: Dict[str, Any]) -> "Kinetics400Dataset": audio_samples, step_between_clips, frame_rate, + dilation, clips_per_video, ) = cls.parse_config(config) extensions = config.get("extensions", ("mp4")) @@ -169,6 +173,7 @@ def from_config(cls, config: Dict[str, Any]) -> "Kinetics400Dataset": audio_channels, step_between_clips, frame_rate, + dilation, clips_per_video, config["video_dir"], extensions, diff --git a/classy_vision/dataset/classy_video_dataset.py b/classy_vision/dataset/classy_video_dataset.py index 9e8385696d..a1e3e02c09 100644 --- a/classy_vision/dataset/classy_video_dataset.py +++ b/classy_vision/dataset/classy_video_dataset.py @@ -125,6 +125,7 @@ def parse_config(cls, config: Dict[str, Any]): audio_samples = config.get("audio_samples", 0) step_between_clips = config.get("step_between_clips", 1) frame_rate = config.get("frame_rate", None) + dilation = config.get("dilation", 1) clips_per_video = config.get("clips_per_video", 1) ( @@ -157,6 +158,7 @@ def parse_config(cls, config: Dict[str, Any]): audio_samples, step_between_clips, frame_rate, + dilation, clips_per_video, ) diff --git a/classy_vision/generic/profiler.py b/classy_vision/generic/profiler.py index 4b1044e54a..d6b19802fc 100644 --- a/classy_vision/generic/profiler.py +++ b/classy_vision/generic/profiler.py @@ -102,7 +102,7 @@ def _get_batchsize_per_replica(x: Union[Tuple, List, Dict]) -> int: return x.size()[0] -def _layer_flops(layer: nn.Module, x: Any, y: Any) -> int: +def _layer_flops(layer: nn.Module, x: Any, y: Any, verbose: bool = False) -> int: """ Computes the number of FLOPs required for a single layer. @@ -162,6 +162,36 @@ def flops(self, x): / layer.groups ) + # 3D convolution + elif layer_type in ["Conv3d"]: + out_t = int( + (x.size()[2] + 2 * layer.padding[0] - layer.kernel_size[0]) + // layer.stride[0] + + 1 + ) + out_h = int( + (x.size()[3] + 2 * layer.padding[1] - layer.kernel_size[1]) + // layer.stride[1] + + 1 + ) + out_w = int( + (x.size()[4] + 2 * layer.padding[2] - layer.kernel_size[2]) + // layer.stride[2] + + 1 + ) + flops = ( + batchsize_per_replica + * layer.in_channels + * layer.out_channels + * layer.kernel_size[0] + * layer.kernel_size[1] + * layer.kernel_size[2] + * out_t + * out_h + * out_w + / layer.groups + ) + # learned group convolution: elif layer_type in ["LearnedGroupConv"]: conv = layer.conv @@ -186,51 +216,36 @@ def flops(self, x): ) flops = count1 + count2 - # non-linearities: + # non-linearities are not considered in MAC counting elif layer_type in ["ReLU", "ReLU6", "Tanh", "Sigmoid", "Softmax"]: - flops = x.numel() - - # 2D pooling layers: - elif layer_type in ["AvgPool2d", "MaxPool2d"]: - in_h = x.size()[2] - in_w = x.size()[3] - if isinstance(layer.kernel_size, int): - layer.kernel_size = (layer.kernel_size, layer.kernel_size) - kernel_ops = layer.kernel_size[0] * layer.kernel_size[1] - out_h = 1 + int( - (in_h + 2 * layer.padding - layer.kernel_size[0]) / layer.stride - ) - out_w = 1 + int( - (in_w + 2 * layer.padding - layer.kernel_size[1]) / layer.stride + flops = 0 + + elif layer_type in [ + "MaxPool1d", + "MaxPool2d", + "MaxPool3d", + "AdaptiveMaxPool1d", + "AdaptiveMaxPool2d", + "AdaptiveMaxPool3d", + ]: + flops = 0 + + elif layer_type in ["AvgPool1d", "AvgPool2d", "AvgPool3d"]: + kernel_ops = 1 + flops = kernel_ops * y.numel() + + elif layer_type in ["AdaptiveAvgPool1d", "AdaptiveAvgPool2d", "AdaptiveAvgPool3d"]: + assert isinstance(layer.output_size, (list, tuple)) + kernel = torch.Tensor(list(x.shape[2:])) // torch.Tensor( + [list(layer.output_size)] ) - flops = x.size()[0] * x.size()[1] * out_w * out_h * kernel_ops - - # adaptive avg pool2d - # This is approximate and works only for downsampling without padding - # based on aten/src/ATen/native/AdaptiveAveragePooling.cpp - elif layer_type in ["AdaptiveAvgPool2d"]: - in_h = x.size()[2] - in_w = x.size()[3] - if isinstance(layer.output_size, int): - out_h, out_w = layer.output_size, layer.output_size - elif len(layer.output_size) == 1: - out_h, out_w = layer.output_size[0], layer.output_size[0] - else: - out_h, out_w = layer.output_size - if out_h > in_h or out_w > in_w: - raise ClassyProfilerNotImplementedError(layer) - batchsize_per_replica = x.size()[0] - num_channels = x.size()[1] - kh = in_h - out_h + 1 - kw = in_w - out_w + 1 - kernel_ops = kh * kw - flops = batchsize_per_replica * num_channels * out_h * out_w * kernel_ops + kernel_ops = torch.prod(kernel) + flops = kernel_ops * y.numel() # linear layer: elif layer_type in ["Linear"]: weight_ops = layer.weight.numel() - bias_ops = layer.bias.numel() if layer.bias is not None else 0 - flops = x.size()[0] * (weight_ops + bias_ops) + flops = x.size()[0] * weight_ops # batch normalization / layer normalization: elif layer_type in [ @@ -240,94 +255,12 @@ def flops(self, x): "SyncBatchNorm", "LayerNorm", ]: - flops = 2 * x.numel() - - # 3D convolution - elif layer_type in ["Conv3d"]: - out_t = int( - (x.size()[2] + 2 * layer.padding[0] - layer.kernel_size[0]) - // layer.stride[0] - + 1 - ) - out_h = int( - (x.size()[3] + 2 * layer.padding[1] - layer.kernel_size[1]) - // layer.stride[1] - + 1 - ) - out_w = int( - (x.size()[4] + 2 * layer.padding[2] - layer.kernel_size[2]) - // layer.stride[2] - + 1 - ) - flops = ( - batchsize_per_replica - * layer.in_channels - * layer.out_channels - * layer.kernel_size[0] - * layer.kernel_size[1] - * layer.kernel_size[2] - * out_t - * out_h - * out_w - / layer.groups - ) - - # 3D pooling layers - elif layer_type in ["AvgPool3d", "MaxPool3d"]: - in_t = x.size()[2] - in_h = x.size()[3] - in_w = x.size()[4] - if isinstance(layer.kernel_size, int): - layer.kernel_size = ( - layer.kernel_size, - layer.kernel_size, - layer.kernel_size, - ) - if isinstance(layer.padding, int): - layer.padding = (layer.padding, layer.padding, layer.padding) - if isinstance(layer.stride, int): - layer.stride = (layer.stride, layer.stride, layer.stride) - kernel_ops = layer.kernel_size[0] * layer.kernel_size[1] * layer.kernel_size[2] - out_t = 1 + int( - (in_t + 2 * layer.padding[0] - layer.kernel_size[0]) / layer.stride[0] - ) - out_h = 1 + int( - (in_h + 2 * layer.padding[1] - layer.kernel_size[1]) / layer.stride[1] - ) - out_w = 1 + int( - (in_w + 2 * layer.padding[2] - layer.kernel_size[2]) / layer.stride[2] - ) - flops = batchsize_per_replica * x.size()[1] * out_t * out_h * out_w * kernel_ops - - # adaptive avg pool3d - # This is approximate and works only for downsampling without padding - # based on aten/src/ATen/native/AdaptiveAveragePooling3d.cpp - elif layer_type in ["AdaptiveAvgPool3d"]: - in_t = x.size()[2] - in_h = x.size()[3] - in_w = x.size()[4] - out_t = layer.output_size[0] - out_h = layer.output_size[1] - out_w = layer.output_size[2] - if out_t > in_t or out_h > in_h or out_w > in_w: - raise ClassyProfilerNotImplementedError(layer) - batchsize_per_replica = x.size()[0] - num_channels = x.size()[1] - kt = in_t - out_t + 1 - kh = in_h - out_h + 1 - kw = in_w - out_w + 1 - kernel_ops = kt * kh * kw - flops = ( - batchsize_per_replica * num_channels * out_t * out_w * out_h * kernel_ops - ) + # batchnorm can be merged into conv op. Thus, count 0 FLOPS + flops = 0 # dropout layer elif layer_type in ["Dropout"]: - # At test time, we do not drop values but scale the feature map by the - # dropout ratio - flops = 1 - for dim_size in x.size(): - flops *= dim_size + flops = 0 elif layer_type == "Identity": flops = 0 @@ -351,11 +284,14 @@ def flops(self, x): f"params(M): {count_params(layer) / 1e6}", f"flops(M): {int(flops) / 1e6}", ] - logging.debug("\t".join(message)) + if verbose: + logging.info("\t".join(message)) return flops -def _layer_activations(layer: nn.Module, x: Any, out: Any) -> int: +def _layer_activations( + layer: nn.Module, x: Any, out: Any, verbose: bool = False +) -> int: """ Computes the number of activations produced by a single layer. @@ -376,7 +312,8 @@ def activations(self, x, out): return 0 message = [f"module: {typestr}", f"activations: {activations}"] - logging.debug("\t".join(message)) + if verbose: + logging.info("\t".join(message)) return activations @@ -402,17 +339,19 @@ def summarize_profiler_info(prof: torch.autograd.profiler.profile) -> str: class ComplexityComputer: - def __init__(self, compute_fn: Callable, count_unique: bool): + def __init__(self, compute_fn: Callable, count_unique: bool, verbose: bool = False): self.compute_fn = compute_fn self.count_unique = count_unique self.count = 0 + self.verbose = verbose self.seen_modules = set() def compute(self, layer: nn.Module, x: Any, out: Any, module_name: str): if self.count_unique and module_name in self.seen_modules: return - logging.debug(f"module name: {module_name}") - self.count += self.compute_fn(layer, x, out) + if self.verbose: + logging.info(f"module name: {module_name}") + self.count += self.compute_fn(layer, x, out, self.verbose) self.seen_modules.add(module_name) def reset(self): @@ -498,6 +437,7 @@ def compute_complexity( input_key: Optional[Union[str, List[str]]] = None, patch_attr: str = None, compute_unique: bool = False, + verbose: bool = False, ) -> int: """ Compute the complexity of a forward pass. @@ -517,7 +457,7 @@ def compute_complexity( else: input = get_model_dummy_input(model, input_shape, input_key) - complexity_computer = ComplexityComputer(compute_fn, compute_unique) + complexity_computer = ComplexityComputer(compute_fn, compute_unique, verbose) # measure FLOPs: modify_forward(model, complexity_computer, patch_attr=patch_attr) @@ -540,7 +480,7 @@ def compute_flops( Compute the number of FLOPs needed for a forward pass. """ return compute_complexity( - model, _layer_flops, input_shape, input_key, patch_attr="flops" + model, _layer_flops, input_shape, input_key, patch_attr="flops", verbose=True ) @@ -553,7 +493,12 @@ def compute_activations( Compute the number of activations created in a forward pass. """ return compute_complexity( - model, _layer_activations, input_shape, input_key, patch_attr="activations" + model, + _layer_activations, + input_shape, + input_key, + patch_attr="activations", + verbose=True, ) diff --git a/classy_vision/optim/classy_optimizer.py b/classy_vision/optim/classy_optimizer.py index 7be454bff7..9b61190c86 100644 --- a/classy_vision/optim/classy_optimizer.py +++ b/classy_vision/optim/classy_optimizer.py @@ -4,6 +4,7 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. +import copy from abc import ABC, abstractmethod from typing import Any, Callable, Dict, Optional, Union @@ -221,7 +222,10 @@ def update_schedule_on_step(self, where: float) -> None: def _update_schedule(self) -> None: """Update the optimizer's parameters based on self.parameters.""" for group in self.optimizer.param_groups: - group.update(self.parameters) + lr_scale = group.get("lr_scale", 1.0) + parameters = copy.deepcopy(self.parameters) + parameters["lr"] *= lr_scale + group.update(parameters) # Here there's an assumption that pytorch optimizer maintain the order # of param_groups and that frozen_param_groups were added before the