Source code for gluoncv.model_zoo.action_recognition.slowfast

"""SlowFast, implemented in Gluon.
Code adapted from"""
# pylint: disable=line-too-long,too-many-lines,missing-docstring,arguments-differ,unused-argument

__all__ = ['SlowFast', 'slowfast_4x16_resnet50_kinetics400', 'slowfast_8x8_resnet50_kinetics400',
           'slowfast_4x16_resnet101_kinetics400', 'slowfast_8x8_resnet101_kinetics400',
           'slowfast_16x8_resnet101_kinetics400', 'slowfast_16x8_resnet101_50_50_kinetics400',

from mxnet import init
from mxnet.context import cpu
from mxnet.gluon.block import HybridBlock
from mxnet.gluon import nn
from mxnet.gluon.nn import BatchNorm

class Bottleneck(HybridBlock):
    Bottleneck building block for ResNet50, ResNet101 and ResNet152.

    inplanes : int.
        Input channels of each block.
    planes : int.
        Output channels of each block.
    strides : int, default is 1.
        Stride in convolution layers.
    head_conv : int, default is 1.
        Determin whether we do 1x1x1 convolution or 3x1x1 convolution.
    downsample : bool.
        Whether to contain a downsampling layer in the block.
    norm_layer : object
        Normalization layer used (default: :class:`mxnet.gluon.nn.BatchNorm`)
        Can be :class:`mxnet.gluon.nn.BatchNorm` or :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`.
    norm_kwargs : dict
        Additional `norm_layer` arguments, for example `num_devices=4`
        for :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`.
    layer_name : str, default is ''.
        Give a name to current block.
    expansion = 4

    def __init__(self,
        super(Bottleneck, self).__init__()

        bottleneck = nn.HybridSequential(prefix=layer_name)
        with bottleneck.name_scope():
            if head_conv == 1:
                self.conv1 = nn.Conv3D(in_channels=inplanes, channels=planes, kernel_size=1, use_bias=False)
                self.bn1 = norm_layer(in_channels=planes, **({} if norm_kwargs is None else norm_kwargs))
            elif head_conv == 3:
                self.conv1 = nn.Conv3D(in_channels=inplanes, channels=planes, kernel_size=(3, 1, 1), padding=(1, 0, 0), use_bias=False)
                self.bn1 = norm_layer(in_channels=planes, **({} if norm_kwargs is None else norm_kwargs))
                raise ValueError("Unsupported head_conv!")
            self.conv2 = nn.Conv3D(in_channels=planes, channels=planes, kernel_size=(1, 3, 3), strides=(1, strides, strides), padding=(0, 1, 1), use_bias=False)
            self.bn2 = norm_layer(in_channels=planes, **({} if norm_kwargs is None else norm_kwargs))
            self.conv3 = nn.Conv3D(in_channels=planes, channels=planes * self.expansion, kernel_size=1, strides=1, use_bias=False)
            self.bn3 = norm_layer(in_channels=planes * self.expansion, gamma_initializer='zeros', **({} if norm_kwargs is None else norm_kwargs))
            self.relu = nn.Activation('relu')
            self.downsample = downsample

    def hybrid_forward(self, F, x):
        """Hybrid forward of a ResNet bottleneck block"""
        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            identity = self.downsample(x)
        out = F.Activation(out + identity, act_type='relu')
        return out

[docs]class SlowFast(HybridBlock): """SlowFast networks (SlowFast) from `"SlowFast Networks for Video Recognition" <>`_ paper. Parameters ---------- nclass : int. Number of categories in the dataset. block : a HybridBlock. Building block of a ResNet, could be Basic or Bottleneck. layers : a list or tuple, default is None. Number of stages in a ResNet, e.g., [3, 4, 6, 3] in ResNet50. num_block_temp_kernel_fast : int, default is None. If the current block has more than NUM_BLOCK_TEMP_KERNEL blocks, use temporal kernel of 1 for the rest of the blocks. num_block_temp_kernel_slow : int, default is None. If the current block has more than NUM_BLOCK_TEMP_KERNEL blocks, use temporal kernel of 1 for the rest of the blocks. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. bn_eval : bool. Whether to set BN layers to eval mode, namely, freeze running stats (mean and var). bn_frozen : bool. Whether to freeze weight and bias of BN layers. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. frozen_stages : int. Stages to be frozen (all param fixed). -1 means not freezing any parameters. dropout_ratio : float, default is 0.5. The dropout rate of a dropout layer. The larger the value, the more strength to prevent overfitting. init_std : float, default is 0.001. Standard deviation value when initialize the dense layers. alpha : int, default is 8. Corresponds to the frame rate reduction ratio between the Slow and Fast pathways. beta_inv : int, default is 8. Corresponds to the inverse of the channel reduction ratio between the Slow and Fast pathways. fusion_conv_channel_ratio : int, default is 2. Ratio of channel dimensions between the Slow and Fast pathways. fusion_kernel_size : int, default is 5. Kernel dimension used for fusing information from Fast pathway to Slow pathway. width_per_group : int, default is 64. Width of each group (64 -> ResNet; 4 -> ResNeXt). num_groups : int, default is 1. Number of groups for the convolution. Num_groups=1 is for standard ResNet like networks, and num_groups>1 is for ResNeXt like networks. slow_temporal_stride : int, default 16. The temporal stride for sparse sampling of video frames in slow branch of a SlowFast network. fast_temporal_stride : int, default 2. The temporal stride for sparse sampling of video frames in fast branch of a SlowFast network. slow_frames : int, default 4. The number of frames used as input to a slow branch. fast_frames : int, default 32. The number of frames used as input to a fast branch. norm_layer : object Normalization layer used (default: :class:`mxnet.gluon.nn.BatchNorm`) Can be :class:`mxnet.gluon.nn.BatchNorm` or :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`. norm_kwargs : dict Additional `norm_layer` arguments, for example `num_devices=4` for :class:`mxnet.gluon.contrib.nn.SyncBatchNorm`. ctx : Context, default CPU. The context in which to load the pretrained weights. """ def __init__(self, nclass, block=Bottleneck, layers=None, num_block_temp_kernel_fast=None, num_block_temp_kernel_slow=None, pretrained=False, pretrained_base=False, feat_ext=False, num_segments=1, num_crop=1, bn_eval=True, bn_frozen=False, partial_bn=False, frozen_stages=-1, dropout_ratio=0.5, init_std=0.01, alpha=8, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=5, width_per_group=64, num_groups=1, slow_temporal_stride=16, fast_temporal_stride=2, slow_frames=4, fast_frames=32, norm_layer=BatchNorm, norm_kwargs=None, ctx=None, **kwargs): super(SlowFast, self).__init__() self.num_segments = num_segments self.num_crop = num_crop self.dropout_ratio = dropout_ratio self.init_std = init_std self.alpha = alpha self.beta_inv = beta_inv self.fusion_conv_channel_ratio = fusion_conv_channel_ratio self.fusion_kernel_size = fusion_kernel_size self.width_per_group = width_per_group self.num_groups = num_groups self.dim_inner = self.num_groups * self.width_per_group self.out_dim_ratio = self.beta_inv // self.fusion_conv_channel_ratio self.slow_temporal_stride = slow_temporal_stride self.fast_temporal_stride = fast_temporal_stride self.slow_frames = slow_frames self.fast_frames = fast_frames self.feat_ext = feat_ext with self.name_scope(): # build fast pathway fast = nn.HybridSequential(prefix='fast_') with fast.name_scope(): self.fast_conv1 = nn.Conv3D(in_channels=3, channels=self.width_per_group // self.beta_inv, kernel_size=(5, 7, 7), strides=(1, 2, 2), padding=(2, 3, 3), use_bias=False) self.fast_bn1 = norm_layer(in_channels=self.width_per_group // self.beta_inv, **({} if norm_kwargs is None else norm_kwargs)) self.fast_relu = nn.Activation('relu') self.fast_maxpool = nn.MaxPool3D(pool_size=(1, 3, 3), strides=(1, 2, 2), padding=(0, 1, 1)) self.fast_res2 = self._make_layer_fast(inplanes=self.width_per_group // self.beta_inv, planes=self.dim_inner // self.beta_inv, num_blocks=layers[0], head_conv=3, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='fast_res2_') self.fast_res3 = self._make_layer_fast(inplanes=self.width_per_group * 4 // self.beta_inv, planes=self.dim_inner * 2 // self.beta_inv, num_blocks=layers[1], strides=2, head_conv=3, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='fast_res3_') self.fast_res4 = self._make_layer_fast(inplanes=self.width_per_group * 8 // self.beta_inv, planes=self.dim_inner * 4 // self.beta_inv, num_blocks=layers[2], num_block_temp_kernel_fast=num_block_temp_kernel_fast, strides=2, head_conv=3, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='fast_res4_') self.fast_res5 = self._make_layer_fast(inplanes=self.width_per_group * 16 // self.beta_inv, planes=self.dim_inner * 8 // self.beta_inv, num_blocks=layers[3], strides=2, head_conv=3, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='fast_res5_') # build lateral connections self.lateral_p1 = nn.HybridSequential(prefix='lateral_p1_') with self.lateral_p1.name_scope(): self.lateral_p1.add(nn.Conv3D(in_channels=self.width_per_group // self.beta_inv, channels=self.width_per_group // self.beta_inv * self.fusion_conv_channel_ratio, kernel_size=(self.fusion_kernel_size, 1, 1), strides=(self.alpha, 1, 1), padding=(self.fusion_kernel_size // 2, 0, 0), use_bias=False)) self.lateral_p1.add(norm_layer(in_channels=self.width_per_group // self.beta_inv * self.fusion_conv_channel_ratio, **({} if norm_kwargs is None else norm_kwargs))) self.lateral_p1.add(nn.Activation('relu')) self.lateral_res2 = nn.HybridSequential(prefix='lateral_res2_') with self.lateral_res2.name_scope(): self.lateral_res2.add(nn.Conv3D(in_channels=self.width_per_group * 4 // self.beta_inv, channels=self.width_per_group * 4 // self.beta_inv * self.fusion_conv_channel_ratio, kernel_size=(self.fusion_kernel_size, 1, 1), strides=(self.alpha, 1, 1), padding=(self.fusion_kernel_size // 2, 0, 0), use_bias=False)) self.lateral_res2.add(norm_layer(in_channels=self.width_per_group * 4 // self.beta_inv * self.fusion_conv_channel_ratio, **({} if norm_kwargs is None else norm_kwargs))) self.lateral_res2.add(nn.Activation('relu')) self.lateral_res3 = nn.HybridSequential(prefix='lateral_res3_') with self.lateral_res3.name_scope(): self.lateral_res3.add(nn.Conv3D(in_channels=self.width_per_group * 8 // self.beta_inv, channels=self.width_per_group * 8 // self.beta_inv * self.fusion_conv_channel_ratio, kernel_size=(self.fusion_kernel_size, 1, 1), strides=(self.alpha, 1, 1), padding=(self.fusion_kernel_size // 2, 0, 0), use_bias=False)) self.lateral_res3.add(norm_layer(in_channels=self.width_per_group * 8 // self.beta_inv * self.fusion_conv_channel_ratio, **({} if norm_kwargs is None else norm_kwargs))) self.lateral_res3.add(nn.Activation('relu')) self.lateral_res4 = nn.HybridSequential(prefix='lateral_res4_') with self.lateral_res4.name_scope(): self.lateral_res4.add(nn.Conv3D(in_channels=self.width_per_group * 16 // self.beta_inv, channels=self.width_per_group * 16 // self.beta_inv * self.fusion_conv_channel_ratio, kernel_size=(self.fusion_kernel_size, 1, 1), strides=(self.alpha, 1, 1), padding=(self.fusion_kernel_size // 2, 0, 0), use_bias=False)) self.lateral_res4.add(norm_layer(in_channels=self.width_per_group * 16 // self.beta_inv * self.fusion_conv_channel_ratio, **({} if norm_kwargs is None else norm_kwargs))) self.lateral_res4.add(nn.Activation('relu')) # build slow pathway slow = nn.HybridSequential(prefix='slow_') with slow.name_scope(): self.slow_conv1 = nn.Conv3D(in_channels=3, channels=self.width_per_group, kernel_size=(1, 7, 7), strides=(1, 2, 2), padding=(0, 3, 3), use_bias=False) self.slow_bn1 = norm_layer(in_channels=self.width_per_group, **({} if norm_kwargs is None else norm_kwargs)) self.slow_relu = nn.Activation('relu') self.slow_maxpool = nn.MaxPool3D(pool_size=(1, 3, 3), strides=(1, 2, 2), padding=(0, 1, 1)) self.slow_res2 = self._make_layer_slow(inplanes=self.width_per_group + self.width_per_group // self.out_dim_ratio, planes=self.dim_inner, num_blocks=layers[0], head_conv=1, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='slow_res2_') self.slow_res3 = self._make_layer_slow(inplanes=self.width_per_group * 4 + self.width_per_group * 4 // self.out_dim_ratio, planes=self.dim_inner * 2, num_blocks=layers[1], strides=2, head_conv=1, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='slow_res3_') self.slow_res4 = self._make_layer_slow(inplanes=self.width_per_group * 8 + self.width_per_group * 8 // self.out_dim_ratio, planes=self.dim_inner * 4, num_blocks=layers[2], num_block_temp_kernel_slow=num_block_temp_kernel_slow, strides=2, head_conv=3, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='slow_res4_') self.slow_res5 = self._make_layer_slow(inplanes=self.width_per_group * 16 + self.width_per_group * 16 // self.out_dim_ratio, planes=self.dim_inner * 8, num_blocks=layers[3], strides=2, head_conv=3, norm_layer=norm_layer, norm_kwargs=norm_kwargs, layer_name='slow_res5_') # build classifier self.avg = nn.GlobalAvgPool3D() self.dp = nn.Dropout(rate=self.dropout_ratio) self.feat_dim = self.width_per_group * 32 // self.beta_inv + self.width_per_group * 32 self.fc = nn.Dense(in_units=self.feat_dim, units=nclass, weight_initializer=init.Normal(sigma=self.init_std), use_bias=True) self.initialize(init.MSRAPrelu(), ctx=ctx)
[docs] def hybrid_forward(self, F, x): """Hybrid forward of SlowFast network""" fast_input = F.slice(x, begin=(None, None, 0, None, None), end=(None, None, self.fast_frames, None, None)) slow_input = F.slice(x, begin=(None, None, self.fast_frames, None, None), end=(None, None, self.fast_frames + self.slow_frames, None, None)) fast, lateral = self.FastPath(F, fast_input) slow = self.SlowPath(F, slow_input, lateral) x = F.concat(slow, fast, dim=1) # bx2304 # segmental consensus x = F.reshape(x, shape=(-1, self.num_segments * self.num_crop, self.feat_dim)) x = F.mean(x, axis=1) if self.feat_ext: return x x = self.dp(x) x = self.fc(x) # bxnclass return x
[docs] def SlowPath(self, F, x, lateral): """Hybrid forward of the slow branch""" x = self.slow_conv1(x) # bx64x4x112x112, input is bx3x4x224x224 x = self.slow_bn1(x) x = self.slow_relu(x) pool1 = self.slow_maxpool(x) # bx64x4x56x56 pool1_lat = F.concat(pool1, lateral[0], dim=1) # bx80x4x56x56 res2 = self.slow_res2(pool1_lat) # bx256x4x56x56 res2_lat = F.concat(res2, lateral[1], dim=1) # bx320x4x56x56 res3 = self.slow_res3(res2_lat) # bx512x4x28x28 res3_lat = F.concat(res3, lateral[2], dim=1) # bx640x4x28x28 res4 = self.slow_res4(res3_lat) # bx1024x4x14x14 res4_lat = F.concat(res4, lateral[3], dim=1) # bx1280x4x14x14 res5 = self.slow_res5(res4_lat) # bx2048x4x7x7 out = self.avg(res5) # bx2048x1x1x1 out = F.squeeze(out, axis=(2, 3, 4)) # bx2048 return out
[docs] def FastPath(self, F, x): """Hybrid forward of the fast branch""" lateral = [] x = self.fast_conv1(x) # bx8x32x112x112, input is bx3x32x224x224 x = self.fast_bn1(x) x = self.fast_relu(x) pool1 = self.fast_maxpool(x) # bx8x32x56x56 lateral_p = self.lateral_p1(pool1) # bx16x4x56x56 lateral.append(lateral_p) res2 = self.fast_res2(pool1) # bx32x32x56x56 lateral_res2 = self.lateral_res2(res2) # bx64x4x56x56 lateral.append(lateral_res2) res3 = self.fast_res3(res2) # bx64x32x28x28 lateral_res3 = self.lateral_res3(res3) # bx128x4x28x28 lateral.append(lateral_res3) res4 = self.fast_res4(res3) # bx128x32x14x14 lateral_res4 = self.lateral_res4(res4) # bx256x4x14x14 lateral.append(lateral_res4) res5 = self.fast_res5(res4) # bx256x32x7x7 out = self.avg(res5) # bx256x1x1x1 out = F.squeeze(out, axis=(2, 3, 4)) # bx256 return out, lateral
def _make_layer_fast(self, inplanes, planes, num_blocks, num_block_temp_kernel_fast=None, block=Bottleneck, strides=1, head_conv=1, norm_layer=BatchNorm, norm_kwargs=None, layer_name=''): """Build each stage of within the fast branch.""" downsample = None if strides != 1 or inplanes != planes * block.expansion: downsample = nn.HybridSequential(prefix=layer_name+'downsample_') with downsample.name_scope(): downsample.add(nn.Conv3D(in_channels=inplanes, channels=planes * block.expansion, kernel_size=1, strides=(1, strides, strides), use_bias=False)) downsample.add(norm_layer(in_channels=planes * block.expansion, **({} if norm_kwargs is None else norm_kwargs))) layers = nn.HybridSequential(prefix=layer_name) cnt = 0 with layers.name_scope(): layers.add(block(inplanes=inplanes, planes=planes, strides=strides, downsample=downsample, head_conv=head_conv, layer_name='block%d_' % cnt)) inplanes = planes * block.expansion cnt += 1 for _ in range(1, num_blocks): if num_block_temp_kernel_fast is not None: if cnt < num_block_temp_kernel_fast: layers.add(block(inplanes=inplanes, planes=planes, head_conv=head_conv, layer_name='block%d_' % cnt)) else: layers.add(block(inplanes=inplanes, planes=planes, head_conv=1, layer_name='block%d_' % cnt)) else: layers.add(block(inplanes=inplanes, planes=planes, head_conv=head_conv, layer_name='block%d_' % cnt)) cnt += 1 return layers def _make_layer_slow(self, inplanes, planes, num_blocks, num_block_temp_kernel_slow=None, block=Bottleneck, strides=1, head_conv=1, norm_layer=BatchNorm, norm_kwargs=None, layer_name=''): """Build each stage of within the slow branch.""" downsample = None if strides != 1 or inplanes != planes * block.expansion: downsample = nn.HybridSequential(prefix=layer_name+'downsample_') with downsample.name_scope(): downsample.add(nn.Conv3D(in_channels=inplanes, channels=planes * block.expansion, kernel_size=1, strides=(1, strides, strides), use_bias=False)) downsample.add(norm_layer(in_channels=planes * block.expansion, **({} if norm_kwargs is None else norm_kwargs))) layers = nn.HybridSequential(prefix=layer_name) cnt = 0 with layers.name_scope(): layers.add(block(inplanes=inplanes, planes=planes, strides=strides, downsample=downsample, head_conv=head_conv, layer_name='block%d_' % cnt)) inplanes = planes * block.expansion cnt += 1 for _ in range(1, num_blocks): if num_block_temp_kernel_slow is not None: if cnt < num_block_temp_kernel_slow: layers.add(block(inplanes=inplanes, planes=planes, head_conv=head_conv, layer_name='block%d_' % cnt)) else: layers.add(block(inplanes=inplanes, planes=planes, head_conv=1, layer_name='block%d_' % cnt)) else: layers.add(block(inplanes=inplanes, planes=planes, head_conv=head_conv, layer_name='block%d_' % cnt)) cnt += 1 return layers
[docs]def slowfast_4x16_resnet50_kinetics400(nclass=400, pretrained=False, pretrained_base=True, use_tsn=False, num_segments=1, num_crop=1, partial_bn=False, feat_ext=False, root='~/.mxnet/models', ctx=cpu(), **kwargs): r"""SlowFast 4x16 networks (SlowFast) with ResNet50 backbone trained on Kinetics400 dataset. Parameters ---------- nclass : int. Number of categories in the dataset. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. ctx : Context, default CPU. The context in which to load the pretrained weights. root : str, default $MXNET_HOME/models Location for keeping the model parameters. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. """ model = SlowFast(nclass=nclass, layers=[3, 4, 6, 3], pretrained=pretrained, pretrained_base=pretrained_base, feat_ext=feat_ext, num_segments=num_segments, num_crop=num_crop, partial_bn=partial_bn, alpha=8, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=5, width_per_group=64, num_groups=1, slow_temporal_stride=16, fast_temporal_stride=2, slow_frames=4, fast_frames=32, ctx=ctx, **kwargs) if pretrained: from ..model_store import get_model_file model.load_parameters(get_model_file('slowfast_4x16_resnet50_kinetics400', tag=pretrained, root=root), ctx=ctx) from import Kinetics400Attr attrib = Kinetics400Attr() model.classes = attrib.classes model.collect_params().reset_ctx(ctx) return model
[docs]def slowfast_8x8_resnet50_kinetics400(nclass=400, pretrained=False, pretrained_base=True, use_tsn=False, num_segments=1, num_crop=1, partial_bn=False, feat_ext=False, root='~/.mxnet/models', ctx=cpu(), **kwargs): r"""SlowFast 8x8 networks (SlowFast) with ResNet50 backbone trained on Kinetics400 dataset. Parameters ---------- nclass : int. Number of categories in the dataset. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. ctx : Context, default CPU. The context in which to load the pretrained weights. root : str, default $MXNET_HOME/models Location for keeping the model parameters. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. """ model = SlowFast(nclass=nclass, layers=[3, 4, 6, 3], pretrained=pretrained, pretrained_base=pretrained_base, feat_ext=feat_ext, num_segments=num_segments, num_crop=num_crop, partial_bn=partial_bn, alpha=4, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=7, width_per_group=64, num_groups=1, slow_temporal_stride=8, fast_temporal_stride=2, slow_frames=8, fast_frames=32, ctx=ctx, **kwargs) if pretrained: from ..model_store import get_model_file model.load_parameters(get_model_file('slowfast_8x8_resnet50_kinetics400', tag=pretrained, root=root), ctx=ctx) from import Kinetics400Attr attrib = Kinetics400Attr() model.classes = attrib.classes model.collect_params().reset_ctx(ctx) return model
[docs]def slowfast_4x16_resnet101_kinetics400(nclass=400, pretrained=False, pretrained_base=True, use_tsn=False, num_segments=1, num_crop=1, partial_bn=False, feat_ext=False, root='~/.mxnet/models', ctx=cpu(), **kwargs): r"""SlowFast 4x16 networks (SlowFast) with ResNet101 backbone trained on Kinetics400 dataset. Parameters ---------- nclass : int. Number of categories in the dataset. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. ctx : Context, default CPU. The context in which to load the pretrained weights. root : str, default $MXNET_HOME/models Location for keeping the model parameters. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. """ model = SlowFast(nclass=nclass, layers=[3, 4, 23, 3], pretrained=pretrained, pretrained_base=pretrained_base, feat_ext=feat_ext, num_segments=num_segments, num_crop=num_crop, partial_bn=partial_bn, alpha=8, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=5, width_per_group=64, num_groups=1, slow_temporal_stride=16, fast_temporal_stride=2, slow_frames=4, fast_frames=32, ctx=ctx, **kwargs) if pretrained: from ..model_store import get_model_file model.load_parameters(get_model_file('slowfast_4x16_resnet101_kinetics400', tag=pretrained, root=root), ctx=ctx) from import Kinetics400Attr attrib = Kinetics400Attr() model.classes = attrib.classes model.collect_params().reset_ctx(ctx) return model
[docs]def slowfast_8x8_resnet101_kinetics400(nclass=400, pretrained=False, pretrained_base=True, use_tsn=False, num_segments=1, num_crop=1, partial_bn=False, feat_ext=False, root='~/.mxnet/models', ctx=cpu(), **kwargs): r"""SlowFast 8x8 networks (SlowFast) with ResNet101 backbone trained on Kinetics400 dataset. Parameters ---------- nclass : int. Number of categories in the dataset. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. ctx : Context, default CPU. The context in which to load the pretrained weights. root : str, default $MXNET_HOME/models Location for keeping the model parameters. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. """ model = SlowFast(nclass=nclass, layers=[3, 4, 23, 3], pretrained=pretrained, pretrained_base=pretrained_base, feat_ext=feat_ext, num_segments=num_segments, num_crop=num_crop, partial_bn=partial_bn, alpha=4, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=5, width_per_group=64, num_groups=1, slow_temporal_stride=8, fast_temporal_stride=2, slow_frames=8, fast_frames=32, ctx=ctx, **kwargs) if pretrained: from ..model_store import get_model_file model.load_parameters(get_model_file('slowfast_8x8_resnet101_kinetics400', tag=pretrained, root=root), ctx=ctx) from import Kinetics400Attr attrib = Kinetics400Attr() model.classes = attrib.classes model.collect_params().reset_ctx(ctx) return model
[docs]def slowfast_16x8_resnet101_kinetics400(nclass=400, pretrained=False, pretrained_base=True, use_tsn=False, num_segments=1, num_crop=1, partial_bn=False, feat_ext=False, root='~/.mxnet/models', ctx=cpu(), **kwargs): r"""SlowFast 16x8 networks (SlowFast) with ResNet101 backbone trained on Kinetics400 dataset. Parameters ---------- nclass : int. Number of categories in the dataset. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. ctx : Context, default CPU. The context in which to load the pretrained weights. root : str, default $MXNET_HOME/models Location for keeping the model parameters. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. """ model = SlowFast(nclass=nclass, layers=[3, 4, 23, 3], pretrained=pretrained, pretrained_base=pretrained_base, feat_ext=feat_ext, num_segments=num_segments, num_crop=num_crop, partial_bn=partial_bn, alpha=4, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=5, width_per_group=64, num_groups=1, slow_temporal_stride=8, fast_temporal_stride=2, slow_frames=16, fast_frames=64, ctx=ctx, **kwargs) if pretrained: from ..model_store import get_model_file model.load_parameters(get_model_file('slowfast_16x8_resnet101_kinetics400', tag=pretrained, root=root), ctx=ctx) from import Kinetics400Attr attrib = Kinetics400Attr() model.classes = attrib.classes model.collect_params().reset_ctx(ctx) return model
[docs]def slowfast_16x8_resnet101_50_50_kinetics400(nclass=400, pretrained=False, pretrained_base=True, use_tsn=False, num_segments=1, num_crop=1, partial_bn=False, feat_ext=False, root='~/.mxnet/models', ctx=cpu(), **kwargs): r"""SlowFast 16x8 networks (SlowFast) with ResNet101 backbone trained on Kinetics400 dataset, but the temporal head is initialized with ResNet50 structure (3, 4, 6, 3). Parameters ---------- nclass : int. Number of categories in the dataset. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. ctx : Context, default CPU. The context in which to load the pretrained weights. root : str, default $MXNET_HOME/models Location for keeping the model parameters. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. """ model = SlowFast(nclass=nclass, layers=[3, 4, 23, 3], num_block_temp_kernel_fast=6, num_block_temp_kernel_slow=6, pretrained=pretrained, pretrained_base=pretrained_base, feat_ext=feat_ext, num_segments=num_segments, num_crop=num_crop, partial_bn=partial_bn, alpha=4, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=5, width_per_group=64, num_groups=1, slow_temporal_stride=8, fast_temporal_stride=2, slow_frames=16, fast_frames=64, ctx=ctx, **kwargs) if pretrained: from ..model_store import get_model_file model.load_parameters(get_model_file('slowfast_16x8_resnet101_50_50_kinetics400', tag=pretrained, root=root), ctx=ctx) from import Kinetics400Attr attrib = Kinetics400Attr() model.classes = attrib.classes model.collect_params().reset_ctx(ctx) return model
[docs]def slowfast_4x16_resnet50_custom(nclass=400, pretrained=False, pretrained_base=True, use_tsn=False, num_segments=1, num_crop=1, partial_bn=False, feat_ext=False, use_kinetics_pretrain=True, root='~/.mxnet/models', ctx=cpu(), **kwargs): r"""SlowFast 4x16 networks (SlowFast) with ResNet50 backbone. Customized for users's own dataset. Parameters ---------- nclass : int. Number of categories in the dataset. pretrained : bool or str. Boolean value controls whether to load the default pretrained weights for model. String value represents the hashtag for a certain version of pretrained weights. pretrained_base : bool or str, optional, default is True. Load pretrained base network, the extra layers are randomized. Note that if pretrained is `True`, this has no effect. ctx : Context, default CPU. The context in which to load the pretrained weights. root : str, default $MXNET_HOME/models Location for keeping the model parameters. num_segments : int, default is 1. Number of segments used to evenly divide a video. num_crop : int, default is 1. Number of crops used during evaluation, choices are 1, 3 or 10. partial_bn : bool, default False. Freeze all batch normalization layers during training except the first layer. feat_ext : bool. Whether to extract features before dense classification layer or do a complete forward pass. use_kinetics_pretrain : bool. Whether to load Kinetics-400 pre-trained model weights. """ model = SlowFast(nclass=nclass, layers=[3, 4, 6, 3], pretrained=pretrained, pretrained_base=pretrained_base, feat_ext=feat_ext, num_segments=num_segments, num_crop=num_crop, partial_bn=partial_bn, alpha=8, beta_inv=8, fusion_conv_channel_ratio=2, fusion_kernel_size=5, width_per_group=64, num_groups=1, slow_temporal_stride=16, fast_temporal_stride=2, slow_frames=4, fast_frames=32, ctx=ctx, **kwargs) if use_kinetics_pretrain and not pretrained: from gluoncv.model_zoo import get_model kinetics_model = get_model('slowfast_4x16_resnet50_kinetics400', nclass=400, pretrained=True) source_params = kinetics_model.collect_params() target_params = model.collect_params() assert len(source_params.keys()) == len(target_params.keys()) pretrained_weights = [] for layer_name in source_params.keys(): pretrained_weights.append(source_params[layer_name].data()) for i, layer_name in enumerate(target_params.keys()): if i + 2 == len(source_params.keys()): # skip the last dense layer break target_params[layer_name].set_data(pretrained_weights[i]) model.collect_params().reset_ctx(ctx) return model