Source code for gluoncv.nn.coder
# pylint: disable=arguments-differ, missing-docstring
"""Encoder and Decoder functions.
Encoders are used during training, which assign training targets.
Decoders are used during testing/validation, which convert predictions back to
normal boxes, etc.
"""
from __future__ import absolute_import
import numpy as np
from mxnet import gluon
from mxnet import nd
from .bbox import BBoxCornerToCenter, NumPyBBoxCornerToCenter
try:
import cython_bbox
except ImportError:
cython_bbox = None
[docs]class NumPyNormalizedBoxCenterEncoder(object):
"""Encode bounding boxes training target with normalized center offsets using numpy.
Input bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`.
Parameters
----------
stds : array-like of size 4
Std value to be divided from encoded values, default is (0.1, 0.1, 0.2, 0.2).
means : array-like of size 4
Mean value to be subtracted from encoded values, default is (0., 0., 0., 0.).
"""
def __init__(self, stds=(0.1, 0.1, 0.2, 0.2), means=(0., 0., 0., 0.)):
super(NumPyNormalizedBoxCenterEncoder, self).__init__()
assert len(stds) == 4, "Box Encoder requires 4 std values."
self._stds = stds
self._means = means
self.corner_to_center = NumPyBBoxCornerToCenter(split=True)
def __call__(self, samples, matches, anchors, refs):
"""Not HybridBlock due to use of matches.shape
Parameters
----------
samples: (B, N) value +1 (positive), -1 (negative), 0 (ignore)
matches: (B, N) value range [0, M)
anchors: (B, N, 4) encoded in corner
refs: (B, M, 4) encoded in corner
Returns
-------
targets: (B, N, 4) transform anchors to refs picked according to matches
masks: (B, N, 4) only positive anchors has targets
"""
if cython_bbox is not None:
return cython_bbox.np_normalized_box_encoder(samples, matches, anchors, refs,
np.array(self._means, dtype=np.float32),
np.array(self._stds, dtype=np.float32))
# refs [B, M, 4], anchors [B, N, 4], samples [B, N], matches [B, N]
ref_boxes = np.repeat(refs.reshape((refs.shape[0], 1, -1, 4)), axis=1,
repeats=matches.shape[1])
# refs [B, N, M, 4] -> [B, N, 4]
ref_boxes = \
ref_boxes[:, range(matches.shape[1]), matches, :] \
.reshape(matches.shape[0], -1, 4)
# g [B, N, 4], a [B, N, 4] -> codecs [B, N, 4]
g = self.corner_to_center(ref_boxes)
a = self.corner_to_center(anchors)
t0 = ((g[0] - a[0]) / a[2] - self._means[0]) / self._stds[0]
t1 = ((g[1] - a[1]) / a[3] - self._means[1]) / self._stds[1]
t2 = (np.log(g[2] / a[2]) - self._means[2]) / self._stds[2]
t3 = (np.log(g[3] / a[3]) - self._means[3]) / self._stds[3]
codecs = np.concatenate((t0, t1, t2, t3), axis=2)
# samples [B, N] -> [B, N, 1] -> [B, N, 4] -> boolean
temp = np.tile(samples.reshape((samples.shape[0], -1, 1)), reps=(1, 1, 4)) > 0.5
# fill targets and masks [B, N, 4]
targets = np.where(temp, codecs, 0.0)
masks = np.where(temp, 1.0, 0.0)
return targets, masks
[docs]class NormalizedBoxCenterEncoder(gluon.HybridBlock):
"""Encode bounding boxes training target with normalized center offsets.
Input bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`.
Parameters
----------
stds : array-like of size 4
Std value to be divided from encoded values, default is (0.1, 0.1, 0.2, 0.2).
means : array-like of size 4
Mean value to be subtracted from encoded values, default is (0., 0., 0., 0.).
"""
def __init__(self, stds=(0.1, 0.1, 0.2, 0.2), means=(0., 0., 0., 0.), **kwargs):
super(NormalizedBoxCenterEncoder, self).__init__(**kwargs)
assert len(stds) == 4, "Box Encoder requires 4 std values."
assert len(means) == 4, "Box Encoder requires 4 std values."
self._means = means
self._stds = stds
with self.name_scope():
self.corner_to_center = BBoxCornerToCenter(split=True)
# pylint: disable=arguments-differ
[docs] def hybrid_forward(self, F, samples, matches, anchors, refs):
"""Not HybridBlock due to use of matches.shape
Parameters
----------
samples: (B, N) value +1 (positive), -1 (negative), 0 (ignore)
matches: (B, N) value range [0, M)
anchors: (B, N, 4) encoded in corner
refs: (B, M, 4) encoded in corner
Returns
-------
targets: (B, N, 4) transform anchors to refs picked according to matches
masks: (B, N, 4) only positive anchors has targets
"""
# TODO(zhreshold): batch_pick, take multiple elements?
# refs [B, M, 4], anchors [B, N, 4], samples [B, N], matches [B, N]
# refs [B, M, 4] -> reshape [B, 1, M, 4] -> repeat [B, N, M, 4]
ref_boxes = F.broadcast_like(refs.reshape((0, 1, -1, 4)), matches, lhs_axes=1, rhs_axes=1)
# refs [B, N, M, 4] -> 4 * [B, N, M]
ref_boxes = F.split(ref_boxes, axis=-1, num_outputs=4, squeeze_axis=True)
# refs 4 * [B, N, M] -> pick from matches [B, N, 1] -> concat to [B, N, 4]
ref_boxes = F.concat(*[F.pick(ref_boxes[i], matches, axis=2).reshape((0, -1, 1)) \
for i in range(4)], dim=2)
# transform based on x, y, w, h
# g [B, N, 4], a [B, N, 4] -> codecs [B, N, 4]
g = self.corner_to_center(ref_boxes)
a = self.corner_to_center(anchors)
t0 = ((g[0] - a[0]) / a[2] - self._means[0]) / self._stds[0]
t1 = ((g[1] - a[1]) / a[3] - self._means[1]) / self._stds[1]
t2 = (F.log(g[2] / a[2]) - self._means[2]) / self._stds[2]
t3 = (F.log(g[3] / a[3]) - self._means[3]) / self._stds[3]
codecs = F.concat(t0, t1, t2, t3, dim=2)
# samples [B, N] -> [B, N, 1] -> [B, N, 4] -> boolean
temp = F.tile(samples.reshape((0, -1, 1)), reps=(1, 1, 4)) > 0.5
# fill targets and masks [B, N, 4]
targets = F.where(temp, codecs, F.zeros_like(codecs))
masks = F.where(temp, F.ones_like(temp), F.zeros_like(temp))
return targets, masks
[docs]class NormalizedPerClassBoxCenterEncoder(gluon.HybridBlock):
"""Encode bounding boxes training target with normalized center offsets.
Input bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`.
Parameters
----------
max_pos : int, default is 128
Upper bound of Number of positive samples.
per_device_batch_size : int, default is 1
Per device batch size
stds : array-like of size 4
Std value to be divided from encoded values, default is (0.1, 0.1, 0.2, 0.2).
means : array-like of size 4
Mean value to be subtracted from encoded values, default is (0., 0., 0., 0.).
"""
def __init__(self, num_class, max_pos=128, per_device_batch_size=1, stds=(0.1, 0.1, 0.2, 0.2),
means=(0., 0., 0., 0.)):
super(NormalizedPerClassBoxCenterEncoder, self).__init__()
assert len(stds) == 4, "Box Encoder requires 4 std values."
assert num_class > 0, "Number of classes must be positive"
self._num_class = num_class
self._max_pos = max_pos
self._batch_size = per_device_batch_size
with self.name_scope():
self.class_agnostic_encoder = NormalizedBoxCenterEncoder(stds=stds, means=means)
if 'box_encode' in nd.contrib.__dict__:
self.means = self.params.get_constant('means', means)
self.stds = self.params.get_constant('stds', stds)
[docs] def hybrid_forward(self, F, samples, matches, anchors, labels, refs, means=None, stds=None):
"""Encode BBox One entry per category
Parameters
----------
samples: (B, N) value +1 (positive), -1 (negative), 0 (ignore)
matches: (B, N) value range [0, M)
anchors: (B, N, 4) encoded in corner
labels: (B, N) value range [0, self._num_class), excluding background
refs: (B, M, 4) encoded in corner
Returns
-------
targets: (B, N_pos, C, 4) transform anchors to refs picked according to matches
masks: (B, N_pos, C, 4) only positive anchors of the correct class has targets
indices : (B, N_pos) positive sample indices
"""
# refs [B, M, 4], anchors [B, N, 4], samples [B, N], matches [B, N]
# encoded targets [B, N, 4], masks [B, N, 4]
if 'box_encode' in F.contrib.__dict__:
targets, masks = F.contrib.box_encode(samples, matches, anchors, refs, means, stds)
else:
targets, masks = self.class_agnostic_encoder(samples, matches, anchors, refs)
# labels [B, M] -> [B, N, M]
ref_labels = F.broadcast_like(labels.reshape((0, 1, -1)), matches, lhs_axes=1, rhs_axes=1)
# labels [B, N, M] -> pick from matches [B, N] -> [B, N, 1]
ref_labels = F.pick(ref_labels, matches, axis=2).reshape((0, -1)).expand_dims(2)
# boolean array [B, N, C]
same_cids = F.broadcast_equal(ref_labels, F.reshape(F.arange(self._num_class),
shape=(1, 1, -1)))
# reduce box targets to positive samples only
indices = F.slice_axis(
F.reshape(F.argsort(F.slice_axis(masks, axis=-1, begin=0, end=1), axis=1,
is_ascend=False), (self._batch_size, -1)),
axis=1, begin=0, end=self._max_pos)
targets_tmp = []
masks_tmp = []
same_cids_tmp = []
for i in range(self._batch_size):
ind = F.slice_axis(indices, axis=0, begin=i, end=i + 1).squeeze(axis=0)
target = F.slice_axis(targets, axis=0, begin=i, end=i + 1).squeeze(axis=0)
mask = F.slice_axis(masks, axis=0, begin=i, end=i + 1).squeeze(axis=0)
same_cid = F.slice_axis(same_cids, axis=0, begin=i, end=i + 1).squeeze(axis=0)
targets_tmp.append(F.take(target, ind).expand_dims(axis=0))
masks_tmp.append(F.take(mask, ind).expand_dims(axis=0))
same_cids_tmp.append(F.take(same_cid, ind).expand_dims(axis=0))
targets = F.concat(*targets_tmp, dim=0)
masks = F.concat(*masks_tmp, dim=0)
same_cids = F.concat(*same_cids_tmp, dim=0).expand_dims(3)
# targets, masks [B, N_pos, C, 4]
all_targets = F.broadcast_axes(targets.expand_dims(2), axis=2, size=self._num_class)
all_masks = F.broadcast_mul(masks.expand_dims(2),
F.broadcast_axes(same_cids, axis=3, size=4))
return all_targets, all_masks, indices
[docs]class NormalizedBoxCenterDecoder(gluon.HybridBlock):
"""Decode bounding boxes training target with normalized center offsets.
This decoder must cooperate with NormalizedBoxCenterEncoder of same `stds`
in order to get properly reconstructed bounding boxes.
Returned bounding boxes are using corner type: `x_{min}, y_{min}, x_{max}, y_{max}`.
Parameters
----------
stds : array-like of size 4
Std value to be divided from encoded values, default is (0.1, 0.1, 0.2, 0.2).
clip : float, default is None
If given, bounding box target will be clipped to this value.
convert_anchor : boolean, default is False
Whether to convert anchor from corner to center format.
minimal_opset : bool
We sometimes add special operators to accelerate training/inference, however, for exporting
to third party compilers we want to utilize most widely used operators.
If `minimal_opset` is `True`, the network will use a minimal set of operators good
for e.g., `TVM`.
"""
def __init__(self, stds=(0.1, 0.1, 0.2, 0.2), convert_anchor=False, clip=None,
minimal_opset=False):
super(NormalizedBoxCenterDecoder, self).__init__()
assert len(stds) == 4, "Box Encoder requires 4 std values."
self._stds = stds
self._clip = clip
if convert_anchor:
self.corner_to_center = BBoxCornerToCenter(split=True)
else:
self.corner_to_center = None
self._format = 'corner' if convert_anchor else 'center'
self._minimal_opset = minimal_opset
[docs] def hybrid_forward(self, F, x, anchors):
if not self._minimal_opset and 'box_decode' in F.contrib.__dict__:
x, anchors = F.amp_multicast(x, anchors, num_outputs=2, cast_narrow=True)
if self._clip is None:
self._clip = -1 # match the signature of c++ operator
return F.contrib.box_decode(x, anchors, self._stds[0], self._stds[1], self._stds[2],
self._stds[3], clip=self._clip, format=self._format)
if self.corner_to_center is not None:
a = self.corner_to_center(anchors)
else:
a = anchors.split(axis=-1, num_outputs=4)
p = F.split(x, axis=-1, num_outputs=4)
ox = F.broadcast_add(F.broadcast_mul(p[0] * self._stds[0], a[2]), a[0])
oy = F.broadcast_add(F.broadcast_mul(p[1] * self._stds[1], a[3]), a[1])
dw = p[2] * self._stds[2]
dh = p[3] * self._stds[3]
if self._clip:
dw = F.minimum(dw, self._clip)
dh = F.minimum(dh, self._clip)
dw = F.exp(dw)
dh = F.exp(dh)
ow = F.broadcast_mul(dw, a[2]) * 0.5
oh = F.broadcast_mul(dh, a[3]) * 0.5
return F.concat(ox - ow, oy - oh, ox + ow, oy + oh, dim=-1)
[docs]class MultiClassEncoder(gluon.HybridBlock):
"""Encode classification training target given matching results.
This encoder will assign training target of matched bounding boxes to
ground-truth label + 1 and negative samples with label 0.
Ignored samples will be assigned with `ignore_label`, whose default is -1.
Parameters
----------
ignore_label : float
Assigned to un-matched samples, they are neither positive or negative during
training, and should be excluded in loss function. Default is -1.
"""
def __init__(self, ignore_label=-1):
super(MultiClassEncoder, self).__init__()
self._ignore_label = ignore_label
[docs] def hybrid_forward(self, F, samples, matches, refs):
"""HybridBlock, handle multi batch correctly
Parameters
----------
samples: (B, N), value +1 (positive), -1 (negative), 0 (ignore)
matches: (B, N), value range [0, M)
refs: (B, M), value range [0, num_fg_class), excluding background
Returns
-------
targets: (B, N), value range [0, num_fg_class + 1), including background
"""
# samples (B, N) (+1, -1, 0: ignore), matches (B, N) [0, M), refs (B, M)
# reshape refs (B, M) -> (B, 1, M) -> (B, N, M)
refs = F.broadcast_like(F.reshape(refs, (0, 1, -1)), matches, lhs_axes=1, rhs_axes=1)
# ids (B, N, M) -> (B, N), value [0, M + 1), 0 reserved for background class
target_ids = F.pick(refs, matches, axis=2) + 1
# samples 0: set ignore samples to ignore_label
targets = F.where(samples > 0.5, target_ids, F.ones_like(target_ids) * self._ignore_label)
# samples -1: set negative samples to 0
targets = F.where(samples < -0.5, F.zeros_like(targets), targets)
return targets
[docs]class MultiClassDecoder(gluon.HybridBlock):
"""Decode classification results.
This decoder must work with `MultiClassEncoder` to reconstruct valid labels.
The decoder expect results are after logits, e.g. Softmax.
Parameters
----------
axis : int
Axis of class-wise results.
thresh : float
Confidence threshold for the post-softmax scores.
Scores less than `thresh` are marked with `0`, corresponding `cls_id` is
marked with invalid class id `-1`.
"""
def __init__(self, axis=-1, thresh=0.01):
super(MultiClassDecoder, self).__init__()
self._axis = axis
self._thresh = thresh
[docs] def hybrid_forward(self, F, x):
pos_x = x.slice_axis(axis=self._axis, begin=1, end=None)
cls_id = F.argmax(pos_x, self._axis)
scores = F.pick(pos_x, cls_id, axis=-1)
mask = scores > self._thresh
cls_id = F.where(mask, cls_id, F.ones_like(cls_id) * -1)
scores = F.where(mask, scores, F.zeros_like(scores))
return cls_id, scores
[docs]class MultiPerClassDecoder(gluon.HybridBlock):
"""Decode classification results.
This decoder must work with `MultiClassEncoder` to reconstruct valid labels.
The decoder expect results are after logits, e.g. Softmax.
This version is different from
:py:class:`gluoncv.nn.coder.MultiClassDecoder` with the following changes:
For each position(anchor boxes), each foreground class can have their own
results, rather than enforced to be the best one.
For example, for a 5-class prediction with background(totaling 6 class), say
(0.5, 0.1, 0.2, 0.1, 0.05, 0.05) as (bg, apple, orange, peach, grape, melon),
`MultiClassDecoder` produce only one class id and score, that is (orange-0.2).
`MultiPerClassDecoder` produce 5 results individually:
(apple-0.1, orange-0.2, peach-0.1, grape-0.05, melon-0.05).
Parameters
----------
num_class : int
Number of classes including background.
axis : int
Axis of class-wise results.
thresh : float
Confidence threshold for the post-softmax scores.
Scores less than `thresh` are marked with `0`, corresponding `cls_id` is
marked with invalid class id `-1`.
"""
def __init__(self, num_class, axis=-1, thresh=0.01):
super(MultiPerClassDecoder, self).__init__()
self._fg_class = num_class - 1
self._axis = axis
self._thresh = thresh
[docs] def hybrid_forward(self, F, x):
scores = x.slice_axis(axis=self._axis, begin=1, end=None) # b x N x fg_class
template = F.zeros_like(x.slice_axis(axis=-1, begin=0, end=1))
cls_id = F.broadcast_add(template,
F.reshape(F.arange(self._fg_class), shape=(1, 1, self._fg_class)))
mask = scores > self._thresh
cls_id = F.where(mask, cls_id, F.ones_like(cls_id) * -1)
scores = F.where(mask, scores, F.zeros_like(scores))
return cls_id, scores
[docs]class SigmoidClassEncoder(object):
"""Encode class prediction labels for SigmoidCrossEntropy Loss."""
def __init__(self, **kwargs):
super(SigmoidClassEncoder, self).__init__(**kwargs)
def __call__(self, samples):
"""Encode class prediction labels for SigmoidCrossEntropy Loss.
Parameters
----------
samples : np.array
Sampling results with shape (B, N), 1:pos, 0:ignore, -1:negative
Returns
-------
(mxnet.nd.NDArray, mxnet.nd.NDArray)
(target, mask)
target is the output label with shape (B, N), 1: pos, 0: negative, -1: ignore
mask is the mask for label, -1(ignore) labels have mask 0, otherwise mask is 1.
"""
# notation from samples, 1:pos, 0:ignore, -1:negative
target = (samples + 1) / 2.
target = np.where(np.abs(samples) < 1e-5, -1, target)
# output: 1: pos, 0: negative, -1: ignore
mask = np.where(np.abs(samples) > 1e-5, 1.0, 0.0)
return target, mask
[docs]class CenterNetDecoder(gluon.HybridBlock):
"""Decorder for centernet.
Parameters
----------
topk : int
Only keep `topk` results.
scale : float, default is 4.0
Downsampling scale for the network.
"""
def __init__(self, topk=100, scale=4.0):
super(CenterNetDecoder, self).__init__()
self._topk = topk
self._scale = scale
[docs] def hybrid_forward(self, F, x, wh, reg):
"""Forward of decoder"""
_, _, out_h, out_w = x.shape_array().split(num_outputs=4, axis=0)
scores, indices = x.reshape((0, -1)).topk(k=self._topk, ret_typ='both')
indices = F.cast(indices, 'int64')
topk_classes = F.cast(F.broadcast_div(indices, (out_h * out_w)), 'float32')
topk_indices = F.broadcast_mod(indices, (out_h * out_w))
topk_ys = F.broadcast_div(topk_indices, out_w)
topk_xs = F.broadcast_mod(topk_indices, out_w)
center = reg.transpose((0, 2, 3, 1)).reshape((0, -1, 2))
wh = wh.transpose((0, 2, 3, 1)).reshape((0, -1, 2))
batch_indices = F.cast(F.arange(256).slice_like(
center, axes=(0)).expand_dims(-1).tile(reps=(1, self._topk)), 'int64')
reg_xs_indices = F.zeros_like(batch_indices, dtype='int64')
reg_ys_indices = F.ones_like(batch_indices, dtype='int64')
reg_xs = F.concat(batch_indices, topk_indices, reg_xs_indices, dim=0).reshape((3, -1))
reg_ys = F.concat(batch_indices, topk_indices, reg_ys_indices, dim=0).reshape((3, -1))
xs = F.cast(F.gather_nd(center, reg_xs).reshape((-1, self._topk)), 'float32')
ys = F.cast(F.gather_nd(center, reg_ys).reshape((-1, self._topk)), 'float32')
topk_xs = F.cast(topk_xs, 'float32') + xs
topk_ys = F.cast(topk_ys, 'float32') + ys
w = F.cast(F.gather_nd(wh, reg_xs).reshape((-1, self._topk)), 'float32')
h = F.cast(F.gather_nd(wh, reg_ys).reshape((-1, self._topk)), 'float32')
half_w = w / 2
half_h = h / 2
results = [topk_xs - half_w, topk_ys - half_h, topk_xs + half_w, topk_ys + half_h]
results = F.concat(*[tmp.expand_dims(-1) for tmp in results], dim=-1)
return topk_classes, scores, results * self._scale