Source code for gluoncv.data.transforms.image

"""Extended image transformations to `mxnet.image`."""
from __future__ import division
import random
import numpy as np
import mxnet as mx
from mxnet import nd
from mxnet.base import numeric_types

__all__ = ['imresize', 'resize_long', 'resize_short_within',
           'random_pca_lighting', 'random_expand', 'random_flip',
           'resize_contain', 'ten_crop']

[docs]def imresize(src, w, h, interp=1):
    """Resize image with OpenCV.

    This is a duplicate of mxnet.image.imresize for name space consistency.

    Parameters
    ----------
    src : mxnet.nd.NDArray
        source image
    w : int, required
        Width of resized image.
    h : int, required
        Height of resized image.
    interp : int, optional, default='1'
        Interpolation method (default=cv2.INTER_LINEAR).

    out : NDArray, optional
        The output NDArray to hold the result.

    Returns
    -------
    out : NDArray or list of NDArrays
        The output of this function.

    Examples
    --------
    >>> import mxnet as mx
    >>> from gluoncv import data as gdata
    >>> img = mx.random.uniform(0, 255, (300, 300, 3)).astype('uint8')
    >>> print(img.shape)
    (300, 300, 3)
    >>> img = gdata.transforms.image.imresize(img, 200, 200)
    >>> print(img.shape)
    (200, 200, 3)
    """
    from mxnet.image.image import _get_interp_method as get_interp
    oh, ow, _ = src.shape
    return mx.image.imresize(src, w, h, interp=get_interp(interp, (oh, ow, h, w)))

[docs]def resize_long(src, size, interp=2):
    """Resizes longer edge to size.
    Note: `resize_long` uses OpenCV (not the CV2 Python library).
    MXNet must have been built with OpenCV for `resize_long` to work.
    Resizes the original image by setting the longer edge to size
    and setting the shorter edge accordingly. This will ensure the new image will
    fit into the `size` specified.
    Resizing function is called from OpenCV.

    Parameters
    ----------
    src : NDArray
        The original image.
    size : int
        The length to be set for the shorter edge.
    interp : int, optional, default=2
        Interpolation method used for resizing the image.
        Possible values:
        0: Nearest Neighbors Interpolation.
        1: Bilinear interpolation.
        2: Area-based (resampling using pixel area relation). It may be a
        preferred method for image decimation, as it gives moire-free
        results. But when the image is zoomed, it is similar to the Nearest
        Neighbors method. (used by default).
        3: Bicubic interpolation over 4x4 pixel neighborhood.
        4: Lanczos interpolation over 8x8 pixel neighborhood.
        9: Cubic for enlarge, area for shrink, bilinear for others
        10: Random select from interpolation method mentioned above.
        Note:
        When shrinking an image, it will generally look best with AREA-based
        interpolation, whereas, when enlarging an image, it will generally look best
        with Bicubic (slow) or Bilinear (faster but still looks OK).
        More details can be found in the documentation of OpenCV, please refer to
        http://docs.opencv.org/master/da/d54/group__imgproc__transform.html.
    Returns
    -------
    NDArray
        An 'NDArray' containing the resized image.
    Example
    -------
    >>> with open("flower.jpeg", 'rb') as fp:
    ...     str_image = fp.read()
    ...
    >>> image = mx.img.imdecode(str_image)
    >>> image
    <NDArray 2321x3482x3 @cpu(0)>
    >>> size = 640
    >>> new_image = mx.img.resize_long(image, size)
    >>> new_image
    <NDArray 386x640x3 @cpu(0)>
    """
    from mxnet.image.image import _get_interp_method as get_interp
    h, w, _ = src.shape
    if h > w:
        new_h, new_w = size, size * w // h
    else:
        new_h, new_w = size * h // w, size
    return imresize(src, new_w, new_h, interp=get_interp(interp, (h, w, new_h, new_w)))

[docs]def resize_short_within(src, short, max_size, mult_base=1, interp=2):
    """Resizes shorter edge to size but make sure it's capped at maximum size.
    Note: `resize_short_within` uses OpenCV (not the CV2 Python library).
    MXNet must have been built with OpenCV for `resize_short_within` to work.
    Resizes the original image by setting the shorter edge to size
    and setting the longer edge accordingly. Also this function will ensure
    the new image will not exceed ``max_size`` even at the longer side.
    Resizing function is called from OpenCV.

    Parameters
    ----------
    src : NDArray
        The original image.
    short : int
        Resize shorter side to ``short``.
    max_size : int
        Make sure the longer side of new image is smaller than ``max_size``.
    mult_base : int, default is 1
        Width and height are rounded to multiples of `mult_base`.
    interp : int, optional, default=2
        Interpolation method used for resizing the image.
        Possible values:
        0: Nearest Neighbors Interpolation.
        1: Bilinear interpolation.
        2: Area-based (resampling using pixel area relation). It may be a
        preferred method for image decimation, as it gives moire-free
        results. But when the image is zoomed, it is similar to the Nearest
        Neighbors method. (used by default).
        3: Bicubic interpolation over 4x4 pixel neighborhood.
        4: Lanczos interpolation over 8x8 pixel neighborhood.
        9: Cubic for enlarge, area for shrink, bilinear for others
        10: Random select from interpolation method mentioned above.
        Note:
        When shrinking an image, it will generally look best with AREA-based
        interpolation, whereas, when enlarging an image, it will generally look best
        with Bicubic (slow) or Bilinear (faster but still looks OK).
        More details can be found in the documentation of OpenCV, please refer to
        http://docs.opencv.org/master/da/d54/group__imgproc__transform.html.
    Returns
    -------
    NDArray
        An 'NDArray' containing the resized image.
    Example
    -------
    >>> with open("flower.jpeg", 'rb') as fp:
    ...     str_image = fp.read()
    ...
    >>> image = mx.img.imdecode(str_image)
    >>> image
    <NDArray 2321x3482x3 @cpu(0)>
    >>> new_image = resize_short_within(image, short=800, max_size=1000)
    >>> new_image
    <NDArray 667x1000x3 @cpu(0)>
    >>> new_image = resize_short_within(image, short=800, max_size=1200)
    >>> new_image
    <NDArray 800x1200x3 @cpu(0)>
    >>> new_image = resize_short_within(image, short=800, max_size=1200, mult_base=32)
    >>> new_image
    <NDArray 800x1184x3 @cpu(0)>
    """
    from mxnet.image.image import _get_interp_method as get_interp
    h, w, _ = src.shape
    im_size_min, im_size_max = (h, w) if w > h else (w, h)
    scale = float(short) / float(im_size_min)
    if np.round(scale * im_size_max / mult_base) * mult_base > max_size:
        # fit in max_size
        scale = float(np.floor(max_size / mult_base) * mult_base) / float(im_size_max)
    new_w, new_h = (int(np.round(w * scale / mult_base) * mult_base),
                    int(np.round(h * scale / mult_base) * mult_base))
    return imresize(src, new_w, new_h, interp=get_interp(interp, (h, w, new_h, new_w)))

[docs]def random_pca_lighting(src, alphastd, eigval=None, eigvec=None):
    """Apply random pca lighting noise to input image.

    Parameters
    ----------
    img : mxnet.nd.NDArray
        Input image with HWC format.
    alphastd : float
        Noise level [0, 1) for image with range [0, 255].
    eigval : list of floats.
        Eigen values, defaults to [55.46, 4.794, 1.148].
    eigvec : nested lists of floats
        Eigen vectors with shape (3, 3), defaults to
        [[-0.5675, 0.7192, 0.4009],
         [-0.5808, -0.0045, -0.8140],
         [-0.5836, -0.6948, 0.4203]].

    Returns
    -------
    mxnet.nd.NDArray
        Augmented image.

    """
    if alphastd <= 0:
        return src

    if eigval is None:
        eigval = np.array([55.46, 4.794, 1.148])
    if eigvec is None:
        eigvec = np.array([[-0.5675, 0.7192, 0.4009],
                           [-0.5808, -0.0045, -0.8140],
                           [-0.5836, -0.6948, 0.4203]])

    alpha = np.random.normal(0, alphastd, size=(3,))
    rgb = np.dot(eigvec * alpha, eigval)
    src += nd.array(rgb, ctx=src.context)
    return src

[docs]def random_expand(src, max_ratio=4, fill=0, keep_ratio=True):
    """Random expand original image with borders, this is identical to placing
    the original image on a larger canvas.

    Parameters
    ----------
    src : mxnet.nd.NDArray
        The original image with HWC format.
    max_ratio : int or float
        Maximum ratio of the output image on both direction(vertical and horizontal)
    fill : int or float or array-like
        The value(s) for padded borders. If `fill` is numerical type, RGB channels
        will be padded with single value. Otherwise `fill` must have same length
        as image channels, which resulted in padding with per-channel values.
    keep_ratio : bool
        If `True`, will keep output image the same aspect ratio as input.

    Returns
    -------
    mxnet.nd.NDArray
        Augmented image.
    tuple
        Tuple of (offset_x, offset_y, new_width, new_height)

    """
    if max_ratio <= 1:
        return src, (0, 0, src.shape[1], src.shape[0])

    h, w, c = src.shape
    ratio_x = random.uniform(1, max_ratio)
    if keep_ratio:
        ratio_y = ratio_x
    else:
        ratio_y = random.uniform(1, max_ratio)

    oh, ow = int(h * ratio_y), int(w * ratio_x)
    off_y = random.randint(0, oh - h)
    off_x = random.randint(0, ow - w)

    # make canvas
    if isinstance(fill, numeric_types):
        dst = nd.full(shape=(oh, ow, c), val=fill, dtype=src.dtype)
    else:
        fill = nd.array(fill, dtype=src.dtype, ctx=src.context)
        if not c == fill.size:
            raise ValueError("Channel and fill size mismatch, {} vs {}".format(c, fill.size))
        dst = nd.tile(fill.reshape((1, c)), reps=(oh * ow, 1)).reshape((oh, ow, c))

    dst[off_y:off_y+h, off_x:off_x+w, :] = src
    return dst, (off_x, off_y, ow, oh)

[docs]def random_flip(src, px=0, py=0, copy=False):
    """Randomly flip image along horizontal and vertical with probabilities.

    Parameters
    ----------
    src : mxnet.nd.NDArray
        Input image with HWC format.
    px : float
        Horizontal flip probability [0, 1].
    py : float
        Vertical flip probability [0, 1].
    copy : bool
        If `True`, return a copy of input

    Returns
    -------
    mxnet.nd.NDArray
        Augmented image.
    tuple
        Tuple of (flip_x, flip_y), records of whether flips are applied.

    """
    flip_y = np.random.choice([False, True], p=[1-py, py])
    flip_x = np.random.choice([False, True], p=[1-px, px])
    if flip_y:
        src = nd.flip(src, axis=0)
    if flip_x:
        src = nd.flip(src, axis=1)
    if copy:
        src = src.copy()
    return src, (flip_x, flip_y)

[docs]def resize_contain(src, size, fill=0):
    """Resize the image to fit in the given area while keeping aspect ratio.

    If both the height and the width in `size` are larger than
    the height and the width of input image, the image is placed on
    the center with an appropriate padding to match `size`.
    Otherwise, the input image is scaled to fit in a canvas whose size
    is `size` while preserving aspect ratio.

    Parameters
    ----------
    src : mxnet.nd.NDArray
        The original image with HWC format.
    size : tuple
        Tuple of length 2 as (width, height).
    fill : int or float or array-like
        The value(s) for padded borders. If `fill` is numerical type, RGB channels
        will be padded with single value. Otherwise `fill` must have same length
        as image channels, which resulted in padding with per-channel values.

    Returns
    -------
    mxnet.nd.NDArray
        Augmented image.
    tuple
        Tuple of (offset_x, offset_y, scaled_x, scaled_y)

    """
    h, w, c = src.shape
    ow, oh = size
    scale_h = oh / h
    scale_w = ow / w
    scale = min(min(scale_h, scale_w), 1)
    scaled_x = int(w * scale)
    scaled_y = int(h * scale)
    if scale < 1:
        src = mx.image.imresize(src, scaled_x, scaled_y)

    off_y = (oh - scaled_y) // 2 if scaled_y < oh else 0
    off_x = (ow - scaled_x) // 2 if scaled_x < ow else 0

    # make canvas
    if isinstance(fill, numeric_types):
        dst = nd.full(shape=(oh, ow, c), val=fill, dtype=src.dtype)
    else:
        fill = nd.array(fill, ctx=src.context)
        if not c == fill.size:
            raise ValueError("Channel and fill size mismatch, {} vs {}".format(c, fill.size))
        dst = nd.repeat(fill, repeats=oh * ow).reshape((oh, ow, c))

    dst[off_y:off_y+scaled_y, off_x:off_x+scaled_x, :] = src
    return dst, (off_x, off_y, scaled_x, scaled_y)

[docs]def ten_crop(src, size):
    """Crop 10 regions from an array.
    This is performed same as:
    http://chainercv.readthedocs.io/en/stable/reference/transforms.html#ten-crop

    This method crops 10 regions. All regions will be in shape
    :obj`size`. These regions consist of 1 center crop and 4 corner
    crops and horizontal flips of them.
    The crops are ordered in this order.
    * center crop
    * top-left crop
    * bottom-left crop
    * top-right crop
    * bottom-right crop
    * center crop (flipped horizontally)
    * top-left crop (flipped horizontally)
    * bottom-left crop (flipped horizontally)
    * top-right crop (flipped horizontally)
    * bottom-right crop (flipped horizontally)

    Parameters
    ----------
    src : mxnet.nd.NDArray
        Input image.
    size : tuple
        Tuple of length 2, as (width, height) of the cropped areas.

    Returns
    -------
    mxnet.nd.NDArray
        The cropped images with shape (10, size[1], size[0], C)

    """
    h, w, _ = src.shape
    ow, oh = size

    if h < oh or w < ow:
        raise ValueError(
            "Cannot crop area {} from image with size ({}, {})".format(str(size), h, w))

    center = src[(h - oh) // 2:(h + oh) // 2, (w - ow) // 2:(w + ow) // 2, :]
    tl = src[0:oh, 0:ow, :]
    bl = src[h - oh:h, 0:ow, :]
    tr = src[0:oh, w - ow:w, :]
    br = src[h - oh:h, w - ow:w, :]
    crops = nd.stack(*[center, tl, bl, tr, br], axis=0)
    crops = nd.concat(*[crops, nd.flip(crops, axis=2)], dim=0)
    return crops