Source code for vissl.data.ssl_transforms

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

from pathlib import Path
from typing import Any, Dict

import torchvision.transforms as pth_transforms
from classy_vision.dataset.transforms import build_transform, register_transform
from classy_vision.dataset.transforms.classy_transform import ClassyTransform
from classy_vision.generic.registry_utils import import_all_modules


# Below the transforms that require passing the labels as well. This is specifc
# to SSL only where we automatically generate the labels for training. All other
# transforms (including torchvision) require passing image only as input.
_TRANSFORMS_WITH_LABELS = ["ImgRotatePil", "ShuffleImgPatches"]
_TRANSFORMS_WITH_COPIES = [
    "ImgReplicatePil",
    "ImgPilToPatchesAndImage",
    "ImgPilToMultiCrop",
]
_TRANSFORMS_WITH_GROUPING = ["ImgPilMultiCropRandomApply"]


# we wrap around transforms so that they work with the multimodal input
[docs]@register_transform("SSLTransformsWrapper")
class SSLTransformsWrapper(ClassyTransform):
    """
    VISSL wraps around transforms so that they work with the multimodal input.
    VISSL supports batches that come from several datasets and sources. Hence
    the input batch (images, labels) always is a list.

    To apply the user defined transforms, VISSL takes "indices" as input which
    defines on what dataset/source data in the sample should the transform be
    applied to. For example:
        Assuming input sample is {
            "data": [dataset1_imgX, dataset2_imgY],
            "label": [dataset1_lblX, dataset2_lblY]
        }
        and the transform is:
            TRANSFORMS:
                - name: RandomGrayscale
                  p: 0.2
                  indices: 0
        then the transform is applied only on dataset1_imgX. If however, the
        indices are either not specified or set to 0, 1 then the transform
        is applied on both dataset1_imgX and dataset2_imgY

    Since this structure of data is introduced by vissl, the SSLTransformsWrapper
    takes care of dealing with the multi-modality input by wrapping the
    original transforms (pytorch transforms or custom transforms defined by user)
    and calling each transform on each index.

    VISSL also supports _TRANSFORMS_WITH_LABELS transforms that modify the label
    or are used to generate the labels used in self-supervised learning tasks like
    Jigsaw. When the transforms in _TRANSFORMS_WITH_LABELS are called, the new
    label is also returned besides the transformed image.

    VISSL also supports the _TRANSFORMS_WITH_COPIES which are transforms
    that basically generate several copies of image. Common example
    of self-supervised training methods that do this is SimCLR, SwAV, MoCo etc
    When a transform from _TRANSFORMS_WITH_COPIES is used, the SSLTransformsWrapper
    will flatten the transform output.
    For example for the input [img1], if we apply ImgReplicatePil to replicate
    the image 2 times:
        SSLTransformsWrapper(
            ImgReplicatePil(num_times=2), [img1]
        )
        will output [img1_1, img1_2] instead of nested list [[img1_1, img1_2]].
    The benefit of this is that the next set of transforms specified by user can now
    operate on img1_1 and img1_2 as the input becomes multi-modal nature.

    VISSL also supports _TRANSFORMS_WITH_GROUPING which essentially means
    that a single transform should be applied on the full multi-modal input
    together instead of separately. This is common transform used in BYOL/
    For example:
        SSLTransformsWrapper(
            ImgPilMultiCropRandomApply(
                RandomApply, prob=[0.0, 0.2]
            ), [img1_1, img1_2]
        )
        this will apply RandomApply on img1_1 with prob=0.0 and on img1_2 with
        prob=0.2
    """

[docs]    def __init__(self, indices, **args):
        """
        Args:
            indices (List[int]) (Optional): the indices list on which transform should
                                 be applied for the input which is always a list
                                 Example: minibatch of size=2 looks like [[img1], [img2]]).
                                 If indices is not specified, transform is applied
                                 to all the multi-modal input.
            args (dict): the arguments that the transform takes

        """
        self.indices = set(indices)
        self.name = args["name"]
        self.transform = build_transform(args)

    def _is_transform_with_labels(self):
        """
        _TRANSFORMS_WITH_LABELS = ["ImgRotatePil", "ShuffleImgPatches"]
        """
        if self.name in _TRANSFORMS_WITH_LABELS:
            return True
        return False

    def _is_transform_with_copies(self):
        """
        _TRANSFORMS_WITH_COPIES = [
            "ImgReplicatePil",
            "ImgPilToPatchesAndImage",
            "ImgPilToMultiCrop",
        ]
        """
        if self.name in _TRANSFORMS_WITH_COPIES:
            return True
        return False

    def _is_grouping_transform(self):
        """
        _TRANSFORMS_WITH_GROUPING = ["ImgPilMultiCropRandomApply"]
        """
        if self.name in _TRANSFORMS_WITH_GROUPING:
            return True
        return False

[docs]    def __call__(self, sample):
        """
        Apply each transform on the specified indices of each entry in
        the input sample.
        """
        # Run on all indices if empty set is passed.
        indices = self.indices if self.indices else set(range(len(sample["data"])))

        if self._is_grouping_transform():
            # if the transform needs to be applied to all the indices
            # together. For example: one might want to vary the intensity
            # of a transform across several crops of an image as in BYOL.
            output = self.transform(sample["data"])
            sample["data"] = output
        else:
            for idx in indices:
                output = self.transform(sample["data"][idx])
                if self._is_transform_with_labels():
                    sample["data"][idx] = output[0]
                    sample["label"].append(output[1])
                else:
                    sample["data"][idx] = output

        if self._is_transform_with_copies():
            # if the transform makes copies of the data, we just flatten the list
            # so the next set of transforms will operate on more indices
            sample["data"] = [val for sublist in sample["data"] for val in sublist]
            # now we replicate the rest of the metadata as well
            num_times = len(sample["data"])
            sample["label"] = sample["label"] * num_times
            sample["data_valid"] = sample["data_valid"] * num_times
            sample["data_idx"] = sample["data_idx"] * num_times
        return sample

[docs]    @classmethod
    def from_config(cls, config: Dict[str, Any]) -> "SSLTransformsWrapper":
        indices = config.get("indices", [])
        return cls(indices, **config)


[docs]def get_transform(input_transforms_list):
    """
    Given the list of user specified transforms, return the
    torchvision.transforms.Compose() version of the transforms. Each transform
    in the composition is SSLTransformsWrapper which wraps the original
    transforms to handle multi-modal nature of input.
    """
    output_transforms = []
    for transform_config in input_transforms_list:
        transform = SSLTransformsWrapper.from_config(transform_config)
        output_transforms.append(transform)
    return pth_transforms.Compose(output_transforms)


FILE_ROOT = Path(__file__).parent
import_all_modules(FILE_ROOT, "vissl.data.ssl_transforms")

__all__ = ["SSLTransformsWrapper", "get_transform"]