Source code for vissl.data.disk_dataset

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import logging

from fvcore.common.file_io import PathManager
from PIL import Image
from torchvision.datasets import ImageFolder
from vissl.data.data_helper import QueueDataset, get_mean_image
from vissl.utils.io import load_file


[docs]class DiskImageDataset(QueueDataset):
    """
    Base Dataset class for loading images from Disk.
    Can load a predefined list of images or all images inside
    a folder.

    Inherits from QueueDataset class in VISSL to provide better
    handling of the invalid images by replacing them with the
    valid and seen images.

    Args:
        cfg (AttrDict): configuration defined by user
        data_source (string): data source either of "disk_filelist" or "disk_folder"
        path (string): can be either of the following
            1. A .npy file containing a list of filepaths.
               In this case `data_source = "disk_filelist"`
            2. A folder such that folder/split contains images.
               In this case `data_source = "disk_folder"`
        split (string): specify split for the dataset.
                        Usually train/val/test.
                        Used to read images if reading from a folder `path` and retrieve
                        settings for that split from the config path.
        dataset_name (string): name of dataset. For information only.

    NOTE: This dataset class only returns images (not labels or other metdata).
    To load labels you must specify them in `LABEL_SOURCES` (See `ssl_dataset.py`).
    LABEL_SOURCES follows a similar convention as the dataset and can either be a filelist
    or a torchvision ImageFolder compatible folder -
    1. Store labels in a numpy file
    2. Store images in a nested directory structure so that torchvision ImageFolder
       dataset can infer the labels.
    """

    def __init__(self, cfg, data_source, path, split, dataset_name):
        super(DiskImageDataset, self).__init__(
            queue_size=cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"]
        )
        assert data_source in [
            "disk_filelist",
            "disk_folder",
        ], "data_source must be either disk_filelist or disk_folder"
        if data_source == "disk_filelist":
            assert PathManager.isfile(path), f"File {path} does not exist"
        elif data_source == "disk_folder":
            assert PathManager.isdir(path), f"Directory {path} does not exist"
        self.cfg = cfg
        self.split = split
        self.dataset_name = dataset_name
        self.data_source = data_source
        self._path = path
        self.image_dataset = []
        self.is_initialized = False
        self._load_data(path)
        self._num_samples = len(self.image_dataset)
        if self.data_source == "disk_filelist":
            # Set dataset to null so that workers dont need to pickle this file.
            # This saves memory when disk_filelist is large, especially when memory mapping.
            self.image_dataset = []
        # whether to use QueueDataset class to handle invalid images or not
        self.enable_queue_dataset = cfg["DATA"][self.split]["ENABLE_QUEUE_DATASET"]

    def _load_data(self, path):
        if self.data_source == "disk_filelist":
            if self.cfg["DATA"][self.split].MMAP_MODE:
                self.image_dataset = load_file(path, mmap_mode="r")
            else:
                self.image_dataset = load_file(path)
        elif self.data_source == "disk_folder":
            self.image_dataset = ImageFolder(path)
            logging.info(f"Loaded {len(self.image_dataset)} samples from folder {path}")

            # mark as initialized.
            # Creating ImageFolder dataset can be expensive because of repeated os.listdir calls
            # Avoid creating it over and over again.
            self.is_initialized = True

        if self.cfg["DATA"][self.split]["DATA_LIMIT"] > 0:
            limit = self.cfg["DATA"][self.split]["DATA_LIMIT"]
            if self.data_source == "disk_filelist":
                self.image_dataset = self.image_dataset[:limit]
            elif self.data_source == "disk_folder":
                self.image_dataset.samples = self.image_dataset.samples[:limit]

[docs]    def num_samples(self):
        """
        Size of the dataset
        """
        return self._num_samples

[docs]    def get_image_paths(self):
        """
        Get paths of all images in the datasets. See load_data()
        """
        self._load_data(self._path)
        return self.image_dataset

[docs]    def __len__(self):
        """
        Size of the dataset
        """
        return self.num_samples()

[docs]    def __getitem__(self, idx):
        """
        - We do delayed loading of data to reduce the memory size due to pickling of
          dataset across dataloader workers.
        - Loads the data if not already loaded.
        - Sets and initializes the queue if not already initialized
        - Depending on the data source (folder or filelist), get the image.
          If using the QueueDataset and image is valid, save the image in queue if
          not full. Otherwise return a valid seen image from the queue if queue is
          not empty.
        """
        if not self.is_initialized:
            self._load_data(self._path)
            self.is_initialized = True
        if not self.queue_init and self.enable_queue_dataset:
            self._init_queues()
        is_success = True
        image_path = self.image_dataset[idx]
        try:
            if self.data_source == "disk_filelist":
                with PathManager.open(image_path, "rb") as fopen:
                    img = Image.open(fopen).convert("RGB")
            elif self.data_source == "disk_folder":
                img = self.image_dataset[idx][0]
            if is_success and self.enable_queue_dataset:
                self.on_sucess(img)
        except Exception as e:
            logging.warning(
                f"Couldn't load: {self.image_dataset[idx]}. Exception: \n{e}"
            )
            is_success = False
            # if we have queue dataset class enabled, we try to use it to get
            # the seen valid images
            if self.enable_queue_dataset:
                img, is_success = self.on_failure()
                if img is None:
                    img = get_mean_image(
                        self.cfg["DATA"][self.split].DEFAULT_GRAY_IMG_SIZE
                    )
            else:
                img = get_mean_image(self.cfg["DATA"][self.split].DEFAULT_GRAY_IMG_SIZE)
        return img, is_success