Source code for vissl.data.disk_dataset

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

import logging

from fvcore.common.file_io import PathManager
from PIL import Image
from torchvision.datasets import ImageFolder
from vissl.data.data_helper import QueueDataset, get_mean_image
from vissl.utils.io import load_file


[docs]class DiskImageDataset(QueueDataset): """ Base Dataset class for loading images from Disk. Can load a predefined list of images or all images inside a folder. Inherits from QueueDataset class in VISSL to provide better handling of the invalid images by replacing them with the valid and seen images. Args: cfg (AttrDict): configuration defined by user data_source (string): data source either of "disk_filelist" or "disk_folder" path (string): can be either of the following 1. A .npy file containing a list of filepaths. In this case `data_source = "disk_filelist"` 2. A folder such that folder/split contains images. In this case `data_source = "disk_folder"` split (string): specify split for the dataset. Usually train/val/test. Used to read images if reading from a folder `path` and retrieve settings for that split from the config path. dataset_name (string): name of dataset. For information only. NOTE: This dataset class only returns images (not labels or other metdata). To load labels you must specify them in `LABEL_SOURCES` (See `ssl_dataset.py`). LABEL_SOURCES follows a similar convention as the dataset and can either be a filelist or a torchvision ImageFolder compatible folder - 1. Store labels in a numpy file 2. Store images in a nested directory structure so that torchvision ImageFolder dataset can infer the labels. """ def __init__(self, cfg, data_source, path, split, dataset_name): super(DiskImageDataset, self).__init__( queue_size=cfg["DATA"][split]["BATCHSIZE_PER_REPLICA"] ) assert data_source in [ "disk_filelist", "disk_folder", ], "data_source must be either disk_filelist or disk_folder" if data_source == "disk_filelist": assert PathManager.isfile(path), f"File {path} does not exist" elif data_source == "disk_folder": assert PathManager.isdir(path), f"Directory {path} does not exist" self.cfg = cfg self.split = split self.dataset_name = dataset_name self.data_source = data_source self._path = path self.image_dataset = [] self.is_initialized = False self._load_data(path) self._num_samples = len(self.image_dataset) if self.data_source == "disk_filelist": # Set dataset to null so that workers dont need to pickle this file. # This saves memory when disk_filelist is large, especially when memory mapping. self.image_dataset = [] # whether to use QueueDataset class to handle invalid images or not self.enable_queue_dataset = cfg["DATA"][self.split]["ENABLE_QUEUE_DATASET"] def _load_data(self, path): if self.data_source == "disk_filelist": if self.cfg["DATA"][self.split].MMAP_MODE: self.image_dataset = load_file(path, mmap_mode="r") else: self.image_dataset = load_file(path) elif self.data_source == "disk_folder": self.image_dataset = ImageFolder(path) logging.info(f"Loaded {len(self.image_dataset)} samples from folder {path}") # mark as initialized. # Creating ImageFolder dataset can be expensive because of repeated os.listdir calls # Avoid creating it over and over again. self.is_initialized = True if self.cfg["DATA"][self.split]["DATA_LIMIT"] > 0: limit = self.cfg["DATA"][self.split]["DATA_LIMIT"] if self.data_source == "disk_filelist": self.image_dataset = self.image_dataset[:limit] elif self.data_source == "disk_folder": self.image_dataset.samples = self.image_dataset.samples[:limit]
[docs] def num_samples(self): """ Size of the dataset """ return self._num_samples
[docs] def get_image_paths(self): """ Get paths of all images in the datasets. See load_data() """ self._load_data(self._path) return self.image_dataset
[docs] def __len__(self): """ Size of the dataset """ return self.num_samples()
[docs] def __getitem__(self, idx): """ - We do delayed loading of data to reduce the memory size due to pickling of dataset across dataloader workers. - Loads the data if not already loaded. - Sets and initializes the queue if not already initialized - Depending on the data source (folder or filelist), get the image. If using the QueueDataset and image is valid, save the image in queue if not full. Otherwise return a valid seen image from the queue if queue is not empty. """ if not self.is_initialized: self._load_data(self._path) self.is_initialized = True if not self.queue_init and self.enable_queue_dataset: self._init_queues() is_success = True image_path = self.image_dataset[idx] try: if self.data_source == "disk_filelist": with PathManager.open(image_path, "rb") as fopen: img = Image.open(fopen).convert("RGB") elif self.data_source == "disk_folder": img = self.image_dataset[idx][0] if is_success and self.enable_queue_dataset: self.on_sucess(img) except Exception as e: logging.warning( f"Couldn't load: {self.image_dataset[idx]}. Exception: \n{e}" ) is_success = False # if we have queue dataset class enabled, we try to use it to get # the seen valid images if self.enable_queue_dataset: img, is_success = self.on_failure() if img is None: img = get_mean_image( self.cfg["DATA"][self.split].DEFAULT_GRAY_IMG_SIZE ) else: img = get_mean_image(self.cfg["DATA"][self.split].DEFAULT_GRAY_IMG_SIZE) return img, is_success