Source code for vissl.data.dataset_catalog

# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved

"""
Data and labels file for various datasets.
"""

import json
import logging
import os
from typing import List

import numpy as np
from fvcore.common.file_io import PathManager
from vissl.data.datasets import get_coco_imgs_labels_info, get_voc_images_labels_info
from vissl.utils.misc import get_json_data_catalog_file
from vissl.utils.slurm import get_slurm_dir


[docs]class VisslDatasetCatalog(object):
    """
    A catalog that stores information about the datasets and how to obtain them.
    It contains a mapping from strings (which are names that identify a dataset,
    e.g. "imagenet1k") to a `dict` which contains:
        1) mapping of various data splits (train, test, val) to the data source
           (path on the disk whether a folder path or a filelist)
        2) source of the data (disk_filelist | disk_folder)
    The purpose of having this catalog is to make it easy to choose different datasets,
    by just using the strings in the config.
    """

    __REGISTERED_DATASETS = {}

[docs]    @staticmethod
    def register_json(json_catalog_path):
        """
        Args:
            filepath: a .json filepath that contains the data to be registered
        """
        with PathManager.open(json_catalog_path) as fopen:
            data_catalog = json.load(fopen)
        for key, value in data_catalog.items():
            VisslDatasetCatalog.register_data(key, value)

[docs]    @staticmethod
    def register_dict(dict_catalog):
        """
        Args:
            dict: a dict with a bunch of datasets to be registered
        """
        for key, value in dict_catalog.items():
            VisslDatasetCatalog.register_data(key, value)

[docs]    @staticmethod
    def register_data(name, data_dict):
        """
        Args:
            name (str): the name that identifies a dataset, e.g. "imagenet1k_folder".
            func (callable): a callable which takes no arguments and returns a list of dicts.
                It must return the same results if called multiple times.
        """
        assert isinstance(
            data_dict, dict
        ), "You must register a dictionary with VisslDatasetCatalog.register_dict"
        assert (
            name not in VisslDatasetCatalog.__REGISTERED_DATASETS
        ), "Dataset '{}' is already registered!".format(name)
        VisslDatasetCatalog.__REGISTERED_DATASETS[name] = data_dict

[docs]    @staticmethod
    def get(name):
        """
        Get the registered dict and return it.

        Args:
            name (str): the name that identifies a dataset, e.g. "imagenet1k".
        Returns:
            dict: dataset information (paths, source)
        """
        try:
            info = VisslDatasetCatalog.__REGISTERED_DATASETS[name]
        except KeyError:
            raise KeyError(
                "Dataset '{}' is not registered! Available datasets are: {}".format(
                    name, ", ".join(VisslDatasetCatalog.__REGISTERED_DATASETS.keys())
                )
            )
        return info

[docs]    @staticmethod
    def list() -> List[str]:
        """
        List all registered datasets.

        Returns:
            list[str]
        """
        return list(VisslDatasetCatalog.__REGISTERED_DATASETS.keys())

[docs]    @staticmethod
    def clear():
        """
        Remove all registered dataset.
        """
        VisslDatasetCatalog.__REGISTERED_DATASETS.clear()

[docs]    @staticmethod
    def remove(name):
        """
        Remove the dataset registered by ``name``.
        """
        VisslDatasetCatalog.__REGISTERED_DATASETS.pop(name)

[docs]    @staticmethod
    def has_data(name):
        """
        Check whether the data with ``name`` exists.
        """
        data_found = name in VisslDatasetCatalog.__REGISTERED_DATASETS
        return data_found


[docs]def get_local_path(input_file, dest_dir):
    """
    If user specified copying data to a local directory,
    get the local path where the data files were copied.

    - If input_file is just a file, we return the dest_dir/filename
    - If the intput_file is a directory, then we check if the
      environemt is SLURM and use slurm_dir or otherwise dest_dir
      to look up copy_complete file is available.
      If available, we return the directory.
    - If both above fail, we return the input_file as is.
    """
    out = ""
    if PathManager.isfile(input_file):
        out = os.path.join(dest_dir, os.path.basename(input_file))
    elif PathManager.isdir(input_file):
        data_name = input_file.strip("/").split("/")[-1]
        if "SLURM_JOBID" in os.environ:
            dest_dir = get_slurm_dir(dest_dir)
        dest_dir = os.path.join(dest_dir, data_name)
        complete_flag = os.path.join(dest_dir, "copy_complete")
        if PathManager.isfile(complete_flag):
            out = dest_dir
    if PathManager.exists(out):
        return out
    else:
        return input_file


[docs]def get_local_output_filepaths(input_files, dest_dir):
    """
    If we have copied the files to local disk as specified in the config, we
    return those local paths. Otherwise return the original paths.
    """
    output_files = []
    for item in input_files:
        if isinstance(item, list):
            out = get_local_output_filepaths(item, dest_dir)
        else:
            out = get_local_path(item, dest_dir)
        output_files.append(out)
    return output_files


[docs]def check_data_exists(data_files):
    """
    Check that the input data files exist. If the data_files is a list,
    we iteratively check for each file in the list.
    """
    if isinstance(data_files, list):
        return np.all([PathManager.exists(item) for item in data_files])
    else:
        return PathManager.exists(data_files)


[docs]def register_pascal_voc():
    """
    Register PASCAL VOC 2007 and 2012 datasets to the data catalog.
    We first look up for these datasets paths in the dataset catalog,
    if the paths exist, we register, otherwise we remove the voc_data
    from the catalog registry.
    """
    voc_datasets = ["voc2007_folder", "voc2012_folder"]
    for voc_data in voc_datasets:
        data_info = VisslDatasetCatalog.get(voc_data)
        data_folder = data_info["train"][0]
        if PathManager.exists(data_folder):
            train_data_info = get_voc_images_labels_info("train", data_folder)
            test_data_info = get_voc_images_labels_info("val", data_folder)
            data_info["train"] = train_data_info
            data_info["val"] = test_data_info
            VisslDatasetCatalog.remove(voc_data)
            VisslDatasetCatalog.register_data(voc_data, data_info)
        else:
            VisslDatasetCatalog.remove(voc_data)


[docs]def register_coco():
    """
    Register COCO 2004 datasets to the data catalog.
    We first look up for these datasets paths in the dataset catalog,
    if the paths exist, we register, otherwise we remove the
    coco2014_folder from the catalog registry.
    """
    data_info = VisslDatasetCatalog.get("coco2014_folder")
    data_folder = data_info["train"][0]
    if PathManager.exists(data_folder):
        train_data_info = get_coco_imgs_labels_info("train", data_folder)
        test_data_info = get_coco_imgs_labels_info("val", data_folder)
        data_info["train"] = train_data_info
        data_info["val"] = test_data_info
        VisslDatasetCatalog.remove("coco2014_folder")
        VisslDatasetCatalog.register_data("coco2014_folder", data_info)
    else:
        VisslDatasetCatalog.remove("coco2014_folder")


[docs]def register_datasets(json_catalog_path):
    """
    If the json dataset_catalog file is found, we register
    the datasets specified in the catalog with VISSL.
    If the catalog also specified VOC or coco datasets, we resister them

    Args:
        json_catalog_path (str): the path to the json dataset catalog
    """
    if PathManager.exists(json_catalog_path):
        logging.info(f"Registering datasets: {json_catalog_path}")
        VisslDatasetCatalog.clear()
        VisslDatasetCatalog.register_json(json_catalog_path)
    if VisslDatasetCatalog.has_data("voc2007_folder") or VisslDatasetCatalog.has_data(
        "voc2012_folder"
    ):
        register_pascal_voc()
    if VisslDatasetCatalog.has_data("coco2014_folder"):
        register_coco()


[docs]def get_data_files(split, dataset_config):
    """
    Get the path to the dataset (images and labels).
        1. If the user has explicitly specified the data_sources, we simply
           use those and don't do lookup in the datasets registered with VISSL
           from the dataset catalog.
        2. If the user hasn't specified the path, look for the dataset in
           the datasets catalog registered with VISSL. For a given list of datasets
           and a given partition (train/test), we first verify that we have the
           dataset and the correct source as specified by the user.
           Then for each dataset in the list, we get the data path (make sure it
           exists, sources match). For the label file, the file is optional.

    Once we have the dataset original paths, we replace the path with the local paths
    if the data was copied to local disk.
    """
    assert len(dataset_config[split].DATASET_NAMES) == len(
        dataset_config[split].DATA_SOURCES
    ), "len(data_sources) != len(dataset_names)"
    if len(dataset_config[split].DATA_PATHS) > 0:
        assert len(dataset_config[split].DATA_SOURCES) == len(
            dataset_config[split].DATA_PATHS
        ), "len(data_sources) != len(data_paths)"
    data_files, label_files = [], []
    data_names = dataset_config[split].DATASET_NAMES
    data_sources = dataset_config[split].DATA_SOURCES
    data_split = "train" if split == "TRAIN" else "val"
    for idx in range(len(data_sources)):
        # if there are synthetic data sources, we set the filepaths as none
        if data_sources[idx] == "synthetic":
            data_files.append("")
            continue
        # if user has specified the data path explicitly, we use it
        elif len(dataset_config[split].DATA_PATHS) > 0:
            data_files.append(dataset_config[split].DATA_PATHS[idx])
        # otherwise retrieve from the cataloag based on the dataset name
        else:
            data_info = VisslDatasetCatalog.get(data_names[idx])
            assert len(data_info[data_split]) > 0, "data paths list is empty"
            check_data_exists(
                data_info[data_split][0]
            ), f"Some data files dont exist: {data_info[data_split][0]}"
            data_files.append(data_info[data_split][0])
        # labels are optional and hence we append if we find them
        if len(dataset_config[split].LABEL_PATHS) > 0:
            if check_data_exists(dataset_config[split].LABEL_PATHS[idx]):
                label_files.append(dataset_config[split].LABEL_PATHS[idx])
        else:
            label_data_info = VisslDatasetCatalog.get(data_names[idx])
            if check_data_exists(label_data_info[data_split][1]):
                label_files.append(label_data_info[data_split][1])

    output = [data_files, label_files]
    if dataset_config[split].COPY_TO_LOCAL_DISK:
        dest_dir = dataset_config[split]["COPY_DESTINATION_DIR"]
        local_data_files = get_local_output_filepaths(data_files, dest_dir)
        local_label_files = get_local_output_filepaths(label_files, dest_dir)
        output = [local_data_files, local_label_files]
    return output


# get the path to dataset_catalog.json file
json_catalog_file = get_json_data_catalog_file()
# register the datasets specified in the catalog with VISSL
register_datasets(json_catalog_file)