#
# MIT License
#
# Copyright (c) 2023 Mike Heddes, Igor Nunes, Pere Vergés, Denis Kleyko, and Danny Abraham
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import os
import os.path
from typing import Callable, Optional, Tuple, List, NamedTuple, Generator, Dict
import torch
from torch import Tensor
from torch.utils import data
import tarfile
import numpy as np
import torchhd
from .utils import download_file_from_google_drive
[docs]
class UCIClassificationBenchmark:
"""Class for iterating over all datasets used in `Do we Need Hundreds of Classifiers to Solve Real World Classification Problems? <https://jmlr.org/papers/v15/delgado14a.html>`_ from the `UCI Machine Learning Repository <https://archive.ics.uci.edu/ml/index.php>`_.
Args:
root (string): Root directory containing the files of the dataset.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""
# All datasets included in the collection
dataset_names = [
"Abalone",
"AcuteInflammation",
"AcuteNephritis",
"Adult",
"Annealing",
"Arrhythmia",
"AudiologyStd",
"BalanceScale",
"Balloons",
"Bank",
"Blood",
"BreastCancer",
"BreastCancerWisc",
"BreastCancerWiscDiag",
"BreastCancerWiscProg",
"BreastTissue",
"Car",
"Cardiotocography10Clases",
"Cardiotocography3Clases",
"ChessKrvk",
"ChessKrvkp",
"CongressionalVoting",
"ConnBenchSonarMinesRocks",
"ConnBenchVowelDeterding",
"Connect4",
"Contrac",
"CreditApproval",
"CylinderBands",
"Dermatology",
"Echocardiogram",
"Ecoli",
"EnergyY1",
"EnergyY2",
"Fertility",
"Flags",
"Glass",
"HabermanSurvival",
"HayesRoth",
"HeartCleveland",
"HeartHungarian",
"HeartSwitzerland",
"HeartVa",
"Hepatitis",
"HillValley",
"HorseColic",
"IlpdIndianLiver",
"ImageSegmentation",
"Ionosphere",
"Iris",
"LedDisplay",
"Lenses",
"Letter",
"Libras",
"LowResSpect",
"LungCancer",
"Lymphography",
"Magic",
"Mammographic",
"Miniboone",
"MolecBiolPromoter",
"MolecBiolSplice",
"Monks1",
"Monks2",
"Monks3",
"Mushroom",
"Musk1",
"Musk2",
"Nursery",
"OocytesMerlucciusNucleus4d",
"OocytesMerlucciusStates2f",
"OocytesTrisopterusNucleus2f",
"OocytesTrisopterusStates5b",
"Optical",
"Ozone",
"PageBlocks",
"Parkinsons",
"Pendigits",
"Pima",
"PittsburgBridgesMaterial",
"PittsburgBridgesRelL",
"PittsburgBridgesSpan",
"PittsburgBridgesTOrD",
"PittsburgBridgesType",
"Planning",
"PlantMargin",
"PlantShape",
"PlantTexture",
"PostOperative",
"PrimaryTumor",
"Ringnorm",
"Seeds",
"Semeion",
"Soybean",
"Spambase",
"Spect",
"Spectf",
"StatlogAustralianCredit",
"StatlogGermanCredit",
"StatlogHeart",
"StatlogImage",
"StatlogLandsat",
"StatlogShuttle",
"StatlogVehicle",
"SteelPlates",
"SyntheticControl",
"Teaching",
"Thyroid",
"TicTacToe",
"Titanic",
"Trains",
"Twonorm",
"VertebralColumn2Clases",
"VertebralColumn3Clases",
"WallFollowing",
"Waveform",
"WaveformNoise",
"Wine",
"WineQualityRed",
"WineQualityWhite",
"Yeast",
"Zoo",
]
# Specify namedtuple format
[docs]
class DatasetEntry(NamedTuple):
name: str
train: data.Dataset
test: data.Dataset
def __init__(
self,
root: str,
download: bool,
):
super(UCIClassificationBenchmark, self).__init__()
self.root = root
self.download = download
self.statistics = {key: [] for key in self.dataset_names}
[docs]
def datasets(self) -> Generator[DatasetEntry, None, None]:
"""Returns an iterator over all datasets in the benchmark."""
# For all datasets in the collection
for dataset_name in self.dataset_names:
# Fetch the current dataset
dataset = getattr(torchhd.datasets, dataset_name)
# If no separate test dataset available - do 4-fold cross-validation
if hasattr(dataset, "num_folds"):
for fold_id in range(dataset.num_folds):
# Set test and train datasets for the current fold
train_ds = dataset(
self.root, train=True, download=self.download, fold=fold_id
)
test_ds = dataset(
self.root, train=False, download=False, fold=fold_id
)
yield self.DatasetEntry(dataset_name, train_ds, test_ds)
# Case of avaiable test set
else:
# Set test and train datasets
train_ds = dataset(self.root, train=True, download=self.download)
test_ds = dataset(self.root, train=False, download=False)
yield self.DatasetEntry(dataset_name, train_ds, test_ds)
[docs]
def report(self, dataset: DatasetEntry, metric: float) -> None:
"""Report the metric, e.g., accuracy, of the current dataset."""
# Update statistics for the current run if the dataset uses cross-validation
if hasattr(dataset.train, "num_folds"):
num_folds = dataset.train.num_folds
fold_idx = dataset.train.fold
if len(self.statistics[dataset.name]) == 0:
# Create a new nested list for each fold
self.statistics[dataset.name] = [[] for _ in range(num_folds)]
self.statistics[dataset.name][fold_idx].append(metric)
# Update statistics for the current run if the dataset has train/test split
else:
self.statistics[dataset.name].append(metric)
[docs]
def score(self) -> Dict[str, List[float]]:
"""Get the score on each dataset, averaged over cross-fold validation."""
results = {}
for key in self.statistics:
# If applicable average over folds
if len(self.statistics[key]) > 0 and isinstance(
self.statistics[key][0], list
):
group_by_repetition = list(zip(*self.statistics[key]))
# If division by zero occurs keep empty
try:
results[key] = [
sum(metrics) / len(metrics) for metrics in group_by_repetition
]
except:
results[key] = []
else:
results[key] = self.statistics[key]
return results
[docs]
class CollectionDataset(data.Dataset):
"""Generic class for loading datasets used in `Do we Need Hundreds of Classifiers to Solve Real World Classification Problems? <https://jmlr.org/papers/v15/delgado14a.html>`_.
Args:
root (string): Root directory containing the files of the dataset.
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""
# Name of the dataset. Used when extracting the corresponding files from archive; needs to be specified by subclasses
name: str
data: Tensor
targets: Tensor
def __init__(
self,
root: str,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
download: bool = False,
):
root = os.path.join(root, self.name)
root = os.path.expanduser(root)
self.root = root
os.makedirs(self.root, exist_ok=True)
self.transform = transform
self.target_transform = target_transform
if download:
self.download()
if not self._check_integrity():
raise RuntimeError(
"Dataset not found or corrupted. You can try to specify download=True to download it"
)
self._load_data()
def _check_integrity(self) -> bool:
return NotImplemented
def _load_data(self):
return NotImplemented
def __len__(self) -> int:
return self.data.size(0)
def __repr__(self) -> str:
return f"{self.name}({len(self)})"
[docs]
def __getitem__(self, index: int) -> Tuple[torch.FloatTensor, torch.LongTensor]:
"""
Args:
index (int): Index
Returns:
Tuple[torch.FloatTensor, torch.LongTensor]: (sample, target) where target is the index of the target class
"""
sample = self.data[index]
label = self.targets[index]
if self.transform:
sample = self.transform(sample)
if self.target_transform:
label = self.target_transform(label)
return sample, label
def download(self):
"""Download the data if it doesn't exist already."""
if self._check_integrity():
print("Files are already downloaded and verified")
return
# original data url:
# http://persoal.citius.usc.es/manuel.fernandez.delgado/papers/jmlr/data.tar.gz
data_dir = os.path.join(self.root, os.pardir)
archive_path = os.path.join(data_dir, "data_hundreds_classifiers.tar.gz")
if os.path.isfile(archive_path):
print("Archive file is already downloaded")
else:
download_file_from_google_drive(
"1Z3tEzCmR-yTvn1ZlAXaeAuVB5a9oCAkk", archive_path
)
# Extract only the requested dataset from the archive
with tarfile.open(archive_path) as file:
for member in file.getmembers():
if member.name.startswith(self.name):
file.extract(member, data_dir)
[docs]
class DatasetFourFold(CollectionDataset):
"""Generic class for loading datasets without separate test data that were used in `Do we Need Hundreds of Classifiers to Solve Real World Classification Problems? <https://jmlr.org/papers/v15/delgado14a.html>`_.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""
# Number of folds for cross validation
num_folds = 4
def __init__(
self,
root: str,
train: bool = True,
fold: int = -1,
hyper_search: bool = False,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
download: bool = False,
):
if (fold < -1) or (fold >= self.num_folds):
raise ValueError(
f"Fold number {fold} is not available. Fold should be between -1 and {self.num_folds - 1}"
)
if not train and fold == -1 and not hyper_search:
raise ValueError(
"This dataset does not have a separate file for test data. Please check that fold is specified correctly."
)
self.train = train
self.fold = fold
self.hyper_search = hyper_search
super().__init__(root, transform, target_transform, download)
def _check_integrity(self) -> bool:
if not os.path.isdir(self.root):
return False
# Check if the root directory contains the required files
has_train_file = os.path.isfile(os.path.join(self.root, self.name + "_R.dat"))
has_k_fold_file = os.path.isfile(os.path.join(self.root, "conxuntos_kfold.dat"))
has_fold_file = os.path.isfile(os.path.join(self.root, "conxuntos.dat"))
if has_train_file and has_k_fold_file and has_fold_file:
return True
# TODO: Add more specific checks like an MD5 checksum
return False
def _load_data(self):
data_path = os.path.join(self.root, self.name + "_R.dat")
data = np.loadtxt(data_path, skiprows=1, dtype=np.float32)
# Separate the targets from the data
targets = torch.from_numpy(data[:, -1].astype(np.int64))
data = torch.from_numpy(data[:, 1:-1])
# Load fold used in hyperparameter search if necessary
if self.hyper_search:
# Files with pre-generated folds with indices
hyper_split_path = os.path.join(self.root, "conxuntos.dat")
line_idx = 0 if self.train else 1
with open(hyper_split_path, "r") as file:
lines = file.readlines()
indices = np.fromstring(lines[line_idx], sep=" ", dtype=np.int64)
indices = torch.from_numpy(indices)
data = data[indices]
targets = targets[indices]
elif self.fold != -1:
# Files with pre-generated folds with indices
k_fold_path = os.path.join(self.root, "conxuntos_kfold.dat")
fold_idx = self.fold * 2
if not self.train:
fold_idx += 1
with open(k_fold_path, "r") as file:
lines = file.readlines()
indices = np.fromstring(lines[fold_idx], sep=" ", dtype=np.int64)
indices = torch.from_numpy(indices)
data = data[indices]
targets = targets[indices]
self.data = data
self.targets = targets
[docs]
class DatasetTrainTest(CollectionDataset):
"""Generic class for loading datasets with separate files for train and test data that were used in `Do we Need Hundreds of Classifiers to Solve Real World Classification Problems? <https://jmlr.org/papers/v15/delgado14a.html>`_.
Args:
root (string): Root directory containing the files of the dataset.
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
while the second row corresponds to test indices (used if ``train = False``).
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
and returns a transformed version.
target_transform (callable, optional): A function/transform that takes in the
target and transforms it.
download (bool, optional): If True, downloads the dataset from the internet and
puts it in root directory. If dataset is already downloaded, it is not
downloaded again.
"""
def __init__(
self,
root: str,
train: bool = True,
hyper_search: bool = False,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None,
download: bool = False,
):
self.train = train
self.hyper_search = hyper_search
super().__init__(root, transform, target_transform, download)
def _check_integrity(self) -> bool:
if not os.path.isdir(self.root):
return False
# Check if the root directory contains the required files
has_train_file = os.path.isfile(
os.path.join(self.root, self.name + "_train_R.dat")
)
has_k_fold_file = os.path.isfile(os.path.join(self.root, "conxuntos_kfold.dat"))
has_fold_file = os.path.isfile(os.path.join(self.root, "conxuntos.dat"))
if has_train_file and has_k_fold_file and has_fold_file:
return True
# TODO: Add more specific checks like an MD5 checksum
return False
def _load_data(self):
if self.train or self.hyper_search:
data_name = self.name + "_train_R.dat"
else:
data_name = self.name + "_test_R.dat"
data_path = os.path.join(self.root, data_name)
data = np.loadtxt(data_path, skiprows=1, dtype=np.float32)
# Separate the targets from the data
targets = torch.from_numpy(data[:, -1].astype(np.int64))
data = torch.from_numpy(data[:, 1:-1])
# Load fold used in hyperparameter search if necessary
if self.hyper_search:
# Files with pre-generated folds with indices
hyper_split_path = os.path.join(self.root, "conxuntos.dat")
line_idx = 0 if self.train else 1
with open(hyper_split_path, "r") as file:
lines = file.readlines()
indices = np.fromstring(lines[line_idx], sep=" ", dtype=np.int64)
indices = torch.from_numpy(indices)
data = data[indices]
targets = targets[indices]
self.data = data
self.targets = targets