Source code for torchhd.datasets.european_languages

#
# MIT License
#
# Copyright (c) 2023 Mike Heddes, Igor Nunes, Pere Vergés, Denis Kleyko, and Danny Abraham
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import os
from typing import Callable, Optional, Tuple, List
import torch
import torch.utils.data as data

from .utils import download_file_from_google_drive, unzip_file


[docs] class EuropeanLanguages(data.Dataset): """European Languages dataset. As used in the paper `"A Robust and Energy-Efficient Classifier Using Brain-Inspired Hyperdimensional Computing" <https://iis-people.ee.ethz.ch/~arahimi/papers/ISLPED16.pdf>`_. The dataset contains sentences in 21 European languages, the training data was taken from `Wortschatz Corpora <https://wortschatz.uni-leipzig.de/en/download>`_ and the testing data from `Europarl Parallel Corpus <https://www.statmt.org/europarl/>`_. Args: root (string): Root directory of dataset where the training and testing samples are located. train (bool, optional): If True, creates dataset from Wortschatz Corpora, otherwise from Europarl Parallel Corpus. download (bool, optional): If True, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again. transform (callable, optional): A function/transform that takes in an torch.LongTensor and returns a transformed version. target_transform (callable, optional): A function/transform that takes in the target and transforms it. """ classes: List[str] = [ "Bulgarian", "Czech", "Danish", "Dutch", "German", "English", "Estonian", "Finnish", "French", "Greek", "Hungarian", "Italian", "Latvian", "Lithuanian", "Polish", "Portuguese", "Romanian", "Slovak", "Slovenian", "Spanish", "Swedish", ] files: List[str] = [ "bul.txt", "ces.txt", "dan.txt", "nld.txt", "deu.txt", "eng.txt", "est.txt", "fin.txt", "fra.txt", "ell.txt", "hun.txt", "ita.txt", "lav.txt", "lit.txt", "pol.txt", "por.txt", "ron.txt", "slk.txt", "slv.txt", "spa.txt", "swe.txt", ] def __init__( self, root: str, train: bool = True, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None, download: bool = False, ): root = os.path.join(root, "language-recognition") root = os.path.expanduser(root) self.root = root os.makedirs(self.root, exist_ok=True) self.train = train self.transform = transform self.target_transform = target_transform if download: self.download() if not self._check_integrity(): raise RuntimeError( "Dataset not found or corrupted. You can use download=True to download it" ) self._load_data() def __len__(self) -> int: return self.targets.size(0) def __getitem__(self, index) -> Tuple[str, torch.LongTensor]: """ Args: index (int): Index Returns: Tuple[str, torch.LongTensor]: (sample, target) where target is the index of the target class """ sample = self.data[index] target = self.targets[index] if self.transform: sample = self.transform(sample) if self.target_transform: target = self.target_transform(target) return sample, target def _check_integrity(self) -> bool: if not os.path.isdir(self.root): return False train_dir = os.path.join(self.root, "training") has_train_dir = os.path.isdir(train_dir) test_dir = os.path.join(self.root, "testing") has_test_dir = os.path.isdir(test_dir) if not has_train_dir or not has_test_dir: return False for file in self.files: has_train_file = os.path.isfile(os.path.join(train_dir, file)) if not has_train_file: return False has_test_file = os.path.isfile(os.path.join(test_dir, file)) if not has_test_file: return False return True def _load_data(self): data_dir = os.path.join(self.root, "training" if self.train else "testing") data = [] targets = [] for class_label, filename in enumerate(self.files): with open(os.path.join(data_dir, filename), "r") as file: lines = file.readlines() lines = map(self._clean_line, lines) lines = filter(self._filter_line, lines) lines = list(lines) data += lines targets += [class_label] * len(lines) self.data = data self.targets = torch.tensor(targets, dtype=torch.long) def _clean_line(self, line): line = line.strip() # remove space at start and end line = " ".join(line.split()) # compact any whitespace to a single space return line def _filter_line(self, line): return line != "" def download(self): """Download the data if it doesn't exist already.""" if self._check_integrity(): print("Files already downloaded and verified") return zip_file_path = os.path.join(self.root, "data.zip") download_file_from_google_drive( "1zCvjPf0R5pOR46CNBNMM60b_LwQKvltI", zip_file_path ) unzip_file(zip_file_path, self.root) os.remove(zip_file_path)