Source code for torchhd.datasets.european_languages

#
# MIT License
#
# Copyright (c) 2023 Mike Heddes, Igor Nunes, Pere Vergés, Denis Kleyko, and Danny Abraham
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
import os
from typing import Callable, Optional, Tuple, List
import torch
import torch.utils.data as data

from .utils import download_file_from_google_drive, unzip_file



[docs]
class EuropeanLanguages(data.Dataset):
    """European Languages dataset.

    As used in the paper `"A Robust and Energy-Efficient Classifier Using
    Brain-Inspired Hyperdimensional Computing" <https://iis-people.ee.ethz.ch/~arahimi/papers/ISLPED16.pdf>`_.
    The dataset contains sentences in 21 European languages,
    the training data was taken from `Wortschatz Corpora <https://wortschatz.uni-leipzig.de/en/download>`_
    and the testing data from `Europarl Parallel Corpus <https://www.statmt.org/europarl/>`_.

    Args:
        root (string): Root directory of dataset where the training and testing samples are located.
        train (bool, optional): If True, creates dataset from Wortschatz Corpora,
            otherwise from Europarl Parallel Corpus.
        download (bool, optional): If True, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
        transform (callable, optional): A function/transform that takes in an torch.LongTensor
            and returns a transformed version.
        target_transform (callable, optional): A function/transform that takes in the
            target and transforms it.
    """

    classes: List[str] = [
        "Bulgarian",
        "Czech",
        "Danish",
        "Dutch",
        "German",
        "English",
        "Estonian",
        "Finnish",
        "French",
        "Greek",
        "Hungarian",
        "Italian",
        "Latvian",
        "Lithuanian",
        "Polish",
        "Portuguese",
        "Romanian",
        "Slovak",
        "Slovenian",
        "Spanish",
        "Swedish",
    ]

    files: List[str] = [
        "bul.txt",
        "ces.txt",
        "dan.txt",
        "nld.txt",
        "deu.txt",
        "eng.txt",
        "est.txt",
        "fin.txt",
        "fra.txt",
        "ell.txt",
        "hun.txt",
        "ita.txt",
        "lav.txt",
        "lit.txt",
        "pol.txt",
        "por.txt",
        "ron.txt",
        "slk.txt",
        "slv.txt",
        "spa.txt",
        "swe.txt",
    ]

    def __init__(
        self,
        root: str,
        train: bool = True,
        transform: Optional[Callable] = None,
        target_transform: Optional[Callable] = None,
        download: bool = False,
    ):
        root = os.path.join(root, "language-recognition")
        root = os.path.expanduser(root)
        self.root = root
        os.makedirs(self.root, exist_ok=True)

        self.train = train
        self.transform = transform
        self.target_transform = target_transform

        if download:
            self.download()

        if not self._check_integrity():
            raise RuntimeError(
                "Dataset not found or corrupted. You can use download=True to download it"
            )

        self._load_data()

    def __len__(self) -> int:
        return self.targets.size(0)

    def __getitem__(self, index) -> Tuple[str, torch.LongTensor]:
        """
        Args:
            index (int): Index

        Returns:
            Tuple[str, torch.LongTensor]: (sample, target) where target is the index of the target class
        """
        sample = self.data[index]
        target = self.targets[index]

        if self.transform:
            sample = self.transform(sample)

        if self.target_transform:
            target = self.target_transform(target)

        return sample, target

    def _check_integrity(self) -> bool:
        if not os.path.isdir(self.root):
            return False

        train_dir = os.path.join(self.root, "training")
        has_train_dir = os.path.isdir(train_dir)
        test_dir = os.path.join(self.root, "testing")
        has_test_dir = os.path.isdir(test_dir)
        if not has_train_dir or not has_test_dir:
            return False

        for file in self.files:
            has_train_file = os.path.isfile(os.path.join(train_dir, file))
            if not has_train_file:
                return False

            has_test_file = os.path.isfile(os.path.join(test_dir, file))
            if not has_test_file:
                return False

        return True

    def _load_data(self):
        data_dir = os.path.join(self.root, "training" if self.train else "testing")

        data = []
        targets = []

        for class_label, filename in enumerate(self.files):
            with open(os.path.join(data_dir, filename), "r") as file:
                lines = file.readlines()
                lines = map(self._clean_line, lines)
                lines = filter(self._filter_line, lines)
                lines = list(lines)

                data += lines
                targets += [class_label] * len(lines)

        self.data = data
        self.targets = torch.tensor(targets, dtype=torch.long)

    def _clean_line(self, line):
        line = line.strip()  # remove space at start and end
        line = " ".join(line.split())  # compact any whitespace to a single space
        return line

    def _filter_line(self, line):
        return line != ""

    def download(self):
        """Download the data if it doesn't exist already."""

        if self._check_integrity():
            print("Files already downloaded and verified")
            return

        zip_file_path = os.path.join(self.root, "data.zip")
        download_file_from_google_drive(
            "1zCvjPf0R5pOR46CNBNMM60b_LwQKvltI", zip_file_path
        )

        unzip_file(zip_file_path, self.root)
        os.remove(zip_file_path)