Shortcuts

Source code for torchtime.datasets.ucruea

import os
import shutil
import warnings
import zipfile
from typing import Optional, Callable, Tuple, Any, List
from urllib.error import URLError

import torch
from torch import Tensor

from . import TimeSeriesDataset
from .utils import download_and_extract_archive
from ..exceptions import DataConversionWarning
from ..io.arff import load_from_arff_to_dataframe
from ..io.ts import TSFileLoader
from ..transforms import functional as F
from ..utils import stack_pad


[docs]class UCR(TimeSeriesDataset): """*UEA & UCR Time Series Classification Repository* :cite:`Dau2019UCR`. Args: root (str): Root directory of dataset where ``UCR/<dataset_name>`` exist. name (str): The name of the dataset to load from the UCR archive. train (bool, optional): If True, creates dataset from ``TRAIN.ts`` or ``TRAIN.arff``, otherwise from ``TEST.ts`` or ``TEST.arff``. download (bool, optional): If true, downloads the dataset from the internet and puts it in root directory. If dataset is already downloaded, it is not downloaded again. classes (list, optional): A list containing the class labels, where the index in the list indicates the numeric value it should be mapped to. transform (callable, optional): A function/transform that takes in a uni- or multivariate time series and returns a transformed version. E.g, ``transforms.NaN2Value`` target_transform (callable, optional): A function/transform that takes in the target and transforms it. """ _repr_indent = 4 mirrors = [ 'http://timeseriesclassification.com/aeon-toolkit/' ] univariate = [ 'AbnormalHeartbeat', 'ACSF1', 'Adiac', 'AllGestureWiimoteX', 'AllGestureWiimoteY', 'AllGestureWiimoteZ', 'ArrowHead', 'Beef', 'BeetleFly', 'BirdChicken', 'BME', 'Car', 'CBF', 'Chinatown', 'ChlorineConcentration', 'CinCECGTorso', 'Coffee', 'Computers', 'CricketX', 'CricketY', 'CricketZ', 'Crop', 'DiatomSizeReduction', 'DistalPhalanxOutlineAgeGroup', 'DistalPhalanxOutlineCorrect', 'DistalPhalanxTW', 'DodgerLoopDay', 'DodgerLoopGame', 'DodgerLoopWeekend', 'Earthquakes', 'ECG200', 'ECG5000', 'ECGFiveDays', 'ElectricDevices', 'EOGHorizontalSignal', 'EOGVerticalSignal', 'EthanolLevel', 'FaceAll', 'FaceFour', 'FacesUCR', 'FiftyWords', 'Fish', 'FordA', 'FordB', 'FreezerRegularTrain', 'FreezerSmallTrain', 'Fungi', 'GestureMidAirD1', 'GestureMidAirD2', 'GestureMidAirD3', 'GesturePebbleZ1', 'GesturePebbleZ2', 'GunPoint', 'GunPointAgeSpan', 'GunPointMaleVersusFemale', 'GunPointOldVersusYoung', 'Ham', 'HandOutlines', 'Haptics', 'Herring', 'HouseTwenty', 'InlineSkate', 'InsectEPGRegularTrain', 'InsectEPGSmallTrain', 'InsectWingbeatSound', 'ItalyPowerDemand', 'LargeKitchenAppliances', 'Lightning2', 'Lightning7', 'Mallat', 'Meat', 'MedicalImages', 'MelbournePedestrian', 'MiddlePhalanxOutlineAgeGroup', 'MiddlePhalanxOutlineCorrect', 'MiddlePhalanxTW', 'MixedShapesRegularTrain', 'MixedShapesSmallTrain', 'MoteStrain', 'NonInvasiveFetalECGThorax1', 'NonInvasiveFetalECGThorax2', 'OliveOil', 'OSULeaf', 'PhalangesOutlinesCorrect', 'Phoneme', 'PickupGestureWiimoteZ', 'PigAirwayPressure', 'PigArtPressure', 'PigCVP', 'PLAID', 'Plane', 'PowerCons', 'ProximalPhalanxOutlineAgeGroup', 'ProximalPhalanxOutlineCorrect', 'ProximalPhalanxTW', 'RefrigerationDevices', 'Rock', 'ScreenType', 'SemgHandGenderCh2', 'SemgHandMovementCh2', 'SemgHandSubjectCh2', 'ShakeGestureWiimoteZ', 'ShapeletSim', 'ShapesAll', 'SmallKitchenAppliances', 'SmoothSubspace', 'SonyAIBORobotSurface1', 'SonyAIBORobotSurface2', 'StarLightCurves', 'Strawberry', 'SwedishLeaf', 'Symbols', 'SyntheticControl', 'ToeSegmentation1', 'ToeSegmentation2', 'Trace', 'TwoLeadECG', 'TwoPatterns', 'UMD', 'UWaveGestureLibraryAll', 'UWaveGestureLibraryX', 'UWaveGestureLibraryY', 'UWaveGestureLibraryZ', 'Wafer', 'Wine', 'WordSynonyms', 'Worms', 'WormsTwoClass', 'Yoga' ] multivariate = [ 'ArticularyWordRecognition', 'AtrialFibrillation', 'BasicMotions', 'CharacterTrajectories', 'Cricket', 'DuckDuckGeese', 'EigenWorms', 'Epilepsy', 'ERing', 'EthanolConcentration', 'FaceDetection', 'FingerMovements', 'HandMovementDirection', 'Handwriting', 'Heartbeat', 'InsectWingbeat', 'JapaneseVowels', 'Libras', 'LSST', 'MotorImagery', 'NATOPS', 'PEMS-SF', 'PenDigits', 'PhonemeSpectra', 'RacketSports', 'SelfRegulationSCP1', 'SelfRegulationSCP2', 'SpokenArabicDigits', 'StandWalkJump', 'UWaveGestureLibrary' ] @property def data_dir(self) -> str: return os.path.join(self.root, self.__class__.__name__, self.name) @property def training_file(self) -> str: return self.name + "_TRAIN" @property def test_file(self) -> str: return self.name + "_TEST" archive_format = ".zip" @property def channels(self) -> int: return self.data.shape[1] def __init__( self, name: str, root: str, train: bool = True, download: bool = False, classes: Optional[List[Any]] = None, **kwargs ) -> None: super(UCR, self).__init__(root, **kwargs) self.name = name self.root = root self.train = train # training set or test set if classes is not None and len(set(classes)) != len(classes): raise ValueError("Class mapping contains duplicate labels!") self.classes = classes if download: self.download() if not self._check_exists(): raise RuntimeError("Dataset not found. You can use download=True to download it") self.data, self.targets, self.classes = self._load_data() def _load_data(self) -> Tuple[Tensor, Tensor, List[Any]]: file_name = (self.training_file if self.train else self.test_file) + ".ts" abspath = os.path.join(self.data_dir, file_name) if os.path.exists(abspath): file_loader = TSFileLoader(abspath) data, targets = file_loader.as_tensor(return_targets=True) classes = file_loader.get_classes() self.dim = (file_loader.dim, file_loader.series_length) else: file_name = (self.training_file if self.train else self.test_file) + ".arff" abspath = os.path.join(self.data_dir, file_name) data, targets, classes = load_from_arff_to_dataframe(abspath, return_class_labels=True) data_ = [] for i in range(data.shape[-1]): # to_time_series_dataset(data[f'dim_{i}']) data_.append(stack_pad(data[f'dim_{i}'])) # stack arrays even if they have different lengths data = torch.permute(torch.stack(data_, dim=-1), (0, 2, 1)) self.dim = data.size()[1:] # data = match_seq_len(data) if self.classes is not None and classes is not None: if len(self.classes) != len(classes): raise ValueError("Number of classes present in {} is different from the provided class mapping." .format(abspath)) if self.classes != classes: warnings.warn("The classes provided do not match the classes from the dataset!") try: # targets = list(map(int, targets)) targets = torch.as_tensor(targets) if classes is None: classes = torch.unique(targets).tolist() except (ValueError, TypeError): if self.classes is not None: _, targets = F.encode_labels(targets, self.classes) else: if classes is not None: warnings.warn("Labels are not numeric and no explicit class mapping is provided. " "Please pass an explicit class mapping to the constructor. " "Using class labels parsed from dataset.", DataConversionWarning, stacklevel=2) _, targets = F.encode_labels(targets, classes) else: warnings.warn("Dataset did not include explicit class labels. Inferring class labels from targets.", DataConversionWarning, stacklevel=2) classes, targets = F.encode_labels(targets) self.classes = classes return data, targets, classes def _ensure_structure(self) -> None: """Remove nested data directory, which seems to exist for some UCR/UEA time series datasets""" with os.scandir(self.data_dir) as d: for entry in d: if entry.is_dir() and entry.name == self.name: with os.scandir(os.path.join(self.data_dir, entry.name)) as nested_dir: for f in nested_dir: shutil.move(f.path, self.data_dir) shutil.rmtree(os.path.join(self.data_dir, entry.name)) def _check_exists(self) -> bool: # TODO: maybe get rid of multiple archive extractions and check ts/arff file checksum instead return os.path.exists(self.data_dir) and os.path.isdir(self.data_dir) # contents = [f for f in os.listdir(self.data_dir) if os.path.isfile(os.path.join(self.data_dir, f))] def download(self) -> None: """Download the UCR/UEA data if it doesn't exist already.""" if self._check_exists(): return os.makedirs(self.root, exist_ok=True) # download files filename = self.name + self.archive_format for mirror in self.mirrors: url = "{}{}".format(mirror, filename) try: print("Downloading {}".format(url)) download_and_extract_archive( url, download_root=self.data_dir, filename=filename, md5=None ) self._ensure_structure() except URLError as error: print( "Failed to download (trying next):\n{}".format(error) ) continue except zipfile.BadZipFile as e: os.remove(os.path.join(self.data_dir, filename)) if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) raise ValueError( "Invalid dataset name! ", self.name, "Please make sure the dataset " + "is available on http://timeseriesclassification.com/.", ) from e finally: print() break
[docs] def __getitem__(self, index: int) -> Tuple[Any, Any]: """ Args: index (int): The index of the sample to return. Returns: tuple: (series, target) where target is index of the target class. """ series, target = self.data[index], self.targets[index] if self.transforms is not None: series, target = self.transforms(series, target) return series, target
def __len__(self) -> int: return len(self.data) def extra_repr(self) -> str: return "Dataset: {}\n".format(self.name) + \ "Split: {}".format("Train" if self.train is True else "Test")