Skip to content

Commit 0ac7881

Browse files
denklegithub-actions[bot]mikeheddes
authored
Add 20 datasets from UCI repository (#103)
* Add new datasets * [github-action] formatting fixes * Add tqdm dependency Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: mikeheddes <[email protected]>
1 parent 996a71d commit 0ac7881

21 files changed

+667
-1
lines changed

docs/datasets.rst

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,24 @@ The Torchhd library provides many popular built-in datasets to work with.
2121
CyclePowerPlant
2222
Abalone
2323
Adult
24-
24+
AcuteInflammation
25+
AcuteNephritis
26+
Annealing
27+
Arrhythmia
28+
AudiologyStd
29+
BalanceScale
30+
Balloons
31+
Bank
32+
Blood
33+
BreastCancer
34+
BreastCancerWisc
35+
BreastCancerWiscDiag
36+
BreastCancerWiscProg
37+
BreastTissue
38+
Car
39+
Cardiotocography3Clases
40+
Cardiotocography10Clases
41+
ChessKrvk
2542

2643
Base classes
2744
------------------------

docs/requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ torch
22
torchvision
33
pandas
44
requests
5+
tqdm
56
numpy
67
sphinx
78
sphinx-rtd-theme

torchhd/datasets/__init__.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,24 @@
1111
from torchhd.datasets.dataset import DatasetTrainTest
1212
from torchhd.datasets.abalone import Abalone
1313
from torchhd.datasets.adult import Adult
14+
from torchhd.datasets.acute_inflammation import AcuteInflammation
15+
from torchhd.datasets.acute_nephritis import AcuteNephritis
16+
from torchhd.datasets.annealing import Annealing
17+
from torchhd.datasets.arrhythmia import Arrhythmia
18+
from torchhd.datasets.audiology_std import AudiologyStd
19+
from torchhd.datasets.balance_scale import BalanceScale
20+
from torchhd.datasets.balloons import Balloons
21+
from torchhd.datasets.bank import Bank
22+
from torchhd.datasets.blood import Blood
23+
from torchhd.datasets.breast_cancer import BreastCancer
24+
from torchhd.datasets.breast_cancer_wisc import BreastCancerWisc
25+
from torchhd.datasets.breast_cancer_wisc_diag import BreastCancerWiscDiag
26+
from torchhd.datasets.breast_cancer_wisc_prog import BreastCancerWiscProg
27+
from torchhd.datasets.breast_tissue import BreastTissue
28+
from torchhd.datasets.car import Car
29+
from torchhd.datasets.cardiotocography_3clases import Cardiotocography3Clases
30+
from torchhd.datasets.cardiotocography_10clases import Cardiotocography10Clases
31+
from torchhd.datasets.chess_krvk import ChessKrvk
1432

1533

1634
__all__ = [
@@ -27,4 +45,22 @@
2745
"DatasetTrainTest",
2846
"Abalone",
2947
"Adult",
48+
"AcuteInflammation",
49+
"AcuteNephritis",
50+
"Annealing",
51+
"Arrhythmia",
52+
"AudiologyStd",
53+
"BalanceScale",
54+
"Balloons",
55+
"Bank",
56+
"Blood",
57+
"BreastCancer",
58+
"BreastCancerWisc",
59+
"BreastCancerWiscDiag",
60+
"BreastCancerWiscProg",
61+
"BreastTissue",
62+
"Car",
63+
"Cardiotocography3Clases",
64+
"Cardiotocography10Clases",
65+
"ChessKrvk",
3066
]
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetFourFold
3+
4+
5+
class AcuteInflammation(DatasetFourFold):
6+
"""`Acute Inflammation of urinary bladder <https://archive.ics.uci.edu/ml/datasets/Acute+Inflammations>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
11+
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
12+
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
13+
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
14+
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
15+
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
16+
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
17+
while the second row corresponds to test indices (used if ``train = False``).
18+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
19+
and returns a transformed version.
20+
target_transform (callable, optional): A function/transform that takes in the
21+
target and transforms it.
22+
download (bool, optional): If True, downloads the dataset from the internet and
23+
puts it in root directory. If dataset is already downloaded, it is not
24+
downloaded again.
25+
"""
26+
27+
name = "acute-inflammation"
28+
classes: List[str] = [
29+
"yes",
30+
"no",
31+
]

torchhd/datasets/acute_nephritis.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetFourFold
3+
4+
5+
class AcuteNephritis(DatasetFourFold):
6+
"""`Acute Nephritis of renal pelvis origin <https://archive.ics.uci.edu/ml/datasets/Acute+Inflammations>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
11+
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
12+
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
13+
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
14+
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
15+
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
16+
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
17+
while the second row corresponds to test indices (used if ``train = False``).
18+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
19+
and returns a transformed version.
20+
target_transform (callable, optional): A function/transform that takes in the
21+
target and transforms it.
22+
download (bool, optional): If True, downloads the dataset from the internet and
23+
puts it in root directory. If dataset is already downloaded, it is not
24+
downloaded again.
25+
"""
26+
27+
name = "acute-nephritis"
28+
classes: List[str] = [
29+
"yes",
30+
"no",
31+
]

torchhd/datasets/annealing.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetTrainTest
3+
4+
5+
class Annealing(DatasetTrainTest):
6+
"""`Annealing <https://archive.ics.uci.edu/ml/datasets/Annealing>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
11+
Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
12+
hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
13+
while the second row corresponds to test indices (used if ``train = False``).
14+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
15+
and returns a transformed version.
16+
target_transform (callable, optional): A function/transform that takes in the
17+
target and transforms it.
18+
download (bool, optional): If True, downloads the dataset from the internet and
19+
puts it in root directory. If dataset is already downloaded, it is not
20+
downloaded again.
21+
"""
22+
23+
name = "annealing"
24+
classes: List[str] = [
25+
"1",
26+
"2",
27+
"3",
28+
"4",
29+
"5",
30+
]

torchhd/datasets/arrhythmia.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetFourFold
3+
4+
5+
class Arrhythmia(DatasetFourFold):
6+
"""`Arrhythmia <https://archive.ics.uci.edu/ml/datasets/arrhythmia>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
11+
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
12+
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
13+
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
14+
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
15+
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
16+
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
17+
while the second row corresponds to test indices (used if ``train = False``).
18+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
19+
and returns a transformed version.
20+
target_transform (callable, optional): A function/transform that takes in the
21+
target and transforms it.
22+
download (bool, optional): If True, downloads the dataset from the internet and
23+
puts it in root directory. If dataset is already downloaded, it is not
24+
downloaded again.
25+
"""
26+
27+
name = "arrhythmia"
28+
classes: List[str] = [
29+
"1 - normal",
30+
"2",
31+
"3",
32+
"4",
33+
"5",
34+
"6",
35+
"7",
36+
"8",
37+
"9",
38+
"10",
39+
"14",
40+
"15",
41+
"16 - unclassified",
42+
]

torchhd/datasets/audiology_std.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetTrainTest
3+
4+
5+
class AudiologyStd(DatasetTrainTest):
6+
"""`Audiology (Standardized) <https://archive.ics.uci.edu/ml/datasets/Audiology+%28Standardized%29>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by hyper_search variable.
11+
Otherwise returns a subset of train dataset if hyperparameter search is performed (``hyper_search = True``) if not (``hyper_search = False``) returns test set.
12+
hyper_search (bool, optional): If True, creates dataset using indices in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
13+
while the second row corresponds to test indices (used if ``train = False``).
14+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
15+
and returns a transformed version.
16+
target_transform (callable, optional): A function/transform that takes in the
17+
target and transforms it.
18+
download (bool, optional): If True, downloads the dataset from the internet and
19+
puts it in root directory. If dataset is already downloaded, it is not
20+
downloaded again.
21+
"""
22+
23+
name = "audiology-std"
24+
classes: List[str] = [
25+
"cochlear_age",
26+
"cochlear_age_and_noise",
27+
"cochlear_noise_and_heredity",
28+
"cochlear_poss_noise",
29+
"cochlear_unknown",
30+
"conductive_discontinuity",
31+
"conductive_fixation",
32+
"mixed_cochlear_age_otitis_media",
33+
"mixed_cochlear_age_s_om",
34+
"mixed_cochlear_unk_discontinuity",
35+
"mixed_cochlear_unk_fixation",
36+
"mixed_cochlear_unk_ser_om",
37+
"mixed_poss_noise_om",
38+
"normal_ear",
39+
"otitis_media",
40+
"possible_brainstem_disorder",
41+
"possible_menieres",
42+
"retrocochlear_unknown",
43+
]

torchhd/datasets/balance_scale.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetFourFold
3+
4+
5+
class BalanceScale(DatasetFourFold):
6+
"""`Balance Scale <https://archive.ics.uci.edu/ml/datasets/balance+scale>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
11+
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
12+
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
13+
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
14+
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
15+
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
16+
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
17+
while the second row corresponds to test indices (used if ``train = False``).
18+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
19+
and returns a transformed version.
20+
target_transform (callable, optional): A function/transform that takes in the
21+
target and transforms it.
22+
download (bool, optional): If True, downloads the dataset from the internet and
23+
puts it in root directory. If dataset is already downloaded, it is not
24+
downloaded again.
25+
"""
26+
27+
name = "balance-scale"
28+
classes: List[str] = [
29+
"B",
30+
"L",
31+
"R",
32+
]

torchhd/datasets/balloons.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetFourFold
3+
4+
5+
class Balloons(DatasetFourFold):
6+
"""`Balloons <https://archive.ics.uci.edu/ml/datasets/balloons>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
11+
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
12+
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
13+
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
14+
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
15+
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
16+
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
17+
while the second row corresponds to test indices (used if ``train = False``).
18+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
19+
and returns a transformed version.
20+
target_transform (callable, optional): A function/transform that takes in the
21+
target and transforms it.
22+
download (bool, optional): If True, downloads the dataset from the internet and
23+
puts it in root directory. If dataset is already downloaded, it is not
24+
downloaded again.
25+
"""
26+
27+
name = "balloons"
28+
classes: List[str] = [
29+
"inflated - F",
30+
"inflated - T",
31+
]

torchhd/datasets/bank.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from typing import List
2+
from torchhd.datasets import DatasetFourFold
3+
4+
5+
class Bank(DatasetFourFold):
6+
"""`Bank Marketing <https://archive.ics.uci.edu/ml/datasets/Bank+Marketing>`_ dataset.
7+
8+
Args:
9+
root (string): Root directory containing the files of the dataset.
10+
train (bool, optional): If True, returns training (sub)set from the file storing training data as further determined by fold and hyper_search variables.
11+
Otherwise returns a subset of train dataset if hypersearch is performed (``hyper_search = True``) if not (``hyper_search = False``) returns a subset of training dataset
12+
as specified in ``conxuntos_kfold.dat`` if fold number is correct. Otherwise issues an error.
13+
fold (int, optional): Specifies which fold number to use. The default value of -1 returns all the training data from the corresponding file.
14+
Values between 0 and 3 specify, which fold in ``conxuntos_kfold.dat`` to use. Relevant only if hyper_search is set to False and ``0 <= fold <= 3``.
15+
Indices in even rows (zero indexing) of ``conxuntos_kfold.dat`` correspond to train subsets while indices in odd rows correspond to test subsets.
16+
hyper_search (bool, optional): If True, creates dataset using indeces in ``conxuntos.dat``. This split is used for hyperparameter search. The first row corresponds to train indices (used if ``train = True``)
17+
while the second row corresponds to test indices (used if ``train = False``).
18+
transform (callable, optional): A function/transform that takes in an torch.FloatTensor
19+
and returns a transformed version.
20+
target_transform (callable, optional): A function/transform that takes in the
21+
target and transforms it.
22+
download (bool, optional): If True, downloads the dataset from the internet and
23+
puts it in root directory. If dataset is already downloaded, it is not
24+
downloaded again.
25+
"""
26+
27+
name = "bank"
28+
classes: List[str] = [
29+
"no",
30+
"yes",
31+
]

0 commit comments

Comments
 (0)