JuliaML · CarloLucibello · May 20, 2022 · May 10, 2022 · May 10, 2022 · May 10, 2022
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,9 @@
 name = "MLDatasets"
 uuid = "eb30cadb-4394-5ae3-aed4-317e484a6458"
-version = "0.6.0"
+version = "0.7.0"
 
 [deps]
-BinDeps = "9e28174c-4ba2-5203-b857-d8d62c4213ee"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
-ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
@@ -14,9 +12,10 @@ FixedPointNumbers = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
 GZip = "92fee26a-97fe-5a0c-ad85-20a5f3185b63"
 Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
-ImageCore = "a09fc81d-aa75-5fe9-8630-4744c3626534"
+ImageShow = "4e3cecfd-b093-5904-9786-8bbb286a6a31"
 JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+LazyModules = "8cdb02fc-e678-4876-92c5-9defec4f444e"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 NPZ = "15e1cf62-19b3-5cfa-8e77-841668bca605"
@@ -26,29 +25,29 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 
 [compat]
-BinDeps = "1"
 CSV = "0.10.2"
-ColorTypes = "0.11"
 DataDeps = "0.7"
 DataFrames = "1.3"
-FileIO = "1.13"
+FileIO = "1.14"
 FixedPointNumbers = "0.8"
 GZip = "0.5"
 Glob = "1.3"
 HDF5 = "0.16.2"
-ImageCore = "0.9"
+ImageShow = "0.3"
 JLD2 = "0.4.21"
 JSON3 = "1"
+LazyModules = "0.3"
 MAT = "0.10"
 MLUtils = "0.2.0"
-Pickle = "0.3"
 NPZ = "0.4.1"
+Pickle = "0.3"
 Requires = "1"
 Tables = "1.6"
 julia = "1.6"
 
 [extras]
+ColorTypes = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
+test = ["Test", "ColorTypes"]
diff --git a/docs/make.jl b/docs/make.jl
@@ -28,7 +28,7 @@ makedocs(
             "Text" => "datasets/text.md",
             "Vision" => "datasets/vision.md",
         ],
-        "Creating Datasets" => Any["containers/overview.md"],
+        "Creating Datasets" => Any["containers/overview.md"], # still experimental
         "LICENSE.md",
     ],
     strict = true,

diff --git a/docs/src/containers/overview.md b/docs/src/containers/overview.md
@@ -4,11 +4,16 @@ MLDatasets.jl contains several reusable data containers for accessing datasets i
 
 ```@docs
 FileDataset
-TableDataset
-HDF5Dataset
-Base.close(::HDF5Dataset)
-JLD2Dataset
-Base.close(::JLD2Dataset)
 CachedDataset
 MLDatasets.make_cache
 ```
+
+<!-- 
+# TODO add back to docs when included again in the pkg
+HDF5Dataset
+Base.close(::HDF5Dataset)
+TableDataset
+JLD2Dataset
+Base.close(::JLD2Dataset) 
+-->
+
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -25,7 +25,7 @@ Pkg.add("MLDatasets")
 
 Datasets are grouped into different categories. Click on the links below for a full list of datasets available in each category.
 
-- [Graphs Datasets](@ref) - datasets with an underlying graph structure: Cora, PubMed, CiteSeer, ...
+- [Graph Datasets](@ref) - datasets with an underlying graph structure: Cora, PubMed, CiteSeer, ...
 - [Miscellaneuous Datasets](@ref) - datasets that do not fall into any of the other categories: Iris, BostonHousing, ...
 - [Text Datasets](@ref) - datasets for language models. 
 - [Vision Datasets](@ref) - vision related datasets such as MNIST, CIFAR10, CIFAR100, ... 
@@ -71,6 +71,8 @@ julia> summary(X_train)
 Input features are commonly denoted by `features`, while classification labels and regression targets are denoted by `targets`.
 
 ```julia-repl
+julia> using MLDatasets, DataFrames
+
 julia> iris = Iris()
 dataset Iris:
   metadata    =>    Dict{String, Any} with 4 entries
@@ -136,6 +138,35 @@ julia> iris.targets
 MLDatasets.jl garuantees compatibility with the [getobs](https://juliaml.github.io/MLUtils.jl/dev/api/#MLUtils.getobs) and [numobs](https://juliaml.github.io/MLUtils.jl/dev/api/#MLUtils.numobs) interface defined in [MLUtils.jl](https://github.com/JuliaML/MLUtils.jl).
 In practice, applying `getobs` and `numobs` on datasets is equivalent to applying indexing and `length`.
 
+## Conditional module loading
+
+MLDatasets.jl relies on many different packages in order to load and process the diverse type of datasets it supports. Most likely, any single user of the library will use a limited subset of these functionalities.
+In order to reduce the time taken by `using MLDatasets` in users' code,
+we use a [lazy import system](https://github.com/johnnychen94/LazyModules.jl) that defers the import of packages inside MLDatasets.jl as much as possible.  
+For some of the packages, some manual intervention is needed from the user. 
+As an example, the following code will produce an error:
+
+```julia-repl
+julia> using MLDataset
+
+julia> MNIST(); # fine, MNIST doesn't require DataFrames
+
+julia> Iris() # ERROR: Add `import DataFrames` or `using DataFrames` to your code to unlock this functionality.
+```
+
+We can easily fix the error with an additional import as recommended by the error message:
+
+```julia-repl
+julia> using MLDataset, DataFrames
+
+julia> Iris()
+dataset Iris:
+  metadata    =>    Dict{String, Any} with 4 entries
+  features    =>    150×4 DataFrame
+  targets     =>    150×1 DataFrame
+  dataframe   =>    150×5 DataFrame
+```
+
 ## Download location
 
 MLDatasets.jl is build on top of the package

diff --git a/src/MLDatasets.jl b/src/MLDatasets.jl
@@ -2,28 +2,34 @@ module MLDatasets
 
 using FixedPointNumbers
 using SparseArrays
-using DataFrames, Tables
-using Glob
-import ImageCore
-using ColorTypes
+using Tables
+using DataDeps
 import MLUtils
 using MLUtils: getobs, numobs, AbstractDataContainer
-
-### I/O imports
-import NPZ
-import Pickle
-using MAT: matopen, matread
-import CSV 
-using HDF5
-using JLD2
-import JSON3
+using Glob
 using DelimitedFiles: readdlm
-##########
+using FileIO
+using LazyModules: @lazy
 
-export getobs, numobs
-export convert2image
+include("require.jl") # export @require
+
+# Use `@lazy import SomePkg` whenever the returned types are not its own types,
+# since for methods applied on the returned types we would encounter in world-age issues
+# (see discussion in  https://github.com/JuliaML/MLDatasets.jl/pull/128).
+# In the other case instead, use `require import SomePkg` to force 
+# the use to manually import.
 
+@require import JSON3="0f8b85d8-7281-11e9-16c2-39a750bddbf1"
+@require import DataFrames="a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+@require import ImageShow="4e3cecfd-b093-5904-9786-8bbb286a6a31"
+# @lazy import NPZ # lazy imported by FileIO
+@lazy import Pickle="fbb45041-c46e-462f-888f-7c521cafbc2c"
+@lazy import MAT="23992714-dd62-5051-b70f-ba57cb901cac"
+@lazy import CSV="336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+@lazy import HDF5="f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+# @lazy import JLD2
 
+export getobs, numobs # From MLUtils.jl
 
 include("abstract_datasets.jl")
 # export AbstractDataset, 
@@ -33,22 +39,26 @@ include("utils.jl")
 export convert2image
 
 include("io.jl")
-# export read_csv, read_npy
+# export read_csv, read_npy, ...
 
 include("download.jl")
 
 include("containers/filedataset.jl")
 export FileDataset
-include("containers/tabledataset.jl")
-export TableDataset
-include("containers/hdf5dataset.jl")
-export HDF5Dataset
-include("containers/jld2dataset.jl")
-export JLD2Dataset
 include("containers/cacheddataset.jl")
 export CachedDataset
+# include("containers/tabledataset.jl")
+# export TableDataset
+
+## TODO add back when compatible with `@lazy` or `@require`
+## which means that they cannot dispatch on types from JLD2 and HDF5
+# include("containers/hdf5dataset.jl")
+# export HDF5Dataset
+# include("containers/jld2dataset.jl")
+# export JLD2Dataset
+
+## Misc.
 
-# Misc.
 include("datasets/misc/boston_housing.jl")
 export BostonHousing
 include("datasets/misc/iris.jl")
@@ -59,7 +69,7 @@ include("datasets/misc/titanic.jl")
 export Titanic
 
 
-# Vision
+## Vision
 
 include("datasets/vision/emnist.jl")
 export EMNIST
@@ -74,11 +84,11 @@ export CIFAR10
 include("datasets/vision/cifar100_reader/CIFAR100Reader.jl")
 include("datasets/vision/cifar100.jl")
 export CIFAR100
-
 include("datasets/vision/svhn2.jl")
 export SVHN2
 
-# Text
+## Text
+
 include("datasets/text/ptblm.jl")
 export PTBLM
 include("datasets/text/udenglish.jl")

diff --git a/src/abstract_datasets.jl b/src/abstract_datasets.jl
@@ -42,10 +42,12 @@ function leftalign(s::AbstractString, n::Int)
     end
 end
 
-_summary(x) = x
+_summary(x) = Tables.istable(x) ? summary(x) : x
 _summary(x::Symbol) = ":$x"
-_summary(x::Union{Dict, AbstractArray, DataFrame}) = summary(x)
-_summary(x::Union{Tuple, NamedTuple}) = map(_summary, x)
+_summary(x::Dict) = summary(x)
+_summary(x::Tuple) = map(_summary, x)
+_summary(x::NamedTuple) = map(_summary, x)
+_summary(x::AbstractArray) = summary(x)
 _summary(x::BitVector) = "$(count(x))-trues BitVector"
 
 """
@@ -58,11 +60,18 @@ a `features` and a `targets` fields.
 abstract type SupervisedDataset <: AbstractDataset end
 
 
-Base.length(d::SupervisedDataset) = numobs((d.features, d.targets))
+Base.length(d::SupervisedDataset) = Tables.istable(d.features) ? numobs_table(d.features) : 
+                                                                 numobs((d.features, d.targets))
+
 
 # We return named tuples
-Base.getindex(d::SupervisedDataset, ::Colon) = getobs((; d.features, d.targets))
-Base.getindex(d::SupervisedDataset, i) = getobs((; d.features, d.targets), i)
+Base.getindex(d::SupervisedDataset, ::Colon) = Tables.istable(d.features) ?
+    (features = d.features, targets=d.targets) :
+    getobs((; d.features, d.targets))
+
+Base.getindex(d::SupervisedDataset, i) = Tables.istable(d.features) ? 
+    (features = getobs_table(d.features, i), targets=getobs_table(d.targets, i)) :
+    getobs((; d.features, d.targets), i)
 
 """
     UnsupervisedDataset <: AbstractDataset

diff --git a/src/containers/tabledataset.jl b/src/containers/tabledataset.jl
@@ -12,14 +12,15 @@ struct TableDataset{T} <: AbstractDataContainer
     # TableDatasets must implement the Tables.jl interface
     function TableDataset{T}(table::T) where {T}
         Tables.istable(table) ||
-            throw(ArgumentError("TableDatasets must implement the Tabels.jl interface"))
+            throw(ArgumentError("The input must implement the Tables.jl interface"))
 
         new{T}(table)
     end
 end
 
 TableDataset(table::T) where {T} = TableDataset{T}(table)
-TableDataset(path::AbstractString) = TableDataset(DataFrame(CSV.File(path)))
+TableDataset(path::AbstractString) = TableDataset(read_csv(path))
+
 
 # slow accesses based on Tables.jl
 _getobs_row(x, i) = first(Iterators.peel(Iterators.drop(x, i - 1)))
@@ -29,37 +30,44 @@ function _getobs_column(x, i)
 
     return NamedTuple{colnames}(rowvals)
 end
-function Base.getindex(dataset::TableDataset, i)
-    if Tables.rowaccess(dataset.table)
-        return _getobs_row(Tables.rows(dataset.table), i)
-    elseif Tables.columnaccess(dataset.table)
-        return _getobs_column(dataset.table, i)
+
+function getobs_table(table, i)
+    if Tables.rowaccess(table)
+        return _getobs_row(Tables.rows(table), i)
+    elseif Tables.columnaccess(table)
+        return _getobs_column(table, i)
     else
         error("The Tables.jl implementation used should have either rowaccess or columnaccess.")
     end
 end
-function Base.length(dataset::TableDataset)
-    if Tables.columnaccess(dataset.table)
-        return length(Tables.getcolumn(dataset.table, 1))
-    elseif Tables.rowaccess(dataset.table)
+
+function numobs_table(table)
+    if Tables.columnaccess(table)
+        return length(Tables.getcolumn(table, 1))
+    elseif Tables.rowaccess(table)
         # length might not be defined, but has to be for this to work.
-        return length(Tables.rows(dataset.table))
+        return length(Tables.rows(table))
     else
         error("The Tables.jl implementation used should have either rowaccess or columnaccess.")
     end
 end
 
+Base.getindex(dataset::TableDataset, i) = getobs_table(dataset.table, i)
+Base.length(dataset::TableDataset) = numobs_table(dataset.table)
+
+
 # fast access for DataFrame
-Base.getindex(dataset::TableDataset{<:DataFrame}, i) = dataset.table[i, :]
-Base.length(dataset::TableDataset{<:DataFrame}) = nrow(dataset.table)
+# Base.getindex(dataset::TableDataset{<:DataFrame}, i) = dataset.table[i, :]
+# Base.length(dataset::TableDataset{<:DataFrame}) = nrow(dataset.table)
 
 # fast access for CSV.File
-Base.getindex(dataset::TableDataset{<:CSV.File}, i) = dataset.table[i]
-Base.length(dataset::TableDataset{<:CSV.File}) = length(dataset.table)
+# Base.getindex(dataset::TableDataset{<:CSV.File}, i) = dataset.table[i]
+# Base.length(dataset::TableDataset{<:CSV.File}) = length(dataset.table)
 
 ## Tables.jl interface
 
 Tables.istable(::TableDataset) = true
+
 for fn in (:rowaccess, :rows, :columnaccess, :columns, :schema, :materializer)
     @eval Tables.$fn(dataset::TableDataset) = Tables.$fn(dataset.table)
 end
diff --git a/src/datasets/graphs/planetoid.jl b/src/datasets/graphs/planetoid.jl
@@ -83,23 +83,12 @@ function read_planetoid_data(DEPNAME; dir=nothing, reverse_edges=true)
     return metadata, g
 end
 
-function read_pickle_file(filename, name)
-    out = Pickle.npyload(filename)
-    if name == "graph"
-        return out
-    end
-    if out isa SparseMatrixCSC
-        return Matrix(out)
-    end
-    return out
-end
-
 function read_planetoid_file(DEPNAME, name, dir)
     filename = datafile(DEPNAME, name, dir)
     if endswith(name, "test.index")
         out = 1 .+ vec(readdlm(filename, Int))
     else
-        out = read_pickle_file(filename, name)
+        out = read_pickle(filename)
         if out isa SparseMatrixCSC
             out = Matrix(out)
         end