From e58f6283e4498ab56258daeef29b7b0225719ce4 Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 11 Jul 2018 14:19:48 +0300 Subject: [PATCH 01/50] added cython implementation of hellinger distance criterion compatible with sklearn --- .../hellinger_distance_criterion.pyx | 80 +++++++++++++++++++ imblearn/tree_split/setup.py | 13 +++ 2 files changed, 93 insertions(+) create mode 100644 imblearn/tree_split/hellinger_distance_criterion.pyx create mode 100644 imblearn/tree_split/setup.py diff --git a/imblearn/tree_split/hellinger_distance_criterion.pyx b/imblearn/tree_split/hellinger_distance_criterion.pyx new file mode 100644 index 000000000..1bffe7f12 --- /dev/null +++ b/imblearn/tree_split/hellinger_distance_criterion.pyx @@ -0,0 +1,80 @@ +from sklearn.tree._criterion cimport ClassificationCriterion +from sklearn.tree._criterion cimport SIZE_t + +import numpy as np +cdef double INFINITY = np.inf + +from libc.math cimport sqrt, pow +from libc.math cimport abs + + +cdef class HellingerDistanceCriterion(ClassificationCriterion): + + cdef double proxy_impurity_improvement(self) nogil: + cdef double impurity_left + cdef double impurity_right + + self.children_impurity(&impurity_left, &impurity_right) + + return impurity_right + impurity_left + + cdef double impurity_improvement(self, double impurity) nogil: + cdef double impurity_left + cdef double impurity_right + + self.children_impurity(&impurity_left, &impurity_right) + + return impurity_right + impurity_left + + cdef double node_impurity(self) nogil: + cdef SIZE_t* n_classes = self.n_classes + cdef double* sum_total = self.sum_total + cdef double hellinger = 0.0 + cdef double sq_count + cdef double count_k + cdef SIZE_t k + cdef SIZE_t c + + for k in range(self.n_outputs): + for c in range(n_classes[k]): + hellinger += 1.0 + + return hellinger / self.n_outputs + + cdef void children_impurity(self, double* impurity_left, + double* impurity_right) nogil: + cdef SIZE_t* n_classes = self.n_classes + cdef double* sum_left = self.sum_left + cdef double* sum_right = self.sum_right + cdef double hellinger_left = 0.0 + cdef double hellinger_right = 0.0 + cdef double count_k1 = 0.0 + cdef double count_k2 = 0.0 + + cdef SIZE_t k + cdef SIZE_t c + + # stop splitting in case reached pure node with 0 samples of second class + if sum_left[1] + sum_right[1] == 0: + impurity_left[0] = -INFINITY + impurity_right[0] = -INFINITY + return + + for k in range(self.n_outputs): + if(sum_left[0] + sum_right[0] > 0): + count_k1 = sqrt(sum_left[0] / (sum_left[0] + sum_right[0])) + if(sum_left[1] + sum_right[1] > 0): + count_k2 = sqrt(sum_left[1] / (sum_left[1] + sum_right[1])) + + hellinger_left += pow((count_k1 - count_k2),2) + + if(sum_left[0] + sum_right[0] > 0): + count_k1 = sqrt(sum_right[0] / (sum_left[0] + sum_right[0])) + if(sum_left[1] + sum_right[1] > 0): + count_k2 = sqrt(sum_right[1] / (sum_left[1] + sum_right[1])) + + hellinger_right += pow((count_k1 - count_k2),2) + + impurity_left[0] = hellinger_left / self.n_outputs + impurity_right[0] = hellinger_right / self.n_outputs + \ No newline at end of file diff --git a/imblearn/tree_split/setup.py b/imblearn/tree_split/setup.py new file mode 100644 index 000000000..bc25fd851 --- /dev/null +++ b/imblearn/tree_split/setup.py @@ -0,0 +1,13 @@ +from setuptools import setup +from Cython.Build import cythonize +import numpy + +setup( + name='hellinger-distance-criterion', + version=0.1, + url='github.com/EvgeniDubov/hellinger-random-forest', + author='Evgeni Dubov', + author_email='evgeni.dubov@gmail.com', + ext_modules = cythonize('hellinger_distance_criterion.pyx'), + include_dirs=[numpy.get_include()] +) \ No newline at end of file From 649e2044d12cda3e9e78b116f7c35a0b6be1c9d0 Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 11 Jul 2018 14:31:50 +0300 Subject: [PATCH 02/50] added usage example --- ...model_with_hellinger_distance_criterion.py | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 examples/tree_split/train_model_with_hellinger_distance_criterion.py diff --git a/examples/tree_split/train_model_with_hellinger_distance_criterion.py b/examples/tree_split/train_model_with_hellinger_distance_criterion.py new file mode 100644 index 000000000..368c642dd --- /dev/null +++ b/examples/tree_split/train_model_with_hellinger_distance_criterion.py @@ -0,0 +1,50 @@ +import numpy as np +import pandas as pd +from hellinger_distance_criterion import HellingerDistanceCriterion +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split +from sklearn.ensemble import RandomForestClassifier +from sklearn.datasets import load_breast_cancer +from sklearn.tree import DecisionTreeClassifier + +# Random Forest criterions comparison +def compare_rf(X_train, y_train, X_test, y_test): + clf = RandomForestClassifier(criterion='gini', max_depth=4, n_estimators=100) + clf.fit(X_train, y_train) + print('gini score: ', clf.score(X_test, y_test)) + + clf = RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=100) + clf.fit(X_train, y_train) + print('entropy score: ', clf.score(X_test, y_test)) + + hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) + clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100) + clf.fit(X_train, y_train) + print('hellinger distance score: ', clf.score(X_test, y_test)) + +# Decision Tree criterions comparison +def compare_dt(X_train, y_train, X_test, y_test): + clf = DecisionTreeClassifier(criterion='gini', max_depth=4) + clf.fit(X_train, y_train) + print('gini score: ', clf.score(X_test, y_test)) + + clf = DecisionTreeClassifier(criterion='entropy', max_depth=4) + clf.fit(X_train, y_train) + print('entropy score: ', clf.score(X_test, y_test)) + + hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) + clf = DecisionTreeClassifier(criterion=hdc, max_depth=4) + clf.fit(X_train, y_train) + print('hellinger distance score: ', clf.score(X_test, y_test)) + +# Comparison on breast cancer dataset +bc = load_breast_cancer() +X_train, X_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.3) +compare_rf(X_train, y_train, X_test, y_test) +compare_dt(X_train, y_train, X_test, y_test) + +# Comparison on imbalanced dataset +X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) +compare_rf(X_train, y_train, X_test, y_test) +compare_dt(X_train, y_train, X_test, y_test) From 689a41b35547cd7f74cc64ee5121f0de048b4c28 Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 11 Jul 2018 21:30:43 +0300 Subject: [PATCH 03/50] added README --- examples/tree_split/README.txt | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 examples/tree_split/README.txt diff --git a/examples/tree_split/README.txt b/examples/tree_split/README.txt new file mode 100644 index 000000000..78d2de242 --- /dev/null +++ b/examples/tree_split/README.txt @@ -0,0 +1,9 @@ +.. _tree_split_examples: + +Example using Hellinger Distance as tree split criterion +======================================================== + +Hellinger Distance is used to quantify the similarity between two probability distributions. +When used as Decision Tree split criterion it is skew insensitive and tackles the imbalance problem. +This is Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. + From e4e13a7e736f223cf7a7edc3f60c7dc8545bc621 Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 11 Jul 2018 21:57:39 +0300 Subject: [PATCH 04/50] update license --- imblearn/tree_split/hellinger_distance_criterion.pyx | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/imblearn/tree_split/hellinger_distance_criterion.pyx b/imblearn/tree_split/hellinger_distance_criterion.pyx index 1bffe7f12..340aced86 100644 --- a/imblearn/tree_split/hellinger_distance_criterion.pyx +++ b/imblearn/tree_split/hellinger_distance_criterion.pyx @@ -1,3 +1,7 @@ +# Author: Evgeni Dubov +# +# License: BSD 3 clause + from sklearn.tree._criterion cimport ClassificationCriterion from sklearn.tree._criterion cimport SIZE_t From e33d8a345424fcd62a1b3bc355e9129c3b052761 Mon Sep 17 00:00:00 2001 From: EvgeniDubov <32032278+EvgeniDubov@users.noreply.github.com> Date: Thu, 12 Jul 2018 10:33:23 +0300 Subject: [PATCH 05/50] Fixed pep8 issues in the example --- ...model_with_hellinger_distance_criterion.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/examples/tree_split/train_model_with_hellinger_distance_criterion.py b/examples/tree_split/train_model_with_hellinger_distance_criterion.py index 368c642dd..b2371e49c 100644 --- a/examples/tree_split/train_model_with_hellinger_distance_criterion.py +++ b/examples/tree_split/train_model_with_hellinger_distance_criterion.py @@ -7,21 +7,27 @@ from sklearn.datasets import load_breast_cancer from sklearn.tree import DecisionTreeClassifier + # Random Forest criterions comparison def compare_rf(X_train, y_train, X_test, y_test): - clf = RandomForestClassifier(criterion='gini', max_depth=4, n_estimators=100) + clf = RandomForestClassifier(criterion='gini', + max_depth=4, + n_estimators=100) clf.fit(X_train, y_train) print('gini score: ', clf.score(X_test, y_test)) - clf = RandomForestClassifier(criterion='entropy', max_depth=4, n_estimators=100) + clf = RandomForestClassifier(criterion='entropy', + max_depth=4, + n_estimators=100) clf.fit(X_train, y_train) print('entropy score: ', clf.score(X_test, y_test)) - hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) + hdc = HellingerDistanceCriterion(1, np.array([2], dtype='int64')) clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100) clf.fit(X_train, y_train) print('hellinger distance score: ', clf.score(X_test, y_test)) + # Decision Tree criterions comparison def compare_dt(X_train, y_train, X_test, y_test): clf = DecisionTreeClassifier(criterion='gini', max_depth=4) @@ -32,19 +38,25 @@ def compare_dt(X_train, y_train, X_test, y_test): clf.fit(X_train, y_train) print('entropy score: ', clf.score(X_test, y_test)) - hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) + hdc = HellingerDistanceCriterion(1, np.array([2], dtype='int64')) clf = DecisionTreeClassifier(criterion=hdc, max_depth=4) clf.fit(X_train, y_train) print('hellinger distance score: ', clf.score(X_test, y_test)) + # Comparison on breast cancer dataset bc = load_breast_cancer() -X_train, X_test, y_train, y_test = train_test_split(bc.data, bc.target, test_size=0.3) +X_train, X_test, y_train, y_test = train_test_split(bc.data, + bc.target, + test_size=0.3) compare_rf(X_train, y_train, X_test, y_test) compare_dt(X_train, y_train, X_test, y_test) + # Comparison on imbalanced dataset -X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) +X, y = make_classification(n_samples=10000, n_features=40, + n_informative=5, n_classes=2, + weights=[0.05, 0.95], random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) compare_rf(X_train, y_train, X_test, y_test) compare_dt(X_train, y_train, X_test, y_test) From f08534bb262bc8f3b7681020d9e745cbb675c1e9 Mon Sep 17 00:00:00 2001 From: EvgeniDubov <32032278+EvgeniDubov@users.noreply.github.com> Date: Thu, 12 Jul 2018 10:37:25 +0300 Subject: [PATCH 06/50] Fixed pep8 issues in the setup --- imblearn/tree_split/setup.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/imblearn/tree_split/setup.py b/imblearn/tree_split/setup.py index bc25fd851..b16642fed 100644 --- a/imblearn/tree_split/setup.py +++ b/imblearn/tree_split/setup.py @@ -1,13 +1,13 @@ -from setuptools import setup +from setuptools import setup from Cython.Build import cythonize import numpy setup( - name='hellinger-distance-criterion', - version=0.1, - url='github.com/EvgeniDubov/hellinger-random-forest', + name='hellinger-distance-criterion', + version=0.1, + url='github.com/EvgeniDubov/hellinger-distance-criterion', author='Evgeni Dubov', - author_email='evgeni.dubov@gmail.com', - ext_modules = cythonize('hellinger_distance_criterion.pyx'), - include_dirs=[numpy.get_include()] -) \ No newline at end of file + author_email='evgeni.dubov@gmail.com', + ext_modules=cythonize('hellinger_distance_criterion.pyx'), + include_dirs=[numpy.get_include()] +) From d655b6045c161e45dea76d49c63f014a066f8f43 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 16 Jul 2018 14:11:03 +0300 Subject: [PATCH 07/50] added support for cython build based on https://github.com/jakevdp/cython_template --- MANIFEST.in | 2 +- build_tools/cython/cythonize.py | 198 ++++++++++++++++++++++++ imblearn/__check_build/README.md | 5 + imblearn/__check_build/__init__.py | 58 +++++++ imblearn/__check_build/_check_build.pyx | 4 + imblearn/__check_build/setup.py | 16 ++ imblearn/tree_split/__init__.py | 1 + imblearn/tree_split/setup.py | 33 ++-- setup.py | 86 ++++++++-- 9 files changed, 375 insertions(+), 28 deletions(-) create mode 100644 build_tools/cython/cythonize.py create mode 100644 imblearn/__check_build/README.md create mode 100644 imblearn/__check_build/__init__.py create mode 100644 imblearn/__check_build/_check_build.pyx create mode 100644 imblearn/__check_build/setup.py create mode 100644 imblearn/tree_split/__init__.py diff --git a/MANIFEST.in b/MANIFEST.in index 192436787..52494e272 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ - recursive-include doc * recursive-include examples * +include imblearn/tree_split *.pyx include AUTHORS.rst include CONTRIBUTING.ms include LICENSE diff --git a/build_tools/cython/cythonize.py b/build_tools/cython/cythonize.py new file mode 100644 index 000000000..b6398f7af --- /dev/null +++ b/build_tools/cython/cythonize.py @@ -0,0 +1,198 @@ +#!/usr/bin/env python +""" cythonize + +Cythonize pyx files into C files as needed. + +Usage: cythonize [root_dir] + +Checks pyx files to see if they have been changed relative to their +corresponding C files. If they have, then runs cython on these files to +recreate the C files. + +The script detects changes in the pyx/pxd files using checksums +[or hashes] stored in a database file + +Simple script to invoke Cython on all .pyx +files; while waiting for a proper build system. Uses file hashes to +figure out if rebuild is needed. + +It is called by ./setup.py sdist so that sdist package can be installed without +cython + +Originally written by Dag Sverre Seljebotn, and adapted from statsmodel 0.6.1 +(Modified BSD 3-clause) + +We copied it for scikit-learn. + +Note: this script does not check any of the dependent C libraries; it only +operates on the Cython .pyx files or their corresponding Cython header (.pxd) +files. +""" +# Author: Arthur Mensch +# Author: Raghav R V +# +# License: BSD 3 clause +# see http://github.com/scikit-learn/scikit-learn + + +from __future__ import division, print_function, absolute_import + +import os +import re +import sys +import hashlib +import subprocess + +HASH_FILE = 'cythonize.dat' + + +# WindowsError is not defined on unix systems +try: + WindowsError +except NameError: + WindowsError = None + + +def cythonize(cython_file, gen_file): + try: + from Cython.Compiler.Version import version as cython_version + from distutils.version import LooseVersion + if LooseVersion(cython_version) < LooseVersion('0.21'): + raise Exception('Building scikit-learn requires Cython >= 0.21') + + except ImportError: + pass + + flags = ['--fast-fail'] + if gen_file.endswith('.cpp'): + flags += ['--cplus'] + + try: + try: + rc = subprocess.call(['cython'] + + flags + ["-o", gen_file, cython_file]) + if rc != 0: + raise Exception('Cythonizing %s failed' % cython_file) + except OSError: + # There are ways of installing Cython that don't result in a cython + # executable on the path, see scipy issue gh-2397. + rc = subprocess.call([sys.executable, '-c', + 'import sys; from Cython.Compiler.Main ' + 'import setuptools_main as main;' + ' sys.exit(main())'] + flags + + ["-o", gen_file, cython_file]) + if rc != 0: + raise Exception('Cythonizing %s failed' % cython_file) + except OSError: + raise OSError('Cython needs to be installed') + + +def load_hashes(filename): + """Load the hashes dict from the hashfile""" + # { filename : (sha1 of header if available or 'NA', + # sha1 of input, + # sha1 of output) } + + hashes = {} + try: + with open(filename, 'r') as cython_hash_file: + for hash_record in cython_hash_file: + (filename, header_hash, + cython_hash, gen_file_hash) = hash_record.split() + hashes[filename] = (header_hash, cython_hash, gen_file_hash) + except (KeyError, ValueError, AttributeError, IOError): + hashes = {} + return hashes + + +def save_hashes(hashes, filename): + """Save the hashes dict to the hashfile""" + with open(filename, 'w') as cython_hash_file: + for key, value in hashes.items(): + cython_hash_file.write("%s %s %s %s\n" + % (key, value[0], value[1], value[2])) + + +def sha1_of_file(filename): + h = hashlib.sha1() + with open(filename, "rb") as f: + h.update(f.read()) + return h.hexdigest() + + +def clean_path(path): + """Clean the path""" + path = path.replace(os.sep, '/') + if path.startswith('./'): + path = path[2:] + return path + + +def get_hash_tuple(header_path, cython_path, gen_file_path): + """Get the hashes from the given files""" + + header_hash = (sha1_of_file(header_path) + if os.path.exists(header_path) else 'NA') + from_hash = sha1_of_file(cython_path) + to_hash = (sha1_of_file(gen_file_path) + if os.path.exists(gen_file_path) else 'NA') + + return header_hash, from_hash, to_hash + + +def cythonize_if_unchanged(path, cython_file, gen_file, hashes): + full_cython_path = os.path.join(path, cython_file) + full_header_path = full_cython_path.replace('.pyx', '.pxd') + full_gen_file_path = os.path.join(path, gen_file) + + current_hash = get_hash_tuple(full_header_path, full_cython_path, + full_gen_file_path) + + if current_hash == hashes.get(clean_path(full_cython_path)): + print('%s has not changed' % full_cython_path) + return + + print('Processing %s' % full_cython_path) + cythonize(full_cython_path, full_gen_file_path) + + # changed target file, recompute hash + current_hash = get_hash_tuple(full_header_path, full_cython_path, + full_gen_file_path) + + # Update the hashes dict with the new hash + hashes[clean_path(full_cython_path)] = current_hash + + +def check_and_cythonize(root_dir): + print(root_dir) + hashes = load_hashes(HASH_FILE) + + for cur_dir, dirs, files in os.walk(root_dir): + for filename in files: + if filename.endswith('.pyx'): + gen_file_ext = '.c' + # Cython files with libcpp imports should be compiled to cpp + with open(os.path.join(cur_dir, filename), 'rb') as f: + data = f.read() + m = re.search(b"libcpp", data, re.I | re.M) + if m: + gen_file_ext = ".cpp" + cython_file = filename + gen_file = filename.replace('.pyx', gen_file_ext) + cythonize_if_unchanged(cur_dir, cython_file, gen_file, hashes) + + # Save hashes once per module. This prevents cythonizing prev. + # files again when debugging broken code in a single file + save_hashes(hashes, HASH_FILE) + + +def main(root_dir): + check_and_cythonize(root_dir) + + +if __name__ == '__main__': + try: + root_dir_arg = sys.argv[1] + except IndexError: + raise ValueError("Usage: python cythonize.py ") + main(root_dir_arg) diff --git a/imblearn/__check_build/README.md b/imblearn/__check_build/README.md new file mode 100644 index 000000000..39ff68dc9 --- /dev/null +++ b/imblearn/__check_build/README.md @@ -0,0 +1,5 @@ +``__check_build`` +================= +The purpose of this submodule is to give the user a readable error when trying +to import the package from within the source tree. + diff --git a/imblearn/__check_build/__init__.py b/imblearn/__check_build/__init__.py new file mode 100644 index 000000000..7e7491f26 --- /dev/null +++ b/imblearn/__check_build/__init__.py @@ -0,0 +1,58 @@ +""" Module to give helpful messages to the user that did not +compile package properly, + +This code was adapted from scikit-learn's check_build utility. +""" +import os + +PACKAGE_NAME = 'cython_template' + +INPLACE_MSG = """ +It appears that you are importing {package} from within the source tree. +Please either use an inplace install or try from another location. +""".format(package=PACKAGE_NAME) + +STANDARD_MSG = """ +If you have used an installer, please check that it is suited for your +Python version, your operating system and your platform. +""" + +ERROR_TEMPLATE = """{error} +___________________________________________________________________________ +Contents of {local_dir}: +{contents} +___________________________________________________________________________ +It seems that the {package} has not been built correctly. + +If you have installed {package} from source, please do not forget +to build the package before using it: run `python setup.py install` +in the source directory. +{msg}""" + + +def raise_build_error(e): + # Raise a comprehensible error and list the contents of the + # directory to help debugging on the mailing list. + local_dir = os.path.split(__file__)[0] + msg = STANDARD_MSG + if local_dir == "megaman/__check_build": + # Picking up the local install: this will work only if the + # install is an 'inplace build' + msg = INPLACE_MSG + dir_content = list() + for i, filename in enumerate(os.listdir(local_dir)): + if ((i + 1) % 3): + dir_content.append(filename.ljust(26)) + else: + dir_content.append(filename + '\n') + contents = ''.join(dir_content).strip() + raise ImportError(ERROR_TEMPLATE.format(error=e, + local_dir=local_dir, + contents=contents, + package=PACKAGE_NAME, + msg=msg)) + +try: + from ._check_build import check_build +except ImportError as e: + raise_build_error(e) diff --git a/imblearn/__check_build/_check_build.pyx b/imblearn/__check_build/_check_build.pyx new file mode 100644 index 000000000..4fe9f3a8c --- /dev/null +++ b/imblearn/__check_build/_check_build.pyx @@ -0,0 +1,4 @@ +# Adapted from scikit-learn __check_build script (BSD-licensed) + +def check_build(): + return diff --git a/imblearn/__check_build/setup.py b/imblearn/__check_build/setup.py new file mode 100644 index 000000000..535e504cd --- /dev/null +++ b/imblearn/__check_build/setup.py @@ -0,0 +1,16 @@ +# Adapted from scikit-learn __check_build script (BSD-licensed) + +import numpy + + +def configuration(parent_package='', top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration('__check_build', parent_package, top_path) + config.add_extension('_check_build', + sources=['_check_build.c']) + return config + + +if __name__ == '__main__': + from numpy.distutils.core import setup + setup(**configuration(top_path='').todict()) diff --git a/imblearn/tree_split/__init__.py b/imblearn/tree_split/__init__.py new file mode 100644 index 000000000..672474b78 --- /dev/null +++ b/imblearn/tree_split/__init__.py @@ -0,0 +1 @@ +from .hellinger_distance_criterion import HellingerDistanceCriterion diff --git a/imblearn/tree_split/setup.py b/imblearn/tree_split/setup.py index bc25fd851..9b836d087 100644 --- a/imblearn/tree_split/setup.py +++ b/imblearn/tree_split/setup.py @@ -1,13 +1,20 @@ -from setuptools import setup -from Cython.Build import cythonize -import numpy - -setup( - name='hellinger-distance-criterion', - version=0.1, - url='github.com/EvgeniDubov/hellinger-random-forest', - author='Evgeni Dubov', - author_email='evgeni.dubov@gmail.com', - ext_modules = cythonize('hellinger_distance_criterion.pyx'), - include_dirs=[numpy.get_include()] -) \ No newline at end of file +import os + +PACKAGE_NAME = 'imblearn/tree_split' + + +def configuration(parent_package='', top_path=None): + from numpy.distutils.misc_util import Configuration + + config = Configuration(PACKAGE_NAME, parent_package, top_path) + + config.add_extension('hellinger_distance_criterion', + sources=['hellinger_distance_criterion.c']) + config.li + + return config + + +if __name__ == '__main__': + from numpy.distutils.core import setup + setup(**configuration(top_path='').todict()) diff --git a/setup.py b/setup.py index 1aa948e72..ba13dd5cc 100755 --- a/setup.py +++ b/setup.py @@ -3,7 +3,8 @@ import codecs import os - +import subprocess +import sys from setuptools import find_packages, setup # get __version__ from _version.py @@ -21,6 +22,8 @@ LICENSE = 'MIT' DOWNLOAD_URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' VERSION = __version__ +TREE_SPLIT_PACKAGE = 'imblearn/tree_split' +CHECK_BUILD_PACKAGE = 'imblearn/__check_build' INSTALL_REQUIRES = ['numpy', 'scipy', 'scikit-learn'] CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', @@ -37,17 +40,72 @@ 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6'] +def configuration(parent_package='', top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration(None, parent_package, top_path) + config.set_options(ignore_setup_xxx_py=True, + assume_default_configuration=True, + delegate_options_to_subpackages=True, + quiet=True) + config.add_subpackage(CHECK_BUILD_PACKAGE) + config.add_subpackage(TREE_SPLIT_PACKAGE) + + return config + +def generate_cython(package): + """Cythonize all sources in the package""" + cwd = os.path.abspath(os.path.dirname(__file__)) + print("Cythonizing sources") + p = subprocess.call([sys.executable, + os.path.join(cwd, + 'build_tools/cython', + 'cythonize.py'), + package], + cwd=cwd) + if p != 0: + raise RuntimeError("Running cythonize failed!") + + +def setup_package(): + from numpy.distutils.core import setup + + old_path = os.getcwd() + local_path = os.path.dirname(os.path.abspath(sys.argv[0])) + src_path = local_path + + os.chdir(local_path) + sys.path.insert(0, local_path) + + old_path = os.getcwd() + os.chdir(src_path) + sys.path.insert(0, src_path) + + cwd = os.path.abspath(os.path.dirname(__file__)) + if not os.path.exists(os.path.join(cwd, 'PKG-INFO')): + generate_cython(CHECK_BUILD_PACKAGE) + generate_cython(TREE_SPLIT_PACKAGE) + + try: + setup(name=DISTNAME, + maintainer=MAINTAINER, + maintainer_email=MAINTAINER_EMAIL, + description=DESCRIPTION, + license=LICENSE, + url=URL, + version=VERSION, + download_url=DOWNLOAD_URL, + long_description=LONG_DESCRIPTION, + zip_safe=False, # the package can run out of an .egg file + classifiers=CLASSIFIERS, + configuration=configuration, + packages=find_packages(), + install_requires=INSTALL_REQUIRES) + finally: + del sys.path[0] + os.chdir(old_path) + + return + -setup(name=DISTNAME, - maintainer=MAINTAINER, - maintainer_email=MAINTAINER_EMAIL, - description=DESCRIPTION, - license=LICENSE, - url=URL, - version=VERSION, - download_url=DOWNLOAD_URL, - long_description=LONG_DESCRIPTION, - zip_safe=False, # the package can run out of an .egg file - classifiers=CLASSIFIERS, - packages=find_packages(), - install_requires=INSTALL_REQUIRES) +if __name__ == '__main__': + setup_package() From b94ed53c46cb960f23dc575e8bcf1f632f8edbbc Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 25 Jul 2018 13:16:26 +0300 Subject: [PATCH 08/50] updated 'whats new' --- doc/whats_new/v0.0.4.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index 6546a57a0..e7017f24f 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -30,6 +30,9 @@ Enhancement - Add support for one-vs-all encoded target to support keras. :issue:`409` by :user:`Guillaume Lemaitre `. +- Add support for Hellinger Distance as sklearn classification tree split criterion. + By :user: `Evgeni Dubov `. + Bug fixes ......... From 3e265cecbcdf9ea4d7b0a6f93f7e4a0250271d87 Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 25 Jul 2018 13:17:56 +0300 Subject: [PATCH 09/50] updated the example --- examples/tree_split/README.txt | 2 +- ...model_with_hellinger_distance_criterion.py | 62 +++---------------- 2 files changed, 9 insertions(+), 55 deletions(-) diff --git a/examples/tree_split/README.txt b/examples/tree_split/README.txt index 78d2de242..9e4b228ab 100644 --- a/examples/tree_split/README.txt +++ b/examples/tree_split/README.txt @@ -4,6 +4,6 @@ Example using Hellinger Distance as tree split criterion ======================================================== Hellinger Distance is used to quantify the similarity between two probability distributions. -When used as Decision Tree split criterion it is skew insensitive and tackles the imbalance problem. +When used as split criterion in Decision Tree Classifier it makes it skew insensitive and helps tackle the imbalance problem. This is Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. diff --git a/examples/tree_split/train_model_with_hellinger_distance_criterion.py b/examples/tree_split/train_model_with_hellinger_distance_criterion.py index b2371e49c..dc4ed47fd 100644 --- a/examples/tree_split/train_model_with_hellinger_distance_criterion.py +++ b/examples/tree_split/train_model_with_hellinger_distance_criterion.py @@ -1,62 +1,16 @@ import numpy as np import pandas as pd -from hellinger_distance_criterion import HellingerDistanceCriterion + from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier -from sklearn.datasets import load_breast_cancer -from sklearn.tree import DecisionTreeClassifier - - -# Random Forest criterions comparison -def compare_rf(X_train, y_train, X_test, y_test): - clf = RandomForestClassifier(criterion='gini', - max_depth=4, - n_estimators=100) - clf.fit(X_train, y_train) - print('gini score: ', clf.score(X_test, y_test)) - - clf = RandomForestClassifier(criterion='entropy', - max_depth=4, - n_estimators=100) - clf.fit(X_train, y_train) - print('entropy score: ', clf.score(X_test, y_test)) - - hdc = HellingerDistanceCriterion(1, np.array([2], dtype='int64')) - clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100) - clf.fit(X_train, y_train) - print('hellinger distance score: ', clf.score(X_test, y_test)) - -# Decision Tree criterions comparison -def compare_dt(X_train, y_train, X_test, y_test): - clf = DecisionTreeClassifier(criterion='gini', max_depth=4) - clf.fit(X_train, y_train) - print('gini score: ', clf.score(X_test, y_test)) +from imblearn.tree_split import HellingerDistanceCriterion - clf = DecisionTreeClassifier(criterion='entropy', max_depth=4) - clf.fit(X_train, y_train) - print('entropy score: ', clf.score(X_test, y_test)) - - hdc = HellingerDistanceCriterion(1, np.array([2], dtype='int64')) - clf = DecisionTreeClassifier(criterion=hdc, max_depth=4) - clf.fit(X_train, y_train) - print('hellinger distance score: ', clf.score(X_test, y_test)) - - -# Comparison on breast cancer dataset -bc = load_breast_cancer() -X_train, X_test, y_train, y_test = train_test_split(bc.data, - bc.target, - test_size=0.3) -compare_rf(X_train, y_train, X_test, y_test) -compare_dt(X_train, y_train, X_test, y_test) - - -# Comparison on imbalanced dataset -X, y = make_classification(n_samples=10000, n_features=40, - n_informative=5, n_classes=2, - weights=[0.05, 0.95], random_state=1) +X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) -compare_rf(X_train, y_train, X_test, y_test) -compare_dt(X_train, y_train, X_test, y_test) + +hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) +clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100) +clf.fit(X_train, y_train) +print(clf.score(X_test, y_test)) \ No newline at end of file From 097a58269b725a5b59a366d5a511772d03cffc1b Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 25 Jul 2018 13:46:06 +0300 Subject: [PATCH 10/50] updates user guide and api --- doc/api.rst | 23 +++++++++++++++++++++++ doc/tree_split.rst | 31 +++++++++++++++++++++++++++++++ doc/user_guide.rst | 1 + 3 files changed, 55 insertions(+) create mode 100644 doc/tree_split.rst diff --git a/doc/api.rst b/doc/api.rst index f9566146f..a910f3443 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -151,6 +151,29 @@ Imbalance-learn provides some fast-prototyping tools. .. _metrics_ref: +:mod:`imblearn.tree_split`: Tree split criterion +================================== + +.. automodule:: imblearn.tree_split + :no-members: + :no-inherited-members: + +.. currentmodule:: imblearn + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + tree_split.HellingerDistanceCriterion + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + pipeline.make_pipeline + +.. _metrics_ref: + :mod:`imblearn.metrics`: Metrics ================================ diff --git a/doc/tree_split.rst b/doc/tree_split.rst new file mode 100644 index 000000000..f73e33594 --- /dev/null +++ b/doc/tree_split.rst @@ -0,0 +1,31 @@ +.. _tree-split: + +============== +Tree-split +============== + +.. currentmodule:: imblearn.tree_split + +.. _cluster_centroids: + +Hellinger Distance split +==================== + +Hellinger Distance is used to quantify the similarity between two probability distributions. +When used as split criterion in Decision Tree Classifier it makes it skew insensitive and helps tackle the imbalance problem. + + >>> import numpy as np + >>> import pandas as pd + >>> from sklearn.datasets import make_classification + >>> from sklearn.model_selection import train_test_split + >>> from sklearn.ensemble import RandomForestClassifier + >>> from imblearn.tree_split import HellingerDistanceCriterion + >>> X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) + >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) + >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) + >>> clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100) + >>> clf.fit(X_train, y_train) + >>> print(clf.score(X_test, y_test)) + [0.9465] + +:class:`HellingerDistanceCriterion` offers a Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 2b4ea6515..1a9698735 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -12,6 +12,7 @@ User Guide introduction.rst over_sampling.rst under_sampling.rst + tree_split.rst combine.rst ensemble.rst miscellaneous.rst From 05172760fed84aba083f2be44282e96906d20293 Mon Sep 17 00:00:00 2001 From: edubov Date: Wed, 25 Jul 2018 14:54:40 +0300 Subject: [PATCH 11/50] fixed LGTM issues --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index ba13dd5cc..20876573b 100755 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ import os import subprocess import sys -from setuptools import find_packages, setup +from setuptools import find_packages # get __version__ from _version.py ver_file = os.path.join('imblearn', '_version.py') @@ -69,7 +69,6 @@ def generate_cython(package): def setup_package(): from numpy.distutils.core import setup - old_path = os.getcwd() local_path = os.path.dirname(os.path.abspath(sys.argv[0])) src_path = local_path From a700cbd3e6d7bcf5bf451f030205cc0343fc8b71 Mon Sep 17 00:00:00 2001 From: edubov Date: Tue, 2 Oct 2018 15:28:42 +0300 Subject: [PATCH 12/50] Merged with https://github.com/glemaitre/imbalanced-learn/commit/27fffea721fe1edafc50d1cfc65d1b857292b6c1 --- .gitignore | 5 + .travis.yml | 8 +- README.rst | 5 +- appveyor.yml | 14 +- build_tools/circle/build_doc.sh | 2 +- build_tools/travis/install.sh | 17 +- conftest.py | 19 + doc/api.rst | 32 +- doc/combine.rst | 4 +- doc/ensemble.rst | 8 +- doc/install.rst | 2 + doc/introduction.rst | 8 +- doc/miscellaneous.rst | 113 +++- doc/over_sampling.rst | 48 +- doc/under_sampling.rst | 35 +- doc/whats_new/v0.0.4.rst | 57 ++ imblearn/__check_build/__init__.py | 2 +- imblearn/__init__.py | 6 + imblearn/base.py | 196 +++---- imblearn/keras/__init__.py | 8 + imblearn/keras/_generator.py | 230 ++++++++ imblearn/keras/tests/__init__.py | 0 imblearn/keras/tests/test_generator.py | 101 ++++ imblearn/setup.py | 41 ++ imblearn/tensorflow/__init__.py | 6 + imblearn/tensorflow/_generator.py | 151 +++++ imblearn/tensorflow/tests/test_generator.py | 89 +++ imblearn/tree/criterion.pxd | 0 .../criterion.pyx} | 84 +-- imblearn/tree/setup.py | 21 + imblearn/tree_split/__init__.py | 1 - imblearn/tree_split/setup.py | 20 - imblearn/under_sampling/__init__.py | 22 +- imblearn/utils/_validation.py | 525 ++++++++++++++++++ requirements.optional.txt | 2 + setup.cfg | 3 +- setup.py | 64 ++- {build_tools/cython => tools}/cythonize.py | 0 38 files changed, 1666 insertions(+), 283 deletions(-) create mode 100644 imblearn/keras/__init__.py create mode 100644 imblearn/keras/_generator.py create mode 100644 imblearn/keras/tests/__init__.py create mode 100644 imblearn/keras/tests/test_generator.py create mode 100644 imblearn/setup.py create mode 100644 imblearn/tensorflow/__init__.py create mode 100644 imblearn/tensorflow/_generator.py create mode 100644 imblearn/tensorflow/tests/test_generator.py create mode 100644 imblearn/tree/criterion.pxd rename imblearn/{tree_split/hellinger_distance_criterion.pyx => tree/criterion.pyx} (63%) create mode 100644 imblearn/tree/setup.py delete mode 100644 imblearn/tree_split/__init__.py delete mode 100644 imblearn/tree_split/setup.py create mode 100644 imblearn/utils/_validation.py create mode 100644 requirements.optional.txt rename {build_tools/cython => tools}/cythonize.py (100%) diff --git a/.gitignore b/.gitignore index 1194254fe..1bf7ee820 100644 --- a/.gitignore +++ b/.gitignore @@ -71,3 +71,8 @@ target/ # PyCharm .idea/ + +# Cython +*.c +*.cpp +cythonize.dat diff --git a/.travis.yml b/.travis.yml index 36b502320..650c14bb1 100644 --- a/.travis.yml +++ b/.travis.yml @@ -38,11 +38,11 @@ matrix: NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.19.0" - env: DISTRIB="conda" PYTHON_VERSION="3.6" NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.19.0" - - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="master" + - env: DISTRIB="conda" PYTHON_VERSION="3.7" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" allow_failures: - - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="master" + - env: DISTRIB="conda" PYTHON_VERSION="3.7" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh diff --git a/README.rst b/README.rst index 4191b844b..5987a9ca4 100644 --- a/README.rst +++ b/README.rst @@ -55,8 +55,11 @@ imbalanced-learn is tested to work under Python 2.7 and Python 3.5, and * scipy(>=0.13.3) * numpy(>=1.8.2) * scikit-learn(>=0.19.0) +* keras 2 (optional) +* tensorflow (optional) -Additionally, to run the examples, you need matplotlib(>=2.0.0). +Additionally, to run the examples, you need matplotlib(>=2.0.0) and +pandas(>=0.22). Installation ~~~~~~~~~~~~ diff --git a/appveyor.yml b/appveyor.yml index a09272080..6bb885553 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,30 +10,37 @@ environment: - PYTHON: "C:\\Miniconda-x64" PYTHON_VERSION: "2.7.x" PYTHON_ARCH: "64" + OPTIONAL_DEP: "pandas" - PYTHON: "C:\\Miniconda" PYTHON_VERSION: "2.7.x" PYTHON_ARCH: "32" + OPTIONAL_DEP: "pandas" - PYTHON: "C:\\Miniconda35-x64" PYTHON_VERSION: "3.5.x" PYTHON_ARCH: "64" + OPTIONAL_DEP: "pandas keras tensorflow" - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6.x" PYTHON_ARCH: "64" + OPTIONAL_DEP: "pandas keras tensorflow" - PYTHON: "C:\\Miniconda36" PYTHON_VERSION: "3.6.x" PYTHON_ARCH: "32" + OPTIONAL_DEP: "pandas" install: # Prepend miniconda installed Python to the PATH of this build # Add Library/bin directory to fix issue # https://github.com/conda/conda/issues/1753 - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PYTHON%\\Library\\bin;%PATH%" - - conda install pip scipy numpy scikit-learn=0.19 pandas -y -q + - conda install pip scipy numpy scikit-learn=0.19 -y -q + - "conda install %OPTIONAL_DEP% -y -q" - conda install pytest pytest-cov -y -q + - pip install codecov - conda install nose -y -q # FIXME: remove this line when using sklearn > 0.19 - pip install . @@ -41,3 +48,8 @@ test_script: - mkdir for_test - cd for_test - pytest --pyargs imblearn --cov-report term-missing --cov=imblearn + +after_test: + - cp .coverage %APPVEYOR_BUILD_FOLDER% + - cd %APPVEYOR_BUILD_FOLDER% + - codecov diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index e49088ae6..a663327b5 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -92,7 +92,7 @@ conda create -n $CONDA_ENV_NAME --yes --quiet python=3 source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy scikit-learn pillow matplotlib sphinx \ - sphinx_rtd_theme numpydoc + sphinx_rtd_theme numpydoc pandas keras nose pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git # Build and install imbalanced-learn in dev mode diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 415e4ce5d..3a56bac81 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,15 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION pandas + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + + if [[ $PYTHON_VERSION == "3.6" ]]; then + conda install --yes pandas + conda install --yes -c conda-forge keras + KERAS_BACKEND=tensorflow + python -c "import keras.backend" + sed -i -e 's/"backend":[[:space:]]*"[^"]*/"backend":\ "'$KERAS_BACKEND'/g' ~/.keras/keras.json; + fi if [[ "$SKLEARN_VERSION" == "master" ]]; then conda install --yes cython @@ -59,8 +67,9 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then # Create a new virtualenv using system site packages for python, numpy virtualenv --system-site-packages testvenv source testvenv/bin/activate - pip install scikit-learn pandas nose nose-timer pytest pytest-cov codecov \ - sphinx numpydoc + pip install scikit-learn + pip install pandas keras tensorflow + pip install nose nose-timer pytest pytest-cov codecov sphinx numpydoc fi @@ -68,7 +77,7 @@ python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" -python setup.py develop +pip install -e . ccache --show-stats # Useful for debugging how ccache is used # cat $CCACHE_LOGFILE diff --git a/conftest.py b/conftest.py index 110fdd479..d3ff91025 100644 --- a/conftest.py +++ b/conftest.py @@ -7,8 +7,27 @@ # Set numpy array str/repr to legacy behaviour on numpy > 1.13 to make # the doctests pass +import os +import pytest import numpy as np + try: np.set_printoptions(legacy='1.13') except TypeError: pass + + +def pytest_runtest_setup(item): + fname = item.fspath.strpath + if (fname.endswith(os.path.join('keras', '_generator.py')) or + fname.endswith('miscellaneous.rst')): + try: + import keras + except ImportError: + pytest.skip('The keras package is not installed.') + elif (fname.endswith(os.path.join('tensorflow', '_generator.py')) or + fname.endswith('miscellaneous.rst')): + try: + import tensorflow + except ImportError: + pytest.skip('The tensorflow package is not installed.') diff --git a/doc/api.rst b/doc/api.rst index a910f3443..6151a3388 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -18,7 +18,7 @@ This is the full API documentation of the `imbalanced-learn` toolbox. Prototype generation -------------------- -.. automodule:: imblearn.under_sampling.prototype_generation +.. automodule:: imblearn.under_sampling._prototype_generation :no-members: :no-inherited-members: @@ -33,7 +33,7 @@ Prototype generation Prototype selection ------------------- -.. automodule:: imblearn.under_sampling.prototype_selection +.. automodule:: imblearn.under_sampling._prototype_selection :no-members: :no-inherited-members: @@ -110,6 +110,33 @@ Prototype selection ensemble.BalanceCascade ensemble.BalancedBaggingClassifier ensemble.EasyEnsemble + ensemble.EasyEnsembleClassifier +.. _keras_ref: +:mod:`imblearn.keras`: Batch generator for Keras +================================================ +.. automodule:: imblearn.keras + :no-members: + :no-inherited-members: +.. currentmodule:: imblearn +.. autosummary:: + :toctree: generated/ + :template: class.rst + keras.BalancedBatchGenerator +.. autosummary:: + :toctree: generated/ + :template: function.rst + keras.balanced_batch_generator +.. _tensorflow_ref: +:mod:`imblearn.tensorflow`: Batch generator for TensorFlow +========================================================== +.. automodule:: imblearn.tensorflow + :no-members: + :no-inherited-members: +.. currentmodule:: imblearn +.. autosummary:: + :toctree: generated/ + :template: function.rst + tensorflow.balanced_batch_generator .. _misc_ref: @@ -229,4 +256,3 @@ Imbalance-learn provides some fast-prototyping tools. utils.check_neighbors_object utils.check_ratio utils.check_sampling_strategy - utils.hash_X_y diff --git a/doc/combine.rst b/doc/combine.rst index 165fcc7f0..c8cd21ff9 100644 --- a/doc/combine.rst +++ b/doc/combine.rst @@ -33,12 +33,12 @@ to their former samplers:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.combine import SMOTEENN >>> smote_enn = SMOTEENN(random_state=0) - >>> X_resampled, y_resampled = smote_enn.fit_sample(X, y) + >>> X_resampled, y_resampled = smote_enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4060), (1, 4381), (2, 3502)] >>> from imblearn.combine import SMOTETomek >>> smote_tomek = SMOTETomek(random_state=0) - >>> X_resampled, y_resampled = smote_tomek.fit_sample(X, y) + >>> X_resampled, y_resampled = smote_tomek.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4499), (1, 4566), (2, 4413)] diff --git a/doc/ensemble.rst b/doc/ensemble.rst index 86a7ccc2b..d8a7ab45a 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -11,6 +11,10 @@ Ensemble of samplers Samplers -------- +.. warning:: + Note that those:class:`EasyEnsemble` is deprecated and you should use + :class:`EasyEnsembleClassifier` instead. :class:`EasyEnsembleClassifier` is + presented in the next section. An imbalanced data set can be balanced by creating several balanced subsets. The module :mod:`imblearn.ensemble` allows to create such sets. @@ -28,7 +32,7 @@ under-sampling the original set:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.ensemble import EasyEnsemble >>> ee = EasyEnsemble(random_state=0, n_subsets=10) - >>> X_resampled, y_resampled = ee.fit_sample(X, y) + >>> X_resampled, y_resampled = ee.fit_resample(X, y) >>> print(X_resampled.shape) (10, 192, 2) >>> print(sorted(Counter(y_resampled[0]).items())) @@ -50,7 +54,7 @@ parameter ``n_max_subset`` and an additional bootstraping can be activated with >>> bc = BalanceCascade(random_state=0, ... estimator=LogisticRegression(random_state=0), ... n_max_subset=4) - >>> X_resampled, y_resampled = bc.fit_sample(X, y) + >>> X_resampled, y_resampled = bc.fit_resample(X, y) >>> print(X_resampled.shape) (4, 192, 2) >>> print(sorted(Counter(y_resampled[0]).items())) diff --git a/doc/install.rst b/doc/install.rst index 6c64827cb..39c6f69da 100644 --- a/doc/install.rst +++ b/doc/install.rst @@ -10,6 +10,8 @@ The imbalanced-learn package requires the following dependencies: * numpy (>=1.8.2) * scipy (>=0.13.3) * scikit-learn (>=0.19.0) +* keras 2 (optional) +* tensorflow (optional) Our release policy is to follow the scikit-learn releases in order to synchronize the new feature. diff --git a/doc/introduction.rst b/doc/introduction.rst index 24c9aca36..6b8aa8cf3 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -18,15 +18,11 @@ and adding a sampling functionality through the ``sample`` method: estimator = obj.fit(data, targets) -:Sampler: +:Resampler: To resample a data sets, each sampler implements:: - data_resampled, targets_resampled = obj.sample(data, targets) - - Fitting and sampling can also be done in one step:: - - data_resampled, targets_resampled = obj.fit_sample(data, targets) + data_resampled, targets_resampled = obj.fit_resample(data, targets) Imbalanced-learn samplers accept the same inputs that in scikit-learn: diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst index ef263a21b..9ec380ee4 100644 --- a/doc/miscellaneous.rst +++ b/doc/miscellaneous.rst @@ -28,7 +28,7 @@ to retain the 10 first elements of the array ``X`` and ``y``:: >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) - >>> X_res, y_res = sampler.fit_sample(X, y) + >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) @@ -38,3 +38,114 @@ We illustrate the use of such sampler to implement an outlier rejection estimator which can be easily used within a :class:`imblearn.pipeline.Pipeline`: :ref:`sphx_glr_auto_examples_plot_outlier_rejections.py` + +.. _generators: + +Custom generators +----------------- + +Imbalanced-learn provides specific generators for TensorFlow and Keras which +will generate balanced mini-batches. + +.. _tensorflow_generator: + +TensorFlow generator +~~~~~~~~~~~~~~~~~~~~ + +The :func:`imblearn.tensorflow.balanced_batch_generator` allow to generate +balanced mini-batches using an imbalanced-learn sampler which returns indices:: + + >>> X = X.astype(np.float32) + >>> from imblearn.under_sampling import RandomUnderSampler + >>> from imblearn.tensorflow import balanced_batch_generator + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sample_weight=None, sampler=RandomUnderSampler(), + ... batch_size=10, random_state=42) + +The ``generator`` and ``steps_per_epoch`` is used during the training of the +Tensorflow model. We will illustrate how to use this generator. First, we can +define a logistic regression model which will be optimized by a gradient +descent:: + + >>> learning_rate, epochs = 0.01, 10 + >>> input_size, output_size = X.shape[1], 3 + >>> import tensorflow as tf + >>> def init_weights(shape): + ... return tf.Variable(tf.random_normal(shape, stddev=0.01)) + >>> def accuracy(y_true, y_pred): + ... return np.mean(np.argmax(y_pred, axis=1) == y_true) + >>> # input and output + >>> data = tf.placeholder("float32", shape=[None, input_size]) + >>> targets = tf.placeholder("int32", shape=[None]) + >>> # build the model and weights + >>> W = init_weights([input_size, output_size]) + >>> b = init_weights([output_size]) + >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) + >>> # build the loss, predict, and train operator + >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + ... logits=out_act, labels=targets) + >>> loss = tf.reduce_sum(cross_entropy) + >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate) + >>> train_op = optimizer.minimize(loss) + >>> predict = tf.nn.softmax(out_act) + >>> # Initialization of all variables in the graph + >>> init = tf.global_variables_initializer() + +Once initialized, the model is trained by iterating on balanced mini-batches of +data and minimizing the loss previously defined:: + + >>> with tf.Session() as sess: + ... print('Starting training') + ... sess.run(init) + ... for e in range(epochs): + ... for i in range(steps_per_epoch): + ... X_batch, y_batch = next(training_generator) + ... sess.run([train_op, loss], feed_dict={data: X_batch, targets: y_batch}) + ... # For each epoch, run accuracy on train and test + ... feed_dict = dict() + ... predicts_train = sess.run(predict, feed_dict={data: X}) + ... print("epoch: {} train accuracy: {:.3f}" + ... .format(e, accuracy(y, predicts_train))) + ... # doctest: +ELLIPSIS + Starting training + [... + +.. _keras_generator: + +Keras generator +~~~~~~~~~~~~~~~ + +Keras provides an higher level API in which a model can be defined and train by +calling ``fit_generator`` method to train the model. To illustrate, we will +define a logistic regression model:: + + >>> import keras + >>> y = keras.utils.to_categorical(y, 3) + >>> model = keras.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + +:func:`imblearn.keras.balanced_batch_generator` creates a balanced mini-batches +generator with the associated number of mini-batches which will be generated:: + + >>> from imblearn.keras import balanced_batch_generator + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42) + +Then, ``fit_generator`` can be called passing the generator and the step:: + + >>> callback_history = model.fit_generator(generator=training_generator, + ... steps_per_epoch=steps_per_epoch, + ... epochs=10, verbose=0) + +The second possibility is to use +:class:`imblearn.keras.BalancedBatchGenerator`. Only an instance of this class +will be passed to ``fit_generator``:: + + >>> from imblearn.keras import BalancedBatchGenerator + >>> training_generator = BalancedBatchGenerator( + ... X, y, sampler=RandomUnderSampler(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... epochs=10, verbose=0) diff --git a/doc/over_sampling.rst b/doc/over_sampling.rst index 0d387cc44..4e7ea4d4d 100644 --- a/doc/over_sampling.rst +++ b/doc/over_sampling.rst @@ -27,7 +27,7 @@ randomly sampling with replacement the current available samples. The ... class_sep=0.8, random_state=0) >>> from imblearn.over_sampling import RandomOverSampler >>> ros = RandomOverSampler(random_state=0) - >>> X_resampled, y_resampled = ros.fit_sample(X, y) + >>> X_resampled, y_resampled = ros.fit_resample(X, y) >>> from collections import Counter >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] @@ -52,6 +52,22 @@ As a result, the majority class does not take over the other classes during the training process. Consequently, all classes are represented by the decision function. +In addition, :class:`RandomOverSampler` allows to sample heterogeneous data +(e.g. containing some strings):: + + >>> import numpy as np + >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + ... dtype=np.object) + >>> y_hetero = np.array([0, 0, 1]) + >>> X_resampled, y_resampled = ros.fit_resample(X_hetero, y_hetero) + >>> print(X_resampled) + [['xxx' 1 1.0] + ['yyy' 2 2.0] + ['zzz' 3 3.0] + ['zzz' 3 3.0]] + >>> print(y_resampled) + [0 0 1 1] + See :ref:`sphx_glr_auto_examples_over-sampling_plot_random_over_sampling.py` for usage example. @@ -66,11 +82,11 @@ to over-sample minority classes: (i) the Synthetic Minority Oversampling Techniq can be used in the same manner:: >>> from imblearn.over_sampling import SMOTE, ADASYN - >>> X_resampled, y_resampled = SMOTE().fit_sample(X, y) + >>> X_resampled, y_resampled = SMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] >>> clf_smote = LinearSVC().fit(X_resampled, y_resampled) - >>> X_resampled, y_resampled = ADASYN().fit_sample(X, y) + >>> X_resampled, y_resampled = ADASYN().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4673), (1, 4662), (2, 4674)] >>> clf_adasyn = LinearSVC().fit(X_resampled, y_resampled) @@ -127,11 +143,11 @@ nearest neighbors class. Those variants are presented in the figure below. :align: center -The parameter ``kind`` is controlling this feature and the following types are -available: (i) ``'borderline1'``, (ii) ``'borderline2'``, and (iii) ``'svm'``:: +The :class:`BorderlineSMOTE` and :class:`SVMSMOTE` offer some variant of the SMOTE +algorithm:: - >>> from imblearn.over_sampling import SMOTE, ADASYN - >>> X_resampled, y_resampled = SMOTE(kind='borderline1').fit_sample(X, y) + >>> from imblearn.over_sampling import BorderlineSMOTE + >>> X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 4674), (1, 4674), (2, 4674)] @@ -168,12 +184,11 @@ interpolation will create a sample on the line between :math:`x_{i}` and Each SMOTE variant and ADASYN differ from each other by selecting the samples :math:`x_i` ahead of generating the new samples. -The **regular** SMOTE algorithm --- cf. to ``kind='regular'`` when -instantiating a :class:`SMOTE` object --- does not impose any rule and will -randomly pick-up all possible :math:`x_i` available. +The **regular** SMOTE algorithm --- cf. to the :class:`SMOTE` object --- does not +impose any rule and will randomly pick-up all possible :math:`x_i` available. -The **borderline** SMOTE --- cf. to ``kind='borderline1'`` and -``kind='borderline2'`` when instantiating a :class:`SMOTE` object --- will +The **borderline** SMOTE --- cf. to the :class:`BorderlineSMOTE` with the +parameters ``kind='borderline-1'`` and ``kind='borderline-2'`` --- will classify each sample :math:`x_i` to be (i) noise (i.e. all nearest-neighbors are from a different class than the one of :math:`x_i`), (ii) in danger (i.e. at least half of the nearest neighbors are from the same class than @@ -184,10 +199,9 @@ samples *in danger* to generate new samples. In **Borderline-1** SMOTE, :math:`x_i`. On the contrary, **Borderline-2** SMOTE will consider :math:`x_{zi}` which can be from any class. -**SVM** SMOTE --- cf. to ``kind='svm'`` when instantiating a :class:`SMOTE` -object --- uses an SVM classifier to find support vectors and generate samples -considering them. Note that the ``C`` parameter of the SVM classifier allows to -select more or less support vectors. +**SVM** SMOTE --- cf. to :class:`SVMSMOTE` --- uses an SVM classifier to find +support vectors and generate samples considering them. Note that the ``C`` +parameter of the SVM classifier allows to select more or less support vectors. For both borderline and SVM SMOTE, a neighborhood is defined using the parameter ``m_neighbors`` to decide if a sample is in danger, safe, or noise. @@ -196,7 +210,7 @@ ADASYN is working similarly to the regular SMOTE. However, the number of samples generated for each :math:`x_i` is proportional to the number of samples which are not from the same class than :math:`x_i` in a given neighborhood. Therefore, more samples will be generated in the area that the -nearest neighbor rule is not respected. The parameter ``n_neighbors`` is +nearest neighbor rule is not respected. The parameter ``m_neighbors`` is equivalent to ``k_neighbors`` in :class:`SMOTE`. Multi-class management diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index f2412528e..c621d40cb 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -32,7 +32,7 @@ K-means method instead of the original samples:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import ClusterCentroids >>> cc = ClusterCentroids(random_state=0) - >>> X_resampled, y_resampled = cc.fit_sample(X, y) + >>> X_resampled, y_resampled = cc.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] @@ -82,7 +82,7 @@ randomly selecting a subset of data for the targeted classes:: >>> from imblearn.under_sampling import RandomUnderSampler >>> rus = RandomUnderSampler(random_state=0) - >>> X_resampled, y_resampled = rus.fit_sample(X, y) + >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] @@ -99,10 +99,23 @@ by considering independently each targeted class:: >>> print(np.vstack({tuple(row) for row in X_resampled}).shape) (192, 2) >>> rus = RandomUnderSampler(random_state=0, replacement=True) - >>> X_resampled, y_resampled = rus.fit_sample(X, y) + >>> X_resampled, y_resampled = rus.fit_resample(X, y) >>> print(np.vstack({tuple(row) for row in X_resampled}).shape) (181, 2) +In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data +(e.g. containing some strings):: + + >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], + ... dtype=np.object) + >>> y_hetero = np.array([0, 0, 1]) + >>> X_resampled, y_resampled = rus.fit_resample(X_hetero, y_hetero) + >>> print(X_resampled) + [['xxx' 1 1.0] + ['zzz' 3 3.0]] + >>> print(y_resampled) + [0 1] + See :ref:`sphx_glr_auto_examples_plot_sampling_strategy_usage.py`., :ref:`sphx_glr_auto_examples_under-sampling_plot_comparison_under_sampling.py`, and :ref:`sphx_glr_auto_examples_under-sampling_plot_random_under_sampler.py`. @@ -113,7 +126,7 @@ be selected with the parameter ``version``:: >>> from imblearn.under_sampling import NearMiss >>> nm1 = NearMiss(version=1) - >>> X_resampled_nm1, y_resampled = nm1.fit_sample(X, y) + >>> X_resampled_nm1, y_resampled = nm1.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] @@ -248,7 +261,7 @@ the sample inspected to keep it in the dataset:: [(0, 64), (1, 262), (2, 4674)] >>> from imblearn.under_sampling import EditedNearestNeighbours >>> enn = EditedNearestNeighbours() - >>> X_resampled, y_resampled = enn.fit_sample(X, y) + >>> X_resampled, y_resampled = enn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 213), (2, 4568)] @@ -262,7 +275,7 @@ Generally, repeating the algorithm will delete more data:: >>> from imblearn.under_sampling import RepeatedEditedNearestNeighbours >>> renn = RepeatedEditedNearestNeighbours() - >>> X_resampled, y_resampled = renn.fit_sample(X, y) + >>> X_resampled, y_resampled = renn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 208), (2, 4551)] @@ -272,7 +285,7 @@ internal nearest neighbors algorithm is increased at each iteration:: >>> from imblearn.under_sampling import AllKNN >>> allknn = AllKNN() - >>> X_resampled, y_resampled = allknn.fit_sample(X, y) + >>> X_resampled, y_resampled = allknn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 220), (2, 4601)] @@ -310,7 +323,7 @@ The :class:`CondensedNearestNeighbour` can be used in the following manner:: >>> from imblearn.under_sampling import CondensedNearestNeighbour >>> cnn = CondensedNearestNeighbour(random_state=0) - >>> X_resampled, y_resampled = cnn.fit_sample(X, y) + >>> X_resampled, y_resampled = cnn.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 24), (2, 115)] @@ -325,7 +338,7 @@ used as:: >>> from imblearn.under_sampling import OneSidedSelection >>> oss = OneSidedSelection(random_state=0) - >>> X_resampled, y_resampled = oss.fit_sample(X, y) + >>> X_resampled, y_resampled = oss.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 174), (2, 4403)] @@ -339,7 +352,7 @@ neighbors classifier. The class can be used as:: >>> from imblearn.under_sampling import NeighbourhoodCleaningRule >>> ncr = NeighbourhoodCleaningRule() - >>> X_resampled, y_resampled = ncr.fit_sample(X, y) + >>> X_resampled, y_resampled = ncr.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 234), (2, 4666)] @@ -367,7 +380,7 @@ removed. The class can be used as:: >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> iht = InstanceHardnessThreshold(random_state=0, ... estimator=LogisticRegression()) - >>> X_resampled, y_resampled = iht.fit_sample(X, y) + >>> X_resampled, y_resampled = iht.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.0.4.rst index e7017f24f..e873b3d83 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.0.4.rst @@ -18,6 +18,21 @@ API - Enable to use a ``list`` for the cleaning methods to specify the class to sample. :issue:`411` by :user:`Guillaume Lemaitre `. +- Replace ``fit_sample`` by ``fit_resample``. An alias is still available for + backward compatibility. In addition, ``sample`` has been removed to avoid + resampling on different set of data. + :issue:`462` by :user:`Guillaume Lemaitre `. + +New features +............ + +- Add a ``keras`` and ``tensorflow`` modules to create balanced mini-batches + generator. :issue:`409` by :user:`Guillaume Lemaitre `. + +- Add :class:`imblearn.ensemble.EasyEnsembleClassifier` which create a bag of + AdaBoost classifier trained on balanced bootstrap samples. + :issue:`455` by :user:`Guillaume Lemaitre `. + Enhancement ........... @@ -30,6 +45,20 @@ Enhancement - Add support for one-vs-all encoded target to support keras. :issue:`409` by :user:`Guillaume Lemaitre `. +- Adding specific class for borderline and SVM SMOTE using + :class:`BorderlineSMOTE` and :class:`SVMSMOTE`. + :issue:`440` by :user:`Guillaume Lemaitre `. + +- Allow :class:`imblearn.over_sampling.RandomOverSampler` can return indices + using the attributes ``return_indices``. + :issue:`439` by :user:`Hugo Gascon` and + :user:`Guillaume Lemaitre `. + +- Allow :class:`imblearn.under_sampling.RandomUnderSampler` and + :class:`imblearn.over_sampling.RandomOverSampler` to sample object array + containing strings. + :issue:`451` by :user:`Guillaume Lemaitre `. + - Add support for Hellinger Distance as sklearn classification tree split criterion. By :user: `Evgeni Dubov `. @@ -44,11 +73,28 @@ Bug fixes generating new samples. :issue:`354` by :user:`Guillaume Lemaitre `. +- Fix bug which allow for sorted behavior of ``sampling_strategy`` dictionary + and thus to obtain a deterministic results when using the same random state. + :issue:`447` by :user:`Guillaume Lemaitre `. +- Force to clone scikit-learn estimator passed as attributes to samplers. + :issue:`446` by :user:`Guillaume Lemaitre `. +- Fix bug which was not preserving the dtype of X and y when generating + samples. + :issue:`450` by :user:`Guillaume Lemaitre `. +- Add the option to pass a ``Memory`` object to :func:`make_pipeline` like + in :class:`pipeline.Pipeline` class. + :issue:`458` by :user:`Christos Aridas `. Maintenance ........... - Remove deprecated parameters in 0.2 - :issue:`331` by :user:`Guillaume Lemaitre `. +- Make some modules private. + :issue:`452` by :user:`Guillaume Lemaitre `. +Documentation +............. +- Remove some docstring which are not necessary. + :issue:`454` by :user:`Guillaume Lemaitre `. Deprecation ........... @@ -66,3 +112,14 @@ Deprecation :class:`imblearn.under_sampling.NeighbourhoodCleaningRule`, :class:`imblearn.under_sampling.InstanceHardnessThreshold`, :class:`imblearn.under_sampling.CondensedNearestNeighbours`. + +- Deprecate ``kind``, ``out_step``, ``svm_estimator``, ``m_neighbors`` in + :class:`imblearn.over_sampling.SMOTE`. User should use + :class:`imblearn.over_sampling.SVMSMOTE` and + :class:`imblearn.over_sampling.BorderlineSMOTE`. + :issue:`440` by :user:`Guillaume Lemaitre `. + +- Deprecate :class:`imblearn.ensemble.EasyEnsemble` in favor of meta-estimator + :class:`imblearn.ensemble.EasyEnsembleClassifier` which follow the exact + algorithm described in the literature. + :issue:`455` by :user:`Guillaume Lemaitre `. diff --git a/imblearn/__check_build/__init__.py b/imblearn/__check_build/__init__.py index 7e7491f26..8d759d70f 100644 --- a/imblearn/__check_build/__init__.py +++ b/imblearn/__check_build/__init__.py @@ -5,7 +5,7 @@ """ import os -PACKAGE_NAME = 'cython_template' +PACKAGE_NAME = 'imblearn' INPLACE_MSG = """ It appears that you are importing {package} from within the source tree. diff --git a/imblearn/__init__.py b/imblearn/__init__.py index 9f05adb1f..0cb3ca8fe 100644 --- a/imblearn/__init__.py +++ b/imblearn/__init__.py @@ -13,11 +13,17 @@ exceptions Module including custom warnings and error clases used across imbalanced-learn. +keras + Module which provides custom generator, layers for deep learning using + keras. metrics Module which provides metrics to quantified the classification performance with imbalanced dataset. over_sampling Module which provides methods to under-sample a dataset. +tensorflow + Module which provides custom generator, layers for deep learning using + tensorflow. under-sampling Module which provides methods to over-sample a dataset. utils diff --git a/imblearn/base.py b/imblearn/base.py index dbfe08070..661f928da 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -16,9 +16,8 @@ from sklearn.externals import six from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y -from sklearn.utils.validation import check_is_fitted -from .utils import check_sampling_strategy, check_target_type, hash_X_y +from .utils import check_sampling_strategy, check_target_type from .utils.deprecation import deprecate_parameter @@ -31,54 +30,33 @@ class SamplerMixin(six.with_metaclass(ABCMeta, BaseEstimator)): _estimator_type = 'sampler' - def _check_X_y(self, X, y): - """Private function to check that the X and y in fitting are the same - than in sampling.""" - X_hash, y_hash = hash_X_y(X, y) - if self.X_hash_ != X_hash or self.y_hash_ != y_hash: - raise RuntimeError("X and y need to be same array earlier fitted.") + def fit(self, X, y): + """Check inputs and statistics of the sampler. - def sample(self, X, y): - """Resample the dataset. + You should use ``fit_resample`` in all cases. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. + Data array. y : array-like, shape (n_samples,) - Corresponding label for each sample in X. + Target array. Returns ------- - X_resampled : {ndarray, sparse matrix}, shape \ -(n_samples_new, n_features) - The array containing the resampled data. - - y_resampled : ndarray, shape (n_samples_new) - The corresponding label of `X_resampled` + self : object + Return the instance itself. """ - # Check the consistency of X and y - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - - check_is_fitted(self, 'sampling_strategy_') - self._check_X_y(X, y) - - output = self._sample(X, y) - - if binarize_y: - y_sampled = label_binarize(output[1], np.unique(y)) - if len(output) == 2: - return output[0], y_sampled - else: - return output[0], y_sampled, output[2] - else: - return output + self._deprecate_ratio() + X, y, _ = self._check_X_y(X, y) + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, self._sampling_type) + return self - def fit_sample(self, X, y): - """Fit the statistics and resample the data directly. + def fit_resample(self, X, y): + """Resample the dataset. Parameters ---------- @@ -95,15 +73,32 @@ def fit_sample(self, X, y): The array containing the resampled data. y_resampled : array-like, shape (n_samples_new,) - The corresponding label of `X_resampled` + The corresponding label of `X_resampled`. """ + self._deprecate_ratio() - return self.fit(X, y).sample(X, y) + X, y, binarize_y = self._check_X_y(X, y) + + self.sampling_strategy_ = check_sampling_strategy( + self.sampling_strategy, y, self._sampling_type) + + output = self._fit_resample(X, y) + + if binarize_y: + y_sampled = label_binarize(output[1], np.unique(y)) + if len(output) == 2: + return output[0], y_sampled + return output[0], y_sampled, output[2] + return output + + # define an alias for back-compatibility + fit_sample = fit_resample @abstractmethod - def _sample(self, X, y): - """Resample the dataset. + def _fit_resample(self, X, y): + """Base method defined in each sampler to defined the sampling + strategy. Parameters ---------- @@ -125,18 +120,6 @@ def _sample(self, X, y): """ pass - def __getstate__(self): - """Prevent logger from being pickled.""" - object_dictionary = self.__dict__.copy() - del object_dictionary['logger'] - return object_dictionary - - def __setstate__(self, dict): - """Re-open the logger.""" - logger = logging.getLogger(self.__module__) - self.__dict__.update(dict) - self.logger = logger - class BaseSampler(SamplerMixin): """Base class for sampling algorithms. @@ -151,6 +134,12 @@ def __init__(self, sampling_strategy='auto', ratio=None): self.ratio = ratio self.logger = logging.getLogger(self.__module__) + @staticmethod + def _check_X_y(X, y): + y, binarize_y = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) + return X, y, binarize_y + @property def ratio_(self): # FIXME: remove in 0.6 @@ -165,38 +154,24 @@ def _deprecate_ratio(self): deprecate_parameter(self, '0.4', 'ratio', 'sampling_strategy') self.sampling_strategy = self.ratio - def fit(self, X, y): - """Find the classes statistics before to perform sampling. - - Parameters - ---------- - X : {array-like, sparse matrix}, shape (n_samples, n_features) - Matrix containing the data which have to be sampled. - - y : array-like, shape (n_samples,) - Corresponding label for each sample in X. - - Returns - ------- - self : object, - Return self. - - """ - self._deprecate_ratio() - y = check_target_type(y) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - self.X_hash_, self.y_hash_ = hash_X_y(X, y) - self.sampling_strategy_ = check_sampling_strategy( - self.sampling_strategy, y, self._sampling_type) + def __getstate__(self): + """Prevent logger from being pickled.""" + object_dictionary = self.__dict__.copy() + del object_dictionary['logger'] + return object_dictionary - return self + def __setstate__(self, dict): + """Re-open the logger.""" + logger = logging.getLogger(self.__module__) + self.__dict__.update(dict) + self.logger = logger def _identity(X, y): return X, y -class FunctionSampler(SamplerMixin): +class FunctionSampler(BaseSampler): """Construct a sampler from calling an arbitrary callable. Read more in the :ref:`User Guide `. @@ -235,7 +210,7 @@ class FunctionSampler(SamplerMixin): >>> def func(X, y): ... return X[:10], y[:10] >>> sampler = FunctionSampler(func=func) - >>> X_res, y_res = sampler.fit_sample(X, y) + >>> X_res, y_res = sampler.fit_resample(X, y) >>> np.all(X_res == X[:10]) True >>> np.all(y_res == y[:10]) @@ -246,68 +221,31 @@ class FunctionSampler(SamplerMixin): >>> from collections import Counter >>> from imblearn.under_sampling import RandomUnderSampler >>> def func(X, y, sampling_strategy, random_state): - ... return RandomUnderSampler(sampling_strategy=sampling_strategy, - ... random_state=random_state).fit_sample(X, y) + ... return RandomUnderSampler( + ... sampling_strategy=sampling_strategy, + ... random_state=random_state).fit_resample(X, y) >>> sampler = FunctionSampler(func=func, ... kw_args={'sampling_strategy': 'auto', ... 'random_state': 0}) - >>> X_res, y_res = sampler.fit_sample(X, y) + >>> X_res, y_res = sampler.fit_resample(X, y) >>> print('Resampled dataset shape {}'.format( ... sorted(Counter(y_res).items()))) Resampled dataset shape [(0, 100), (1, 100)] """ + _sampling_type = 'bypass' + def __init__(self, func=None, accept_sparse=True, kw_args=None): + super(FunctionSampler, self).__init__() self.func = func self.accept_sparse = accept_sparse self.kw_args = kw_args self.logger = logging.getLogger(__name__) - def fit(self, X, y): - y = check_target_type(y) - X, y = check_X_y( - X, - y, - accept_sparse=['csr', 'csc'] if self.accept_sparse else False) - self.X_hash_, self.y_hash_ = hash_X_y(X, y) - # when using a sampler, ratio_ is supposed to exist after fit - self.sampling_strategy_ = 'is_fitted' - - return self - - @property - def ratio_(self): - # FIXME: remove in 0.6 - warnings.warn("'ratio' and 'ratio_' are deprecated. Use " - "'sampling_strategy' and 'sampling_strategy_' instead.", - DeprecationWarning) - return self.sampling_strategy_ - - def _sample(self, X, y, func=None, kw_args=None): - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y( - X, - y, - accept_sparse=['csr', 'csc'] if self.accept_sparse else False) - check_is_fitted(self, 'sampling_strategy_') - X_hash, y_hash = hash_X_y(X, y) - if self.X_hash_ != X_hash or self.y_hash_ != y_hash: - raise RuntimeError("X and y need to be same array earlier fitted.") - - if func is None: - func = _identity - - output = func(X, y, **(kw_args if self.kw_args else {})) - - if binarize_y: - y_sampled = label_binarize(output[1], np.unique(y)) - if len(output) == 2: - return output[0], y_sampled - else: - return output[0], y_sampled, output[2] - else: - return output - - def sample(self, X, y): - return self._sample(X, y, func=self.func, kw_args=self.kw_args) + def _fit_resample(self, X, y): + X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] + if self.accept_sparse else False) + func = _identity if self.func is None else self.func + output = func(X, y, **(self.kw_args if self.kw_args else {})) + return output diff --git a/imblearn/keras/__init__.py b/imblearn/keras/__init__.py new file mode 100644 index 000000000..407e0c7dd --- /dev/null +++ b/imblearn/keras/__init__.py @@ -0,0 +1,8 @@ +"""The :mod:`imblearn.keras` provides utilities to deal with imbalanced dataset +in keras.""" + +from ._generator import BalancedBatchGenerator +from ._generator import balanced_batch_generator + +__all__ = ['BalancedBatchGenerator', + 'balanced_batch_generator'] diff --git a/imblearn/keras/_generator.py b/imblearn/keras/_generator.py new file mode 100644 index 000000000..a92e8ea9d --- /dev/null +++ b/imblearn/keras/_generator.py @@ -0,0 +1,230 @@ +"""Implement generators for ``keras`` which will balance the data.""" +from __future__ import division + +# This is a trick to avoid an error during tests collection with pytest. We +# avoid the error when importing the package raise the error at the moment of +# creating the instance. +try: + import keras + ParentClass = keras.utils.Sequence + HAS_KERAS = True +except ImportError: + ParentClass = object + HAS_KERAS = False + +from scipy.sparse import issparse + +from sklearn.base import clone +from sklearn.utils import safe_indexing +from sklearn.utils import check_random_state +from sklearn.utils.testing import set_random_state + +from ..under_sampling import RandomUnderSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring +from ..tensorflow import balanced_batch_generator as tf_bbg + + +class BalancedBatchGenerator(ParentClass): + """Create balanced batches when training a keras model. + + Create a keras ``Sequence`` which is given to ``fit_generator``. The + sampler defines the sampling strategy used to balance the dataset ahead of + creating the batch. The sampler should have an attribute + ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=RandomUnderSampler) + A sampler instance which has an attribute ``return_indices``. + By default, the sampler used is a + :class:`imblearn.under_sampling.RandomUnderSampler`. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + keep_sparse : bool, optional (default=False) + Either or not to conserve or not the sparsity of the input (i.e. ``X``, + ``y``, ``sample_weight``). By default, the returned batches will be + dense. + + random_state : int, RandomState instance or None, optional (default=None) + Control the randomization of the algorithm + - If int, ``random_state`` is the seed used by the random number + generator; + - If ``RandomState`` instance, random_state is the random number + generator; + - If ``None``, the random number generator is the ``RandomState`` + instance used by ``np.random``. + + Attributes + ---------- + sampler_ : object + The sampler used to balance the dataset. + + indices_ : ndarray, shape (n_samples, n_features) + The indices of the samples selected during sampling. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> iris = load_iris() + >>> from imblearn.datasets import make_imbalance + >>> class_dict = dict() + >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 + >>> X, y = make_imbalance(iris.data, iris.target, class_dict) + >>> import keras + >>> y = keras.utils.to_categorical(y, 3) + >>> model = keras.models.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + >>> from imblearn.keras import BalancedBatchGenerator + >>> from imblearn.under_sampling import NearMiss + >>> training_generator = BalancedBatchGenerator( + ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... epochs=10, verbose=0) + + """ + def __init__(self, X, y, sample_weight=None, sampler=None, batch_size=32, + keep_sparse=False, random_state=None): + if not HAS_KERAS: + raise ImportError("'No module named 'keras'") + self.X = X + self.y = y + self.sample_weight = sample_weight + self.sampler = sampler + self.batch_size = batch_size + self.keep_sparse = keep_sparse + self.random_state = random_state + self._sample() + + def _sample(self): + random_state = check_random_state(self.random_state) + if self.sampler is None: + self.sampler_ = RandomUnderSampler(return_indices=True, + random_state=random_state) + else: + if not hasattr(self.sampler, 'return_indices'): + raise ValueError("'sampler' needs to return the indices of " + "the samples selected. Provide a sampler " + "which has an attribute 'return_indices'.") + self.sampler_ = clone(self.sampler) + self.sampler_.set_params(return_indices=True) + set_random_state(self.sampler_, random_state) + + _, _, self.indices_ = self.sampler_.fit_resample(self.X, self.y) + # shuffle the indices since the sampler are packing them by class + random_state.shuffle(self.indices_) + + def __len__(self): + return int(self.indices_.size // self.batch_size) + + def __getitem__(self, index): + X_resampled = safe_indexing( + self.X, self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + y_resampled = safe_indexing( + self.y, self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + if issparse(X_resampled) and not self.keep_sparse: + X_resampled = X_resampled.toarray() + if self.sample_weight is not None: + sample_weight_resampled = safe_indexing( + self.sample_weight, + self.indices_[index * self.batch_size: + (index + 1) * self.batch_size]) + + if self.sample_weight is None: + return X_resampled, y_resampled + else: + return X_resampled, y_resampled, sample_weight_resampled + + +@Substitution(random_state=_random_state_docstring) +def balanced_batch_generator(X, y, sample_weight=None, sampler=None, + batch_size=32, keep_sparse=False, + random_state=None): + """Create a balanced batch generator to train keras model. + + Returns a generator --- as well as the number of step per epoch --- which + is given to ``fit_generator``. The sampler defines the sampling strategy + used to balance the dataset ahead of creating the batch. The sampler should + have an attribute ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=RandomUnderSampler) + A sampler instance which has an attribute ``return_indices``. + By default, the sampler used is a + :class:`imblearn.under_sampling.RandomUnderSampler`. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + keep_sparse : bool, optional (default=False) + Either or not to conserve or not the sparsity of the input (i.e. ``X``, + ``y``, ``sample_weight``). By default, the returned batches will be + dense. + + {random_state} + + Returns + ------- + generator : generator of tuple + Generate batch of data. The tuple generated are either (X_batch, + y_batch) or (X_batch, y_batch, sampler_weight_batch). + + steps_per_epoch : int + The number of samples per epoch. Required by ``fit_generator`` in + keras. + + Examples + -------- + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> from imblearn.datasets import make_imbalance + >>> class_dict = dict() + >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(X, y, class_dict) + >>> import keras + >>> y = keras.utils.to_categorical(y, 3) + >>> model = keras.models.Sequential() + >>> model.add(keras.layers.Dense(y.shape[1], input_dim=X.shape[1], + ... activation='softmax')) + >>> model.compile(optimizer='sgd', loss='categorical_crossentropy', + ... metrics=['accuracy']) + >>> from imblearn.keras import balanced_batch_generator + >>> from imblearn.under_sampling import NearMiss + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sampler=NearMiss(), batch_size=10, random_state=42) + >>> callback_history = model.fit_generator(generator=training_generator, + ... steps_per_epoch=steps_per_epoch, + ... epochs=10, verbose=0) + + """ + + return tf_bbg(X=X, y=y, sample_weight=sample_weight, + sampler=sampler, batch_size=batch_size, + keep_sparse=keep_sparse, random_state=random_state) diff --git a/imblearn/keras/tests/__init__.py b/imblearn/keras/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/imblearn/keras/tests/test_generator.py b/imblearn/keras/tests/test_generator.py new file mode 100644 index 000000000..cbab74864 --- /dev/null +++ b/imblearn/keras/tests/test_generator.py @@ -0,0 +1,101 @@ +import pytest + +import numpy as np +from scipy import sparse + +from sklearn.datasets import load_iris + +keras = pytest.importorskip('keras') +from keras.models import Sequential +from keras.layers import Dense +from keras.utils import to_categorical + +from imblearn.datasets import make_imbalance +from imblearn.under_sampling import ClusterCentroids +from imblearn.under_sampling import NearMiss + +from imblearn.keras import BalancedBatchGenerator +from imblearn.keras import balanced_batch_generator + +iris = load_iris() +X, y = make_imbalance(iris.data, iris.target, {0: 30, 1: 50, 2: 40}) +y = to_categorical(y, 3) + + +def _build_keras_model(n_classes, n_features): + model = Sequential() + model.add(Dense(n_classes, input_dim=n_features, activation='softmax')) + model.compile(optimizer='sgd', loss='categorical_crossentropy', + metrics=['accuracy']) + return model + + +def test_balanced_batch_generator_class_no_return_indices(): + with pytest.raises(ValueError, match='needs to return the indices'): + BalancedBatchGenerator(X, y, sampler=ClusterCentroids(), batch_size=10) + + +@pytest.mark.parametrize( + "sampler, sample_weight", + [(None, None), + (NearMiss(), None), + (None, np.random.uniform(size=(y.shape[0])))] +) +def test_balanced_batch_generator_class(sampler, sample_weight): + model = _build_keras_model(y.shape[1], X.shape[1]) + training_generator = BalancedBatchGenerator(X, y, + sample_weight=sample_weight, + sampler=sampler, + batch_size=10, + random_state=42) + model.fit_generator(generator=training_generator, + epochs=10) + + +@pytest.mark.parametrize("keep_sparse", [True, False]) +def test_balanced_batch_generator_class_sparse(keep_sparse): + training_generator = BalancedBatchGenerator(sparse.csr_matrix(X), y, + batch_size=10, + keep_sparse=keep_sparse, + random_state=42) + for idx in range(len(training_generator)): + X_batch, y_batch = training_generator.__getitem__(idx) + if keep_sparse: + assert sparse.issparse(X_batch) + else: + assert not sparse.issparse(X_batch) + + +def test_balanced_batch_generator_function_no_return_indices(): + with pytest.raises(ValueError, match='needs to return the indices'): + balanced_batch_generator( + X, y, sampler=ClusterCentroids(), batch_size=10, random_state=42) + + +@pytest.mark.parametrize( + "sampler, sample_weight", + [(None, None), + (NearMiss(), None), + (None, np.random.uniform(size=(y.shape[0])))] +) +def test_balanced_batch_generator_function(sampler, sample_weight): + model = _build_keras_model(y.shape[1], X.shape[1]) + training_generator, steps_per_epoch = balanced_batch_generator( + X, y, sample_weight=sample_weight, sampler=sampler, batch_size=10, + random_state=42) + model.fit_generator(generator=training_generator, + steps_per_epoch=steps_per_epoch, + epochs=10) + + +@pytest.mark.parametrize("keep_sparse", [True, False]) +def test_balanced_batch_generator_function_sparse(keep_sparse): + training_generator, steps_per_epoch = balanced_batch_generator( + sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, + random_state=42) + for idx in range(steps_per_epoch): + X_batch, y_batch = next(training_generator) + if keep_sparse: + assert sparse.issparse(X_batch) + else: + assert not sparse.issparse(X_batch) diff --git a/imblearn/setup.py b/imblearn/setup.py new file mode 100644 index 000000000..3ae6851a8 --- /dev/null +++ b/imblearn/setup.py @@ -0,0 +1,41 @@ +PACKAGE_NAME = 'imblearn' + + +def configuration(parent_package='', top_path=None): + from numpy.distutils.misc_util import Configuration + + config = Configuration(PACKAGE_NAME, parent_package, top_path) + + config.add_subpackage('__check_build') + + # pure python packages + config.add_subpackage('combine') + config.add_subpackage('combine/tests') + config.add_subpackage('datasets') + config.add_subpackage('datasets/tests') + config.add_subpackage('ensemble') + config.add_subpackage('ensemble/tests') + config.add_subpackage('keras') + config.add_subpackage('keras/tests') + config.add_subpackage('metrics') + config.add_subpackage('metrics/tests') + config.add_subpackage('tensorflow') + config.add_subpackage('tensorflow/tests') + config.add_subpackage('tests') + config.add_subpackage('under_sampling') + config.add_subpackage('under_sampling/_prototype_generation') + config.add_subpackage('under_sampling/_prototype_generation/tests') + config.add_subpackage('under_sampling/_prototype_selection') + config.add_subpackage('under_sampling/_prototype_selection/tests') + config.add_subpackage('utils') + config.add_subpackage('utils/tests') + + # packages that have their own setup.py -> cython files + config.add_subpackage('tree') + + return config + + +if __name__ == '__main__': + from numpy.distutils.core import setup + setup(**configuration(top_path='').todict()) diff --git a/imblearn/tensorflow/__init__.py b/imblearn/tensorflow/__init__.py new file mode 100644 index 000000000..3224a7db1 --- /dev/null +++ b/imblearn/tensorflow/__init__.py @@ -0,0 +1,6 @@ +"""The :mod:`imblearn.tensorflow` provides utilities to deal with imbalanced +dataset in tensorflow.""" + +from ._generator import balanced_batch_generator + +__all__ = ['balanced_batch_generator'] diff --git a/imblearn/tensorflow/_generator.py b/imblearn/tensorflow/_generator.py new file mode 100644 index 000000000..0f124e88c --- /dev/null +++ b/imblearn/tensorflow/_generator.py @@ -0,0 +1,151 @@ +"""Implement generators for ``tensorflow`` which will balance the data.""" + +from __future__ import division + +from scipy.sparse import issparse + +from sklearn.base import clone +from sklearn.utils import safe_indexing +from sklearn.utils import check_random_state +from sklearn.utils.testing import set_random_state + +from ..under_sampling import RandomUnderSampler +from ..utils import Substitution +from ..utils._docstring import _random_state_docstring + + +@Substitution(random_state=_random_state_docstring) +def balanced_batch_generator(X, y, sample_weight=None, sampler=None, + batch_size=32, keep_sparse=False, + random_state=None): + """Create a balanced batch generator to train keras model. + + Returns a generator --- as well as the number of step per epoch --- which + is given to ``fit_generator``. The sampler defines the sampling strategy + used to balance the dataset ahead of creating the batch. The sampler should + have an attribute ``return_indices``. + + Parameters + ---------- + X : ndarray, shape (n_samples, n_features) + Original imbalanced dataset. + + y : ndarray, shape (n_samples,) or (n_samples, n_classes) + Associated targets. + + sample_weight : ndarray, shape (n_samples,) + Sample weight. + + sampler : object or None, optional (default=RandomUnderSampler) + A sampler instance which has an attribute ``return_indices``. + By default, the sampler used is a + :class:`imblearn.under_sampling.RandomUnderSampler`. + + batch_size : int, optional (default=32) + Number of samples per gradient update. + + keep_sparse : bool, optional (default=False) + Either or not to conserve or not the sparsity of the input ``X``. By + default, the returned batches will be dense. + + {random_state} + + Returns + ------- + generator : generator of tuple + Generate batch of data. The tuple generated are either (X_batch, + y_batch) or (X_batch, y_batch, sampler_weight_batch). + + steps_per_epoch : int + The number of samples per epoch. + + Examples + -------- + >>> import numpy as np + >>> from sklearn.datasets import load_iris + >>> X, y = load_iris(return_X_y=True) + >>> class_dict = dict() + >>> class_dict[0] = 30; class_dict[1] = 50; class_dict[2] = 40 + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance(X, y, class_dict) + >>> X = X.astype(np.float32) + >>> batch_size, learning_rate, epochs = 10, 0.01, 10 + >>> training_generator, steps_per_epoch = balanced_batch_generator( + ... X, y, sample_weight=None, sampler=None, + ... batch_size=batch_size, random_state=42) + >>> input_size, output_size = X.shape[1], 3 + >>> import tensorflow as tf + >>> def init_weights(shape): + ... return tf.Variable(tf.random_normal(shape, stddev=0.01)) + >>> def accuracy(y_true, y_pred): + ... return np.mean(np.argmax(y_pred, axis=1) == y_true) + >>> # input and output + >>> data = tf.placeholder("float32", shape=[None, input_size]) + >>> targets = tf.placeholder("int32", shape=[None]) + >>> # build the model and weights + >>> W = init_weights([input_size, output_size]) + >>> b = init_weights([output_size]) + >>> out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) + >>> # build the loss, predict, and train operator + >>> cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + ... logits=out_act, labels=targets) + >>> loss = tf.reduce_sum(cross_entropy) + >>> optimizer = tf.train.GradientDescentOptimizer(learning_rate) + >>> train_op = optimizer.minimize(loss) + >>> predict = tf.nn.softmax(out_act) + >>> # Initialization of all variables in the graph + >>> init = tf.global_variables_initializer() + >>> with tf.Session() as sess: + ... print('Starting training') + ... sess.run(init) + ... for e in range(epochs): + ... for i in range(steps_per_epoch): + ... X_batch, y_batch = next(training_generator) + ... feed_dict = dict() + ... feed_dict[data] = X_batch; feed_dict[targets] = y_batch + ... sess.run([train_op, loss], feed_dict=feed_dict) + ... # For each epoch, run accuracy on train and test + ... feed_dict = dict() + ... feed_dict[data] = X + ... predicts_train = sess.run(predict, feed_dict=feed_dict) + ... print("epoch: {{}} train accuracy: {{:.3f}}" + ... .format(e, accuracy(y, predicts_train))) + ... # doctest: +ELLIPSIS + Starting training + [... + + """ + + random_state = check_random_state(random_state) + if sampler is None: + sampler_ = RandomUnderSampler(return_indices=True, + random_state=random_state) + else: + if not hasattr(sampler, 'return_indices'): + raise ValueError("'sampler' needs to return the indices of " + "the samples selected. Provide a sampler " + "which has an attribute 'return_indices'.") + sampler_ = clone(sampler) + sampler_.set_params(return_indices=True) + set_random_state(sampler_, random_state) + + _, _, indices = sampler_.fit_resample(X, y) + # shuffle the indices since the sampler are packing them by class + random_state.shuffle(indices) + + def generator(X, y, sample_weight, indices, batch_size): + while True: + for index in range(0, len(indices), batch_size): + X_res = safe_indexing(X, indices[index:index + batch_size]) + y_res = safe_indexing(y, indices[index:index + batch_size]) + if issparse(X_res) and not keep_sparse: + X_res = X_res.toarray() + if sample_weight is None: + yield X_res, y_res + else: + sw_res = safe_indexing(sample_weight, + indices[index:index + batch_size]) + yield X_res, y_res, sw_res + + return (generator(X, y, sample_weight, indices, batch_size), + int(indices.size // batch_size)) diff --git a/imblearn/tensorflow/tests/test_generator.py b/imblearn/tensorflow/tests/test_generator.py new file mode 100644 index 000000000..78eda3b1d --- /dev/null +++ b/imblearn/tensorflow/tests/test_generator.py @@ -0,0 +1,89 @@ +from __future__ import division + +import pytest +import numpy as np +from scipy import sparse + +from sklearn.datasets import load_iris + +from imblearn.datasets import make_imbalance +from imblearn.under_sampling import NearMiss + +from imblearn.tensorflow import balanced_batch_generator + +tf = pytest.importorskip('tensorflow') + + +@pytest.mark.parametrize("sampler", [None, NearMiss()]) +def test_balanced_batch_generator(sampler): + X, y = load_iris(return_X_y=True) + X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) + X = X.astype(np.float32) + + batch_size = 10 + training_generator, steps_per_epoch = balanced_batch_generator( + X, y, sample_weight=None, sampler=sampler, + batch_size=batch_size, random_state=42) + + learning_rate = 0.01 + epochs = 10 + input_size = X.shape[1] + output_size = 3 + + # helper functions + def init_weights(shape): + return tf.Variable(tf.random_normal(shape, stddev=0.01)) + + def accuracy(y_true, y_pred): + return np.mean(np.argmax(y_pred, axis=1) == y_true) + + # input and output + data = tf.placeholder("float32", shape=[None, input_size]) + targets = tf.placeholder("int32", shape=[None]) + + # build the model and weights + W = init_weights([input_size, output_size]) + b = init_weights([output_size]) + out_act = tf.nn.sigmoid(tf.matmul(data, W) + b) + + # build the loss, predict, and train operator + cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=out_act, labels=targets) + loss = tf.reduce_sum(cross_entropy) + optimizer = tf.train.GradientDescentOptimizer(learning_rate) + train_op = optimizer.minimize(loss) + predict = tf.nn.softmax(out_act) + + # Initialization of all variables in the graph + init = tf.global_variables_initializer() + + with tf.Session() as sess: + sess.run(init) + + for e in range(epochs): + for i in range(steps_per_epoch): + X_batch, y_batch = next(training_generator) + sess.run([train_op, loss], + feed_dict={data: X_batch, targets: y_batch}) + + # For each epoch, run accuracy on train and test + predicts_train = sess.run(predict, feed_dict={data: X}) + print("epoch: {} train accuracy: {:.3f}" + .format(e, accuracy(y, predicts_train))) + + +@pytest.mark.parametrize("keep_sparse", [True, False]) +def test_balanced_batch_generator_function_sparse(keep_sparse): + X, y = load_iris(return_X_y=True) + X, y = make_imbalance(X, y, {0: 30, 1: 50, 2: 40}) + X = X.astype(np.float32) + + training_generator, steps_per_epoch = balanced_batch_generator( + sparse.csr_matrix(X), y, keep_sparse=keep_sparse, batch_size=10, + random_state=42) + for idx in range(steps_per_epoch): + X_batch, y_batch = next(training_generator) + if keep_sparse: + assert sparse.issparse(X_batch) + else: + assert not sparse.issparse(X_batch) diff --git a/imblearn/tree/criterion.pxd b/imblearn/tree/criterion.pxd new file mode 100644 index 000000000..e69de29bb diff --git a/imblearn/tree_split/hellinger_distance_criterion.pyx b/imblearn/tree/criterion.pyx similarity index 63% rename from imblearn/tree_split/hellinger_distance_criterion.pyx rename to imblearn/tree/criterion.pyx index 340aced86..256a73a24 100644 --- a/imblearn/tree_split/hellinger_distance_criterion.pyx +++ b/imblearn/tree/criterion.pyx @@ -2,42 +2,49 @@ # # License: BSD 3 clause +from libc.math cimport sqrt, pow +from libc.math cimport abs + +import numpy as np + from sklearn.tree._criterion cimport ClassificationCriterion from sklearn.tree._criterion cimport SIZE_t -import numpy as np cdef double INFINITY = np.inf -from libc.math cimport sqrt, pow -from libc.math cimport abs +cdef class HellingerDistanceCriterion(ClassificationCriterion): + """Hellinger distance criterion. + + + """ -cdef class HellingerDistanceCriterion(ClassificationCriterion): - cdef double proxy_impurity_improvement(self) nogil: - cdef double impurity_left - cdef double impurity_right - + cdef: + double impurity_left + double impurity_right + self.children_impurity(&impurity_left, &impurity_right) - + return impurity_right + impurity_left - + cdef double impurity_improvement(self, double impurity) nogil: - cdef double impurity_left - cdef double impurity_right + cdef: + double impurity_left + double impurity_right self.children_impurity(&impurity_left, &impurity_right) return impurity_right + impurity_left - + cdef double node_impurity(self) nogil: - cdef SIZE_t* n_classes = self.n_classes - cdef double* sum_total = self.sum_total - cdef double hellinger = 0.0 - cdef double sq_count - cdef double count_k - cdef SIZE_t k - cdef SIZE_t c + cdef: + SIZE_t* n_classes = self.n_classes + double* sum_total = self.sum_total + double hellinger = 0.0 + double sq_count + double count_k + SIZE_t k, c for k in range(self.n_outputs): for c in range(n_classes[k]): @@ -47,38 +54,37 @@ cdef class HellingerDistanceCriterion(ClassificationCriterion): cdef void children_impurity(self, double* impurity_left, double* impurity_right) nogil: - cdef SIZE_t* n_classes = self.n_classes - cdef double* sum_left = self.sum_left - cdef double* sum_right = self.sum_right - cdef double hellinger_left = 0.0 - cdef double hellinger_right = 0.0 - cdef double count_k1 = 0.0 - cdef double count_k2 = 0.0 - - cdef SIZE_t k - cdef SIZE_t c - - # stop splitting in case reached pure node with 0 samples of second class + cdef: + SIZE_t* n_classes = self.n_classes + double* sum_left = self.sum_left + double* sum_right = self.sum_right + double hellinger_left = 0.0 + double hellinger_right = 0.0 + double count_k1 = 0.0 + double count_k2 = 0.0 + SIZE_t k, c + + # stop splitting in case reached pure node with 0 samples of second + # class if sum_left[1] + sum_right[1] == 0: impurity_left[0] = -INFINITY impurity_right[0] = -INFINITY return - + for k in range(self.n_outputs): if(sum_left[0] + sum_right[0] > 0): count_k1 = sqrt(sum_left[0] / (sum_left[0] + sum_right[0])) if(sum_left[1] + sum_right[1] > 0): count_k2 = sqrt(sum_left[1] / (sum_left[1] + sum_right[1])) - hellinger_left += pow((count_k1 - count_k2),2) - - if(sum_left[0] + sum_right[0] > 0): + hellinger_left += pow((count_k1 - count_k2), 2) + + if(sum_left[0] + sum_right[0] > 0): count_k1 = sqrt(sum_right[0] / (sum_left[0] + sum_right[0])) if(sum_left[1] + sum_right[1] > 0): count_k2 = sqrt(sum_right[1] / (sum_left[1] + sum_right[1])) - hellinger_right += pow((count_k1 - count_k2),2) - + hellinger_right += pow((count_k1 - count_k2), 2) + impurity_left[0] = hellinger_left / self.n_outputs impurity_right[0] = hellinger_right / self.n_outputs - \ No newline at end of file diff --git a/imblearn/tree/setup.py b/imblearn/tree/setup.py new file mode 100644 index 000000000..061cbc018 --- /dev/null +++ b/imblearn/tree/setup.py @@ -0,0 +1,21 @@ +import numpy + + +def configuration(parent_package='', top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration('tree', parent_package, top_path) + libraries = [] + config.add_extension('criterion', + sources=['criterion.c'], + include_dirs=[numpy.get_include()], + libraries=libraries) + # extra_compile_args=["-O3", "-fopenmp"], + # extra_link_args=["-fopenmp"]) + # config.add_subpackage("tests") + + return config + + +if __name__ == "__main__": + from numpy.distutils.core import setup + setup(**configuration().todict()) \ No newline at end of file diff --git a/imblearn/tree_split/__init__.py b/imblearn/tree_split/__init__.py deleted file mode 100644 index 672474b78..000000000 --- a/imblearn/tree_split/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .hellinger_distance_criterion import HellingerDistanceCriterion diff --git a/imblearn/tree_split/setup.py b/imblearn/tree_split/setup.py deleted file mode 100644 index 9b836d087..000000000 --- a/imblearn/tree_split/setup.py +++ /dev/null @@ -1,20 +0,0 @@ -import os - -PACKAGE_NAME = 'imblearn/tree_split' - - -def configuration(parent_package='', top_path=None): - from numpy.distutils.misc_util import Configuration - - config = Configuration(PACKAGE_NAME, parent_package, top_path) - - config.add_extension('hellinger_distance_criterion', - sources=['hellinger_distance_criterion.c']) - config.li - - return config - - -if __name__ == '__main__': - from numpy.distutils.core import setup - setup(**configuration(top_path='').todict()) diff --git a/imblearn/under_sampling/__init__.py b/imblearn/under_sampling/__init__.py index 28df84eec..4324833e5 100644 --- a/imblearn/under_sampling/__init__.py +++ b/imblearn/under_sampling/__init__.py @@ -3,18 +3,18 @@ a dataset. """ -from .prototype_generation import ClusterCentroids +from ._prototype_generation import ClusterCentroids -from .prototype_selection import RandomUnderSampler -from .prototype_selection import TomekLinks -from .prototype_selection import NearMiss -from .prototype_selection import CondensedNearestNeighbour -from .prototype_selection import OneSidedSelection -from .prototype_selection import NeighbourhoodCleaningRule -from .prototype_selection import EditedNearestNeighbours -from .prototype_selection import RepeatedEditedNearestNeighbours -from .prototype_selection import AllKNN -from .prototype_selection import InstanceHardnessThreshold +from ._prototype_selection import RandomUnderSampler +from ._prototype_selection import TomekLinks +from ._prototype_selection import NearMiss +from ._prototype_selection import CondensedNearestNeighbour +from ._prototype_selection import OneSidedSelection +from ._prototype_selection import NeighbourhoodCleaningRule +from ._prototype_selection import EditedNearestNeighbours +from ._prototype_selection import RepeatedEditedNearestNeighbours +from ._prototype_selection import AllKNN +from ._prototype_selection import InstanceHardnessThreshold __all__ = [ 'ClusterCentroids', 'RandomUnderSampler', 'InstanceHardnessThreshold', diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py new file mode 100644 index 000000000..27120364c --- /dev/null +++ b/imblearn/utils/_validation.py @@ -0,0 +1,525 @@ +"""Utilities for input validation""" + +# Authors: Guillaume Lemaitre +# License: MIT +from __future__ import division + +import warnings +from collections import Counter +from collections import OrderedDict +from numbers import Integral, Real + +import numpy as np + +from sklearn.base import clone +from sklearn.neighbors.base import KNeighborsMixin +from sklearn.neighbors import NearestNeighbors +from sklearn.externals import six +from sklearn.utils.multiclass import type_of_target +from sklearn.utils.deprecation import deprecated + +from ..exceptions import raise_isinstance_error + +SAMPLING_KIND = ('over-sampling', 'under-sampling', 'clean-sampling', + 'ensemble', 'bypass') +TARGET_KIND = ('binary', 'multiclass', 'multilabel-indicator') + + +def check_neighbors_object(nn_name, nn_object, additional_neighbor=0): + """Check the objects is consistent to be a NN. + + Several methods in imblearn relies on NN. Until version 0.4, these + objects can be passed at initialisation as an integer or a + KNeighborsMixin. After only KNeighborsMixin will be accepted. This + utility allows for type checking and raise if the type is wrong. + + Parameters + ---------- + nn_name : str, + The name associated to the object to raise an error if needed. + + nn_object : int or KNeighborsMixin, + The object to be checked + + additional_neighbor : int, optional (default=0) + Sometimes, some algorithm need an additional neighbors. + + Returns + ------- + nn_object : KNeighborsMixin + The k-NN object. + """ + if isinstance(nn_object, Integral): + return NearestNeighbors(n_neighbors=nn_object + additional_neighbor) + elif isinstance(nn_object, KNeighborsMixin): + return clone(nn_object) + else: + raise_isinstance_error(nn_name, [int, KNeighborsMixin], nn_object) + + +def check_target_type(y, indicate_one_vs_all=False): + """Check the target types to be conform to the current samplers. + + The current samplers should be compatible with ``'binary'``, + ``'multilabel-indicator'`` and ``'multiclass'`` targets only. + + Parameters + ---------- + y : ndarray, + The array containing the target. + + indicate_one_vs_all : bool, optional + Either to indicate if the targets are encoded in a one-vs-all fashion. + + Returns + ------- + y : ndarray, + The returned target. + + is_one_vs_all : bool, optional + Indicate if the target was originally encoded in a one-vs-all fashion. + Only returned if ``indicate_multilabel=True``. + + """ + type_y = type_of_target(y) + if type_y not in TARGET_KIND: + # FIXME: perfectly we should raise an error but the sklearn API does + # not allow for it + warnings.warn("'y' should be of types {} only. Got {} instead.".format( + TARGET_KIND, type_of_target(y))) + + if indicate_one_vs_all: + return (y.argmax(axis=1) if type_y == 'multilabel-indicator' else y, + type_y == 'multilabel-indicator') + else: + return y.argmax(axis=1) if type_y == 'multilabel-indicator' else y + + +def _sampling_strategy_all(y, sampling_type): + """Returns sampling target by targeting all classes.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() + } + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + n_sample_minority = min(target_stats.values()) + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() + } + else: + raise NotImplementedError + + return sampling_strategy + + +def _sampling_strategy_majority(y, sampling_type): + """Returns sampling target by targeting the majority class only.""" + if sampling_type == 'over-sampling': + raise ValueError("'sampling_strategy'='majority' cannot be used with" + " over-sampler.") + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + target_stats = Counter(y) + class_majority = max(target_stats, key=target_stats.get) + n_sample_minority = min(target_stats.values()) + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() if key == class_majority + } + else: + raise NotImplementedError + + return sampling_strategy + + +def _sampling_strategy_not_majority(y, sampling_type): + """Returns sampling target by targeting all classes but not the + majority.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() if key != class_majority + } + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + n_sample_minority = min(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() if key != class_majority + } + else: + raise NotImplementedError + + return sampling_strategy + + +def _sampling_strategy_not_minority(y, sampling_type): + """Returns sampling target by targeting all classes but not the + minority.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() if key != class_minority + } + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + n_sample_minority = min(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_minority + for key in target_stats.keys() if key != class_minority + } + else: + raise NotImplementedError + + return sampling_strategy + + +def _sampling_strategy_minority(y, sampling_type): + """Returns sampling target by targeting the minority class only.""" + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + sampling_strategy = { + key: n_sample_majority - value + for (key, value) in target_stats.items() if key == class_minority + } + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + raise ValueError("'sampling_strategy'='minority' cannot be used with" + " under-sampler and clean-sampler.") + else: + raise NotImplementedError + + return sampling_strategy + + +def _sampling_strategy_auto(y, sampling_type): + """Returns sampling target auto for over-sampling and not-minority for + under-sampling.""" + if sampling_type == 'over-sampling': + return _sampling_strategy_not_majority(y, sampling_type) + elif (sampling_type == 'under-sampling' or + sampling_type == 'clean-sampling'): + return _sampling_strategy_not_minority(y, sampling_type) + + +def _sampling_strategy_dict(sampling_strategy, y, sampling_type): + """Returns sampling target by converting the dictionary depending of the + sampling.""" + target_stats = Counter(y) + # check that all keys in sampling_strategy are also in y + set_diff_sampling_strategy_target = ( + set(sampling_strategy.keys()) - set(target_stats.keys())) + if len(set_diff_sampling_strategy_target) > 0: + raise ValueError("The {} target class is/are not present in the" + " data.".format(set_diff_sampling_strategy_target)) + # check that there is no negative number + if any(n_samples < 0 for n_samples in sampling_strategy.values()): + raise ValueError("The number of samples in a class cannot be negative." + "'sampling_strategy' contains some negative value: {}" + .format(sampling_strategy)) + sampling_strategy_ = {} + if sampling_type == 'over-sampling': + n_samples_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + for class_sample, n_samples in sampling_strategy.items(): + if n_samples < target_stats[class_sample]: + raise ValueError("With over-sampling methods, the number" + " of samples in a class should be greater" + " or equal to the original number of samples." + " Originally, there is {} samples and {}" + " samples are asked.".format( + target_stats[class_sample], n_samples)) + if n_samples > n_samples_majority: + warnings.warn("After over-sampling, the number of samples ({})" + " in class {} will be larger than the number of" + " samples in the majority class (class #{} ->" + " {})".format(n_samples, class_sample, + class_majority, + n_samples_majority)) + sampling_strategy_[class_sample] = ( + n_samples - target_stats[class_sample]) + elif sampling_type == 'under-sampling': + for class_sample, n_samples in sampling_strategy.items(): + if n_samples > target_stats[class_sample]: + raise ValueError("With under-sampling methods, the number of" + " samples in a class should be less or equal" + " to the original number of samples." + " Originally, there is {} samples and {}" + " samples are asked.".format( + target_stats[class_sample], n_samples)) + sampling_strategy_[class_sample] = n_samples + elif sampling_type == 'clean-sampling': + # FIXME: Turn into an error in 0.6 + warnings.warn("'sampling_strategy' as a dict for cleaning methods is " + "deprecated and will raise an error in version 0.6. " + "Please give a list of the classes to be targeted by the" + " sampling.", DeprecationWarning) + # clean-sampling can be more permissive since those samplers do not + # use samples + for class_sample, n_samples in sampling_strategy.items(): + sampling_strategy_[class_sample] = n_samples + else: + raise NotImplementedError + + return sampling_strategy_ + + +def _sampling_strategy_list(sampling_strategy, y, sampling_type): + """With cleaning methods, sampling_strategy can be a list to target the + class of interest.""" + if sampling_type != 'clean-sampling': + raise ValueError("'sampling_strategy' cannot be a list for samplers " + "which are not cleaning methods.") + + target_stats = Counter(y) + # check that all keys in sampling_strategy are also in y + set_diff_sampling_strategy_target = ( + set(sampling_strategy) - set(target_stats.keys())) + if len(set_diff_sampling_strategy_target) > 0: + raise ValueError("The {} target class is/are not present in the" + " data.".format(set_diff_sampling_strategy_target)) + + return { + class_sample: min(target_stats.values()) + for class_sample in sampling_strategy + } + + +def _sampling_strategy_float(sampling_strategy, y, sampling_type): + """Take a proportion of the majority (over-sampling) or minority + (under-sampling) class in binary classification.""" + type_y = type_of_target(y) + if type_y != 'binary': + raise ValueError( + '"sampling_strategy" can be a float only when the type ' + 'of target is binary. For multi-class, use a dict.') + target_stats = Counter(y) + if sampling_type == 'over-sampling': + n_sample_majority = max(target_stats.values()) + class_majority = max(target_stats, key=target_stats.get) + sampling_strategy_ = { + key: int(n_sample_majority * sampling_strategy - value) + for (key, value) in target_stats.items() if key != class_majority + } + elif (sampling_type == 'under-sampling'): + n_sample_minority = min(target_stats.values()) + class_minority = min(target_stats, key=target_stats.get) + sampling_strategy_ = { + key: int(n_sample_minority / sampling_strategy) + for (key, value) in target_stats.items() if key != class_minority + } + else: + raise ValueError("'clean-sampling' methods do let the user " + "specify the sampling ratio.") + return sampling_strategy_ + + +def check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs): + """Sampling target validation for samplers. + + Checks that ``sampling_strategy`` is of consistent type and return a + dictionary containing each targeted class with its corresponding + number of sample. It is used in :class:`imblearn.base.BaseSampler`. + + Parameters + ---------- + sampling_strategy : float, str, dict, list or callable, + Sampling information to sample the data set. + + - When ``float``: + + For **under-sampling methods**, it corresponds to the ratio + :math:`\\alpha_{us}` defined by :math:`N_{rM} = \\alpha_{us} + \\times N_{m}` where :math:`N_{rM}` and :math:`N_{m}` are the + number of samples in the majority class after resampling and the + number of samples in the minority class, respectively; + + For **over-sampling methods**, it correspond to the ratio + :math:`\\alpha_{os}` defined by :math:`N_{rm} = \\alpha_{os} + \\times N_{m}` where :math:`N_{rm}` and :math:`N_{M}` are the + number of samples in the minority class after resampling and the + number of samples in the majority class, respectively. + + .. warning:: + ``float`` is only available for **binary** classification. An + error is raised for multi-class classification and with cleaning + samplers. + + - When ``str``, specify the class targeted by the resampling. For + **under- and over-sampling methods**, the number of samples in the + different classes will be equalized. For **cleaning methods**, the + number of samples will not be equal. Possible choices are: + + ``'minority'``: resample only the minority class; + + ``'majority'``: resample only the majority class; + + ``'not minority'``: resample all classes but the minority class; + + ``'not majority'``: resample all classes but the majority class; + + ``'all'``: resample all classes; + + ``'auto'``: for under-sampling methods, equivalent to ``'not + minority'`` and for over-sampling methods, equivalent to ``'not + majority'``. + + - When ``dict``, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + .. warning:: + ``dict`` is available for both **under- and over-sampling + methods**. An error is raised with **cleaning methods**. Use a + ``list`` instead. + + - When ``list``, the list contains the targeted classes. It used only + for **cleaning methods**. + + .. warning:: + ``list`` is available for **cleaning methods**. An error is raised + with **under- and over-sampling methods**. + + - When callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + + y : ndarray, shape (n_samples,) + The target array. + + sampling_type : str, + The type of sampling. Can be either ``'over-sampling'``, + ``'under-sampling'``, or ``'clean-sampling'``. + + kwargs : dict, optional + Dictionary of additional keyword arguments to pass to + ``sampling_strategy`` when this is a callable. + + Returns + ------- + sampling_strategy_converted : dict, + The converted and validated sampling target. Returns a dictionary with + the key being the class target and the value being the desired + number of samples. + + """ + if sampling_type not in SAMPLING_KIND: + raise ValueError("'sampling_type' should be one of {}. Got '{}'" + " instead.".format(SAMPLING_KIND, sampling_type)) + + if np.unique(y).size <= 1: + raise ValueError("The target 'y' needs to have more than 1 class." + " Got {} class instead".format(np.unique(y).size)) + + if sampling_type in ('ensemble', 'bypass'): + return sampling_strategy + + if isinstance(sampling_strategy, six.string_types): + if sampling_strategy not in SAMPLING_TARGET_KIND.keys(): + raise ValueError("When 'sampling_strategy' is a string, it needs" + " to be one of {}. Got '{}' instead.".format( + SAMPLING_TARGET_KIND, sampling_strategy)) + return OrderedDict(sorted( + SAMPLING_TARGET_KIND[sampling_strategy](y, sampling_type).items())) + elif isinstance(sampling_strategy, dict): + return OrderedDict(sorted( + _sampling_strategy_dict(sampling_strategy, y, sampling_type) + .items())) + elif isinstance(sampling_strategy, list): + return OrderedDict(sorted( + _sampling_strategy_list(sampling_strategy, y, sampling_type) + .items())) + elif isinstance(sampling_strategy, Real): + if sampling_strategy <= 0 or sampling_strategy > 1: + raise ValueError( + "When 'sampling_strategy' is a float, it should be " + "in the range (0, 1]. Got {} instead." + .format(sampling_strategy)) + return OrderedDict(sorted( + _sampling_strategy_float(sampling_strategy, y, sampling_type) + .items())) + elif callable(sampling_strategy): + sampling_strategy_ = sampling_strategy(y, **kwargs) + return OrderedDict(sorted( + _sampling_strategy_dict(sampling_strategy_, y, sampling_type) + .items())) + + +SAMPLING_TARGET_KIND = { + 'minority': _sampling_strategy_minority, + 'majority': _sampling_strategy_majority, + 'not minority': _sampling_strategy_not_minority, + 'not majority': _sampling_strategy_not_majority, + 'all': _sampling_strategy_all, + 'auto': _sampling_strategy_auto +} + + +@deprecated("imblearn.utils.check_ratio was deprecated in favor of " + "imblearn.utils.check_sampling_strategy in 0.4. It will be " + "removed in 0.6.") +def check_ratio(ratio, y, sampling_type, **kwargs): + """Sampling target validation for samplers. + + Checks ratio for consistent type and return a dictionary + containing each targeted class with its corresponding number of + sample. + + .. deprecated:: 0.4 + This function is deprecated in favor of + :func:`imblearn.utils.check_sampling_strategy`. It will be removed in + 0.6. + + Parameters + ---------- + ratio : str, dict or callable, + Ratio to use for resampling the data set. + + - If ``str``, has to be one of: (i) ``'minority'``: resample the + minority class; (ii) ``'majority'``: resample the majority class, + (iii) ``'not minority'``: resample all classes apart of the minority + class, (iv) ``'all'``: resample all classes, and (v) ``'auto'``: + correspond to ``'all'`` with for over-sampling methods and ``'not + minority'`` for under-sampling methods. The classes targeted will be + over-sampled or under-sampled to achieve an equal number of sample + with the majority or minority class. + - If ``dict``, the keys correspond to the targeted classes. The values + correspond to the desired number of samples. + - If callable, function taking ``y`` and returns a ``dict``. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples. + + y : ndarray, shape (n_samples,) + The target array. + + sampling_type : str, + The type of sampling. Can be either ``'over-sampling'`` or + ``'under-sampling'``. + + kwargs : dict, optional + Dictionary of additional keyword arguments to pass to ``ratio``. + + Returns + ------- + ratio_converted : dict, + The converted and validated ratio. Returns a dictionary with + the key being the class target and the value being the desired + number of samples. + + """ + return check_sampling_strategy(ratio, y, sampling_type, **kwargs) diff --git a/requirements.optional.txt b/requirements.optional.txt new file mode 100644 index 000000000..826277d5e --- /dev/null +++ b/requirements.optional.txt @@ -0,0 +1,2 @@ +keras +tensorflow diff --git a/setup.cfg b/setup.cfg index 56cfb932a..6d45a8020 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,6 +33,5 @@ doctest-extension = rst doctest-fixtures = _fixture [tool:pytest] -addopts = - --doctest-modules +addopts = --doctest-modules diff --git a/setup.py b/setup.py index 20876573b..6c044cf67 100755 --- a/setup.py +++ b/setup.py @@ -1,16 +1,15 @@ #! /usr/bin/env python """Toolbox for imbalanced dataset in machine learning.""" -import codecs +import io +import re import os -import subprocess import sys +import subprocess +import codecs from setuptools import find_packages -# get __version__ from _version.py -ver_file = os.path.join('imblearn', '_version.py') -with open(ver_file) as f: - exec(f.read()) +PACKAGE_NAME = 'imblearn' DISTNAME = 'imbalanced-learn' DESCRIPTION = 'Toolbox for imbalanced dataset in machine learning.' @@ -21,9 +20,6 @@ URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' LICENSE = 'MIT' DOWNLOAD_URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' -VERSION = __version__ -TREE_SPLIT_PACKAGE = 'imblearn/tree_split' -CHECK_BUILD_PACKAGE = 'imblearn/__check_build' INSTALL_REQUIRES = ['numpy', 'scipy', 'scikit-learn'] CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', @@ -40,65 +36,79 @@ 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6'] -def configuration(parent_package='', top_path=None): - from numpy.distutils.misc_util import Configuration - config = Configuration(None, parent_package, top_path) - config.set_options(ignore_setup_xxx_py=True, - assume_default_configuration=True, - delegate_options_to_subpackages=True, - quiet=True) - config.add_subpackage(CHECK_BUILD_PACKAGE) - config.add_subpackage(TREE_SPLIT_PACKAGE) - return config +def version(package, encoding='utf-8'): + """Obtain the packge version from a python file e.g. pkg/_version.py + See . + """ + path = os.path.join(os.path.dirname(__file__), package, '_version.py') + with io.open(path, encoding=encoding) as fp: + version_info = fp.read() + version_match = re.search(r"""^__version__ = ['"]([^'"]*)['"]""", + version_info, re.M) + if not version_match: + raise RuntimeError("Unable to find version string.") + return version_match.group(1) + def generate_cython(package): """Cythonize all sources in the package""" cwd = os.path.abspath(os.path.dirname(__file__)) print("Cythonizing sources") p = subprocess.call([sys.executable, - os.path.join(cwd, - 'build_tools/cython', - 'cythonize.py'), + os.path.join(cwd, 'tools', 'cythonize.py'), package], cwd=cwd) if p != 0: raise RuntimeError("Running cythonize failed!") +def configuration(parent_package='', top_path=None): + from numpy.distutils.misc_util import Configuration + config = Configuration(None, parent_package, top_path) + config.set_options(ignore_setup_xxx_py=True, + assume_default_configuration=True, + delegate_options_to_subpackages=True, + quiet=True) + config.add_subpackage(PACKAGE_NAME) + return config def setup_package(): from numpy.distutils.core import setup + old_path = os.getcwd() local_path = os.path.dirname(os.path.abspath(sys.argv[0])) src_path = local_path os.chdir(local_path) sys.path.insert(0, local_path) + # Run build old_path = os.getcwd() os.chdir(src_path) sys.path.insert(0, src_path) cwd = os.path.abspath(os.path.dirname(__file__)) if not os.path.exists(os.path.join(cwd, 'PKG-INFO')): - generate_cython(CHECK_BUILD_PACKAGE) - generate_cython(TREE_SPLIT_PACKAGE) + # Generate Cython sources, unless building from source release + generate_cython(PACKAGE_NAME) try: setup(name=DISTNAME, + author=MAINTAINER, + author_email=MAINTAINER_EMAIL, maintainer=MAINTAINER, maintainer_email=MAINTAINER_EMAIL, description=DESCRIPTION, license=LICENSE, url=URL, - version=VERSION, + version=version(PACKAGE_NAME), download_url=DOWNLOAD_URL, long_description=LONG_DESCRIPTION, zip_safe=False, # the package can run out of an .egg file classifiers=CLASSIFIERS, - configuration=configuration, packages=find_packages(), - install_requires=INSTALL_REQUIRES) + install_requires=INSTALL_REQUIRES, + configuration=configuration) finally: del sys.path[0] os.chdir(old_path) diff --git a/build_tools/cython/cythonize.py b/tools/cythonize.py similarity index 100% rename from build_tools/cython/cythonize.py rename to tools/cythonize.py From d8eb231b7af5d654e19119a2f72893e99b5d1c34 Mon Sep 17 00:00:00 2001 From: edubov Date: Tue, 2 Oct 2018 16:11:55 +0300 Subject: [PATCH 13/50] fixed setup --- setup.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.py b/setup.py index 1d5cac601..e6d225bba 100755 --- a/setup.py +++ b/setup.py @@ -21,8 +21,6 @@ LICENSE = 'MIT' DOWNLOAD_URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' -VERSION = __version__ - CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', 'License :: OSI Approved', From 118bf23a9809087674b3d14ee2f26cbd4c6254d1 Mon Sep 17 00:00:00 2001 From: edubov Date: Sat, 6 Oct 2018 21:30:04 +0300 Subject: [PATCH 14/50] adding __init__.py to tree --- imblearn/tree/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 imblearn/tree/__init__.py diff --git a/imblearn/tree/__init__.py b/imblearn/tree/__init__.py new file mode 100644 index 000000000..e69de29bb From 01506e60bf7ceaf35527bbc8a118895fe01eac7a Mon Sep 17 00:00:00 2001 From: EvgeniDubov <32032278+EvgeniDubov@users.noreply.github.com> Date: Mon, 8 Oct 2018 09:07:44 +0300 Subject: [PATCH 15/50] added Cython as dependency in appveyor added the dependency due to the following appveyor error ModuleNotFoundError: No module named 'Cython' FileNotFoundError: [WinError 2] The system cannot find the file specified --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index b7e6298b5..8aa910fe5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -20,7 +20,7 @@ environment: - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" - OPTIONAL_DEP: "pandas keras tensorflow" + OPTIONAL_DEP: "pandas keras tensorflow Cython" - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.7" From 5953be8682de9c88c1815b4761f5f5ca0dcd044b Mon Sep 17 00:00:00 2001 From: edubov Date: Sat, 13 Oct 2018 17:53:27 +0300 Subject: [PATCH 16/50] renamed tree_split to tree in documentation and example --- MANIFEST.in | 2 +- doc/api.rst | 6 +++--- doc/{tree_split.rst => tree.rst} | 4 ++-- doc/user_guide.rst | 2 +- examples/{tree_split => tree}/README.txt | 2 +- .../train_model_with_hellinger_distance_criterion.py | 2 +- imblearn/tree/criterion.pxd | 0 7 files changed, 9 insertions(+), 9 deletions(-) rename doc/{tree_split.rst => tree.rst} (92%) rename examples/{tree_split => tree}/README.txt (95%) rename examples/{tree_split => tree}/train_model_with_hellinger_distance_criterion.py (90%) delete mode 100644 imblearn/tree/criterion.pxd diff --git a/MANIFEST.in b/MANIFEST.in index 52494e272..fd510005c 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,6 @@ recursive-include doc * recursive-include examples * -include imblearn/tree_split *.pyx +include imblearn/tree *.pyx include AUTHORS.rst include CONTRIBUTING.ms include LICENSE diff --git a/doc/api.rst b/doc/api.rst index 603f0295f..e11878bb6 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -197,10 +197,10 @@ Imbalance-learn provides some fast-prototyping tools. .. _metrics_ref: -:mod:`imblearn.tree_split`: Tree split criterion +:mod:`imblearn.tree`: Tree split criterion ================================== -.. automodule:: imblearn.tree_split +.. automodule:: imblearn.tree :no-members: :no-inherited-members: @@ -210,7 +210,7 @@ Imbalance-learn provides some fast-prototyping tools. :toctree: generated/ :template: class.rst - tree_split.HellingerDistanceCriterion + tree.criterion.HellingerDistanceCriterion .. autosummary:: :toctree: generated/ diff --git a/doc/tree_split.rst b/doc/tree.rst similarity index 92% rename from doc/tree_split.rst rename to doc/tree.rst index f73e33594..4954cc730 100644 --- a/doc/tree_split.rst +++ b/doc/tree.rst @@ -4,7 +4,7 @@ Tree-split ============== -.. currentmodule:: imblearn.tree_split +.. currentmodule:: imblearn.tree .. _cluster_centroids: @@ -19,7 +19,7 @@ When used as split criterion in Decision Tree Classifier it makes it skew insens >>> from sklearn.datasets import make_classification >>> from sklearn.model_selection import train_test_split >>> from sklearn.ensemble import RandomForestClassifier - >>> from imblearn.tree_split import HellingerDistanceCriterion + >>> from imblearn.tree.criterion import HellingerDistanceCriterion >>> X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) diff --git a/doc/user_guide.rst b/doc/user_guide.rst index 1a9698735..63914f6dd 100644 --- a/doc/user_guide.rst +++ b/doc/user_guide.rst @@ -12,7 +12,7 @@ User Guide introduction.rst over_sampling.rst under_sampling.rst - tree_split.rst + tree.rst combine.rst ensemble.rst miscellaneous.rst diff --git a/examples/tree_split/README.txt b/examples/tree/README.txt similarity index 95% rename from examples/tree_split/README.txt rename to examples/tree/README.txt index 9e4b228ab..a39794dbf 100644 --- a/examples/tree_split/README.txt +++ b/examples/tree/README.txt @@ -1,4 +1,4 @@ -.. _tree_split_examples: +.. _tree_examples: Example using Hellinger Distance as tree split criterion ======================================================== diff --git a/examples/tree_split/train_model_with_hellinger_distance_criterion.py b/examples/tree/train_model_with_hellinger_distance_criterion.py similarity index 90% rename from examples/tree_split/train_model_with_hellinger_distance_criterion.py rename to examples/tree/train_model_with_hellinger_distance_criterion.py index dc4ed47fd..c324efe91 100644 --- a/examples/tree_split/train_model_with_hellinger_distance_criterion.py +++ b/examples/tree/train_model_with_hellinger_distance_criterion.py @@ -5,7 +5,7 @@ from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier -from imblearn.tree_split import HellingerDistanceCriterion +from imblearn.tree.criterion import HellingerDistanceCriterion X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) diff --git a/imblearn/tree/criterion.pxd b/imblearn/tree/criterion.pxd deleted file mode 100644 index e69de29bb..000000000 From 18960021ec08839813724733cb1fbaec775051bf Mon Sep 17 00:00:00 2001 From: edubov Date: Sat, 13 Oct 2018 17:57:56 +0300 Subject: [PATCH 17/50] restored criterion pxd --- imblearn/tree/criterion.pxd | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 imblearn/tree/criterion.pxd diff --git a/imblearn/tree/criterion.pxd b/imblearn/tree/criterion.pxd new file mode 100644 index 000000000..e69de29bb From 443a91b74187822313e766d4f1451f93ff822cf9 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 08:25:46 +0200 Subject: [PATCH 18/50] doc update --- doc/tree.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/tree.rst b/doc/tree.rst index 4954cc730..cc51a4149 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -8,6 +8,7 @@ Tree-split .. _cluster_centroids: + Hellinger Distance split ==================== From 12df09812225d88ba5fde88b25a2496c080a3ade Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 09:40:38 +0200 Subject: [PATCH 19/50] add Cython to travis install list --- .travis.yml | 11 ++++++----- build_tools/travis/install.sh | 3 ++- build_tools/travis/test_script.sh | 1 + 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index f7800f289..819e22b32 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,6 +18,7 @@ addons: packages: - python-numpy - python-scipy + - python-Cython env: global: # Directory where tests are run from @@ -33,16 +34,16 @@ matrix: - env: DISTRIB="ubuntu" # Latest release - env: DISTRIB="conda" PYTHON_VERSION="2.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.20" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="master" allow_failures: - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="master" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 4997ed7ea..7af5fb385 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,7 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION Cython=$CYTHON_VERSION if [[ $PYTHON_VERSION == "3.6" ]]; then @@ -79,6 +79,7 @@ fi python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" +python -c "import Cython; print('Cython %s' % Cython.__version__)" pip install -e . ccache --show-stats diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index ca0770902..35d1112d5 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -19,6 +19,7 @@ run_tests(){ python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" + python -c "import Cython; print('Cython %s' % Cython.__version__)" python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pytest --cov=$MODULE -r sx --pyargs $MODULE From 2bdc7ddcef855a764a471ff6b1ae348453f52652 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 09:48:42 +0200 Subject: [PATCH 20/50] fixed travis config --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 819e22b32..54a4dd8e8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -18,7 +18,7 @@ addons: packages: - python-numpy - python-scipy - - python-Cython + env: global: # Directory where tests are run from From e06caeab313c8996f3fdfd8b5689ed928f19aa49 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 10:02:34 +0200 Subject: [PATCH 21/50] added cython special install in travis conda --- .travis.yml | 10 +++++----- build_tools/travis/install.sh | 3 ++- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 54a4dd8e8..e87c32760 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,16 +34,16 @@ matrix: - env: DISTRIB="ubuntu" # Latest release - env: DISTRIB="conda" PYTHON_VERSION="2.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="0.20" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="*" SKLEARN_VERSION="master" allow_failures: - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29.1" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="*" SKLEARN_VERSION="master" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 7af5fb385..a0fc17e4a 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,8 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION Cython=$CYTHON_VERSION + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + conda install -c anaconda cython=$CYTHON_VERSION if [[ $PYTHON_VERSION == "3.6" ]]; then From 55f67e70ae8c95c854ed748dca42d14d11dfc7df Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 10:22:24 +0200 Subject: [PATCH 22/50] added cython to travis ubuntu --- build_tools/travis/install.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index a0fc17e4a..586f57ad5 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -39,7 +39,7 @@ if [[ "$DISTRIB" == "conda" ]]; then conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION - conda install -c anaconda cython=$CYTHON_VERSION + conda install --yes -c anaconda cython=$CYTHON_VERSION if [[ $PYTHON_VERSION == "3.6" ]]; then @@ -73,6 +73,7 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then pip install scikit-learn pip install pandas keras tensorflow pip install pytest pytest-cov codecov sphinx numpydoc + pip install cython fi From 6049db4cb762e5d11b6ee4606b730891deaf0a60 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 10:39:24 +0200 Subject: [PATCH 23/50] - fixed Hellinger tree example to pass travis - fixed undersampling example to pass travis --- doc/tree.rst | 3 +-- doc/under_sampling.rst | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/tree.rst b/doc/tree.rst index cc51a4149..6d6d00ac1 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -24,8 +24,7 @@ When used as split criterion in Decision Tree Classifier it makes it skew insens >>> X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) - >>> clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100) - >>> clf.fit(X_train, y_train) + >>> clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100).fit(X_train, y_train) >>> print(clf.score(X_test, y_test)) [0.9465] diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 821665756..efe2f46c4 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -388,7 +388,6 @@ removed [SMMG2014]_. The class can be used as:: >>> from sklearn.linear_model import LogisticRegression >>> from imblearn.under_sampling import InstanceHardnessThreshold >>> iht = InstanceHardnessThreshold(random_state=0, - ... estimator=LogisticRegression( ... solver='lbfgs', multi_class='auto')) From aa7348f2ce1ca8eeb3b556d6ede2570ad725ad02 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 10:55:31 +0200 Subject: [PATCH 24/50] added pandas to travis install list --- .travis.yml | 10 +++++----- build_tools/travis/install.sh | 2 +- doc/tree.rst | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index e87c32760..f3f2a2077 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,16 +34,16 @@ matrix: - env: DISTRIB="ubuntu" # Latest release - env: DISTRIB="conda" PYTHON_VERSION="2.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" PANDAS_VERSION="0.23.4" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" + NUMPY_VERSION="*" SCIPY_VERSION="*" PANDAS_VERSION="0.23.4" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" PANDAS_VERSION="0.23.4" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" PANDAS_VERSION="0.23.4" CYTHON_VERSION="*" SKLEARN_VERSION="master" allow_failures: - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" PANDAS_VERSION="0.23.4" CYTHON_VERSION="*" SKLEARN_VERSION="master" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 586f57ad5..ea50a399f 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,7 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION pandas=$PANDAS_VERSION conda install --yes -c anaconda cython=$CYTHON_VERSION if [[ $PYTHON_VERSION == "3.6" ]]; then diff --git a/doc/tree.rst b/doc/tree.rst index 6d6d00ac1..6abb9d327 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -26,6 +26,6 @@ When used as split criterion in Decision Tree Classifier it makes it skew insens >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) >>> clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100).fit(X_train, y_train) >>> print(clf.score(X_test, y_test)) - [0.9465] + 0.9465 :class:`HellingerDistanceCriterion` offers a Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. From 699ce53352c30bec50bf5180af7f4b744341b89e Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 11:15:07 +0200 Subject: [PATCH 25/50] added cython to appveyor --- appveyor.yml | 6 +++--- doc/tree.rst | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 48562a977..db92bced1 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,12 +10,12 @@ environment: - PYTHON: "C:\\Miniconda-x64" PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" - OPTIONAL_DEP: "pandas" + OPTIONAL_DEP: "pandas Cython" - PYTHON: "C:\\Miniconda" PYTHON_VERSION: "2.7" PYTHON_ARCH: "32" - OPTIONAL_DEP: "pandas" + OPTIONAL_DEP: "pandas Cython" - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6" @@ -25,7 +25,7 @@ environment: - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.7" PYTHON_ARCH: "64" - OPTIONAL_DEP: "pandas" + OPTIONAL_DEP: "pandas Cython" - PYTHON: "C:\\Miniconda36" PYTHON_VERSION: "3.7" diff --git a/doc/tree.rst b/doc/tree.rst index 6abb9d327..3c1f22fa4 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -21,10 +21,10 @@ When used as split criterion in Decision Tree Classifier it makes it skew insens >>> from sklearn.model_selection import train_test_split >>> from sklearn.ensemble import RandomForestClassifier >>> from imblearn.tree.criterion import HellingerDistanceCriterion - >>> X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) + >>> X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=22) >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) - >>> clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100).fit(X_train, y_train) + >>> clf = RandomForestClassifier(random_state=22, criterion=hdc, max_depth=4, n_estimators=100).fit(X_train, y_train) >>> print(clf.score(X_test, y_test)) 0.9465 From f4a9bfa268a893909047dad7805f61d64eca12b8 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 11:37:57 +0200 Subject: [PATCH 26/50] - changed tree example to pass travus - added cython install to appveyor --- .travis.yml | 10 +++++----- appveyor.yml | 7 ++++--- build_tools/travis/install.sh | 2 +- doc/tree.rst | 10 ++-------- 4 files changed, 12 insertions(+), 17 deletions(-) diff --git a/.travis.yml b/.travis.yml index f3f2a2077..e87c32760 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,16 +34,16 @@ matrix: - env: DISTRIB="ubuntu" # Latest release - env: DISTRIB="conda" PYTHON_VERSION="2.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" PANDAS_VERSION="0.23.4" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="*" SCIPY_VERSION="*" PANDAS_VERSION="0.23.4" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" PANDAS_VERSION="0.23.4" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" + NUMPY_VERSION="1.13.1" SCIPY_VERSION="0.19.1" CYTHON_VERSION="0.29" SKLEARN_VERSION="0.20" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" PANDAS_VERSION="0.23.4" CYTHON_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="*" SKLEARN_VERSION="master" allow_failures: - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" PANDAS_VERSION="0.23.4" CYTHON_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" CYTHON_VERSION="*" SKLEARN_VERSION="master" install: source build_tools/travis/install.sh script: bash build_tools/travis/test_script.sh diff --git a/appveyor.yml b/appveyor.yml index db92bced1..cedb4495a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,12 +10,12 @@ environment: - PYTHON: "C:\\Miniconda-x64" PYTHON_VERSION: "2.7" PYTHON_ARCH: "64" - OPTIONAL_DEP: "pandas Cython" + OPTIONAL_DEP: "pandas" - PYTHON: "C:\\Miniconda" PYTHON_VERSION: "2.7" PYTHON_ARCH: "32" - OPTIONAL_DEP: "pandas Cython" + OPTIONAL_DEP: "pandas" - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6" @@ -25,7 +25,7 @@ environment: - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.7" PYTHON_ARCH: "64" - OPTIONAL_DEP: "pandas Cython" + OPTIONAL_DEP: "pandas" - PYTHON: "C:\\Miniconda36" PYTHON_VERSION: "3.7" @@ -42,6 +42,7 @@ install: - activate testenv - conda install scipy numpy -y -q - conda install scikit-learn -y -q + - conda install -c anaconda cython -y -q - conda install %OPTIONAL_DEP% -y -q - conda install pytest pytest-cov -y -q - pip install codecov diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index ea50a399f..586f57ad5 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,7 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION pandas=$PANDAS_VERSION + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION conda install --yes -c anaconda cython=$CYTHON_VERSION if [[ $PYTHON_VERSION == "3.6" ]]; then diff --git a/doc/tree.rst b/doc/tree.rst index 3c1f22fa4..889f87b6e 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -16,16 +16,10 @@ Hellinger Distance is used to quantify the similarity between two probability di When used as split criterion in Decision Tree Classifier it makes it skew insensitive and helps tackle the imbalance problem. >>> import numpy as np - >>> import pandas as pd - >>> from sklearn.datasets import make_classification - >>> from sklearn.model_selection import train_test_split >>> from sklearn.ensemble import RandomForestClassifier >>> from imblearn.tree.criterion import HellingerDistanceCriterion - >>> X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=22) - >>> X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) + >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) - >>> clf = RandomForestClassifier(random_state=22, criterion=hdc, max_depth=4, n_estimators=100).fit(X_train, y_train) - >>> print(clf.score(X_test, y_test)) - 0.9465 + >>> clf = RandomForestClassifier(criterion=hdc) :class:`HellingerDistanceCriterion` offers a Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. From f21c208d85dcc9ac8765eacb69da50417f69ffa0 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 12:06:19 +0200 Subject: [PATCH 27/50] turned appveyor build on --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index cedb4495a..33bc16e4f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -build: false +build: true environment: # There is no need to run the build for all the Python version / From c022715a31bcdfd7a0a3f3054df9e4627692c126 Mon Sep 17 00:00:00 2001 From: edubov Date: Sun, 30 Dec 2018 12:52:16 +0200 Subject: [PATCH 28/50] turned appveyor build off --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 33bc16e4f..cedb4495a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -build: true +build: false environment: # There is no need to run the build for all the Python version / From 9a5b32b46454e7194cfa53c5ba0e9b5fa9880924 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 27 May 2019 08:42:26 +0300 Subject: [PATCH 29/50] appveyor.yml - trying to fix appveyor errors by imblearn build install --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index cedb4495a..26c58c327 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -46,7 +46,7 @@ install: - conda install %OPTIONAL_DEP% -y -q - conda install pytest pytest-cov -y -q - pip install codecov - - pip install . + - python setup.py install test_script: - mkdir for_test From 0cd8d1fd9edecc469b3d5268208a304acabdd6d9 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 27 May 2019 09:26:03 +0300 Subject: [PATCH 30/50] Revert appveyor change --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 26c58c327..cedb4495a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -46,7 +46,7 @@ install: - conda install %OPTIONAL_DEP% -y -q - conda install pytest pytest-cov -y -q - pip install codecov - - python setup.py install + - pip install . test_script: - mkdir for_test From 810498e10633a79749d4a6184b1bdcc59bbdf7d9 Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 15 Aug 2019 11:42:40 +0300 Subject: [PATCH 31/50] Synced __check_build\setup.py with sklearn script --- imblearn/__check_build/setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/imblearn/__check_build/setup.py b/imblearn/__check_build/setup.py index 535e504cd..2d76f5230 100644 --- a/imblearn/__check_build/setup.py +++ b/imblearn/__check_build/setup.py @@ -7,9 +7,10 @@ def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration config = Configuration('__check_build', parent_package, top_path) config.add_extension('_check_build', - sources=['_check_build.c']) - return config + sources=['_check_build.pyx'], + include_dirs=[numpy.get_include()]) + return config if __name__ == '__main__': from numpy.distutils.core import setup From 64db93ffc92cccd48a61da397fbfc247f334a542 Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 15 Aug 2019 12:09:15 +0300 Subject: [PATCH 32/50] aligned to master --- build_tools/circle/build_doc.sh | 1 - build_tools/travis/test_script.sh | 2 +- doc/api.rst | 2 - doc/under_sampling.rst | 3 -- imblearn/__check_build/README.md | 5 --- imblearn/__check_build/__init__.py | 58 ------------------------- imblearn/__check_build/_check_build.pyx | 4 -- imblearn/__check_build/setup.py | 17 -------- imblearn/base.py | 8 ---- imblearn/setup.py | 41 ----------------- setup.py | 14 +++--- 11 files changed, 10 insertions(+), 145 deletions(-) delete mode 100644 imblearn/__check_build/README.md delete mode 100644 imblearn/__check_build/__init__.py delete mode 100644 imblearn/__check_build/_check_build.pyx delete mode 100644 imblearn/__check_build/setup.py delete mode 100644 imblearn/setup.py diff --git a/build_tools/circle/build_doc.sh b/build_tools/circle/build_doc.sh index b384ba84e..3f30c27d4 100755 --- a/build_tools/circle/build_doc.sh +++ b/build_tools/circle/build_doc.sh @@ -94,7 +94,6 @@ source activate $CONDA_ENV_NAME conda install --yes pip numpy scipy pillow matplotlib sphinx \ sphinx_rtd_theme numpydoc pandas keras pip install --pre scikit-learn - pip install -U git+https://github.com/sphinx-gallery/sphinx-gallery.git # Build and install imbalanced-learn in dev mode diff --git a/build_tools/travis/test_script.sh b/build_tools/travis/test_script.sh index 35d1112d5..66135fa79 100755 --- a/build_tools/travis/test_script.sh +++ b/build_tools/travis/test_script.sh @@ -19,7 +19,7 @@ run_tests(){ python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" - python -c "import Cython; print('Cython %s' % Cython.__version__)" + python -c "import Cython; print('Cython %s' % Cython.__version__)" python -c "import multiprocessing as mp; print('%d CPUs' % mp.cpu_count())" pytest --cov=$MODULE -r sx --pyargs $MODULE diff --git a/doc/api.rst b/doc/api.rst index 3b7d35f46..e31e07ae8 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -116,7 +116,6 @@ Prototype selection ensemble.BalancedRandomForestClassifier ensemble.EasyEnsemble ensemble.EasyEnsembleClassifier - ensemble.RUSBoostClassifier .. _keras_ref: @@ -157,7 +156,6 @@ Prototype selection :toctree: generated/ :template: function.rst - tensorflow.balanced_batch_generator .. _misc_ref: diff --git a/doc/under_sampling.rst b/doc/under_sampling.rst index 0f2c0441f..a8fcdc109 100644 --- a/doc/under_sampling.rst +++ b/doc/under_sampling.rst @@ -106,8 +106,6 @@ by considering independently each targeted class:: In addition, :class:`RandomUnderSampler` allows to sample heterogeneous data (e.g. containing some strings):: - - >>> X_hetero = np.array([['xxx', 1, 1.0], ['yyy', 2, 2.0], ['zzz', 3, 3.0]], ... dtype=np.object) >>> y_hetero = np.array([0, 0, 1]) @@ -390,7 +388,6 @@ removed [SMMG2014]_. The class can be used as:: >>> iht = InstanceHardnessThreshold(random_state=0, ... estimator=LogisticRegression( ... solver='lbfgs', multi_class='auto')) - >>> X_resampled, y_resampled = iht.fit_resample(X, y) >>> print(sorted(Counter(y_resampled).items())) [(0, 64), (1, 64), (2, 64)] diff --git a/imblearn/__check_build/README.md b/imblearn/__check_build/README.md deleted file mode 100644 index 39ff68dc9..000000000 --- a/imblearn/__check_build/README.md +++ /dev/null @@ -1,5 +0,0 @@ -``__check_build`` -================= -The purpose of this submodule is to give the user a readable error when trying -to import the package from within the source tree. - diff --git a/imblearn/__check_build/__init__.py b/imblearn/__check_build/__init__.py deleted file mode 100644 index 8d759d70f..000000000 --- a/imblearn/__check_build/__init__.py +++ /dev/null @@ -1,58 +0,0 @@ -""" Module to give helpful messages to the user that did not -compile package properly, - -This code was adapted from scikit-learn's check_build utility. -""" -import os - -PACKAGE_NAME = 'imblearn' - -INPLACE_MSG = """ -It appears that you are importing {package} from within the source tree. -Please either use an inplace install or try from another location. -""".format(package=PACKAGE_NAME) - -STANDARD_MSG = """ -If you have used an installer, please check that it is suited for your -Python version, your operating system and your platform. -""" - -ERROR_TEMPLATE = """{error} -___________________________________________________________________________ -Contents of {local_dir}: -{contents} -___________________________________________________________________________ -It seems that the {package} has not been built correctly. - -If you have installed {package} from source, please do not forget -to build the package before using it: run `python setup.py install` -in the source directory. -{msg}""" - - -def raise_build_error(e): - # Raise a comprehensible error and list the contents of the - # directory to help debugging on the mailing list. - local_dir = os.path.split(__file__)[0] - msg = STANDARD_MSG - if local_dir == "megaman/__check_build": - # Picking up the local install: this will work only if the - # install is an 'inplace build' - msg = INPLACE_MSG - dir_content = list() - for i, filename in enumerate(os.listdir(local_dir)): - if ((i + 1) % 3): - dir_content.append(filename.ljust(26)) - else: - dir_content.append(filename + '\n') - contents = ''.join(dir_content).strip() - raise ImportError(ERROR_TEMPLATE.format(error=e, - local_dir=local_dir, - contents=contents, - package=PACKAGE_NAME, - msg=msg)) - -try: - from ._check_build import check_build -except ImportError as e: - raise_build_error(e) diff --git a/imblearn/__check_build/_check_build.pyx b/imblearn/__check_build/_check_build.pyx deleted file mode 100644 index 4fe9f3a8c..000000000 --- a/imblearn/__check_build/_check_build.pyx +++ /dev/null @@ -1,4 +0,0 @@ -# Adapted from scikit-learn __check_build script (BSD-licensed) - -def check_build(): - return diff --git a/imblearn/__check_build/setup.py b/imblearn/__check_build/setup.py deleted file mode 100644 index 2d76f5230..000000000 --- a/imblearn/__check_build/setup.py +++ /dev/null @@ -1,17 +0,0 @@ -# Adapted from scikit-learn __check_build script (BSD-licensed) - -import numpy - - -def configuration(parent_package='', top_path=None): - from numpy.distutils.misc_util import Configuration - config = Configuration('__check_build', parent_package, top_path) - config.add_extension('_check_build', - sources=['_check_build.pyx'], - include_dirs=[numpy.get_include()]) - - return config - -if __name__ == '__main__': - from numpy.distutils.core import setup - setup(**configuration(top_path='').todict()) diff --git a/imblearn/base.py b/imblearn/base.py index 1af736e0f..ab554aa83 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -137,12 +137,6 @@ def _check_X_y(X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) return X, y, binarize_y - @staticmethod - def _check_X_y(X, y): - y, binarize_y = check_target_type(y, indicate_one_vs_all=True) - X, y = check_X_y(X, y, accept_sparse=['csr', 'csc']) - return X, y, binarize_y - @property def ratio_(self): # FIXME: remove in 0.6 @@ -158,7 +152,6 @@ def _deprecate_ratio(self): self.sampling_strategy = self.ratio - def _identity(X, y): return X, y @@ -234,7 +227,6 @@ def __init__(self, func=None, accept_sparse=True, kw_args=None): self.accept_sparse = accept_sparse self.kw_args = kw_args - def _fit_resample(self, X, y): X, y = check_X_y(X, y, accept_sparse=['csr', 'csc'] if self.accept_sparse else False) diff --git a/imblearn/setup.py b/imblearn/setup.py deleted file mode 100644 index 3ae6851a8..000000000 --- a/imblearn/setup.py +++ /dev/null @@ -1,41 +0,0 @@ -PACKAGE_NAME = 'imblearn' - - -def configuration(parent_package='', top_path=None): - from numpy.distutils.misc_util import Configuration - - config = Configuration(PACKAGE_NAME, parent_package, top_path) - - config.add_subpackage('__check_build') - - # pure python packages - config.add_subpackage('combine') - config.add_subpackage('combine/tests') - config.add_subpackage('datasets') - config.add_subpackage('datasets/tests') - config.add_subpackage('ensemble') - config.add_subpackage('ensemble/tests') - config.add_subpackage('keras') - config.add_subpackage('keras/tests') - config.add_subpackage('metrics') - config.add_subpackage('metrics/tests') - config.add_subpackage('tensorflow') - config.add_subpackage('tensorflow/tests') - config.add_subpackage('tests') - config.add_subpackage('under_sampling') - config.add_subpackage('under_sampling/_prototype_generation') - config.add_subpackage('under_sampling/_prototype_generation/tests') - config.add_subpackage('under_sampling/_prototype_selection') - config.add_subpackage('under_sampling/_prototype_selection/tests') - config.add_subpackage('utils') - config.add_subpackage('utils/tests') - - # packages that have their own setup.py -> cython files - config.add_subpackage('tree') - - return config - - -if __name__ == '__main__': - from numpy.distutils.core import setup - setup(**configuration(top_path='').todict()) diff --git a/setup.py b/setup.py index fdeeabb21..a59271ff3 100755 --- a/setup.py +++ b/setup.py @@ -3,14 +3,19 @@ import io import re +import codecs import os import sys import subprocess -import codecs from setuptools import find_packages PACKAGE_NAME = 'imblearn' +# get __version__ from _version.py +ver_file = os.path.join('imblearn', '_version.py') +with open(ver_file) as f: + exec(f.read()) + DISTNAME = 'imbalanced-learn' DESCRIPTION = 'Toolbox for imbalanced dataset in machine learning.' with codecs.open('README.rst', encoding='utf-8-sig') as f: @@ -20,7 +25,7 @@ URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' LICENSE = 'MIT' DOWNLOAD_URL = 'https://github.com/scikit-learn-contrib/imbalanced-learn' - +VERSION = __version__ CLASSIFIERS = ['Intended Audience :: Science/Research', 'Intended Audience :: Developers', 'License :: OSI Approved', @@ -115,20 +120,19 @@ def setup_package(): try: setup(name=DISTNAME, - author=MAINTAINER, - author_email=MAINTAINER_EMAIL, maintainer=MAINTAINER, maintainer_email=MAINTAINER_EMAIL, description=DESCRIPTION, license=LICENSE, url=URL, - version=version(PACKAGE_NAME), + version=VERSION, download_url=DOWNLOAD_URL, long_description=LONG_DESCRIPTION, zip_safe=False, # the package can run out of an .egg file classifiers=CLASSIFIERS, packages=find_packages(), install_requires=INSTALL_REQUIRES, + extras_require=EXTRAS_REQUIRE, configuration=configuration) finally: del sys.path[0] From e72f2d6ce1addba9681692521c231ec605ae4ca8 Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 15 Aug 2019 12:29:24 +0300 Subject: [PATCH 33/50] =?UTF-8?q?Fixed=20travis=20issue=20according=20to?= =?UTF-8?q?=20[MRG]=20=F0=9F=91=BD=20Maintenance=20for=20`imblearn.show=5F?= =?UTF-8?q?versions()`,=20fix=20Travis=20build=20#591?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- build_tools/travis/install.sh | 5 ++++- imblearn/utils/_show_versions.py | 21 ++++----------------- imblearn/utils/tests/test_show_versions.py | 9 +-------- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 03ebf0c20..b5920e813 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -28,7 +28,10 @@ if [[ "$DISTRIB" == "conda" ]]; then MINICONDA_PATH=/home/travis/miniconda chmod +x miniconda.sh && ./miniconda.sh -b -p $MINICONDA_PATH export PATH=$MINICONDA_PATH/bin:$PATH - conda install --yes conda=4.6 + conda config --set always_yes yes --set changeps1 no + conda install conda=4.6 + conda update -q conda + # Configure the conda environment and put it in the path using the # provided versions diff --git a/imblearn/utils/_show_versions.py b/imblearn/utils/_show_versions.py index 7a6efb84f..2a0e6efd0 100644 --- a/imblearn/utils/_show_versions.py +++ b/imblearn/utils/_show_versions.py @@ -59,40 +59,31 @@ def show_versions(github=False): If true, wrap system info with GitHub markup. """ - from sklearn.utils._show_versions import ( - _get_sys_info, - _get_blas_info, - ) + from sklearn.utils._show_versions import _get_sys_info _sys_info = _get_sys_info() - _blas_info = _get_blas_info() _deps_info = _get_deps_info() _github_markup = ( "
" - "System, BLAS, and Dependencies\n\n" + "System, Dependency Information\n\n" "**System Information**\n\n" "{0}\n" - "**BLAS**\n\n" - "{1}\n" "**Python Dependencies**\n\n" - "{2}\n" + "{1}\n" "
" ) if github: _sys_markup = "" - _blas_markup = "" _deps_markup = "" for k, stat in _sys_info.items(): _sys_markup += "* {k:<10}: `{stat}`\n".format(k=k, stat=stat) - for k, stat in _blas_info.items(): - _blas_markup += "* {k:<10}: `{stat}`\n".format(k=k, stat=stat) for k, stat in _deps_info.items(): _deps_markup += "* {k:<10}: `{stat}`\n".format(k=k, stat=stat) - print(_github_markup.format(_sys_markup, _blas_markup, _deps_markup)) + print(_github_markup.format(_sys_markup, _blas_markup)) else: @@ -100,10 +91,6 @@ def show_versions(github=False): for k, stat in _sys_info.items(): print("{k:>11}: {stat}".format(k=k, stat=stat)) - print("\nBLAS:") - for k, stat in _blas_info.items(): - print("{k:>11}: {stat}".format(k=k, stat=stat)) - print("\nPython dependencies:") for k, stat in _deps_info.items(): print("{k:>11}: {stat}".format(k=k, stat=stat)) diff --git a/imblearn/utils/tests/test_show_versions.py b/imblearn/utils/tests/test_show_versions.py index c53b2b85b..3b59146bc 100644 --- a/imblearn/utils/tests/test_show_versions.py +++ b/imblearn/utils/tests/test_show_versions.py @@ -24,9 +24,6 @@ def test_show_versions_default(capsys): assert "python" in out assert "executable" in out assert "machine" in out - assert "macros" in out - assert "lib_dirs" in out - assert "cblas_libs" in out assert "pip" in out assert "setuptools" in out assert "imblearn" in out @@ -42,15 +39,11 @@ def test_show_versions_default(capsys): def test_show_versions_github(capsys): show_versions(github=True) out, err = capsys.readouterr() - assert "
System, BLAS, and Dependencies" in out + assert "
System, Dependency Information" in out assert "**System Information**" in out assert "* python" in out assert "* executable" in out assert "* machine" in out - assert "**BLAS**" in out - assert "* macros" in out - assert "* lib_dirs" in out - assert "* cblas_libs" in out assert "**Python Dependencies**" in out assert "* pip" in out assert "* setuptools" in out From 5a4e53ebcea78d4cd9d8ad0d18eeb0cf3b03a3b9 Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 15 Aug 2019 13:32:45 +0300 Subject: [PATCH 34/50] fixed versions --- imblearn/utils/_show_versions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/utils/_show_versions.py b/imblearn/utils/_show_versions.py index 2a0e6efd0..5b2b964e3 100644 --- a/imblearn/utils/_show_versions.py +++ b/imblearn/utils/_show_versions.py @@ -59,7 +59,7 @@ def show_versions(github=False): If true, wrap system info with GitHub markup. """ - from sklearn.utils._show_versions import _get_sys_info + from sklearn.utils._show_versions import _get_sys_info _sys_info = _get_sys_info() _deps_info = _get_deps_info() From e1316b2bdf00189eeb3bd60f3217b3a6655ffdb0 Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 15 Aug 2019 13:46:25 +0300 Subject: [PATCH 35/50] fixed travis --- imblearn/utils/_show_versions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/imblearn/utils/_show_versions.py b/imblearn/utils/_show_versions.py index 5b2b964e3..8b580e24f 100644 --- a/imblearn/utils/_show_versions.py +++ b/imblearn/utils/_show_versions.py @@ -83,7 +83,7 @@ def show_versions(github=False): for k, stat in _deps_info.items(): _deps_markup += "* {k:<10}: `{stat}`\n".format(k=k, stat=stat) - print(_github_markup.format(_sys_markup, _blas_markup)) + print(_github_markup.format(_sys_markup, _deps_markup)) else: From 4af0af8e6897aebaae6897e3d278457f3b4a98db Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 15 Aug 2019 14:43:33 +0300 Subject: [PATCH 36/50] commented out hellinger usage example to narrow down travis failure root cause --- doc/tree.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/tree.rst b/doc/tree.rst index 889f87b6e..d9e2dd506 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -15,11 +15,11 @@ Hellinger Distance split Hellinger Distance is used to quantify the similarity between two probability distributions. When used as split criterion in Decision Tree Classifier it makes it skew insensitive and helps tackle the imbalance problem. - >>> import numpy as np - >>> from sklearn.ensemble import RandomForestClassifier - >>> from imblearn.tree.criterion import HellingerDistanceCriterion + >> import numpy as np + >> from sklearn.ensemble import RandomForestClassifier + >> from imblearn.tree.criterion import HellingerDistanceCriterion - >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) - >>> clf = RandomForestClassifier(criterion=hdc) + >> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) + >> clf = RandomForestClassifier(criterion=hdc) :class:`HellingerDistanceCriterion` offers a Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. From 97ef77f2ef85b5c09cd5e20bb5e1f69b3c7bee5b Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 15 Aug 2019 16:15:30 +0300 Subject: [PATCH 37/50] added hellinger usage example to tree.rst --- doc/tree.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/tree.rst b/doc/tree.rst index d9e2dd506..889f87b6e 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -15,11 +15,11 @@ Hellinger Distance split Hellinger Distance is used to quantify the similarity between two probability distributions. When used as split criterion in Decision Tree Classifier it makes it skew insensitive and helps tackle the imbalance problem. - >> import numpy as np - >> from sklearn.ensemble import RandomForestClassifier - >> from imblearn.tree.criterion import HellingerDistanceCriterion + >>> import numpy as np + >>> from sklearn.ensemble import RandomForestClassifier + >>> from imblearn.tree.criterion import HellingerDistanceCriterion - >> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) - >> clf = RandomForestClassifier(criterion=hdc) + >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) + >>> clf = RandomForestClassifier(criterion=hdc) :class:`HellingerDistanceCriterion` offers a Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. From a7855a73a67506a5b40dcbf1e854ee75dff36fbd Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 10 Oct 2019 08:38:59 +0300 Subject: [PATCH 38/50] - added Cython temp files to git ignore - added Hellinger pyd file to MANIFEST - update cython version requirements in hellinger cython code --- .gitignore | 3 --- MANIFEST.in | 3 ++- imblearn/tree/criterion.pxd | 0 imblearn/tree/criterion.pyx | 2 ++ 4 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 imblearn/tree/criterion.pxd diff --git a/.gitignore b/.gitignore index 47e55743f..3f0bb7a1d 100644 --- a/.gitignore +++ b/.gitignore @@ -74,9 +74,6 @@ target/ .idea/ # Cython -*.c -*.cpp -cythonize.dat *.pyc *.pyo __pycache__ diff --git a/MANIFEST.in b/MANIFEST.in index fd510005c..1ce498de9 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,8 @@ recursive-include doc * recursive-include examples * include imblearn/tree *.pyx +include imblearn/tree *.pyd include AUTHORS.rst -include CONTRIBUTING.ms +include CONTRIBUTING.md include LICENSE include README.rst \ No newline at end of file diff --git a/imblearn/tree/criterion.pxd b/imblearn/tree/criterion.pxd deleted file mode 100644 index e69de29bb..000000000 diff --git a/imblearn/tree/criterion.pyx b/imblearn/tree/criterion.pyx index 256a73a24..1325ec2e9 100644 --- a/imblearn/tree/criterion.pyx +++ b/imblearn/tree/criterion.pyx @@ -2,6 +2,8 @@ # # License: BSD 3 clause +#cython: language_level=3, boundscheck=False + from libc.math cimport sqrt, pow from libc.math cimport abs From 21e6909cfbf79d283172773573a6c0de7561b03e Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 10 Oct 2019 08:51:09 +0300 Subject: [PATCH 39/50] documentation update --- doc/tree.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/doc/tree.rst b/doc/tree.rst index 889f87b6e..ceb3dc305 100644 --- a/doc/tree.rst +++ b/doc/tree.rst @@ -22,4 +22,5 @@ When used as split criterion in Decision Tree Classifier it makes it skew insens >>> hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) >>> clf = RandomForestClassifier(criterion=hdc) -:class:`HellingerDistanceCriterion` offers a Cython implementation of Hellinger Distance as a criterion for decision tree split compatible with sklearn tree based classification models. +:class:`HellingerDistanceCriterion` offers a Cython implementation of Hellinger Distance +as a criterion for decision tree split compatible with sklearn tree based classification models. From 9268ee989a02e5d57d7300aa1b2e5387c59b01de Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 10 Oct 2019 09:10:49 +0300 Subject: [PATCH 40/50] added cython installation to travis --- .travis.yml | 8 ++++---- build_tools/travis/install.sh | 4 +++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 60fa30fd3..ae6e365a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -33,17 +33,17 @@ matrix: - env: DISTRIB="ubuntu" # Latest release - env: DISTRIB="conda" PYTHON_VERSION="3.6" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.21.2" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.21.2" CYTHON_VERSION="*" OPTIONAL_DEPS="true" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.21.2" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="0.21.2" CYTHON_VERSION="*" OPTIONAL_DEPS="false" - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" CYTHON_VERSION="*" OPTIONAL_DEPS="false" allow_failures: - env: DISTRIB="conda" PYTHON_VERSION="3.7" - NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" + NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" CYTHON_VERSION="*" OPTIONAL_DEPS="false" install: source build_tools/travis/install.sh diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 342a56883..6f7b9c9d5 100755 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -37,7 +37,7 @@ if [[ "$DISTRIB" == "conda" ]]; then # provided versions conda create -n testenv --yes python=$PYTHON_VERSION pip source activate testenv - conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION + conda install --yes numpy=$NUMPY_VERSION scipy=$SCIPY_VERSION cython=$CYTHON_VERSION if [[ "$OPTIONAL_DEPS" == "true" ]]; then conda install --yes pandas keras tensorflow @@ -68,12 +68,14 @@ elif [[ "$DISTRIB" == "ubuntu" ]]; then pip3 install scikit-learn pip3 install pandas keras tensorflow pip3 install pytest pytest-cov codecov sphinx numpydoc + pip3 install cython fi python --version python -c "import numpy; print('numpy %s' % numpy.__version__)" python -c "import scipy; print('scipy %s' % scipy.__version__)" +python -c "import Cython; print('Cython %s' % Cython.__version__)" pip install -e . ccache --show-stats From e4c536086bcfece0b17e8e72025092ef52839cf8 Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 10 Oct 2019 09:16:55 +0300 Subject: [PATCH 41/50] fix few LGTM issues --- examples/tree/train_model_with_hellinger_distance_criterion.py | 1 - setup.py | 1 - 2 files changed, 2 deletions(-) diff --git a/examples/tree/train_model_with_hellinger_distance_criterion.py b/examples/tree/train_model_with_hellinger_distance_criterion.py index c324efe91..5cff76da7 100644 --- a/examples/tree/train_model_with_hellinger_distance_criterion.py +++ b/examples/tree/train_model_with_hellinger_distance_criterion.py @@ -1,5 +1,4 @@ import numpy as np -import pandas as pd from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split diff --git a/setup.py b/setup.py index a59271ff3..0ce164eb4 100755 --- a/setup.py +++ b/setup.py @@ -101,7 +101,6 @@ def configuration(parent_package='', top_path=None): def setup_package(): from numpy.distutils.core import setup - old_path = os.getcwd() local_path = os.path.dirname(os.path.abspath(sys.argv[0])) src_path = local_path From fc9e483a7256148b2e51466a352158a387f5ade0 Mon Sep 17 00:00:00 2001 From: edubov Date: Thu, 10 Oct 2019 09:42:14 +0300 Subject: [PATCH 42/50] fix LGTM issue --- tools/cythonize.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/cythonize.py b/tools/cythonize.py index b6398f7af..286f584ea 100644 --- a/tools/cythonize.py +++ b/tools/cythonize.py @@ -77,9 +77,7 @@ def cythonize(cython_file, gen_file): # There are ways of installing Cython that don't result in a cython # executable on the path, see scipy issue gh-2397. rc = subprocess.call([sys.executable, '-c', - 'import sys; from Cython.Compiler.Main ' - 'import setuptools_main as main;' - ' sys.exit(main())'] + flags + + 'import sys; from Cython.Compiler.Main; import setuptools_main as main; sys.exit(main())'] + flags + ["-o", gen_file, cython_file]) if rc != 0: raise Exception('Cythonizing %s failed' % cython_file) From a3cfa7defba701f62003559e21e5da5a4f055fbc Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 10:42:58 +0200 Subject: [PATCH 43/50] travis fix --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 0176f814a..144e42623 100644 --- a/.travis.yml +++ b/.travis.yml @@ -37,7 +37,7 @@ matrix: OPTIONAL_DEPS="keras" TEST_DOC="true" TEST_NUMPYDOC="false" - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" CYTHON_VERSION="*" - OPTIONAL_DEPS="tensorflow" TEST_DOC="true" TEST_NUMPYDOC="false" CYTHON_VERSION="*" + OPTIONAL_DEPS="tensorflow" TEST_DOC="true" TEST_NUMPYDOC="false" - env: DISTRIB="conda" PYTHON_VERSION="3.7" NUMPY_VERSION="*" SCIPY_VERSION="*" SKLEARN_VERSION="master" CYTHON_VERSION="*" OPTIONAL_DEPS="false" TEST_DOC="false" TEST_NUMPYDOC="true" From 0bb474ce63f49c9e38ac53d238a8623c39dc36cc Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 10:44:25 +0200 Subject: [PATCH 44/50] fix appveyor --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index e741930de..42a7e97b0 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -10,7 +10,7 @@ environment: - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6" PYTHON_ARCH: "64" - OPTIONAL_DEP: "pandas keras tensorflow=1 Cython" + OPTIONAL_DEP: "pandas keras tensorflow=1" - PYTHON: "C:\\Miniconda36-x64" PYTHON_VERSION: "3.6" From 2fc250e016d7d8ecf6fd3b053d6173884fb3d5ff Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 10:45:22 +0200 Subject: [PATCH 45/50] updated MANIFEST.in --- MANIFEST.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index 1ce498de9..0637c065e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,6 +3,6 @@ recursive-include examples * include imblearn/tree *.pyx include imblearn/tree *.pyd include AUTHORS.rst -include CONTRIBUTING.md +include CONTRIBUTING.ms include LICENSE include README.rst \ No newline at end of file From 008b8082a1f437a080869da61d7800379ba89376 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 10:46:26 +0200 Subject: [PATCH 46/50] aligned setup file to master --- setup.py | 102 ++++++++----------------------------------------------- 1 file changed, 15 insertions(+), 87 deletions(-) diff --git a/setup.py b/setup.py index 8b5654cdd..b3cab4f09 100755 --- a/setup.py +++ b/setup.py @@ -1,15 +1,10 @@ #! /usr/bin/env python """Toolbox for imbalanced dataset in machine learning.""" -import io -import re import codecs import os -import sys -import subprocess -from setuptools import find_packages -PACKAGE_NAME = 'imblearn' +from setuptools import find_packages, setup # get __version__ from _version.py ver_file = os.path.join('imblearn', '_version.py') @@ -62,84 +57,17 @@ } - -def version(package, encoding='utf-8'): - """Obtain the packge version from a python file e.g. pkg/_version.py - See . - """ - path = os.path.join(os.path.dirname(__file__), package, '_version.py') - with io.open(path, encoding=encoding) as fp: - version_info = fp.read() - version_match = re.search(r"""^__version__ = ['"]([^'"]*)['"]""", - version_info, re.M) - if not version_match: - raise RuntimeError("Unable to find version string.") - return version_match.group(1) - - -def generate_cython(package): - """Cythonize all sources in the package""" - cwd = os.path.abspath(os.path.dirname(__file__)) - print("Cythonizing sources") - p = subprocess.call([sys.executable, - os.path.join(cwd, 'tools', 'cythonize.py'), - package], - cwd=cwd) - if p != 0: - raise RuntimeError("Running cythonize failed!") - - -def configuration(parent_package='', top_path=None): - from numpy.distutils.misc_util import Configuration - config = Configuration(None, parent_package, top_path) - config.set_options(ignore_setup_xxx_py=True, - assume_default_configuration=True, - delegate_options_to_subpackages=True, - quiet=True) - config.add_subpackage(PACKAGE_NAME) - return config -def setup_package(): - from numpy.distutils.core import setup - - local_path = os.path.dirname(os.path.abspath(sys.argv[0])) - src_path = local_path - - os.chdir(local_path) - sys.path.insert(0, local_path) - - # Run build - old_path = os.getcwd() - os.chdir(src_path) - sys.path.insert(0, src_path) - - cwd = os.path.abspath(os.path.dirname(__file__)) - if not os.path.exists(os.path.join(cwd, 'PKG-INFO')): - # Generate Cython sources, unless building from source release - generate_cython(PACKAGE_NAME) - - try: - setup(name=DISTNAME, - maintainer=MAINTAINER, - maintainer_email=MAINTAINER_EMAIL, - description=DESCRIPTION, - license=LICENSE, - url=URL, - version=VERSION, - download_url=DOWNLOAD_URL, - long_description=LONG_DESCRIPTION, - zip_safe=False, # the package can run out of an .egg file - classifiers=CLASSIFIERS, - packages=find_packages(), - install_requires=INSTALL_REQUIRES, - extras_require=EXTRAS_REQUIRE, - configuration=configuration) - finally: - del sys.path[0] - os.chdir(old_path) - - return - - -if __name__ == '__main__': - setup_package() - +setup(name=DISTNAME, + maintainer=MAINTAINER, + maintainer_email=MAINTAINER_EMAIL, + description=DESCRIPTION, + license=LICENSE, + url=URL, + version=VERSION, + download_url=DOWNLOAD_URL, + long_description=LONG_DESCRIPTION, + zip_safe=False, # the package can run out of an .egg file + classifiers=CLASSIFIERS, + packages=find_packages(), + install_requires=INSTALL_REQUIRES, + extras_require=EXTRAS_REQUIRE) From 6e67b969af5711a1ba282a716b5068eaa4d3c6e7 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 10:53:36 +0200 Subject: [PATCH 47/50] fixed lint issues --- doc/ensemble.rst | 2 -- doc/miscellaneous.rst | 2 -- imblearn/tree/setup.py | 3 --- 3 files changed, 7 deletions(-) diff --git a/doc/ensemble.rst b/doc/ensemble.rst index d78cab0df..410c302e3 100644 --- a/doc/ensemble.rst +++ b/doc/ensemble.rst @@ -6,13 +6,11 @@ Ensemble of samplers .. currentmodule:: imblearn.ensemble - .. _ensemble_meta_estimators: Classifier including inner balancing samplers ============================================= - .. _bagging: Bagging classifier diff --git a/doc/miscellaneous.rst b/doc/miscellaneous.rst index 780ce50f6..412f2b822 100644 --- a/doc/miscellaneous.rst +++ b/doc/miscellaneous.rst @@ -167,8 +167,6 @@ will be passed to ``fit_generator``:: >>> callback_history = model.fit_generator(generator=training_generator, ... epochs=10, verbose=0) - .. topic:: References * :ref:`sphx_glr_auto_examples_applications_porto_seguro_keras_under_sampling.py` - diff --git a/imblearn/tree/setup.py b/imblearn/tree/setup.py index 061cbc018..071484190 100644 --- a/imblearn/tree/setup.py +++ b/imblearn/tree/setup.py @@ -9,9 +9,6 @@ def configuration(parent_package='', top_path=None): sources=['criterion.c'], include_dirs=[numpy.get_include()], libraries=libraries) - # extra_compile_args=["-O3", "-fopenmp"], - # extra_link_args=["-fopenmp"]) - # config.add_subpackage("tests") return config From 3afdbb45367016bab9a718b84097892e556b2af7 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 10:57:36 +0200 Subject: [PATCH 48/50] fix lint issues --- imblearn/tree/setup.py | 2 +- tools/cythonize.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/imblearn/tree/setup.py b/imblearn/tree/setup.py index 071484190..311c3e0d0 100644 --- a/imblearn/tree/setup.py +++ b/imblearn/tree/setup.py @@ -15,4 +15,4 @@ def configuration(parent_package='', top_path=None): if __name__ == "__main__": from numpy.distutils.core import setup - setup(**configuration().todict()) \ No newline at end of file + setup(**configuration().todict()) diff --git a/tools/cythonize.py b/tools/cythonize.py index 286f584ea..1ba5840cd 100644 --- a/tools/cythonize.py +++ b/tools/cythonize.py @@ -77,8 +77,10 @@ def cythonize(cython_file, gen_file): # There are ways of installing Cython that don't result in a cython # executable on the path, see scipy issue gh-2397. rc = subprocess.call([sys.executable, '-c', - 'import sys; from Cython.Compiler.Main; import setuptools_main as main; sys.exit(main())'] + flags + - ["-o", gen_file, cython_file]) + 'import sys; from Cython.Compiler.Main; \ + import setuptools_main as main; sys.exit(main())'] + + flags + + ["-o", gen_file, cython_file]) if rc != 0: raise Exception('Cythonizing %s failed' % cython_file) except OSError: From 806cc7b62a8db9a04f3bb69185df2ac2394edc64 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 11:02:24 +0200 Subject: [PATCH 49/50] fixed lint issues --- tools/cythonize.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tools/cythonize.py b/tools/cythonize.py index 1ba5840cd..38fcd94fd 100644 --- a/tools/cythonize.py +++ b/tools/cythonize.py @@ -76,11 +76,12 @@ def cythonize(cython_file, gen_file): except OSError: # There are ways of installing Cython that don't result in a cython # executable on the path, see scipy issue gh-2397. - rc = subprocess.call([sys.executable, '-c', - 'import sys; from Cython.Compiler.Main; \ - import setuptools_main as main; sys.exit(main())'] - + flags - + ["-o", gen_file, cython_file]) + rc = subprocess.call( + [sys.executable, '-c', + 'import sys; from Cython.Compiler.Main; \ + import setuptools_main as main; sys.exit(main())'] + + flags + + ["-o", gen_file, cython_file]) if rc != 0: raise Exception('Cythonizing %s failed' % cython_file) except OSError: From 7acac1f0cb177af7184ff107ba94fcd37b461d36 Mon Sep 17 00:00:00 2001 From: edubov Date: Mon, 23 Dec 2019 11:28:55 +0200 Subject: [PATCH 50/50] fix lint issues --- .../tree/train_model_with_hellinger_distance_criterion.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/tree/train_model_with_hellinger_distance_criterion.py b/examples/tree/train_model_with_hellinger_distance_criterion.py index 5cff76da7..3bfebe33d 100644 --- a/examples/tree/train_model_with_hellinger_distance_criterion.py +++ b/examples/tree/train_model_with_hellinger_distance_criterion.py @@ -6,10 +6,12 @@ from imblearn.tree.criterion import HellingerDistanceCriterion -X, y = make_classification(n_samples=10000, n_features=40, n_informative=5, n_classes=2, weights=[0.05,0.95], random_state=1) +X, y = make_classification( + n_samples=10000, n_features=40, n_informative=5, + n_classes=2, weights=[0.05, 0.95], random_state=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) -hdc = HellingerDistanceCriterion(1, np.array([2],dtype='int64')) +hdc = HellingerDistanceCriterion(1, np.array([2], dtype='int64')) clf = RandomForestClassifier(criterion=hdc, max_depth=4, n_estimators=100) clf.fit(X_train, y_train) -print(clf.score(X_test, y_test)) \ No newline at end of file +print(clf.score(X_test, y_test))