sissa-data-science
diff --git a/‎dadapy/_utils/id_estimation.py
+35-113 b/‎dadapy/_utils/id_estimation.py
+35-113
diff --git a/‎dadapy/data.py
+21-87 b/‎dadapy/data.py
+21-87
@@ -15,15 +15,13 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+
 rng = np.random.default_rng()
 
 from scipy.stats import epps_singleton_2samp as es2s
-from scipy.stats import ks_2samp as ks2s
-from scipy.stats import cramervonmises_2samp as cvm2s
-from scipy.stats import chisquare as x2
-from scipy.stats import binom
-from scipy.stats import kstwo
-import os
+
+from ..plot import plot_cdf
+
 
 def box_counting(
     data,
@@ -73,26 +71,13 @@ def box_counting(
             offsets = np.linspace(0, scale, n_offsets + 1)[:-1]
         # search over all offsets
         for offset in offsets:
-            # bin_edges = np.array([np.hstack([inf - 1e-5 - offset, x - 1e-5 - offset])
-            #        for x in bin_temp[:, 1:]])
             bin_edges = np.array(
                 [
                     np.hstack([inf - scale + offset + 1e-5, x + offset + 1e-5])
                     for x in bin_temp
                 ]
             )
-            # ind = np.where((bin_edges[0] < sup - 1) == False)[0][0]
-            # bin_edges = bin_edges[:, : ind + 1]
-            # print(bin_edges)
-
             H1, e = np.histogramdd(data, bins=bin_edges)
-            # print(e[0],e[0][0],e[0][-1])
-            # if verb:
-            #    print(H1.sum())
-            #    if int(H1.sum()) != array.shape[0]:
-            #       print('for scale ', scale, ' and offset ', offset, ' the covering is badly shaped')
-            # touched.append(np.inf)
-            # continue
             touched.append(np.sum(H1 > 0))
         Ns.append(touched)
     Ns = np.array(Ns)
@@ -134,7 +119,7 @@ def box_counting(
             np.log(1 / scales),
             fitted_y_vals,
             "k--",
-            label=f"Fit: {np.round(coeffs[0],3)}X+{coeffs[1]}",
+            label=f"Fit: {np.round(coeffs[0], 3)}X+{coeffs[1]}",
         )
         ax.legend()
         plt.show()
@@ -201,120 +186,57 @@ def correlation_integral(dists, scales, cond=False, plot=True):
 
     return ids, scales[1:], CI
 
+
 # --------------------------------------------------------------------------------------
 
 
-def _binomial_model_validation(k, n, p, artificial_samples=100000, k_bootstrap=100, plot=True):
+def _binomial_model_validation(
+    k, n, p, artificial_samples=100000, k_bootstrap=1, plot=False
+):
     """Perform the model validation for the binomial estimator. To this aim, an artificial set of binomially distributed
-    points is extracted and compared to the observed ones. The quantitative test is performed by means of the .... test.
-    The associated statistics and p-value is returned
+    points is extracted and compared to the observed ones. The quantitative test is performed by means of the 2-samples Epps-Singleton tests.
+    The associated statistics and p-value are returned.
+    Initially, the 2-samples Kolmogorov-Smirnoff was used but, even if providing reasonable results, it is supposed to deal with continuous distributions.
 
     Args:
         k (int or np.ndarray(int)): Observed points in outer shells.
         n (np.ndarray(int)): Observed points in the inner shells.
         p (float): Tested Binomial parameter
         artificial_samples (int, default=1000000): number of theoretical samples to be compared with the observed ones.
+        k_bootstrap (int, default=1): number of bootstrap resampling in order to obtain more reliable p-values
+        plot (bool, default=False): flag that, if se to True, allows to plot the observed vs theoretical distributions
 
     Returns:
-        statistics (float): test statistics
         p_value (float): p-value obtained from the test
     """
     assert 0 < p < 1, "The binomial parameter must be in the interval (0,1)"
     # sample an artificial ensemble of binomially distributed points to be compared with the observed ones
     if n.shape[0] < artificial_samples:
         if isinstance(k, np.ndarray):
             replicas = int(artificial_samples / n.shape[0])
-            n_samp = np.array([rng.binomial(ki, p, size=replicas) for ki in (k-1)]).reshape(-1)
+            n_samp = np.array(
+                [rng.binomial(ki, p, size=replicas) for ki in (k - 1)]
+            ).reshape(-1)
         else:
-            n_samp = rng.binomial(k-1, p, size=artificial_samples)
+            n_samp = rng.binomial(k - 1, p, size=artificial_samples)
     else:
         if isinstance(k, np.ndarray):
-            n_samp = rng.binomial(k-1, p)
+            n_samp = rng.binomial(k - 1, p)
         else:
-            n_samp = rng.binomial(k-1, p, size=n.shape[0])
-                                    
-    # compute the theoretical probabilities
-#    if isinstance(k, np.ndarray):
-#        k_sup = k.max()
-#        p_k = np.array([sum(k-1 == i) for i in range(0, k_sup)]) / len(k)
-#        p_theo = np.sum([ binom.pmf(range(0, k_sup), ki, p)*p_k[ki] for ki in range(0, k_sup) ], axis=0)
-#    else:
-#        k_sup = k
-#        p_theo = binom.pmf(range(0, k), k-1, p)
-
-    # observed probability distribution
-    #p_obs = np.array([sum(n-1 == i) for i in range(0, k_sup)]) / len(n)
-#   commented out as it takes long time and is not needed at the moment
-#    print('computing p_samp...')
-#    p_samp = np.array([sum(n_samp == i) for i in range(0,k_sup)]) / len(n_samp)
-    
-    #if plot:
-    #    print('plotting...')
-    #    plt.figure()
-    #    plt.plot(p_theo, label='p theo | id')
-    #    plt.plot(p_samp, label='p samp | id')
-    #    plt.plot(p_obs, label='p obs')
-    #    plt.xlabel('n',fontsize=14)
-    #    plt.ylabel('p(n)',fontsize=14)
-    #plt.yscale('log')
-    #plt.show()
-    
-    #x2_d1, x2_pv1 = x2(p_obs, p_theo, ddof=4)
-    #x2_d = np.array([x2(p_obs, p_theo, ddof=ki) for ki in range(1, k_sup)])
-    
-    #x2_d, x2_pv = x2(len(n)*p_obs, len(n)*p_theo, ddof=1)
-    #f_theo = len(n)*p_theo
-    #mask = f_theo > 7
-    #chi2 = sum((p_obs-p_theo)**2/p_theo)*len(n)
-    #chi2 = sum((f_obs[mask]-f_theo[mask])**2/f_theo[mask])
-    
-    # test against sampled distribution using bootstrap (cramer von mises is left out)
-    pvs = np.zeros((k_bootstrap,2))#3
-    for ki in range(k_bootstrap):
-        n_temp = rng.choice(n,size=len(n),replace=True)
-        print(ki,end='\r')
-        #p_temp = np.array([sum(n_temp-1 == i) for i in range(0, k_sup)]) / len(n)
-        
-        #if plot:
-        #    if ki==0:
-        #        plt.plot(p_temp, color='grey',alpha=0.25,zorder=-1,label='p bootstrap')
-        #    else:
-        #        plt.plot(p_temp, color='grey',alpha=0.25,zorder=-1)
-                
-        ks_d, ks_pv = ks2s(n_samp, n_temp-1)
-        es_d, es_pv = es2s(n_samp, n_temp-1)#, t=(0.45, 0.55))
-        #appo = cvm2s(n_samp, n_temp-1)
-        #cvm_d, cvm_pv = appo.statistic,appo.pvalue
-        pvs[ki] = (ks_pv,es_pv)#,cvm_pv)
-
-    #if plot:
-    #    plt.legend()
-    #    plt.show()
-    
-    ave = np.array([pvs[:i*5].mean(axis=0) for i in range(1,k_bootstrap//5+1)])
-    median = np.array([np.median(pvs[:i*5],axis=0) for i in range(1,k_bootstrap//5+1)])
-    
+            n_samp = rng.binomial(k - 1, p, size=n.shape[0])
+
     if plot:
-        labels=['ks','es','cvm']
-        plt.figure()
-        [plt.plot(range(5,k_bootstrap+5,5),ave[:,j],label=labels[j]) for j in range(2)]
-        [plt.plot(range(5,k_bootstrap+5,5),median[:,j],label=labels[j]) for j in range(2)]
-        plt.xlabel('# of bootstrap',fontsize=14)
-        plt.ylabel('average pv',fontsize=14)
-        plt.yscale('log')
-        plt.legend()
-        plt.show()
-    
-    # test against theoretical distribution 
-    #ks_stat = max(abs(np.cumsum(p_obs)-np.cumsum(p_theo)))
-    #ks_p_value = kstwo.sf(ks_stat, len(n))
-    
-
-    #print("KS\t", ks_d, ks_pv)
-    #print("KS_hm\t", ks_stat, p_value)
-    #print("X2 auto\t", x2_d, x2_pv)
-    #print("X2 hm\t", x2_d1, x2_pv1)
-    # print("X2:\t", x2_d, x2_pv)
-    # print("ES:\t", es_d, es_pv)
-    # return ks_d, ks_pv
-    return ave[-1,0],ave[-1,1], median[-1,0],median[-1,1]
+        plot_cdf(n - 1, n_samp)
+
+    es_d, es_pv = es2s(n_samp, n - 1)
+
+    # possibly test against re-sampled distribution using bootstrap
+    if k_bootstrap > 1:
+        pvs = [es_pv]
+        for ki in range(k_bootstrap):
+            n_temp = rng.choice(n, size=len(n), replace=True)
+            es_d, es_pv = es2s(n_samp, n_temp - 1)
+            pvs.append(es_pv)
+        return np.mean(pvs)
+    else:
+        return es_pv
@@ -30,7 +30,7 @@
 from dadapy.density_advanced import DensityAdvanced
 from dadapy.feature_weighting import FeatureWeighting
 from dadapy.metric_comparisons import MetricComparisons
-from scipy.stats import ks_2samp as KS
+
 rng = np.random.default_rng()
 
 cores = multiprocessing.cpu_count()
@@ -147,121 +147,55 @@ def return_ids_kstar_gride(
         return ids, ids_err, kstars, log_likelihoods
 
     def return_ids_kstar_binomial(
-        self, initial_id=None, n_iter=5, Dthr=23.92812698, r='opt'
+        self, initial_id=None, n_iter=5, Dthr=23.92812698, r=None, plot_mv=False
     ):
         """Return the id estimates of the binomial algorithm coupled with the kstar estimation of the scale.
 
         Args:
             initial_id (float): initial estimate of the id default uses 2NN
             n_iter (int): number of iteration
             Dthr (float): threshold value for the kstar test
-            r (float): parameter of binomial estimator, 0 < r < 1
-        Returns:
-            ids, ids_err, kstars, log_likelihoods
-        """
-        # start with an initial estimate of the ID
-        if initial_id is None:
-            self.compute_id_2NN(algorithm='base')
-        else:
-            self.compute_distances()
-            self.set_id(initial_id)
-
-        ids = np.zeros(n_iter)
-        ids_err = np.zeros(n_iter)
-        kstars = np.zeros((n_iter, self.N), dtype=int)
-        log_likelihoods = np.zeros(n_iter)
-        ks_stats = np.zeros(n_iter)
-        p_values = np.zeros(n_iter)
-
-        for i in range(n_iter):
-            # compute kstar
-            self.compute_kstar(Dthr)
-            print("iteration ", i)
-            print("id ", self.intrinsic_dim)
-
-            # set new ratio
-            r_eff = min(0.95,0.2032**(1./self.intrinsic_dim)) if r == 'opt' else r
-            # compute neighbourhoods shells from k_star
-            rk = np.array([dd[self.kstar[j]] for j, dd in enumerate(self.distances)])
-            rn = rk * r_eff
-            n = np.sum([dd < rn[j] for j, dd in enumerate(self.distances)], axis=1)
-            # compute id
-            id = np.log((n.mean() - 1) / (self.kstar.mean() - 1)) / np.log(r_eff)
-            # compute id error
-            id_err = ut._compute_binomial_cramerrao(id, self.kstar-1, r_eff, self.N)
-            # compute likelihood
-            log_lik = ut.binomial_loglik(id, self.kstar - 1, n - 1, r_eff)
-            # model validation through KS test
-            n_model = rng.binomial(self.kstar-1, r_eff**id, size=len(n))
-            ks, pv = KS(n-1, n_model)
-            # set new id
-            self.set_id(id)
+            r (float, default=None): parameter of binomial estimator, 0 < r < 1. If None, the optimal, adaptive one is
+             used
+            plot_mv (bool, default=False): if True, plots the observed and the theoretical distributions compared by
+             means of the Epps-Singleton test
 
-            ids[i] = id
-            ids_err[i] = id_err
-            kstars[i] = self.kstar
-            log_likelihoods[i] = log_lik
-            #ks_stats[i] = ks
-            p_values[i] = pv
-
-        self.intrinsic_dim = id
-        self.intrinsic_dim_err = id_err
-        self.intrinsic_dim_scale = 0.5 * (rn.mean() + rk.mean())
-
-        return ids, ids_err, kstars, log_likelihoods, ks_stats, p_values
-
-    def return_ids_kstar_binomial_func(
-        self, initial_id=None, n_iter=5, Dthr=23.92812698, r='opt', verb=True
-    ):
-        """Return the id estimates of the binomial algorithm coupled with the kstar estimation of the scale.
-
-        Args:
-            initial_id (float): initial estimate of the id default uses 2NN
-            n_iter (int): number of iteration
-            Dthr (float): threshold value for the kstar test
-            r (float): parameter of binomial estimator, 0 < r < 1
         Returns:
             ids, ids_err, kstars, log_likelihoods
         """
-        # start with an initial estimate of the ID
+        # start with an initial estimate of the ID and the associated k*
         if initial_id is None:
-            self.compute_id_2NN(algorithm='base')
+            self.compute_id_2NN(algorithm="base")
         else:
             self.compute_distances()
             self.set_id(initial_id)
+        self.compute_kstar(Dthr)
 
         ids = np.zeros(n_iter)
         ids_err = np.zeros(n_iter)
         kstars = np.zeros((n_iter, self.N), dtype=int)
-        ks_pv = np.zeros(n_iter)
         es_pv = np.zeros(n_iter)
-        ks_pv1 = np.zeros(n_iter)
-        es_pv1 = np.zeros(n_iter)
-        
 
         for i in range(n_iter):
-            # compute kstar
-            self.compute_kstar(Dthr)
-            if verb:
+            if self.verb:
                 print("iteration ", i)
                 print("id ", self.intrinsic_dim)
 
             # set new ratio
-            r_eff = min(0.95,0.2032**(1./self.intrinsic_dim)) if r == 'opt' else r
+            r_eff = min(0.95, 0.2032 ** (1.0 / self.intrinsic_dim)) if r is None else r
             # compute id using the k*
-            ide, id_err, scale, ks, es, med1, med2 = self.compute_id_binomial_k(self.kstar, r_eff, bayes=False)
+            ide, id_err, scale, pv = self.compute_id_binomial_k(
+                self.kstar, r_eff, bayes=False, plot_mv=plot_mv
+            )
             # compute likelihood
-            # log_lik = ut.binomial_loglik(id, self.kstar - 1, n - 1, r_eff)
-
+            # n = self._fix_k(self.kstar, r_eff)
+            # log_lik = ut.binomial_loglik(ide, self.kstar - 1, n - 1, r_eff)
+            # update the k*
+            self.compute_kstar(Dthr)
+            # store the obtained values
             ids[i] = ide
             ids_err[i] = id_err
             kstars[i] = self.kstar
-            ks_pv[i] = ks
-            es_pv[i] = es
-            ks_pv1[i] = med1
-            es_pv1[i] = med2
-           
-        self.intrinsic_dim = id
-        self.intrinsic_dim_err = id_err
+            es_pv[i] = pv
 
-        return ids, ids_err, kstars, ks_pv, es_pv, ks_pv1, es_pv1
+        return ids, ids_err, kstars, es_pv