Skip to content

Commit 9e3aa88

Browse files
imacoccoAldoGl
authored andcommitted
overall compatibility fix between functions and tests
1 parent a8e1389 commit 9e3aa88

13 files changed

+204
-560
lines changed

dadapy/_utils/id_estimation.py

+35-113
Original file line numberDiff line numberDiff line change
@@ -15,15 +15,13 @@
1515

1616
import matplotlib.pyplot as plt
1717
import numpy as np
18+
1819
rng = np.random.default_rng()
1920

2021
from scipy.stats import epps_singleton_2samp as es2s
21-
from scipy.stats import ks_2samp as ks2s
22-
from scipy.stats import cramervonmises_2samp as cvm2s
23-
from scipy.stats import chisquare as x2
24-
from scipy.stats import binom
25-
from scipy.stats import kstwo
26-
import os
22+
23+
from ..plot import plot_cdf
24+
2725

2826
def box_counting(
2927
data,
@@ -73,26 +71,13 @@ def box_counting(
7371
offsets = np.linspace(0, scale, n_offsets + 1)[:-1]
7472
# search over all offsets
7573
for offset in offsets:
76-
# bin_edges = np.array([np.hstack([inf - 1e-5 - offset, x - 1e-5 - offset])
77-
# for x in bin_temp[:, 1:]])
7874
bin_edges = np.array(
7975
[
8076
np.hstack([inf - scale + offset + 1e-5, x + offset + 1e-5])
8177
for x in bin_temp
8278
]
8379
)
84-
# ind = np.where((bin_edges[0] < sup - 1) == False)[0][0]
85-
# bin_edges = bin_edges[:, : ind + 1]
86-
# print(bin_edges)
87-
8880
H1, e = np.histogramdd(data, bins=bin_edges)
89-
# print(e[0],e[0][0],e[0][-1])
90-
# if verb:
91-
# print(H1.sum())
92-
# if int(H1.sum()) != array.shape[0]:
93-
# print('for scale ', scale, ' and offset ', offset, ' the covering is badly shaped')
94-
# touched.append(np.inf)
95-
# continue
9681
touched.append(np.sum(H1 > 0))
9782
Ns.append(touched)
9883
Ns = np.array(Ns)
@@ -134,7 +119,7 @@ def box_counting(
134119
np.log(1 / scales),
135120
fitted_y_vals,
136121
"k--",
137-
label=f"Fit: {np.round(coeffs[0],3)}X+{coeffs[1]}",
122+
label=f"Fit: {np.round(coeffs[0], 3)}X+{coeffs[1]}",
138123
)
139124
ax.legend()
140125
plt.show()
@@ -201,120 +186,57 @@ def correlation_integral(dists, scales, cond=False, plot=True):
201186

202187
return ids, scales[1:], CI
203188

189+
204190
# --------------------------------------------------------------------------------------
205191

206192

207-
def _binomial_model_validation(k, n, p, artificial_samples=100000, k_bootstrap=100, plot=True):
193+
def _binomial_model_validation(
194+
k, n, p, artificial_samples=100000, k_bootstrap=1, plot=False
195+
):
208196
"""Perform the model validation for the binomial estimator. To this aim, an artificial set of binomially distributed
209-
points is extracted and compared to the observed ones. The quantitative test is performed by means of the .... test.
210-
The associated statistics and p-value is returned
197+
points is extracted and compared to the observed ones. The quantitative test is performed by means of the 2-samples Epps-Singleton tests.
198+
The associated statistics and p-value are returned.
199+
Initially, the 2-samples Kolmogorov-Smirnoff was used but, even if providing reasonable results, it is supposed to deal with continuous distributions.
211200
212201
Args:
213202
k (int or np.ndarray(int)): Observed points in outer shells.
214203
n (np.ndarray(int)): Observed points in the inner shells.
215204
p (float): Tested Binomial parameter
216205
artificial_samples (int, default=1000000): number of theoretical samples to be compared with the observed ones.
206+
k_bootstrap (int, default=1): number of bootstrap resampling in order to obtain more reliable p-values
207+
plot (bool, default=False): flag that, if se to True, allows to plot the observed vs theoretical distributions
217208
218209
Returns:
219-
statistics (float): test statistics
220210
p_value (float): p-value obtained from the test
221211
"""
222212
assert 0 < p < 1, "The binomial parameter must be in the interval (0,1)"
223213
# sample an artificial ensemble of binomially distributed points to be compared with the observed ones
224214
if n.shape[0] < artificial_samples:
225215
if isinstance(k, np.ndarray):
226216
replicas = int(artificial_samples / n.shape[0])
227-
n_samp = np.array([rng.binomial(ki, p, size=replicas) for ki in (k-1)]).reshape(-1)
217+
n_samp = np.array(
218+
[rng.binomial(ki, p, size=replicas) for ki in (k - 1)]
219+
).reshape(-1)
228220
else:
229-
n_samp = rng.binomial(k-1, p, size=artificial_samples)
221+
n_samp = rng.binomial(k - 1, p, size=artificial_samples)
230222
else:
231223
if isinstance(k, np.ndarray):
232-
n_samp = rng.binomial(k-1, p)
224+
n_samp = rng.binomial(k - 1, p)
233225
else:
234-
n_samp = rng.binomial(k-1, p, size=n.shape[0])
235-
236-
# compute the theoretical probabilities
237-
# if isinstance(k, np.ndarray):
238-
# k_sup = k.max()
239-
# p_k = np.array([sum(k-1 == i) for i in range(0, k_sup)]) / len(k)
240-
# p_theo = np.sum([ binom.pmf(range(0, k_sup), ki, p)*p_k[ki] for ki in range(0, k_sup) ], axis=0)
241-
# else:
242-
# k_sup = k
243-
# p_theo = binom.pmf(range(0, k), k-1, p)
244-
245-
# observed probability distribution
246-
#p_obs = np.array([sum(n-1 == i) for i in range(0, k_sup)]) / len(n)
247-
# commented out as it takes long time and is not needed at the moment
248-
# print('computing p_samp...')
249-
# p_samp = np.array([sum(n_samp == i) for i in range(0,k_sup)]) / len(n_samp)
250-
251-
#if plot:
252-
# print('plotting...')
253-
# plt.figure()
254-
# plt.plot(p_theo, label='p theo | id')
255-
# plt.plot(p_samp, label='p samp | id')
256-
# plt.plot(p_obs, label='p obs')
257-
# plt.xlabel('n',fontsize=14)
258-
# plt.ylabel('p(n)',fontsize=14)
259-
#plt.yscale('log')
260-
#plt.show()
261-
262-
#x2_d1, x2_pv1 = x2(p_obs, p_theo, ddof=4)
263-
#x2_d = np.array([x2(p_obs, p_theo, ddof=ki) for ki in range(1, k_sup)])
264-
265-
#x2_d, x2_pv = x2(len(n)*p_obs, len(n)*p_theo, ddof=1)
266-
#f_theo = len(n)*p_theo
267-
#mask = f_theo > 7
268-
#chi2 = sum((p_obs-p_theo)**2/p_theo)*len(n)
269-
#chi2 = sum((f_obs[mask]-f_theo[mask])**2/f_theo[mask])
270-
271-
# test against sampled distribution using bootstrap (cramer von mises is left out)
272-
pvs = np.zeros((k_bootstrap,2))#3
273-
for ki in range(k_bootstrap):
274-
n_temp = rng.choice(n,size=len(n),replace=True)
275-
print(ki,end='\r')
276-
#p_temp = np.array([sum(n_temp-1 == i) for i in range(0, k_sup)]) / len(n)
277-
278-
#if plot:
279-
# if ki==0:
280-
# plt.plot(p_temp, color='grey',alpha=0.25,zorder=-1,label='p bootstrap')
281-
# else:
282-
# plt.plot(p_temp, color='grey',alpha=0.25,zorder=-1)
283-
284-
ks_d, ks_pv = ks2s(n_samp, n_temp-1)
285-
es_d, es_pv = es2s(n_samp, n_temp-1)#, t=(0.45, 0.55))
286-
#appo = cvm2s(n_samp, n_temp-1)
287-
#cvm_d, cvm_pv = appo.statistic,appo.pvalue
288-
pvs[ki] = (ks_pv,es_pv)#,cvm_pv)
289-
290-
#if plot:
291-
# plt.legend()
292-
# plt.show()
293-
294-
ave = np.array([pvs[:i*5].mean(axis=0) for i in range(1,k_bootstrap//5+1)])
295-
median = np.array([np.median(pvs[:i*5],axis=0) for i in range(1,k_bootstrap//5+1)])
296-
226+
n_samp = rng.binomial(k - 1, p, size=n.shape[0])
227+
297228
if plot:
298-
labels=['ks','es','cvm']
299-
plt.figure()
300-
[plt.plot(range(5,k_bootstrap+5,5),ave[:,j],label=labels[j]) for j in range(2)]
301-
[plt.plot(range(5,k_bootstrap+5,5),median[:,j],label=labels[j]) for j in range(2)]
302-
plt.xlabel('# of bootstrap',fontsize=14)
303-
plt.ylabel('average pv',fontsize=14)
304-
plt.yscale('log')
305-
plt.legend()
306-
plt.show()
307-
308-
# test against theoretical distribution
309-
#ks_stat = max(abs(np.cumsum(p_obs)-np.cumsum(p_theo)))
310-
#ks_p_value = kstwo.sf(ks_stat, len(n))
311-
312-
313-
#print("KS\t", ks_d, ks_pv)
314-
#print("KS_hm\t", ks_stat, p_value)
315-
#print("X2 auto\t", x2_d, x2_pv)
316-
#print("X2 hm\t", x2_d1, x2_pv1)
317-
# print("X2:\t", x2_d, x2_pv)
318-
# print("ES:\t", es_d, es_pv)
319-
# return ks_d, ks_pv
320-
return ave[-1,0],ave[-1,1], median[-1,0],median[-1,1]
229+
plot_cdf(n - 1, n_samp)
230+
231+
es_d, es_pv = es2s(n_samp, n - 1)
232+
233+
# possibly test against re-sampled distribution using bootstrap
234+
if k_bootstrap > 1:
235+
pvs = [es_pv]
236+
for ki in range(k_bootstrap):
237+
n_temp = rng.choice(n, size=len(n), replace=True)
238+
es_d, es_pv = es2s(n_samp, n_temp - 1)
239+
pvs.append(es_pv)
240+
return np.mean(pvs)
241+
else:
242+
return es_pv

dadapy/data.py

+21-87
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030
from dadapy.density_advanced import DensityAdvanced
3131
from dadapy.feature_weighting import FeatureWeighting
3232
from dadapy.metric_comparisons import MetricComparisons
33-
from scipy.stats import ks_2samp as KS
33+
3434
rng = np.random.default_rng()
3535

3636
cores = multiprocessing.cpu_count()
@@ -147,121 +147,55 @@ def return_ids_kstar_gride(
147147
return ids, ids_err, kstars, log_likelihoods
148148

149149
def return_ids_kstar_binomial(
150-
self, initial_id=None, n_iter=5, Dthr=23.92812698, r='opt'
150+
self, initial_id=None, n_iter=5, Dthr=23.92812698, r=None, plot_mv=False
151151
):
152152
"""Return the id estimates of the binomial algorithm coupled with the kstar estimation of the scale.
153153
154154
Args:
155155
initial_id (float): initial estimate of the id default uses 2NN
156156
n_iter (int): number of iteration
157157
Dthr (float): threshold value for the kstar test
158-
r (float): parameter of binomial estimator, 0 < r < 1
159-
Returns:
160-
ids, ids_err, kstars, log_likelihoods
161-
"""
162-
# start with an initial estimate of the ID
163-
if initial_id is None:
164-
self.compute_id_2NN(algorithm='base')
165-
else:
166-
self.compute_distances()
167-
self.set_id(initial_id)
168-
169-
ids = np.zeros(n_iter)
170-
ids_err = np.zeros(n_iter)
171-
kstars = np.zeros((n_iter, self.N), dtype=int)
172-
log_likelihoods = np.zeros(n_iter)
173-
ks_stats = np.zeros(n_iter)
174-
p_values = np.zeros(n_iter)
175-
176-
for i in range(n_iter):
177-
# compute kstar
178-
self.compute_kstar(Dthr)
179-
print("iteration ", i)
180-
print("id ", self.intrinsic_dim)
181-
182-
# set new ratio
183-
r_eff = min(0.95,0.2032**(1./self.intrinsic_dim)) if r == 'opt' else r
184-
# compute neighbourhoods shells from k_star
185-
rk = np.array([dd[self.kstar[j]] for j, dd in enumerate(self.distances)])
186-
rn = rk * r_eff
187-
n = np.sum([dd < rn[j] for j, dd in enumerate(self.distances)], axis=1)
188-
# compute id
189-
id = np.log((n.mean() - 1) / (self.kstar.mean() - 1)) / np.log(r_eff)
190-
# compute id error
191-
id_err = ut._compute_binomial_cramerrao(id, self.kstar-1, r_eff, self.N)
192-
# compute likelihood
193-
log_lik = ut.binomial_loglik(id, self.kstar - 1, n - 1, r_eff)
194-
# model validation through KS test
195-
n_model = rng.binomial(self.kstar-1, r_eff**id, size=len(n))
196-
ks, pv = KS(n-1, n_model)
197-
# set new id
198-
self.set_id(id)
158+
r (float, default=None): parameter of binomial estimator, 0 < r < 1. If None, the optimal, adaptive one is
159+
used
160+
plot_mv (bool, default=False): if True, plots the observed and the theoretical distributions compared by
161+
means of the Epps-Singleton test
199162
200-
ids[i] = id
201-
ids_err[i] = id_err
202-
kstars[i] = self.kstar
203-
log_likelihoods[i] = log_lik
204-
#ks_stats[i] = ks
205-
p_values[i] = pv
206-
207-
self.intrinsic_dim = id
208-
self.intrinsic_dim_err = id_err
209-
self.intrinsic_dim_scale = 0.5 * (rn.mean() + rk.mean())
210-
211-
return ids, ids_err, kstars, log_likelihoods, ks_stats, p_values
212-
213-
def return_ids_kstar_binomial_func(
214-
self, initial_id=None, n_iter=5, Dthr=23.92812698, r='opt', verb=True
215-
):
216-
"""Return the id estimates of the binomial algorithm coupled with the kstar estimation of the scale.
217-
218-
Args:
219-
initial_id (float): initial estimate of the id default uses 2NN
220-
n_iter (int): number of iteration
221-
Dthr (float): threshold value for the kstar test
222-
r (float): parameter of binomial estimator, 0 < r < 1
223163
Returns:
224164
ids, ids_err, kstars, log_likelihoods
225165
"""
226-
# start with an initial estimate of the ID
166+
# start with an initial estimate of the ID and the associated k*
227167
if initial_id is None:
228-
self.compute_id_2NN(algorithm='base')
168+
self.compute_id_2NN(algorithm="base")
229169
else:
230170
self.compute_distances()
231171
self.set_id(initial_id)
172+
self.compute_kstar(Dthr)
232173

233174
ids = np.zeros(n_iter)
234175
ids_err = np.zeros(n_iter)
235176
kstars = np.zeros((n_iter, self.N), dtype=int)
236-
ks_pv = np.zeros(n_iter)
237177
es_pv = np.zeros(n_iter)
238-
ks_pv1 = np.zeros(n_iter)
239-
es_pv1 = np.zeros(n_iter)
240-
241178

242179
for i in range(n_iter):
243-
# compute kstar
244-
self.compute_kstar(Dthr)
245-
if verb:
180+
if self.verb:
246181
print("iteration ", i)
247182
print("id ", self.intrinsic_dim)
248183

249184
# set new ratio
250-
r_eff = min(0.95,0.2032**(1./self.intrinsic_dim)) if r == 'opt' else r
185+
r_eff = min(0.95, 0.2032 ** (1.0 / self.intrinsic_dim)) if r is None else r
251186
# compute id using the k*
252-
ide, id_err, scale, ks, es, med1, med2 = self.compute_id_binomial_k(self.kstar, r_eff, bayes=False)
187+
ide, id_err, scale, pv = self.compute_id_binomial_k(
188+
self.kstar, r_eff, bayes=False, plot_mv=plot_mv
189+
)
253190
# compute likelihood
254-
# log_lik = ut.binomial_loglik(id, self.kstar - 1, n - 1, r_eff)
255-
191+
# n = self._fix_k(self.kstar, r_eff)
192+
# log_lik = ut.binomial_loglik(ide, self.kstar - 1, n - 1, r_eff)
193+
# update the k*
194+
self.compute_kstar(Dthr)
195+
# store the obtained values
256196
ids[i] = ide
257197
ids_err[i] = id_err
258198
kstars[i] = self.kstar
259-
ks_pv[i] = ks
260-
es_pv[i] = es
261-
ks_pv1[i] = med1
262-
es_pv1[i] = med2
263-
264-
self.intrinsic_dim = id
265-
self.intrinsic_dim_err = id_err
199+
es_pv[i] = pv
266200

267-
return ids, ids_err, kstars, ks_pv, es_pv, ks_pv1, es_pv1
201+
return ids, ids_err, kstars, es_pv

0 commit comments

Comments
 (0)