|
15 | 15 |
|
16 | 16 | import matplotlib.pyplot as plt
|
17 | 17 | import numpy as np
|
| 18 | + |
18 | 19 | rng = np.random.default_rng()
|
19 | 20 |
|
20 | 21 | from scipy.stats import epps_singleton_2samp as es2s
|
21 |
| -from scipy.stats import ks_2samp as ks2s |
22 |
| -from scipy.stats import cramervonmises_2samp as cvm2s |
23 |
| -from scipy.stats import chisquare as x2 |
24 |
| -from scipy.stats import binom |
25 |
| -from scipy.stats import kstwo |
26 |
| -import os |
| 22 | + |
| 23 | +from ..plot import plot_cdf |
| 24 | + |
27 | 25 |
|
28 | 26 | def box_counting(
|
29 | 27 | data,
|
@@ -73,26 +71,13 @@ def box_counting(
|
73 | 71 | offsets = np.linspace(0, scale, n_offsets + 1)[:-1]
|
74 | 72 | # search over all offsets
|
75 | 73 | for offset in offsets:
|
76 |
| - # bin_edges = np.array([np.hstack([inf - 1e-5 - offset, x - 1e-5 - offset]) |
77 |
| - # for x in bin_temp[:, 1:]]) |
78 | 74 | bin_edges = np.array(
|
79 | 75 | [
|
80 | 76 | np.hstack([inf - scale + offset + 1e-5, x + offset + 1e-5])
|
81 | 77 | for x in bin_temp
|
82 | 78 | ]
|
83 | 79 | )
|
84 |
| - # ind = np.where((bin_edges[0] < sup - 1) == False)[0][0] |
85 |
| - # bin_edges = bin_edges[:, : ind + 1] |
86 |
| - # print(bin_edges) |
87 |
| - |
88 | 80 | H1, e = np.histogramdd(data, bins=bin_edges)
|
89 |
| - # print(e[0],e[0][0],e[0][-1]) |
90 |
| - # if verb: |
91 |
| - # print(H1.sum()) |
92 |
| - # if int(H1.sum()) != array.shape[0]: |
93 |
| - # print('for scale ', scale, ' and offset ', offset, ' the covering is badly shaped') |
94 |
| - # touched.append(np.inf) |
95 |
| - # continue |
96 | 81 | touched.append(np.sum(H1 > 0))
|
97 | 82 | Ns.append(touched)
|
98 | 83 | Ns = np.array(Ns)
|
@@ -134,7 +119,7 @@ def box_counting(
|
134 | 119 | np.log(1 / scales),
|
135 | 120 | fitted_y_vals,
|
136 | 121 | "k--",
|
137 |
| - label=f"Fit: {np.round(coeffs[0],3)}X+{coeffs[1]}", |
| 122 | + label=f"Fit: {np.round(coeffs[0], 3)}X+{coeffs[1]}", |
138 | 123 | )
|
139 | 124 | ax.legend()
|
140 | 125 | plt.show()
|
@@ -201,120 +186,57 @@ def correlation_integral(dists, scales, cond=False, plot=True):
|
201 | 186 |
|
202 | 187 | return ids, scales[1:], CI
|
203 | 188 |
|
| 189 | + |
204 | 190 | # --------------------------------------------------------------------------------------
|
205 | 191 |
|
206 | 192 |
|
207 |
| -def _binomial_model_validation(k, n, p, artificial_samples=100000, k_bootstrap=100, plot=True): |
| 193 | +def _binomial_model_validation( |
| 194 | + k, n, p, artificial_samples=100000, k_bootstrap=1, plot=False |
| 195 | +): |
208 | 196 | """Perform the model validation for the binomial estimator. To this aim, an artificial set of binomially distributed
|
209 |
| - points is extracted and compared to the observed ones. The quantitative test is performed by means of the .... test. |
210 |
| - The associated statistics and p-value is returned |
| 197 | + points is extracted and compared to the observed ones. The quantitative test is performed by means of the 2-samples Epps-Singleton tests. |
| 198 | + The associated statistics and p-value are returned. |
| 199 | + Initially, the 2-samples Kolmogorov-Smirnoff was used but, even if providing reasonable results, it is supposed to deal with continuous distributions. |
211 | 200 |
|
212 | 201 | Args:
|
213 | 202 | k (int or np.ndarray(int)): Observed points in outer shells.
|
214 | 203 | n (np.ndarray(int)): Observed points in the inner shells.
|
215 | 204 | p (float): Tested Binomial parameter
|
216 | 205 | artificial_samples (int, default=1000000): number of theoretical samples to be compared with the observed ones.
|
| 206 | + k_bootstrap (int, default=1): number of bootstrap resampling in order to obtain more reliable p-values |
| 207 | + plot (bool, default=False): flag that, if se to True, allows to plot the observed vs theoretical distributions |
217 | 208 |
|
218 | 209 | Returns:
|
219 |
| - statistics (float): test statistics |
220 | 210 | p_value (float): p-value obtained from the test
|
221 | 211 | """
|
222 | 212 | assert 0 < p < 1, "The binomial parameter must be in the interval (0,1)"
|
223 | 213 | # sample an artificial ensemble of binomially distributed points to be compared with the observed ones
|
224 | 214 | if n.shape[0] < artificial_samples:
|
225 | 215 | if isinstance(k, np.ndarray):
|
226 | 216 | replicas = int(artificial_samples / n.shape[0])
|
227 |
| - n_samp = np.array([rng.binomial(ki, p, size=replicas) for ki in (k-1)]).reshape(-1) |
| 217 | + n_samp = np.array( |
| 218 | + [rng.binomial(ki, p, size=replicas) for ki in (k - 1)] |
| 219 | + ).reshape(-1) |
228 | 220 | else:
|
229 |
| - n_samp = rng.binomial(k-1, p, size=artificial_samples) |
| 221 | + n_samp = rng.binomial(k - 1, p, size=artificial_samples) |
230 | 222 | else:
|
231 | 223 | if isinstance(k, np.ndarray):
|
232 |
| - n_samp = rng.binomial(k-1, p) |
| 224 | + n_samp = rng.binomial(k - 1, p) |
233 | 225 | else:
|
234 |
| - n_samp = rng.binomial(k-1, p, size=n.shape[0]) |
235 |
| - |
236 |
| - # compute the theoretical probabilities |
237 |
| -# if isinstance(k, np.ndarray): |
238 |
| -# k_sup = k.max() |
239 |
| -# p_k = np.array([sum(k-1 == i) for i in range(0, k_sup)]) / len(k) |
240 |
| -# p_theo = np.sum([ binom.pmf(range(0, k_sup), ki, p)*p_k[ki] for ki in range(0, k_sup) ], axis=0) |
241 |
| -# else: |
242 |
| -# k_sup = k |
243 |
| -# p_theo = binom.pmf(range(0, k), k-1, p) |
244 |
| - |
245 |
| - # observed probability distribution |
246 |
| - #p_obs = np.array([sum(n-1 == i) for i in range(0, k_sup)]) / len(n) |
247 |
| -# commented out as it takes long time and is not needed at the moment |
248 |
| -# print('computing p_samp...') |
249 |
| -# p_samp = np.array([sum(n_samp == i) for i in range(0,k_sup)]) / len(n_samp) |
250 |
| - |
251 |
| - #if plot: |
252 |
| - # print('plotting...') |
253 |
| - # plt.figure() |
254 |
| - # plt.plot(p_theo, label='p theo | id') |
255 |
| - # plt.plot(p_samp, label='p samp | id') |
256 |
| - # plt.plot(p_obs, label='p obs') |
257 |
| - # plt.xlabel('n',fontsize=14) |
258 |
| - # plt.ylabel('p(n)',fontsize=14) |
259 |
| - #plt.yscale('log') |
260 |
| - #plt.show() |
261 |
| - |
262 |
| - #x2_d1, x2_pv1 = x2(p_obs, p_theo, ddof=4) |
263 |
| - #x2_d = np.array([x2(p_obs, p_theo, ddof=ki) for ki in range(1, k_sup)]) |
264 |
| - |
265 |
| - #x2_d, x2_pv = x2(len(n)*p_obs, len(n)*p_theo, ddof=1) |
266 |
| - #f_theo = len(n)*p_theo |
267 |
| - #mask = f_theo > 7 |
268 |
| - #chi2 = sum((p_obs-p_theo)**2/p_theo)*len(n) |
269 |
| - #chi2 = sum((f_obs[mask]-f_theo[mask])**2/f_theo[mask]) |
270 |
| - |
271 |
| - # test against sampled distribution using bootstrap (cramer von mises is left out) |
272 |
| - pvs = np.zeros((k_bootstrap,2))#3 |
273 |
| - for ki in range(k_bootstrap): |
274 |
| - n_temp = rng.choice(n,size=len(n),replace=True) |
275 |
| - print(ki,end='\r') |
276 |
| - #p_temp = np.array([sum(n_temp-1 == i) for i in range(0, k_sup)]) / len(n) |
277 |
| - |
278 |
| - #if plot: |
279 |
| - # if ki==0: |
280 |
| - # plt.plot(p_temp, color='grey',alpha=0.25,zorder=-1,label='p bootstrap') |
281 |
| - # else: |
282 |
| - # plt.plot(p_temp, color='grey',alpha=0.25,zorder=-1) |
283 |
| - |
284 |
| - ks_d, ks_pv = ks2s(n_samp, n_temp-1) |
285 |
| - es_d, es_pv = es2s(n_samp, n_temp-1)#, t=(0.45, 0.55)) |
286 |
| - #appo = cvm2s(n_samp, n_temp-1) |
287 |
| - #cvm_d, cvm_pv = appo.statistic,appo.pvalue |
288 |
| - pvs[ki] = (ks_pv,es_pv)#,cvm_pv) |
289 |
| - |
290 |
| - #if plot: |
291 |
| - # plt.legend() |
292 |
| - # plt.show() |
293 |
| - |
294 |
| - ave = np.array([pvs[:i*5].mean(axis=0) for i in range(1,k_bootstrap//5+1)]) |
295 |
| - median = np.array([np.median(pvs[:i*5],axis=0) for i in range(1,k_bootstrap//5+1)]) |
296 |
| - |
| 226 | + n_samp = rng.binomial(k - 1, p, size=n.shape[0]) |
| 227 | + |
297 | 228 | if plot:
|
298 |
| - labels=['ks','es','cvm'] |
299 |
| - plt.figure() |
300 |
| - [plt.plot(range(5,k_bootstrap+5,5),ave[:,j],label=labels[j]) for j in range(2)] |
301 |
| - [plt.plot(range(5,k_bootstrap+5,5),median[:,j],label=labels[j]) for j in range(2)] |
302 |
| - plt.xlabel('# of bootstrap',fontsize=14) |
303 |
| - plt.ylabel('average pv',fontsize=14) |
304 |
| - plt.yscale('log') |
305 |
| - plt.legend() |
306 |
| - plt.show() |
307 |
| - |
308 |
| - # test against theoretical distribution |
309 |
| - #ks_stat = max(abs(np.cumsum(p_obs)-np.cumsum(p_theo))) |
310 |
| - #ks_p_value = kstwo.sf(ks_stat, len(n)) |
311 |
| - |
312 |
| - |
313 |
| - #print("KS\t", ks_d, ks_pv) |
314 |
| - #print("KS_hm\t", ks_stat, p_value) |
315 |
| - #print("X2 auto\t", x2_d, x2_pv) |
316 |
| - #print("X2 hm\t", x2_d1, x2_pv1) |
317 |
| - # print("X2:\t", x2_d, x2_pv) |
318 |
| - # print("ES:\t", es_d, es_pv) |
319 |
| - # return ks_d, ks_pv |
320 |
| - return ave[-1,0],ave[-1,1], median[-1,0],median[-1,1] |
| 229 | + plot_cdf(n - 1, n_samp) |
| 230 | + |
| 231 | + es_d, es_pv = es2s(n_samp, n - 1) |
| 232 | + |
| 233 | + # possibly test against re-sampled distribution using bootstrap |
| 234 | + if k_bootstrap > 1: |
| 235 | + pvs = [es_pv] |
| 236 | + for ki in range(k_bootstrap): |
| 237 | + n_temp = rng.choice(n, size=len(n), replace=True) |
| 238 | + es_d, es_pv = es2s(n_samp, n_temp - 1) |
| 239 | + pvs.append(es_pv) |
| 240 | + return np.mean(pvs) |
| 241 | + else: |
| 242 | + return es_pv |
0 commit comments