Merge pull request #591 from ICB-DCM/develop

yannikschaelte · web-flow · commit 59e06fdca0c4 · 2022-11-16T12:35:02.000+01:00
Release 0.12.8
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,22 @@ Release Notes
 ...........
 
 
+0.12.8 (2022-11-16)
+-------------------
+
+* Fix look-ahead implementation in case of biased proposals (#568)
+
+Minor:
+
+* Remove boteh in test env as distributed #7227 got fixed
+* Remove obsolete two gaussians test
+* Fix Mixin random seed (set it via /dev/urandom)
+* Update viserver to bokeh >= 3.0.1 (different import of TabPanel, Tabs)
+  (all #589)
+* Fix sqlalchemy warning "SAWarning: TypeDecorator BytesStorage()
+  will not produce a cache key" (#590)
+
+
 0.12.7 (2022-10-30)
 -------------------
 
diff --git a/pyabc/inference/smc.py b/pyabc/inference/smc.py
@@ -3,7 +3,7 @@
 import copy
 import logging
 from datetime import datetime, timedelta
-from typing import Callable, List, TypeVar, Union
+from typing import Callable, List, Tuple, TypeVar, Union
 
 import numpy as np
 
@@ -497,7 +497,9 @@ def get_initial_records():
             acceptor_config=self.acceptor.get_epsilon_config(t),
         )
 
-    def _get_initial_population(self, t: int) -> (List[float], List[dict]):
+    def _get_initial_population(
+        self, t: int
+    ) -> Tuple[List[float], List[dict]]:
         """
         Get initial samples, either from the last population stored in history,
         or via sampling sum stats from the prior. This can be used to calibrate
diff --git a/pyabc/inference_util.py b/pyabc/inference_util.py
@@ -536,7 +536,7 @@ def evaluate_preliminary_particle(
     if acc_res.accept:
         weight = sampling_weight * acc_res.weight
     else:
-        weight = 0
+        weight = 0.0
 
     # return the evaluated particle
     return Particle(
diff --git a/pyabc/sampler/eps_mixin.py b/pyabc/sampler/eps_mixin.py
@@ -1,5 +1,6 @@
 """Client submission interface."""
 
+import random
 from abc import ABC, abstractmethod
 from time import sleep
 from typing import Union
@@ -65,6 +66,9 @@ def _full_submit_function_pickle(self, job_id):
         # Unpickle function
         simulate_one = pickle.loads(self._simulate_accept_one)
 
+        random.seed()
+        np.random.seed()
+
         # Run batch_size evaluations and create list of tuples
         result_batch = []
         for j in range(self.batch_size):
@@ -91,6 +95,8 @@ def sample_until_n_accepted(
         else:
             # For advanced pickling, e.g. cloudpickle
             def full_submit_function(job_id):
+                random.seed()
+                np.random.seed()
                 # Run batch_size evaluations and create list of tuples
                 result_batch = []
                 for j in range(self.batch_size):
@@ -168,6 +174,11 @@ def full_submit_function(job_id):
             if result[2] == nth_accepted_id:
                 break
 
+        if sample.n_accepted != n:
+            raise AssertionError(
+                f"Got {sample.n_accepted} accepted particles but expected {n}"
+            )
+
         self.nr_evaluations_ = next_job_id
 
         return sample
diff --git a/pyabc/sampler/mapping.py b/pyabc/sampler/mapping.py
@@ -86,6 +86,7 @@ def map_function(self, simulate_one, _):
 
         np.random.seed()
         random.seed()
+
         nr_simulations = 0
         sample = self._create_empty_sample()
 
diff --git a/pyabc/sampler/redis_eps/sampler.py b/pyabc/sampler/redis_eps/sampler.py
@@ -181,7 +181,7 @@ class RedisEvalParallelSampler(RedisSamplerBase):
         lead to a worse performance, especially if evaluation is costly
         compared to simulation, because evaluation happens sequentially on the
         main thread.
-        Only effective if `look_ahead=True`.
+        Only effective if `look_ahead is True`.
     max_n_eval_look_ahead_factor:
         In delayed evaluation, only this factor times the previous number of
         samples are generated, afterwards the workers wait.
@@ -194,6 +194,21 @@ class RedisEvalParallelSampler(RedisSamplerBase):
         If not, then the sampler only waits for all simulations that were
         started prior to the last started particle of the first `n`
         acceptances.
+        Waiting for all should not be needed, this is for studying purposes.
+    adapt_look_ahead_proposal:
+        In look-ahead mode, adapt the preliminary proposal based on previous
+        acceptances.
+        In theory, as long as proposal >> prior, everything is fine.
+        However, in practice, given a finite sample size, in some cases the
+        preliminary proposal may be biased towards earlier-accepted particles,
+        which can induce a similar bias in the next accepted population.
+        Thus, if any parameter dependent simulation time heterogeneity is to be
+        expected, i.e. if different plausible parameter space regions come
+        with different simulation times, then this flag should be set to False.
+        If no such heterogeneity is to be expected, this flag can be set to
+        True, which can result in improved performance due to a more tailored
+        proposal distribution.
+        Only effective if `look_ahead is True`.
     log_file:
         A file for a dedicated sampler history. Updated in each iteration.
         This log file is complementary to the logging realized via the
@@ -210,6 +225,7 @@ def __init__(
         look_ahead_delay_evaluation: bool = True,
         max_n_eval_look_ahead_factor: float = 10.0,
         wait_for_all_samples: bool = False,
+        adapt_look_ahead_proposal: bool = False,
         log_file: str = None,
     ):
         super().__init__(
@@ -220,6 +236,7 @@ def __init__(
         self.look_ahead_delay_evaluation: bool = look_ahead_delay_evaluation
         self.max_n_eval_look_ahead_factor: float = max_n_eval_look_ahead_factor
         self.wait_for_all_samples: bool = wait_for_all_samples
+        self.adapt_look_ahead_proposal: bool = adapt_look_ahead_proposal
 
     def sample_until_n_accepted(
         self,
@@ -546,6 +563,7 @@ def maybe_start_next_generation(
             t=t + 1,
             population=population,
             delay_evaluation=self.look_ahead_delay_evaluation,
+            adapt_proposal=self.adapt_look_ahead_proposal,
             ana_vars=ana_vars,
         )
 
@@ -591,6 +609,12 @@ def create_sample(self, id_results: List[Tuple], n: int) -> Sample:
         for j in range(n):
             sample += results[j]
 
+        # check number of acceptances
+        if (n_accepted := sample.n_accepted) != n:
+            raise AssertionError(
+                f"Expected {n} accepted particles but got {n_accepted}"
+            )
+
         return sample
 
     def check_analysis_variables(
@@ -604,7 +628,7 @@ def check_analysis_variables(
             # nothing to be done
             return
 
-        def check_bad(var):
+        def _check_bad(var):
             """Check whether a component is incompatible."""
             # do not check for `requires_calibration()`, because in the first
             #  iteration we do not look ahead
@@ -615,15 +639,16 @@ def check_bad(var):
                     "sampler's `look_ahead_delay_evaluation` flag."
                 )
 
-        check_bad(acceptor)
-        check_bad(distance_function)
-        check_bad(eps)
+        _check_bad(acceptor)
+        _check_bad(distance_function)
+        _check_bad(eps)
 
 
 def create_preliminary_simulate_one(
     t,
     population,
     delay_evaluation: bool,
+    adapt_proposal: bool,
     ana_vars: AnalysisVars,
 ) -> Callable:
     """Create a preliminary simulate_one function for generation `t`.
@@ -636,24 +661,36 @@ def create_preliminary_simulate_one(
 
     Parameters
     ----------
-    t: The time index for which to create the function (i.e. call with t+1).
-    population: The preliminary population.
-    delay_evaluation: Whether to delay evaluation.
-    ana_vars: The analysis variables.
+    t:
+        The time index for which to create the function (i.e. call with t+1).
+    population:
+        The preliminary population.
+    delay_evaluation:
+        Whether to delay evaluation.
+    adapt_proposal:
+        Whether to fit the proposal distribution to the new population.
+    ana_vars:
+        The analysis variables.
 
     Returns
     -------
     simulate_one: The preliminary sampling function.
     """
     model_probabilities = population.get_model_probabilities()
 
-    # create deep copy of the transition function
-    transitions = copy.deepcopy(ana_vars.transitions)
-
-    # fit transition
-    for m in population.get_alive_models():
-        parameters, w = population.get_distribution(m)
-        transitions[m].fit(parameters, w)
+    # set proposal distribution
+    transitions = ana_vars.transitions
+    if adapt_proposal:
+        # create deep copy of the transition function
+        transitions = copy.deepcopy(transitions)
+        # fit transitions
+        for m in population.get_alive_models():
+            parameters, w = population.get_distribution(m)
+            transitions[m].fit(parameters, w)
+    elif t == 1:
+        # at t=0, the prior is used for sampling
+        #  (and the transition not fitted yet)
+        transitions = ana_vars.parameter_priors
 
     return create_simulate_function(
         t=t,
@@ -753,6 +790,13 @@ def post_check_acceptance(
 def self_normalize_within_subpopulations(sample: Sample, n: int) -> Sample:
     """Applies subpopulation-wise self-normalization of samples, in-place.
 
+    The weights are adjusted per proposal id, such that all particles
+    belonging to one proposal id have a total weight proportional to the
+    effective sample size of the sub-population.
+    This defines the relative importances of all particles in the accepted
+    population in a reasonabler manner.
+    Conceptually, also hter normalizations are possible.
+
     Parameters
     ----------
     sample: The population to be returned by the sampler.
diff --git a/pyabc/sampler/redis_eps/server_starter.py b/pyabc/sampler/redis_eps/server_starter.py
@@ -109,8 +109,9 @@ def __init__(
         batch_size: int = 1,
         wait_for_all_samples: bool = False,
         look_ahead: bool = False,
-        look_ahead_delay_evaluation=True,
+        look_ahead_delay_evaluation: bool = True,
         max_n_eval_look_ahead_factor: float = 10.0,
+        adapt_look_ahead_proposal: bool = False,
         workers: int = 2,
         processes_per_worker: int = 1,
         daemon: bool = True,
@@ -134,6 +135,7 @@ def __init__(
             look_ahead=look_ahead,
             look_ahead_delay_evaluation=look_ahead_delay_evaluation,
             max_n_eval_look_ahead_factor=max_n_eval_look_ahead_factor,
+            adapt_look_ahead_proposal=adapt_look_ahead_proposal,
             log_file=log_file,
         )
 
diff --git a/pyabc/storage/db_model.py b/pyabc/storage/db_model.py
@@ -34,8 +34,18 @@
 
 
 class BytesStorage(types.TypeDecorator):
+    """Bytes storage.
+
+    See https://docs.sqlalchemy.org/en/14/core/custom_types.html.
+    """
+
+    # Type
     impl = LargeBinary
 
+    # Safe to be used as part of a cache key, see https://sqlalche.me/e/14/cprf
+    # (guaranteed to produce the same bind/result behavior every time)
+    cache_ok = True
+
     def process_bind_param(self, value, dialect):  # pylint: disable=R0201
         return to_bytes(value)
 
diff --git a/pyabc/storage/history.py b/pyabc/storage/history.py
@@ -862,7 +862,7 @@ def get_model_probabilities(
         else:
             p_models_df = (
                 pd.DataFrame(p_models, columns=["p", "m", "t"])
-                .pivot("t", "m", "p")
+                .pivot(index="t", columns="m", values="p")
                 .fillna(0)
             )
             return p_models_df
diff --git a/pyabc/version.py b/pyabc/version.py
@@ -1 +1 @@
-__version__ = '0.12.7'
+__version__ = '0.12.8'
diff --git a/pyabc/visserver/server.py b/pyabc/visserver/server.py
@@ -7,7 +7,7 @@
 import click
 import pandas as pd
 from bokeh.embed import components  # noqa: E402
-from bokeh.models.widgets import Panel, Tabs  # noqa: E402
+from bokeh.models.layouts import TabPanel, Tabs  # noqa: E402
 from bokeh.plotting import figure
 from bokeh.resources import INLINE  # noqa: E402
 from flask import Flask, render_template
@@ -137,10 +137,10 @@ def abc_detail(abc_id):
 
         plot = Tabs(
             tabs=[
-                Panel(child=prob_plot, title="Probability"),
-                Panel(child=samples_fig, title="Samples"),
-                Panel(child=particles_fig, title="Particles"),
-                Panel(child=eps_fig, title="Epsilon"),
+                TabPanel(child=prob_plot, title="Probability"),
+                TabPanel(child=samples_fig, title="Samples"),
+                TabPanel(child=particles_fig, title="Particles"),
+                TabPanel(child=eps_fig, title="Epsilon"),
             ]
         )
         plot = PlotScriptDiv(*components(plot))
@@ -181,7 +181,7 @@ def abc_model(abc_id, model_id, t):
         plot_df_cumsum[parameter] = plot_df[parameter]
         f = figure()
         f.line(x=plot_df_cumsum[parameter], y=plot_df_cumsum["CDF"])
-        p = Panel(child=f, title=parameter)
+        p = TabPanel(child=f, title=parameter)
         tabs.append(p)
     if len(tabs) == 0:
         plot = PlotScriptDiv("", "This model has no Parameters")
diff --git a/setup.cfg b/setup.cfg
@@ -58,7 +58,7 @@ install_requires =
     scikit-learn >= 0.23.1
     click >= 7.1.2
     redis >= 2.10.6
-    distributed >= 2021.10.0
+    distributed >= 2022.10.2
     matplotlib >= 3.3.0
     sqlalchemy >= 1.3.18
     jabbar >= 0.0.10
@@ -76,7 +76,7 @@ packages = find:
 webserver =
     flask_bootstrap >= 3.3.7.1
     flask >= 1.1.2
-    bokeh >= 2.1.1
+    bokeh >= 3.0.1
 pyarrow =
     pyarrow >= 6.0.0
 R =
@@ -117,7 +117,6 @@ test =
     pytest >= 5.4.3
     pytest-cov >= 2.10.0
     pytest-rerunfailures >= 9.1.1
-    bokeh >= 2.4.3  # TODO remove after dask/distributed#7227
 test_petab =
     petabtests >= 0.0.0a6
 
diff --git a/test/base/test_samplers.py b/test/base/test_samplers.py

Original file line number	Diff line number	Diff line change
`@@ -862,7 +862,7 @@ def get_model_probabilities(`
`862`	`862`	`else:`
`863`	`863`	`p_models_df = (`
`864`	`864`	`pd.DataFrame(p_models, columns=["p", "m", "t"])`
`865`		`- .pivot("t", "m", "p")`
	`865`	`+ .pivot(index="t", columns="m", values="p")`
`866`	`866`	`.fillna(0)`
`867`	`867`	`)`
`868`	`868`	`return p_models_df`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '0.12.7'`
	`1`	`+__version__ = '0.12.8'`