2026-03-19

LuisA92 · LuisA92 · commit 896441cff0bc · 2026-03-19T00:37:20.000-04:00
diff --git a/src/integrator/model/encoders/group_encoder.py b/src/integrator/model/encoders/group_encoder.py
@@ -1,27 +1,34 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 from torch import Tensor
-from torch.distributions import Gamma
 
 
 class GroupEncoder(nn.Module):
-    """Infer per-group rate posteriors q(τ_k) from local encoder features.
+    """Infer per-group rate posteriors q(log τ_k) from local encoder features.
+
+    Works in log-space: q(log τ_k) = Normal(μ_k, σ_k²), then τ_k = exp(log τ_k).
+    This avoids Gamma rsample instabilities when τ → 0.
 
     Architecture (DeepSets):
-        x_i → φ(x_i) → mean-pool by group → ρ(·) → (α_k, β_k)
+        x_i → φ(x_i) → mean-pool by group → ρ(·) → (μ_k, logvar_k)
 
     Parameters
     ----------
     encoder_out : int
         Dimension of per-reflection encoder features.
     hidden_dim : int
         Width of φ and ρ hidden layers.
+    log_tau_init : float
+        Initial bias for head_mu (should match prior mean, e.g. -6.9).
     """
 
-    def __init__(self, encoder_out: int, hidden_dim: int = 64, alpha_min: float = 0.1):
+    def __init__(
+        self,
+        encoder_out: int,
+        hidden_dim: int = 64,
+        log_tau_init: float = -6.9,
+    ):
         super().__init__()
-        self.alpha_min = alpha_min
 
         # φ: per-element transform (before pooling)
         self.phi = nn.Sequential(
@@ -37,19 +44,19 @@ def __init__(self, encoder_out: int, hidden_dim: int = 64, alpha_min: float = 0.
             nn.SiLU(),
         )
 
-        self.head_alpha = nn.Linear(hidden_dim, 1)
-        self.head_beta = nn.Linear(hidden_dim, 1)
+        self.head_mu = nn.Linear(hidden_dim, 1)
+        self.head_logvar = nn.Linear(hidden_dim, 1)
 
-        nn.init.zeros_(self.head_alpha.weight)
-        nn.init.zeros_(self.head_alpha.bias)
-        nn.init.zeros_(self.head_beta.weight)
-        nn.init.zeros_(self.head_beta.bias)
+        nn.init.zeros_(self.head_mu.weight)
+        nn.init.constant_(self.head_mu.bias, log_tau_init)
+        nn.init.zeros_(self.head_logvar.weight)
+        nn.init.constant_(self.head_logvar.bias, -2.0)
 
     def forward(
         self,
         x: Tensor,
         group_labels: Tensor,
-    ) -> tuple[Gamma, Tensor]:
+    ) -> tuple[Tensor, Tensor, Tensor]:
         """
         Parameters
         ----------
@@ -60,15 +67,18 @@ def forward(
 
         Returns
         -------
-        q_tau : Gamma with batch shape (n_groups_in_batch,)
-        tau_per_refl : (B, 1) sampled τ_k broadcast to each reflection.
+        mu : (n_groups,)
+            Posterior mean of log τ_k.
+        logvar : (n_groups,)
+            Posterior log-variance of log τ_k.
+        tau_per_refl : (B, 1)
+            Sampled τ_k = exp(log τ_k) broadcast to each reflection.
         """
         # φ: transform each reflection
         z = self.phi(x)  # (B, hidden_dim)
 
         # Mean-pool by group (simple loop — K is small)
         unique_groups = torch.unique(group_labels)
-        n_groups = unique_groups.shape[0]
 
         group_means = []
         for k in unique_groups:
@@ -77,21 +87,20 @@ def forward(
 
         group_features = torch.stack(group_means)  # (n_groups, hidden_dim)
 
-        # ρ: per-group transform → Gamma params
+        # ρ: per-group transform → Normal params in log-space
         h = self.rho(group_features)  # (n_groups, hidden_dim)
 
-        alpha = (
-            F.softplus(self.head_alpha(h)).squeeze(-1) + self.alpha_min
-        )  # (n_groups,)
-        beta = F.softplus(self.head_beta(h)).squeeze(-1) + self.alpha_min  # (n_groups,)
-
-        q_tau = Gamma(concentration=alpha, rate=beta)
+        mu = self.head_mu(h).squeeze(-1)  # (n_groups,)
+        logvar = self.head_logvar(h).squeeze(-1).clamp(-10.0, 4.0)  # (n_groups,)
 
-        # Sample one τ per group, broadcast to reflections
-        tau_group = q_tau.rsample()  # (n_groups,)
+        # Reparameterized sample: log τ_k = μ_k + σ_k * ε, ε ~ N(0,1)
+        std = torch.exp(0.5 * logvar)
+        eps = torch.randn_like(std)
+        log_tau = mu + std * eps  # (n_groups,)
+        tau_group = torch.exp(log_tau)  # (n_groups,), always positive
 
         # Map back: unique_groups is sorted (from torch.unique), so use searchsorted
         indices = torch.searchsorted(unique_groups, group_labels)
         tau_per_refl = tau_group[indices].unsqueeze(1)  # (B, 1)
 
-        return q_tau, tau_per_refl
+        return mu, logvar, tau_per_refl
diff --git a/src/integrator/model/integrators/hierarchical_integrator.py b/src/integrator/model/integrators/hierarchical_integrator.py
@@ -1,8 +1,8 @@
 """Hierarchical Integrator: groups reflections and learns per-group intensity priors.
 
 Uses the standard encoder architecture but adds:
-  1. A GroupEncoder that pools local features by radial bin → q(τ_k)
-  2. Conditioning of the qi surrogate on the sampled τ_k
+  1. A GroupEncoder that pools local features by radial bin → q(log τ_k)
+  2. Conditioning of the qi surrogate on the sampled log τ_k
 """
 
 from typing import Any, Literal
@@ -67,11 +67,12 @@ def _forward_impl(
         x_profile = self.encoders["profile"](shoebox_reshaped)
         x_intensity = self.encoders["intensity"](shoebox_reshaped)
 
-        # Group encoder: pool by radial bin -> sample τ_k per reflection
+        # Group encoder: pool by radial bin → sample τ_k in log-space
         group_labels = metadata["group_label"].long()
-        q_tau, tau_per_refl = self.group_encoder(x_intensity, group_labels)
+        mu, logvar, tau_per_refl = self.group_encoder(x_intensity, group_labels)
 
         # Condition qi on τ_k: concatenate log(τ) to intensity features
+        # tau_per_refl = exp(log_tau), so log(tau_per_refl) recovers log_tau
         log_tau = torch.log(tau_per_refl + 1e-6)
         x_intensity_cond = torch.cat([x_intensity, log_tau], dim=-1)
 
@@ -103,20 +104,20 @@ def _forward_impl(
         )
         out = _assemble_outputs(out)
 
-        # Store q(τ_k) parameters and per-reflection τ for SBC / prediction
-        # q_tau has batch shape [n_groups_in_batch]; scatter back to [B]
+        # Store q(log τ_k) parameters and per-reflection τ for SBC / prediction
         _, inv = torch.unique(group_labels, return_inverse=True)
-        out["tau_per_refl"] = tau_per_refl.squeeze(-1)          # [B]
-        out["q_tau_concentration"] = q_tau.concentration[inv]   # [B]
-        out["q_tau_rate"] = q_tau.rate[inv]                     # [B]
-        out["group_label"] = group_labels                       # [B]
+        out["tau_per_refl"] = tau_per_refl.squeeze(-1)  # [B]
+        out["q_log_tau_mu"] = mu[inv]                    # [B]
+        out["q_log_tau_logvar"] = logvar[inv]             # [B]
+        out["group_label"] = group_labels                 # [B]
 
         return {
             "forward_out": out,
             "qp": qp,
             "qi": qi,
             "qbg": qbg,
-            "q_tau": q_tau,
+            "mu": mu,
+            "logvar": logvar,
             "tau_per_refl": tau_per_refl,
             "group_labels": group_labels,
         }
@@ -133,7 +134,8 @@ def _step(self, batch, step: Literal["train", "val"]):
             qi=outputs["qi"],
             qbg=outputs["qbg"],
             mask=forward_out["mask"],
-            q_tau=outputs["q_tau"],
+            mu=outputs["mu"],
+            logvar=outputs["logvar"],
             tau_per_refl=outputs["tau_per_refl"],
             group_labels=outputs["group_labels"],
         )
diff --git a/src/integrator/model/loss/hierarchical_shoebox_loss.py b/src/integrator/model/loss/hierarchical_shoebox_loss.py
@@ -5,13 +5,13 @@
       - KL( q(prf) || p(prf) )                  — profile prior
       - KL( q(I_i) || Exp(τ_{k(i)}) )           — adaptive intensity prior
       - KL( q(bg_i) || Exp(λ_bg) )              — background prior
-      - (1/N) Σ_k KL( q(τ_k) || Gamma(α, β) )  — global hyperprior
+      - (1/N) Σ_k KL( q(log τ_k) || N(μ_0, σ_0²) )  — global hyperprior
 """
 
 import torch
 import torch.nn as nn
 from torch import Tensor
-from torch.distributions import Distribution, Exponential, Gamma, Poisson
+from torch.distributions import Distribution, Gamma, Poisson
 
 from integrator.configs.priors import PriorConfig
 from integrator.model.distributions.logistic_normal import ProfilePosterior
@@ -27,8 +27,8 @@ class HierarchicalShoeboxLoss(nn.Module):
     """ELBO loss with per-group learned Exponential intensity priors.
 
     The intensity prior for reflection i in group k is Exp(τ_k), where
-    q(τ_k) = Gamma(α_q, β_q) is learned by the GroupEncoder.  The global
-    hyperprior is p(τ_k) = Gamma(hp_alpha, hp_beta).
+    q(log τ_k) = Normal(μ_k, σ_k²) is learned by the GroupEncoder.
+    The global hyperprior is p(log τ_k) = Normal(log_tau_mu, log_tau_sigma²).
 
     Parameters
     ----------
@@ -42,10 +42,10 @@ class HierarchicalShoeboxLoss(nn.Module):
         Monte Carlo samples for KL estimation.
     eps : float
         Numerical stability constant.
-    hp_alpha : float
-        Hyperprior Gamma concentration for τ_k.
-    hp_beta : float
-        Hyperprior Gamma rate for τ_k.
+    log_tau_mu : float
+        Prior mean for log τ_k (Normal hyperprior).
+    log_tau_sigma : float
+        Prior std for log τ_k (Normal hyperprior).
     dataset_size : int
         Total training set size N for global KL scaling.
     """
@@ -58,15 +58,15 @@ def __init__(
         pi_cfg: PriorConfig | None = None,
         mc_samples: int = 4,
         eps: float = 1e-6,
-        hp_alpha: float = 2.0,
-        hp_beta: float = 1.0,
+        log_tau_mu: float = -6.9,
+        log_tau_sigma: float = 1.0,
         dataset_size: int = 1,
     ):
         super().__init__()
         self.mc_samples = mc_samples
         self.eps = eps
-        self.hp_alpha = hp_alpha
-        self.hp_beta = hp_beta
+        self.log_tau_mu = log_tau_mu
+        self.log_tau_sigma = log_tau_sigma
         self.dataset_size = dataset_size
 
         # Profile prior (Dirichlet path — ignored when qp is ProfilePosterior)
@@ -94,7 +94,8 @@ def forward(
         qi: Distribution,
         qbg: Distribution,
         mask: Tensor,
-        q_tau: Gamma,
+        mu: Tensor,
+        logvar: Tensor,
         tau_per_refl: Tensor,
         group_labels: Tensor,
     ) -> dict[str, Tensor]:
@@ -126,11 +127,6 @@ def forward(
             kl = kl + kl_prf
 
         # ── Intensity KL: KL(q(I_i) || Exp(τ_{k(i)})) ──────────────
-        # tau_per_refl is [B, 1]; flatten to [B] for the Exponential rate
-        tau_flat = tau_per_refl.squeeze(-1).detach()  # stop gradient to τ for this KL
-        # Actually we want gradients through τ for the global KL, but the
-        # per-reflection intensity KL should use the same τ sample.
-        # Re-enable gradient: use tau_per_refl directly (no detach).
         tau_flat = tau_per_refl.squeeze(-1)
         p_i = Gamma(
             concentration=torch.ones_like(tau_flat),
@@ -152,13 +148,18 @@ def forward(
             )
             kl = kl + kl_bg
 
-        # ── Global KL: KL(q(τ_k) || Gamma(α, β)) / N ───────────────
-        p_tau = Gamma(
-            concentration=torch.tensor(self.hp_alpha, device=device),
-            rate=torch.tensor(self.hp_beta, device=device),
-        )
+        # ── Global KL: KL(N(μ_k, σ_k²) || N(μ_0, σ_0²)) / N ─────
+        sigma_q_sq = logvar.exp()  # (n_groups,)
+        sigma_p_sq = self.log_tau_sigma**2
+
         kl_global = (
-            torch.distributions.kl.kl_divergence(q_tau, p_tau).sum()
+            0.5
+            * (
+                sigma_q_sq / sigma_p_sq
+                + (mu - self.log_tau_mu) ** 2 / sigma_p_sq
+                - 1.0
+                - torch.log(sigma_q_sq / sigma_p_sq)
+            ).sum()
             / self.dataset_size
         )
 
diff --git a/src/integrator/utils/factory_utils.py b/src/integrator/utils/factory_utils.py
@@ -191,7 +191,7 @@ def _get_loss_module(
     kwargs = shallow_dict(loss_args)
 
     # Forward extra keys from loss.args for custom loss classes
-    # (e.g. hp_alpha, hp_beta for HierarchicalShoeboxLoss)
+    # (e.g. log_tau_mu, log_tau_sigma for HierarchicalShoeboxLoss)
     standard_keys = {"mc_samples", "eps", "pprf_cfg", "pbg_cfg", "pi_cfg"}
     for k, v in cfg["loss"]["args"].items():
         if k not in standard_keys: