feat: add trivariate accuracy metric (#179)

mplatzer · lukaszkolodziejczyk · web-flow · commit 7b7fa4991c71 · 2025-05-09T15:44:43.000+02:00
Co-authored-by: Lukasz Kolodziejczyk &lt;lukasz.kolodziejczyk@mostly.ai&gt;
diff --git a/mostlyai/qa/_accuracy.py b/mostlyai/qa/_accuracy.py
@@ -16,6 +16,7 @@
 import hashlib
 import logging
 import math
+import time
 from typing import Any, Literal
 from collections.abc import Callable, Iterable
 
@@ -30,6 +31,7 @@
     CHARTS_COLORS,
     CHARTS_FONTS,
     EMPTY_BIN,
+    MAX_TRIVARIATES,
     NA_BIN,
     MIN_RARE_CAT_PROTECTION,
     OTHER_BIN,
@@ -58,9 +60,9 @@ def calculate_univariates(
     """
     Calculates univariate accuracies for all target columns.
     """
-    _LOG.info("calculate univariates")
-
+    t0 = time.time()
     tgt_cols = [c for c in ori_bin.columns if c.startswith(TGT_COLUMN)]
+
     accuracies = pd.DataFrame({"column": tgt_cols})
     with parallel_config("loky", n_jobs=min(16, max(1, cpu_count() - 1))):
         results = Parallel()(
@@ -71,6 +73,9 @@ def calculate_univariates(
             for _, row in accuracies.iterrows()
         )
         accuracies["accuracy"], accuracies["accuracy_max"] = zip(*results)
+
+    _LOG.info(f"calculated univariates for {len(tgt_cols)} columns in {time.time() - t0:.2f} seconds")
+
     return accuracies
 
 
@@ -87,7 +92,7 @@ def calculate_bivariates(
     For each such column pair, value pair frequencies
     are calculated both for training and synthetic data.
     """
-    _LOG.info("calculate bivariates")
+    t0 = time.time()
 
     # the result for symmetric pairs is the same, so we only calculate one of them
     # later, we append copy results for symmetric pairs
@@ -107,7 +112,6 @@ def calculate_bivariates(
     else:
         # enforce consistent columns
         accuracies[["accuracy", "accuracy_max"]] = None
-        # ensure required number of progress messages are sent
 
     accuracies = pd.concat(
         [
@@ -117,6 +121,8 @@ def calculate_bivariates(
         axis=0,
     ).reset_index(drop=True)
 
+    _LOG.info(f"calculated bivariate accuracies for {len(accuracies)} combinations in {time.time() - t0:.2f} seconds")
+
     return accuracies
 
 
@@ -169,6 +175,49 @@ def calculate_bivariate_columns(ori_bin: pd.DataFrame, append_symetric: bool = T
     return columns_df
 
 
+def calculate_trivariates(ori_bin: pd.DataFrame, syn_bin: pd.DataFrame) -> pd.DataFrame:
+    """
+    Calculates trivariate accuracies.
+    """
+    t0 = time.time()
+
+    accuracies = calculate_trivariate_columns(ori_bin)
+
+    # calculate trivariates if there is at least one pair
+    if len(accuracies) > 0:
+        with parallel_config("loky", n_jobs=min(16, max(1, cpu_count() - 1))):
+            results = Parallel()(
+                delayed(calculate_accuracy)(
+                    ori_bin_cols=ori_bin[[row["col1"], row["col2"], row["col3"]]],
+                    syn_bin_cols=syn_bin[[row["col1"], row["col2"], row["col3"]]],
+                )
+                for _, row in accuracies.iterrows()
+            )
+            accuracies["accuracy"], accuracies["accuracy_max"] = zip(*results)
+    else:
+        # enforce consistent columns
+        accuracies[["accuracy", "accuracy_max"]] = None
+
+    _LOG.info(f"calculated trivariate accuracies for {len(accuracies)} combinations in {time.time() - t0:.2f} seconds")
+
+    return accuracies
+
+
+def calculate_trivariate_columns(ori_bin: pd.DataFrame) -> pd.DataFrame:
+    """
+    Creates DataFrame with all column-triples subject to trivariate analysis.
+    """
+    tgt_cols = [c for c in ori_bin.columns if c.startswith(TGT_COLUMN_PREFIX)]
+    columns_df = pd.DataFrame({"col1": tgt_cols})
+    columns_df = pd.merge(columns_df, pd.DataFrame({"col2": tgt_cols}), how="cross")
+    columns_df = pd.merge(columns_df, pd.DataFrame({"col3": tgt_cols}), how="cross")
+    columns_df = columns_df.loc[columns_df.col1 < columns_df.col2]
+    columns_df = columns_df.loc[columns_df.col1 < columns_df.col3]
+    columns_df = columns_df.loc[columns_df.col2 < columns_df.col3]
+    columns_df = columns_df.sample(frac=1).head(n=MAX_TRIVARIATES)
+    return columns_df
+
+
 def calculate_expected_l1_multinomial(probs: list[float], n_1: int, n_2: int) -> np.float64:
     """
     Calculate expected L1 distance for two multinomial samples of size `n_1` and `n_2` that follow `probs`.
@@ -349,7 +398,7 @@ def calculate_bin_counts(
     """
     Calculates counts of unique values in each bin.
     """
-    _LOG.info("calculate bin counts")
+    t0 = time.time()
     with parallel_config("loky", n_jobs=min(16, max(1, cpu_count() - 1))):
         results = Parallel()(
             delayed(bin_count_uni)(
@@ -359,8 +408,10 @@ def calculate_bin_counts(
             for col, values in binned.items()
         )
         bin_cnts_uni = dict(results)
+    _LOG.info(f"calculated univariate bin counts for {len(binned.columns)} columns in {time.time() - t0:.2f} seconds")
 
-    biv_cols = calculate_bivariate_columns(binned)
+    t0 = time.time()
+    biv_cols = calculate_bivariate_columns(binned, append_symetric=True)
     with parallel_config("loky", n_jobs=min(16, max(1, cpu_count() - 1))):
         results = Parallel()(
             delayed(bin_count_biv)(
@@ -372,6 +423,7 @@ def calculate_bin_counts(
             for _, row in biv_cols.iterrows()
         )
         bin_cnts_biv = dict(results)
+    _LOG.info(f"calculated bivariate bin counts for {len(biv_cols)} combinations in {time.time() - t0:.2f} seconds")
 
     return bin_cnts_uni, bin_cnts_biv
 
diff --git a/mostlyai/qa/_common.py b/mostlyai/qa/_common.py
@@ -28,6 +28,7 @@
 MAX_BIVARIATE_TGT_PLOTS = 300
 MAX_BIVARIATE_CTX_PLOTS = 60
 MAX_BIVARIATE_NXT_PLOTS = 60
+MAX_TRIVARIATES = 10_000
 
 NA_BIN = "(n/a)"
 OTHER_BIN = "(other)"
diff --git a/mostlyai/qa/_filesystem.py b/mostlyai/qa/_filesystem.py
@@ -110,6 +110,7 @@ def __init__(self, path: str | Path):
         self.bins_dir = self.path / "bins"
         self.univariate_accuracies_path = self.path / "univariate_accuracies.parquet"
         self.bivariate_accuracies_path = self.path / "bivariate_accuracies.parquet"
+        self.trivariate_accuracies_path = self.path / "trivariate_accuracies.parquet"
         self.numeric_kdes_uni_dir = self.path / "numeric_kdes_uni"
         self.categorical_counts_uni_dir = self.path / "categorical_counts_uni"
         self.bin_counts_uni_path = self.path / "bin_counts_uni.parquet"
@@ -203,6 +204,15 @@ def load_bivariate_accuracies(self) -> pd.DataFrame:
         df["col2"] = df["col2"].str.replace(_OLD_COL_PREFIX, _NEW_COL_PREFIX, regex=True)
         return df
 
+    def store_trivariate_accuracies(self, trivariates: pd.DataFrame) -> None:
+        trivariates.to_parquet(self.trivariate_accuracies_path)
+
+    def load_trivariate_accuracies(self) -> pd.DataFrame:
+        if not self.trivariate_accuracies_path.exists():
+            return pd.DataFrame(columns=["col1", "col2", "col3", "accuracy", "accuracy_max"])
+        df = pd.read_parquet(self.trivariate_accuracies_path)
+        return df
+
     def store_numeric_uni_kdes(self, trn_kdes: dict[str, pd.Series]) -> None:
         trn_kdes = pd.DataFrame(
             [(column, list(xy.index), list(xy.values)) for column, xy in trn_kdes.items()],
diff --git a/mostlyai/qa/_html_report.py b/mostlyai/qa/_html_report.py
@@ -73,6 +73,7 @@ def store_report(
     meta: dict,
     acc_uni: pd.DataFrame,
     acc_biv: pd.DataFrame,
+    acc_triv: pd.DataFrame,
     acc_cats_per_seq: pd.DataFrame,
     acc_seqs_per_cat: pd.DataFrame,
     corr_trn: pd.DataFrame,
@@ -82,7 +83,9 @@ def store_report(
     """
 
     # summarize accuracies by column for overview table
-    accuracy_table_by_column = summarize_accuracies_by_column(acc_uni, acc_biv, acc_cats_per_seq, acc_seqs_per_cat)
+    accuracy_table_by_column = summarize_accuracies_by_column(
+        acc_uni, acc_biv, acc_triv, acc_cats_per_seq, acc_seqs_per_cat
+    )
     accuracy_table_by_column = accuracy_table_by_column.sort_values("univariate", ascending=False)
 
     acc_uni = filter_uni_acc_for_plotting(acc_uni)
@@ -131,27 +134,48 @@ def store_report(
 
 
 def summarize_accuracies_by_column(
-    acc_uni: pd.DataFrame, acc_biv: pd.DataFrame, acc_cats_per_seq: pd.DataFrame, acc_seqs_per_cat: pd.DataFrame
+    acc_uni: pd.DataFrame,
+    acc_biv: pd.DataFrame,
+    acc_triv: pd.DataFrame,
+    acc_cats_per_seq: pd.DataFrame,
+    acc_seqs_per_cat: pd.DataFrame,
 ) -> pd.DataFrame:
     """
-    Calculates DataFrame that stores per-column univariate, bivariate and coherence accuracies.
+    Calculates DataFrame that stores per-column univariate, bivariate, trivariate and coherence accuracies.
     """
 
     tbl_acc_uni = acc_uni.rename(columns={"accuracy": "univariate", "accuracy_max": "univariate_max"})
+    tbl_acc = tbl_acc_uni
+
     tbl_acc_biv = (
-        acc_biv.loc[acc_biv.type != "nxt"]
-        .groupby("col1")
-        .mean(["accuracy", "accuracy_max"])
+        acc_biv.melt(value_vars=["col1", "col2"], value_name="column", id_vars=["accuracy", "accuracy_max"])
+        .groupby("column")[["accuracy", "accuracy_max"]]
+        .mean()
         .reset_index()
         .rename(
             columns={
-                "col1": "column",
                 "accuracy": "bivariate",
                 "accuracy_max": "bivariate_max",
             }
         )
     )
-    tbl_acc = tbl_acc_uni.merge(tbl_acc_biv, how="left")
+    if not tbl_acc_biv.empty:
+        tbl_acc = tbl_acc_uni.merge(tbl_acc_biv, how="left")
+
+    tbl_acc_triv = (
+        acc_triv.melt(value_vars=["col1", "col2", "col3"], value_name="column", id_vars=["accuracy", "accuracy_max"])
+        .groupby("column")[["accuracy", "accuracy_max"]]
+        .mean()
+        .reset_index()
+        .rename(
+            columns={
+                "accuracy": "trivariate",
+                "accuracy_max": "trivariate_max",
+            }
+        )
+    )
+    if not tbl_acc_triv.empty:
+        tbl_acc = tbl_acc.merge(tbl_acc_triv, how="left")
 
     acc_nxt = acc_biv.loc[acc_biv.type == "nxt"]
     if not all((acc_nxt.empty, acc_cats_per_seq.empty, acc_seqs_per_cat.empty)):
diff --git a/mostlyai/qa/assets/html/report_template.html b/mostlyai/qa/assets/html/report_template.html
@@ -75,24 +75,28 @@ <h1 id="summary"><span>{{ meta.report_title }}</span>{{ meta.report_subtitle }}<
                 <table class='table'>
                   <tr><td>Univariate</td>
                     <td align="right">
-                      {{ "{:.1%}".format(metrics.accuracy.univariate) }}<br />
-                      <small class="muted-text">({{ "{:.1%}".format(metrics.accuracy.univariate_max) }})</small>
+                      {{ "{:.1%}".format(metrics.accuracy.univariate) }}
                     </td>
                   </tr>
                   {% if 'bivariate' in accuracy_table_by_column %}
                   <tr><td>Bivariate</td>
                     <td align="right">
-                      {{ "{:.1%}".format(metrics.accuracy.bivariate) }}<br />
-                      <small class="muted-text">({{ "{:.1%}".format(metrics.accuracy.bivariate_max) }})</small>
+                      {{ "{:.1%}".format(metrics.accuracy.bivariate) }}
+                    </td>
+                  </tr>
+                  {% endif %}
+                  {% if 'trivariate' in accuracy_table_by_column %}
+                  <tr><td>Trivariate</td>
+                    <td align="right">
+                      {{ "{:.1%}".format(metrics.accuracy.trivariate) }}
                     </td>
                   </tr>
                   {% endif %}
                   {% if 'coherence' in accuracy_table_by_column %}
                   <tr>
                     <td>Coherence</td>
                     <td align="right">
-                      {{ "{:.1%}".format(metrics.accuracy.coherence).replace('nan%', '-') }}<br />
-                      <small class="muted-text">({{ "{:.1%}".format(metrics.accuracy.coherence_max).replace('nan%', '-') }})</small>
+                      {{ "{:.1%}".format(metrics.accuracy.coherence).replace('nan%', '-') }}
                     </td>
                   </tr>
                   {% endif %}
@@ -305,6 +309,9 @@ <h2 id="accuracy" class="anchor">Accuracy</h2>
             {% if 'bivariate' in accuracy_table_by_column %}
             <th>Bivariate</th>
             {% endif %}
+            {% if 'trivariate' in accuracy_table_by_column %}
+            <th>Trivariate</th>
+            {% endif %}
             {% if 'coherence' in accuracy_table_by_column %}
             <th>Coherence</th>
             {% endif %}
@@ -318,6 +325,9 @@ <h2 id="accuracy" class="anchor">Accuracy</h2>
             {% if 'bivariate' in accuracy_table_by_column %}
             <td>{{ "{:.1%}".format(row['bivariate']) }}</td>
             {% endif %}
+            {% if 'trivariate' in accuracy_table_by_column %}
+            <td>{{ "{:.1%}".format(row['trivariate']) }}</td>
+            {% endif %}
             {% if 'coherence' in accuracy_table_by_column %}
             <td>{{ "{:.1%}".format(row['coherence']).replace('nan%', '-') }}</td>
             {% endif %}
@@ -327,12 +337,15 @@ <h2 id="accuracy" class="anchor">Accuracy</h2>
           <thead>
           <tr>
             <th>Total</th>
-            <th>{{ "{:.1%}".format(metrics.accuracy.univariate) }}</th>
+            <th>{{ "{:.1%}".format(metrics.accuracy.univariate) }} <small class="muted-text">({{ "{:.1%}".format(metrics.accuracy.univariate_max) }})</small></th>
             {% if 'bivariate' in accuracy_table_by_column %}
-            <th>{{ "{:.1%}".format(metrics.accuracy.bivariate) }}</th>
+            <th>{{ "{:.1%}".format(metrics.accuracy.bivariate) }} <small class="muted-text">({{ "{:.1%}".format(metrics.accuracy.bivariate_max) }})</small></th>
+            {% endif %}
+            {% if 'trivariate' in accuracy_table_by_column %}
+            <th>{{ "{:.1%}".format(metrics.accuracy.trivariate) }} <small class="muted-text">({{ "{:.1%}".format(metrics.accuracy.trivariate_max) }})</small></th>
             {% endif %}
             {% if 'coherence' in accuracy_table_by_column %}
-            <th>{{ "{:.1%}".format(metrics.accuracy.coherence) }}</th>
+            <th>{{ "{:.1%}".format(metrics.accuracy.coherence) }} <small class="muted-text">({{ "{:.1%}".format(metrics.accuracy.coherence_max) }})</small></th>
             {% endif %}
           </tr>
           </thead>
@@ -355,7 +368,7 @@ <h2 id="accuracy" class="anchor">Accuracy</h2>
       <div class="explainer-body">
         Accuracy of synthetic data is assessed by comparing the distributions of the synthetic (shown in green) and the original data (shown in gray).
         For each distribution plot we sum up the deviations across all categories, to get the so-called total variation distance (TVD). The reported accuracy is then simply reported as 100% - TVD.
-        These accuracies are calculated for all univariate and bivariate distributions. A final accuracy score is then calculated as the average across all of these.
+        These accuracies are calculated for all univariate, bivariate and trivariate distributions. A final accuracy score is then calculated as the average across all of these.
       </div>
     </div>
   </div>
diff --git a/mostlyai/qa/metrics.py b/mostlyai/qa/metrics.py
@@ -27,7 +27,8 @@ class Accuracy(CustomBaseModel):
     1. **Univariate Accuracy**: The accuracy of the univariate distributions for all target columns.
     2. **Bivariate Accuracy**: The accuracy of all pair-wise distributions for target columns, as well as for target
     columns with respect to the context columns.
-    3. **Coherence Accuracy**: The accuracy of the auto-correlation for all target columns.
+    3. **Trivariate Accuracy**: The accuracy of all three-way distributions for target columns.
+    4. **Coherence Accuracy**: The accuracy of the auto-correlation for all target columns.
 
     Accuracy is defined as 100% - [Total Variation Distance](https://en.wikipedia.org/wiki/Total_variation_distance_of_probability_measures) (TVD),
     whereas TVD is half the sum of the absolute differences of the relative frequencies of the corresponding
@@ -60,6 +61,12 @@ class Accuracy(CustomBaseModel):
         ge=0.0,
         le=1.0,
     )
+    trivariate: float | None = Field(
+        default=None,
+        description="Average accuracy of discretized trivariate distributions.",
+        ge=0.0,
+        le=1.0,
+    )
     coherence: float | None = Field(
         default=None,
         description="Average accuracy of discretized coherence distributions. Only applicable for sequential data.",
@@ -87,6 +94,13 @@ class Accuracy(CustomBaseModel):
         ge=0.0,
         le=1.0,
     )
+    trivariate_max: float | None = Field(
+        default=None,
+        alias="trivariateMax",
+        description="Expected trivariate accuracy of a same-sized holdout. Serves as a reference for `trivariate`.",
+        ge=0.0,
+        le=1.0,
+    )
     coherence_max: float | None = Field(
         default=None,
         alias="coherenceMax",
diff --git a/mostlyai/qa/reporting.py b/mostlyai/qa/reporting.py
diff --git a/mostlyai/qa/reporting_from_statistics.py b/mostlyai/qa/reporting_from_statistics.py
diff --git a/tests/unit/test_html_report.py b/tests/unit/test_html_report.py