Merge pull request #330 from iiasa/project/ssp/transport/2025-04-08

khaeru · web-flow · commit 818623b16e4e · 2025-04-09T13:00:14.000+02:00
Two adjustments to `.project.ssp.transport`
diff --git a/doc/whatsnew.rst b/doc/whatsnew.rst
@@ -44,9 +44,10 @@ SSP :ref:`ssp-2024`/ScenarioMIP
 
 Improve :mod:`.ssp.transport`:
 
-- Add :func:`~.ssp.transport.method_B` and make this the default (:pull:`259`).
-- Add :func:`~.ssp.transport.method_C` (:issue:`305`, :pull:`325`).
-- Add :func:`~.ssp.transport.process_df` (:pull:`303`).
+- Add :func:`~.ssp.transport.method_B` and make this the default (:pull:`259`, :pull:`330`).
+- Add :func:`~.ssp.transport.method_C` (:issue:`305`, :pull:`325`, :pull:`330`).
+- Add :func:`~.ssp.transport.process_df` (:pull:`303`);
+  handle data frames containing :py:`np.NaN` (:pull:`330`).
 - Adapt to revised ‘variable’ codes (:pull:`309`, :issue:`304`).
 
 Transport
diff --git a/message_ix_models/model/transport/operator.py b/message_ix_models/model/transport/operator.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from collections.abc import Mapping, Sequence
+from collections.abc import Sequence
 from functools import partial
 from itertools import product
 from operator import gt, le, lt
@@ -26,7 +26,6 @@
     MappingAdapter,
     datetime_now_with_tz,
     minimum_version,
-    nodes_ex_world,
     show_versions,
 )
 from message_ix_models.util.genno import as_quantity
@@ -71,8 +70,6 @@
     "max",
     "maybe_select",
     "min",
-    "nodes_ex_world",  # Re-export from message_ix_models.util TODO do this upstream
-    "nodes_world_agg",
     "price_units",
     "quantity_from_config",
     "relabel2",
@@ -832,41 +829,6 @@ def indexers_usage(technologies: list[Code]) -> dict:
     }
 
 
-def nodes_world_agg(config, dim: Hashable = "nl") -> dict[Hashable, Mapping]:
-    """Mapping to aggregate e.g. nl="World" from values for child nodes of "World".
-
-    This mapping should be used with :func:`.genno.operator.aggregate`, giving the
-    argument ``keep=False``. It includes 1:1 mapping from each region name to itself.
-
-    .. todo:: move to :mod:`message_ix_models.report.operator`.
-    """
-    result = {}
-
-    cl = get_codelist(f"node/{config['regions']}")
-    for n in cl:
-        # "World" node should have be top-level (its parent is the `cl` itself) and have
-        # some children. Countries (from pycountry) that are omitted from a mapping have
-        # no children.
-        if n.parent is cl and len(n.child):
-            name = str(n)
-
-            # FIXME Remove. This is a hack to suit the legacy reporting, which expects
-            #       global aggregates at *_GLB rather than "World".
-            new_name = f"{config['regions']}_GLB"
-            log.info(f"Aggregates for {n!r} will be labelled {new_name!r}")
-            name = new_name
-
-            # Global total as aggregate of child nodes
-            result = {name: list(map(str, n.child))}
-
-            # Also add "no-op" aggregates e.g. "R12_AFR" is the sum of ["R12_AFR"]
-            result.update({c: [c] for c in map(str, n.child)})
-
-            return {dim: result}
-
-    raise RuntimeError("Failed to identify the World node")
-
-
 def price_units(qty: "AnyQuantity") -> "AnyQuantity":
     """Forcibly adjust price units, if necessary."""
     target = "USD_2010 / km"
diff --git a/message_ix_models/project/ssp/transport.py b/message_ix_models/project/ssp/transport.py
@@ -265,6 +265,7 @@ def get_computer(
     context = Context(model=ModelConfig(regions="R12"))
     # Store in `c` for reference by other operations
     c.add("context", context)
+    c.graph["config"].update(regions="R12")
 
     # Store a model name and scenario name from a single row of the data
     model_name, scenario_name = row0[["Model", "Scenario"]]
@@ -413,11 +414,15 @@ def method_B(c: "Computer") -> None:
     c.add(fe.iea[1], "aggregate", fe.iea[0], g, keep=False)
 
     # Rename dimensions
-    c.add(fe.cnt, "rename_dims", fe.iea[1], name_dict=dict(flow="t", product="c"))
+    c.add(fe.cnt[0], "rename_dims", fe.iea[1], name_dict=dict(flow="t", product="c"))
 
-    # Compute ratio
-    c.add(fe.share[0], "select", fe.cnt, indexers=dict(t="_1"), drop=True)
-    c.add(fe.share[1], "select", fe.cnt, indexers=dict(t="_2"), drop=True)
+    # Global total
+    c.add("n::world agg", "nodes_world_agg", "config", dim="n", name=None)
+    c.add(fe.cnt[1], "aggregate", fe.cnt[0], "n::world agg", keep=False)
+
+    # Ratio of _1 (DOMESAIR - AVBUNK) to _2 (TOTTRANS - AVBUNK)
+    c.add(fe.share[0], "select", fe.cnt[1], indexers=dict(t="_1"), drop=True)
+    c.add(fe.share[1], "select", fe.cnt[1], indexers=dict(t="_2"), drop=True)
     c.add(fe.share, "div", fe.share[0], fe.share[1])
 
     # Prepare remaining calculations
@@ -467,11 +472,11 @@ def method_BC_common(c: "Computer", k_fe_share: "Key") -> None:
 
     # Relabel:
     # - c[ommodity]: 'Liquids|Oil' (IAMC 'variable' component) → 'lightoil'
-    # - n[ode]: 'AFR' → 'R12_AFR' etc.
-    labels = dict(
-        c={"Liquids|Oil": "lightoil"},
-        n={n.id.partition("_")[2]: n.id for n in get_codelist("node/R12")},
-    )
+    # - n[ode]: "AFR" → "R12_AFR" etc. "World" is not changed.
+    cl = get_codelist("node/R12")
+    labels = dict(c={"Liquids|Oil": "lightoil"}, n={})
+    for n in filter(lambda n: len(n.child) and n.id != "World", cl):
+        labels["n"][n.id.partition("_")[2]] = n.id
     c.add(k.fe_in[2] / "UNIT", "relabel", k.fe_in[1] / "UNIT", labels=labels)
 
     ### Compute estimate of emissions
@@ -496,7 +501,7 @@ def method_BC_common(c: "Computer", k_fe_share: "Key") -> None:
     c.add(k.units, e_UNIT, "e::codelist")
     c.add(K.emi[2], "mul", k.emi0[1], k.units, K.bcast)
 
-    # Change labels: restore e.g. "AFR" given "R12_AFR"
+    # Restore labels: "R12_AFR" → "AFR" etc. "World" is not changed.
     labels = dict(n={v: k for k, v in labels["n"].items()})
     c.add(K.emi, "relabel", K.emi[2], labels=labels)
 
@@ -527,15 +532,20 @@ def method_C(c: "Computer") -> None:
     # Prepare `c` to compute the final energy share for aviation
     k = Keys(
         # Added by .transport.base.prepare_reporter()
-        base="in:nl-t-ya-c:transport+units+0",
+        base="in:nl-t-ya-c:transport+units",
         share0=f"fe share:c-nl-ya:{L}",
         share1=f"fe share:c-n-y:{L}",
     )
 
+    # Relabel "R12_GLB" (added by .report.transport.aggregate()) to "World"
+    labels = {"nl": {"R12_GLB": "World"}}
+    c.add(k.base[1], "relabel", k.base[0], labels=labels, sums=True)
+
     # Select the numerator
-    c.add(k.share0["num"], "select", k.base, indexers=dict(t=["AIR"]), drop=True)
-    # Compute the ratio
-    c.add(k.share0, "div", k.share0["num"], k.base / "t")
+    c.add(k.share0["num"], "select", k.base[1], indexers=dict(t=["AIR"]), drop=True)
+    # Ratio of AIR to the total
+    c.add(k.share0, "div", k.share0["num"], k.base[1] / "t")
+
     # Rename dimensions as expected by method_BC_common
     c.add(k.share1, "rename_dims", k.share0, name_dict={"nl": "n", "ya": "y"})
 
@@ -559,8 +569,18 @@ def process_df(
     # Prepare all other tasks
     c = get_computer(data.iloc[0, :], method, platform_name=platform_name)
 
-    # Input data: convert `data` to a Quantity with the appropriate structure
-    c.add(K.input, to_quantity, data, **IAMC_KW)
+    def fillna(df: pd.DataFrame) -> pd.DataFrame:
+        """Replace :py:`np.nan` with 0.0 in certain rows and columns."""
+        mask = df.Variable.str.fullmatch(
+            r"Emissions\|[^\|]+\|Energy\|Demand\|(Bunkers|Transportation).*"
+        )
+        to_fill = {c: 0.0 for c in df.columns if str(c).isnumeric() and int(c) >= 2020}
+        return df.where(~mask, df.fillna(to_fill))
+
+    # Input data: replace NaN with 0
+    c.add(K.input[0], fillna, data)
+    # Convert `data` to a Quantity with the appropriate structure
+    c.add(K.input, to_quantity, K.input[0], **IAMC_KW)
 
     # Compute and return the result
     return c.get("target")
diff --git a/message_ix_models/report/operator.py b/message_ix_models/report/operator.py
@@ -2,7 +2,7 @@
 
 import logging
 import re
-from collections.abc import Callable, Hashable, Mapping, Sequence
+from collections.abc import Callable, Hashable, Mapping, MutableMapping, Sequence
 from functools import reduce
 from itertools import filterfalse, product
 from typing import TYPE_CHECKING, Any, Optional, Union
@@ -48,6 +48,7 @@ def __lt__(self, __other: Any) -> bool: ...
     "merge_data",
     "model_periods",
     "nodes_ex_world",
+    "nodes_world_agg",
     "quantity_from_iamc",
     "remove_ts",
     "select_expand",
@@ -244,6 +245,42 @@ def model_periods(y: list[int], cat_year: pd.DataFrame) -> list[int]:
     return list(filter(lambda year: y0 <= year, y))
 
 
+def nodes_world_agg(
+    config: dict, *, dim: Hashable = "nl", name: Optional[str] = "{}_GLB"
+) -> Mapping[Hashable, Mapping[Hashable, list[str]]]:
+    """Mapping to aggregate e.g. nl="World" from values for child nodes of "World".
+
+    This mapping should be used with :func:`.genno.operator.aggregate`, giving the
+    argument ``keep=False``. It includes 1:1 mapping from each region name to itself.
+    """
+    from message_ix_models.model.structure import get_codelist
+
+    cl = get_codelist(f"node/{config['regions']}")
+
+    # "World" node should have be top-level (its parent is the `cl` itself) and have
+    # some children. Countries (from pycountry) that are omitted from a mapping have no
+    # children.
+    try:
+        node = next(filter(lambda n: n.parent is cl and len(n.child), cl))
+    except StopIteration:
+        raise RuntimeError("Failed to identify a 'World' node")
+
+    if name:
+        # FIXME Remove. This is a hack to suit the legacy reporting, which expects
+        #       global aggregates at *_GLB rather than "World".
+        name = name.format(config["regions"])
+        log.info(f"Aggregates for {node!r} will be labelled {name!r}")
+    else:
+        name = node.id
+
+    # Global total as aggregate of child nodes
+    result: MutableMapping = {name: list(map(str, node.child))}
+    # Also add "no-op" aggregates e.g. "R12_AFR" is the sum of ["R12_AFR"]
+    result.update({c: [c] for c in map(str, node.child)})
+
+    return {dim: result}
+
+
 def remove_ts(
     scenario: ixmp.Scenario,
     config: Optional[dict] = None,
diff --git a/message_ix_models/tests/project/ssp/test_transport.py b/message_ix_models/tests/project/ssp/test_transport.py
@@ -1,5 +1,7 @@
+from collections.abc import Callable, Hashable
 from typing import TYPE_CHECKING
 
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -98,24 +100,31 @@ def _to_long(df):
 
     # Diff data:
     # - Outer merge.
+    # - Fill NaNs resulting from insert_nans()
     # - Compute diff and select rows where diff is larger than a certain value
-    df = df_in.merge(df_out, how="outer", on=dims, suffixes=("_in", "_out")).query(
-        "abs(value_out - value_in) > 1e-16"
+    df = (
+        df_in.merge(df_out, how="outer", on=dims, suffixes=("_in", "_out"))
+        .fillna(0)
+        .query("abs(value_out - value_in) > 1e-16")
     )
 
     # Identify the directory from which IEA EWEB data is read
     iea_eweb_dir = web.dir_fallback(web.FILES[("IEA", "2024")][0])
     # True if the fuzzed test data are being used
     iea_eweb_test_data = iea_eweb_dir.match("message_ix_models/data/test/iea/web")
 
+    # All regions and "World" have modified values
+    N_reg = {METHOD.A: 13, METHOD.B: 9, METHOD.C: 13}[method]
+    assert N_reg <= len(df["Region"].unique())
+
     # Number of modified values
     N_exp = {
         (METHOD.A, False): 10280,
         (METHOD.A, True): 10280,
-        (METHOD.B, False): 4660,
-        (METHOD.B, True): 3060,
-        (METHOD.C, False): 3220,
-        (METHOD.C, True): 3220,
+        (METHOD.B, False): 5060,
+        (METHOD.B, True): 3460,
+        (METHOD.C, False): 3500,
+        (METHOD.C, True): 3500,
     }[(method, iea_eweb_test_data)]
 
     if N_exp != len(df):
@@ -155,6 +164,22 @@ def expected_variables(flag: int, method: METHOD) -> set[str]:
     return result
 
 
+def insert_nans(
+    df: pd.DataFrame, variable_expr: str, year_cond: Callable[[Hashable], bool]
+) -> pd.DataFrame:
+    """Replace zeros with :py:`np.nan` in `df`.
+
+    This occurs only where:
+
+    1. The 'Variable' column contains a string that matches `variable_expr`.
+    2. The `year_cond` returns :any:`True` for the column name.
+    """
+    return df.where(
+        ~df.Variable.str.fullmatch(variable_expr),
+        df.replace({c: {0: np.nan} for c in filter(year_cond, df.columns)}),
+    )
+
+
 @get_computer.minimum_version
 def test_cli(tmp_path, mix_models_cli, test_context, input_xlsx_path) -> None:
     """Code can be invoked from the command-line."""
@@ -218,7 +243,13 @@ def test_get_scenario_code(expected_id, model_name, scenario_name) -> None:
 @get_computer.minimum_version
 @pytest.mark.parametrize("method", METHOD_PARAM)
 def test_process_df(test_context, input_csv_path, method) -> None:
-    df_in = pd.read_csv(input_csv_path)
+    # - Read input data
+    # - Replace some 0 values with np.nan to replicate conditions in calling code.
+    df_in = pd.read_csv(input_csv_path).pipe(
+        insert_nans,
+        r"Emissions\|.*\|International Aviation",
+        lambda c: str(c).isnumeric() and int(c) >= 2020,
+    )
 
     # Code runs
     df_out = process_df(df_in, method=method)