|
| 1 | +import numpy as np |
| 2 | +import pandas as pd |
| 3 | +from beartype import beartype |
| 4 | +from beartype.typing import Sequence |
| 5 | +from scipy.stats import gmean |
| 6 | + |
| 7 | +from eis_toolkit.exceptions import InvalidColumnException, InvalidCompositionException, InvalidParameterValueException |
| 8 | +from eis_toolkit.utilities.checks.compositional import check_in_simplex_sample_space |
| 9 | +from eis_toolkit.utilities.checks.dataframe import check_columns_valid |
| 10 | +from eis_toolkit.utilities.checks.parameter import check_lists_overlap, check_numeric_value_sign |
| 11 | + |
| 12 | + |
| 13 | +@beartype |
| 14 | +def _calculate_ilr_scaling_factor(c1: int, c2: int) -> np.float64: |
| 15 | + """ |
| 16 | + Calculate the scaling factor for the ILR transform. |
| 17 | +
|
| 18 | + Args: |
| 19 | + c1: The cardinality of the first subcomposition. |
| 20 | + c2: The cardinality of the second subcomposition. |
| 21 | +
|
| 22 | + Returns: |
| 23 | + The scaling factor. |
| 24 | +
|
| 25 | + Raises: |
| 26 | + InvalidParameterValueException: One or both of the input values are zero or negative. |
| 27 | + """ |
| 28 | + if not (check_numeric_value_sign(c1) and check_numeric_value_sign(c2)): |
| 29 | + raise InvalidParameterValueException("Input values must both be positive integers.") |
| 30 | + |
| 31 | + return np.sqrt((c1 * c2) / np.float64(c1 + c2)) |
| 32 | + |
| 33 | + |
| 34 | +@beartype |
| 35 | +def _geometric_mean_logratio( |
| 36 | + row: pd.Series, subcomposition_1: Sequence[str], subcomposition_2: Sequence[str] |
| 37 | +) -> np.float64: |
| 38 | + |
| 39 | + numerator = gmean(row[subcomposition_1]) |
| 40 | + denominator = gmean(row[subcomposition_2]) |
| 41 | + return np.log(numerator / denominator) |
| 42 | + |
| 43 | + |
| 44 | +@beartype |
| 45 | +def _single_ilr_transform( |
| 46 | + df: pd.DataFrame, subcomposition_1: Sequence[str], subcomposition_2: Sequence[str] |
| 47 | +) -> pd.Series: |
| 48 | + |
| 49 | + dfc = df.copy() |
| 50 | + |
| 51 | + c1 = len(subcomposition_1) |
| 52 | + c2 = len(subcomposition_2) |
| 53 | + |
| 54 | + # A Series to hold the transformed rows |
| 55 | + ilr_values = pd.Series([0.0] * df.shape[0]) |
| 56 | + |
| 57 | + for idx, row in dfc.iterrows(): |
| 58 | + ilr_values[idx] = _geometric_mean_logratio(row, subcomposition_1, subcomposition_2) |
| 59 | + |
| 60 | + ilr_values = _calculate_ilr_scaling_factor(c1, c2) * ilr_values |
| 61 | + |
| 62 | + return ilr_values |
| 63 | + |
| 64 | + |
| 65 | +@beartype |
| 66 | +def single_ilr_transform( |
| 67 | + df: pd.DataFrame, subcomposition_1: Sequence[str], subcomposition_2: Sequence[str] |
| 68 | +) -> pd.Series: |
| 69 | + """ |
| 70 | + Perform a single isometric logratio transformation on the provided subcompositions. |
| 71 | +
|
| 72 | + Returns ILR balances. Column order matters. |
| 73 | +
|
| 74 | + Args: |
| 75 | + df: A dataframe of shape [N, D] of compositional data. |
| 76 | + subcomposition_1: Names of the columns in the numerator part of the ratio. |
| 77 | + subcomposition_2: Names of the columns in the denominator part of the ratio. |
| 78 | +
|
| 79 | + Returns: |
| 80 | + A series of length N containing the transforms. |
| 81 | +
|
| 82 | + Raises: |
| 83 | + InvalidColumnException: One or more subcomposition columns are not found in the input dataframe. |
| 84 | + InvalidCompositionException: Data is not normalized to the expected value or |
| 85 | + one or more columns are found in both subcompositions. |
| 86 | + InvalidParameterValueException: At least one subcomposition provided was empty. |
| 87 | + NumericValueSignException: Data contains zeros or negative values. |
| 88 | + """ |
| 89 | + check_in_simplex_sample_space(df) |
| 90 | + |
| 91 | + if not (subcomposition_1 and subcomposition_2): |
| 92 | + raise InvalidParameterValueException("A subcomposition should contain at least one column.") |
| 93 | + |
| 94 | + if not (check_columns_valid(df, subcomposition_1) and check_columns_valid(df, subcomposition_2)): |
| 95 | + raise InvalidColumnException("Not all of the input columns were found in the input dataframe.") |
| 96 | + |
| 97 | + if check_lists_overlap(subcomposition_1, subcomposition_2): |
| 98 | + raise InvalidCompositionException("The subcompositions overlap.") |
| 99 | + |
| 100 | + return _single_ilr_transform(df, subcomposition_1, subcomposition_2) |
0 commit comments