microsoft · SunsetWolf · May 13, 2025 · Apr 28, 2025 · May 8, 2025 · May 8, 2025
diff --git a/README.md b/README.md
@@ -462,6 +462,14 @@ python run_all_model.py run 10
 
 It also provides the API to run specific models at once. For more use cases, please refer to the file's [docstrings](examples/run_all_model.py). 
 
+### Break change
+In `pandas`, `group_key` is one of the parameters of the `groupby` method. From version 1.5 to 2.0 of `pandas`, the default value of `group_key` has been changed from `no default` to `True`, which will cause qlib to report an error during operation. So we set `group_key=False`, but it doesn't guarantee that some programmes will run correctly, including:
+* qlib\examples\rl_order_execution\scripts\gen_training_orders.py
+* qlib\examples\benchmarks\TRA\src\dataset.MTSDatasetH.py
+* qlib\examples\benchmarks\TFT\tft.py
+
+
+
 ## [Adapting to Market Dynamics](examples/benchmarks_dynamic)
 
 Due to the non-stationary nature of the environment of the financial market, the data distribution may change in different periods, which makes the performance of models build on training data decays in the future test data.

diff --git a/examples/benchmarks/TFT/libs/tft_model.py b/examples/benchmarks/TFT/libs/tft_model.py
@@ -599,7 +599,7 @@ def _batch_sampled_data(self, data, max_samples):
         print("Getting valid sampling locations.")
         valid_sampling_locations = []
         split_data_map = {}
-        for identifier, df in data.groupby(id_col):
+        for identifier, df in data.groupby(id_col, group_key=False):
             print("Getting locations for {}".format(identifier))
             num_entries = len(df)
             if num_entries >= self.time_steps:
@@ -678,7 +678,7 @@ def _batch_single_entity(input_data):
         input_cols = [tup[0] for tup in self.column_definition if tup[2] not in {InputTypes.ID, InputTypes.TIME}]
 
         data_map = {}
-        for _, sliced in data.groupby(id_col):
+        for _, sliced in data.groupby(id_col, group_keys=False):
             col_mappings = {"identifier": [id_col], "time": [time_col], "outputs": [target_col], "inputs": input_cols}
 
             for k in col_mappings:

diff --git a/examples/benchmarks/TFT/tft.py b/examples/benchmarks/TFT/tft.py
@@ -78,13 +78,15 @@
 
 
 def get_shifted_label(data_df, shifts=5, col_shift="LABEL0"):
-    return data_df[[col_shift]].groupby("instrument").apply(lambda df: df.shift(shifts))
+    return data_df[[col_shift]].groupby("instrument", group_keys=False).apply(lambda df: df.shift(shifts))
 
 
 def fill_test_na(test_df):
     test_df_res = test_df.copy()
     feature_cols = ~test_df_res.columns.str.contains("label", case=False)
-    test_feature_fna = test_df_res.loc[:, feature_cols].groupby("datetime").apply(lambda df: df.fillna(df.mean()))
+    test_feature_fna = (
+        test_df_res.loc[:, feature_cols].groupby("datetime", group_keys=False).apply(lambda df: df.fillna(df.mean()))
+    )
     test_df_res.loc[:, feature_cols] = test_feature_fna
     return test_df_res
 

diff --git a/examples/benchmarks/TRA/src/dataset.py b/examples/benchmarks/TRA/src/dataset.py
@@ -29,7 +29,7 @@ def _create_ts_slices(index, seq_len):
     assert index.is_lexsorted(), "index should be sorted"
 
     # number of dates for each code
-    sample_count_by_codes = pd.Series(0, index=index).groupby(level=0).size().values
+    sample_count_by_codes = pd.Series(0, index=index).groupby(level=0, group_keys=False).size().values
 
     # start_index for each code
     start_index_of_codes = np.roll(np.cumsum(sample_count_by_codes), 1)

diff --git a/examples/highfreq/highfreq_ops.py b/examples/highfreq/highfreq_ops.py
@@ -25,7 +25,7 @@ class DayLast(ElemOperator):
     def _load_internal(self, instrument, start_index, end_index, freq):
         _calendar = get_calendar_day(freq=freq)
         series = self.feature.load(instrument, start_index, end_index, freq)
-        return series.groupby(_calendar[series.index]).transform("last")
+        return series.groupby(_calendar[series.index], group_keys=False).transform("last")
 
 
 class FFillNan(ElemOperator):
@@ -44,7 +44,7 @@ class FFillNan(ElemOperator):
 
     def _load_internal(self, instrument, start_index, end_index, freq):
         series = self.feature.load(instrument, start_index, end_index, freq)
-        return series.fillna(method="ffill")
+        return series.ffill()
 
 
 class BFillNan(ElemOperator):
@@ -63,7 +63,7 @@ class BFillNan(ElemOperator):
 
     def _load_internal(self, instrument, start_index, end_index, freq):
         series = self.feature.load(instrument, start_index, end_index, freq)
-        return series.fillna(method="bfill")
+        return series.bfill()
 
 
 class Date(ElemOperator):

diff --git a/examples/rl_order_execution/scripts/gen_training_orders.py b/examples/rl_order_execution/scripts/gen_training_orders.py
@@ -19,9 +19,9 @@ def generate_order(stock: str, start_idx: int, end_idx: int) -> bool:
 
     df["date"] = df["datetime"].dt.date.astype("datetime64")
     df = df.set_index(["instrument", "datetime", "date"])
-    df = df.groupby("date").take(range(start_idx, end_idx)).droplevel(level=0)
+    df = df.groupby("date", group_keys=False).take(range(start_idx, end_idx)).droplevel(level=0)
 
-    order_all = pd.DataFrame(df.groupby(level=(2, 0)).mean().dropna())
+    order_all = pd.DataFrame(df.groupby(level=(2, 0), group_keys=False).mean().dropna())
     order_all["amount"] = np.random.lognormal(-3.28, 1.14) * order_all["$volume0"]
     order_all = order_all[order_all["amount"] > 0.0]
     order_all["order_type"] = 0

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,7 +26,7 @@ readme = {file = "README.md", content-type = "text/markdown"}
 dependencies = [
   "pyyaml",
   "numpy",
-  "pandas",
+  "pandas>=0.24",
   "mlflow",
   "filelock>=3.16.0",
   "redis",
@@ -67,10 +67,13 @@ lint = [
   "flake8",
   "nbqa",
 ]
+# snowballstemmer, a dependency of sphinx, was released on 2025-05-08 with version 3.0.0,
+# which causes errors in the build process. So we've limited the version for now.
 docs = [
   "sphinx",
   "sphinx_rtd_theme",
   "readthedocs_sphinx_ext",
+  "snowballstemmer<3.0",
 ]
 package = [
   "twine",

diff --git a/qlib/backtest/high_performance_ds.py b/qlib/backtest/high_performance_ds.py
@@ -104,7 +104,7 @@ class PandasQuote(BaseQuote):
     def __init__(self, quote_df: pd.DataFrame, freq: str) -> None:
         super().__init__(quote_df=quote_df, freq=freq)
         quote_dict = {}
-        for stock_id, stock_val in quote_df.groupby(level="instrument"):
+        for stock_id, stock_val in quote_df.groupby(level="instrument", group_keys=False):
             quote_dict[stock_id] = stock_val.droplevel(level="instrument")
         self.data = quote_dict
 
@@ -137,7 +137,7 @@ def __init__(self, quote_df: pd.DataFrame, freq: str, region: str = "cn") -> Non
         """
         super().__init__(quote_df=quote_df, freq=freq)
         quote_dict = {}
-        for stock_id, stock_val in quote_df.groupby(level="instrument"):
+        for stock_id, stock_val in quote_df.groupby(level="instrument", group_keys=False):
             quote_dict[stock_id] = idd.MultiData(stock_val.droplevel(level="instrument"))
             quote_dict[stock_id].sort_index()  # To support more flexible slicing, we must sort data first
         self.data = quote_dict

diff --git a/qlib/backtest/position.py b/qlib/backtest/position.py
@@ -311,7 +311,7 @@ def fill_stock_value(self, start_time: Union[str, pd.Timestamp], freq: str, last
             freq=freq,
             disk_cache=True,
         ).dropna()
-        price_dict = price_df.groupby(["instrument"]).tail(1).reset_index(level=1, drop=True)["$close"].to_dict()
+        price_dict = price_df.groupby(["instrument"], group_keys=False).tail(1)["$close"].to_dict()
 
         if len(price_dict) < len(stock_list):
             lack_stock = set(stock_list) - set(price_dict)

diff --git a/qlib/backtest/report.py b/qlib/backtest/report.py
@@ -114,7 +114,11 @@ def _cal_benchmark(benchmark_config: Optional[dict], freq: str) -> Optional[pd.S
             _temp_result, _ = get_higher_eq_freq_feature(_codes, fields, start_time, end_time, freq=freq)
             if len(_temp_result) == 0:
                 raise ValueError(f"The benchmark {_codes} does not exist. Please provide the right benchmark")
-            return _temp_result.groupby(level="datetime")[_temp_result.columns.tolist()[0]].mean().fillna(0)
+            return (
+                _temp_result.groupby(level="datetime", group_keys=False)[_temp_result.columns.tolist()[0]]
+                .mean()
+                .fillna(0)
+            )
 
     def _sample_benchmark(
         self,

diff --git a/qlib/contrib/data/dataset.py b/qlib/contrib/data/dataset.py
@@ -32,7 +32,7 @@ def _create_ts_slices(index, seq_len):
     assert index.is_monotonic_increasing, "index should be sorted"
 
     # number of dates for each instrument
-    sample_count_by_insts = index.to_series().groupby(level=0).size().values
+    sample_count_by_insts = index.to_series().groupby(level=0, group_keys=False).size().values
 
     # start index for each instrument
     start_index_of_insts = np.roll(np.cumsum(sample_count_by_insts), 1)

diff --git a/qlib/contrib/data/processor.py b/qlib/contrib/data/processor.py
@@ -55,14 +55,18 @@ def _feature_norm(x):
 
         # Label
         cols = df_focus.columns[df_focus.columns.str.contains("^LABEL")]
-        df_focus[cols] = df_focus[cols].groupby(level="datetime").apply(_label_norm)
+        df_focus[cols] = df_focus[cols].groupby(level="datetime", group_keys=False).apply(_label_norm)
 
         # Features
         cols = df_focus.columns[df_focus.columns.str.contains("^KLEN|^KLOW|^KUP")]
-        df_focus[cols] = df_focus[cols].apply(lambda x: x**0.25).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = (
+            df_focus[cols].apply(lambda x: x**0.25).groupby(level="datetime", group_keys=False).apply(_feature_norm)
+        )
 
         cols = df_focus.columns[df_focus.columns.str.contains("^KLOW2|^KUP2")]
-        df_focus[cols] = df_focus[cols].apply(lambda x: x**0.5).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = (
+            df_focus[cols].apply(lambda x: x**0.5).groupby(level="datetime", group_keys=False).apply(_feature_norm)
+        )
 
         _cols = [
             "KMID",
@@ -88,25 +92,35 @@ def _feature_norm(x):
         ]
         pat = "|".join(["^" + x for x in _cols])
         cols = df_focus.columns[df_focus.columns.str.contains(pat) & (~df_focus.columns.isin(["HIGH0", "LOW0"]))]
-        df_focus[cols] = df_focus[cols].groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = df_focus[cols].groupby(level="datetime", group_keys=False).apply(_feature_norm)
 
         cols = df_focus.columns[df_focus.columns.str.contains("^STD|^VOLUME|^VMA|^VSTD")]
-        df_focus[cols] = df_focus[cols].apply(np.log).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = df_focus[cols].apply(np.log).groupby(level="datetime", group_keys=False).apply(_feature_norm)
 
         cols = df_focus.columns[df_focus.columns.str.contains("^RSQR")]
-        df_focus[cols] = df_focus[cols].fillna(0).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = df_focus[cols].fillna(0).groupby(level="datetime", group_keys=False).apply(_feature_norm)
 
         cols = df_focus.columns[df_focus.columns.str.contains("^MAX|^HIGH0")]
-        df_focus[cols] = df_focus[cols].apply(lambda x: (x - 1) ** 0.5).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = (
+            df_focus[cols]
+            .apply(lambda x: (x - 1) ** 0.5)
+            .groupby(level="datetime", group_keys=False)
+            .apply(_feature_norm)
+        )
 
         cols = df_focus.columns[df_focus.columns.str.contains("^MIN|^LOW0")]
-        df_focus[cols] = df_focus[cols].apply(lambda x: (1 - x) ** 0.5).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = (
+            df_focus[cols]
+            .apply(lambda x: (1 - x) ** 0.5)
+            .groupby(level="datetime", group_keys=False)
+            .apply(_feature_norm)
+        )
 
         cols = df_focus.columns[df_focus.columns.str.contains("^CORR|^CORD")]
-        df_focus[cols] = df_focus[cols].apply(np.exp).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = df_focus[cols].apply(np.exp).groupby(level="datetime", group_keys=False).apply(_feature_norm)
 
         cols = df_focus.columns[df_focus.columns.str.contains("^WVMA")]
-        df_focus[cols] = df_focus[cols].apply(np.log1p).groupby(level="datetime").apply(_feature_norm)
+        df_focus[cols] = df_focus[cols].apply(np.log1p).groupby(level="datetime", group_keys=False).apply(_feature_norm)
 
         df[selected_cols] = df_focus.values
 

diff --git a/qlib/contrib/eva/alpha.py b/qlib/contrib/eva/alpha.py
@@ -39,31 +39,33 @@ def calc_long_short_prec(
         long precision and short precision in time level
     """
     if is_alpha:
-        label = label - label.mean(level=date_col)
+        label = label - label.groupby(level=date_col, group_keys=False).mean()
     if int(1 / quantile) >= len(label.index.get_level_values(1).unique()):
         raise ValueError("Need more instruments to calculate precision")
 
     df = pd.DataFrame({"pred": pred, "label": label})
     if dropna:
         df.dropna(inplace=True)
 
-    group = df.groupby(level=date_col)
+    group = df.groupby(level=date_col, group_keys=False)
 
     def N(x):
         return int(len(x) * quantile)
 
     # find the top/low quantile of prediction and treat them as long and short target
-    long = group.apply(lambda x: x.nlargest(N(x), columns="pred").label).reset_index(level=0, drop=True)
-    short = group.apply(lambda x: x.nsmallest(N(x), columns="pred").label).reset_index(level=0, drop=True)
+    long = group.apply(lambda x: x.nlargest(N(x), columns="pred").label)
+    short = group.apply(lambda x: x.nsmallest(N(x), columns="pred").label)
 
-    groupll = long.groupby(date_col)
+    groupll = long.groupby(date_col, group_keys=False)
     l_dom = groupll.apply(lambda x: x > 0)
     l_c = groupll.count()
 
-    groups = short.groupby(date_col)
+    groups = short.groupby(date_col, group_keys=False)
     s_dom = groups.apply(lambda x: x < 0)
     s_c = groups.count()
-    return (l_dom.groupby(date_col).sum() / l_c), (s_dom.groupby(date_col).sum() / s_c)
+    return (l_dom.groupby(date_col, group_keys=False).sum() / l_c), (
+        s_dom.groupby(date_col, group_keys=False).sum() / s_c
+    )
 
 
 def calc_long_short_return(
@@ -100,7 +102,7 @@ def calc_long_short_return(
     df = pd.DataFrame({"pred": pred, "label": label})
     if dropna:
         df.dropna(inplace=True)
-    group = df.groupby(level=date_col)
+    group = df.groupby(level=date_col, group_keys=False)
 
     def N(x):
         return int(len(x) * quantile)
@@ -173,8 +175,8 @@ def calc_ic(pred: pd.Series, label: pd.Series, date_col="datetime", dropna=False
         ic and rank ic
     """
     df = pd.DataFrame({"pred": pred, "label": label})
-    ic = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"]))
-    ric = df.groupby(date_col).apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
+    ic = df.groupby(date_col, group_keys=False).apply(lambda df: df["pred"].corr(df["label"]))
+    ric = df.groupby(date_col, group_keys=False).apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
     if dropna:
         return ic.dropna(), ric.dropna()
     else:

diff --git a/qlib/contrib/meta/data_selection/dataset.py b/qlib/contrib/meta/data_selection/dataset.py
@@ -106,7 +106,7 @@ def setup(self, trainer=TrainerR, trainer_kwargs={}):
 
     def _calc_perf(self, pred, label):
         df = pd.DataFrame({"pred": pred, "label": label})
-        df = df.groupby("datetime").corr(method="spearman")
+        df = df.groupby("datetime", group_keys=False).corr(method="spearman")
         corr = df.loc(axis=0)[:, "pred"]["label"].droplevel(axis=0, level=-1)
         return corr
 
@@ -161,7 +161,7 @@ def __init__(self, task: dict, meta_info: pd.DataFrame, mode: str = MetaTask.PRO
                 raise ValueError(f"Most of samples are dropped. Please check this task: {task}")
 
             assert (
-                d_test.groupby("datetime").size().shape[0] >= 5
+                d_test.groupby("datetime", group_keys=False).size().shape[0] >= 5
             ), "In this segment, this trading dates is less than 5, you'd better check the data."
 
             sample_time_belong = np.zeros((d_train.shape[0], time_perf.shape[1]))

diff --git a/qlib/contrib/meta/data_selection/model.py b/qlib/contrib/meta/data_selection/model.py
@@ -125,7 +125,11 @@ def run_epoch(self, phase, task_list, epoch, opt, loss_l, ignore_weight=False):
         loss_l.setdefault(phase, []).append(running_loss)
 
         pred_y_all = pd.concat(pred_y_all)
-        ic = pred_y_all.groupby("datetime").apply(lambda df: df["pred"].corr(df["label"], method="spearman")).mean()
+        ic = (
+            pred_y_all.groupby("datetime", group_keys=False)
+            .apply(lambda df: df["pred"].corr(df["label"], method="spearman"))
+            .mean()
+        )
 
         R.log_metrics(**{f"loss/{phase}": running_loss, "step": epoch})
         R.log_metrics(**{f"ic/{phase}": ic, "step": epoch})

diff --git a/qlib/contrib/model/double_ensemble.py b/qlib/contrib/model/double_ensemble.py
@@ -166,7 +166,7 @@ def sample_reweight(self, loss_curve, loss_values, k_th):
 
         # calculate weights
         h["bins"] = pd.cut(h["h_value"], self.bins_sr)
-        h_avg = h.groupby("bins")["h_value"].mean()
+        h_avg = h.groupby("bins", group_keys=False, observed=False)["h_value"].mean()
         weights = pd.Series(np.zeros(N, dtype=float))
         for b in h_avg.index:
             weights[h["bins"] == b] = 1.0 / (self.decay**k_th * h_avg[b] + 0.1)

diff --git a/qlib/contrib/model/highfreq_gdbt_model.py b/qlib/contrib/model/highfreq_gdbt_model.py
@@ -90,8 +90,14 @@ def _prepare_data(self, dataset: DatasetH):
         if y_train.values.ndim == 2 and y_train.values.shape[1] == 1:
             l_name = df_train["label"].columns[0]
             # Convert label into alpha
-            df_train["label"][l_name] = df_train["label"][l_name] - df_train["label"][l_name].mean(level=0)
-            df_valid["label"][l_name] = df_valid["label"][l_name] - df_valid["label"][l_name].mean(level=0)
+            df_train.loc[:, ("label", l_name)] = (
+                df_train.loc[:, ("label", l_name)]
+                - df_train.loc[:, ("label", l_name)].groupby(level=0, group_keys=False).mean()
+            )
+            df_valid.loc[:, ("label", l_name)] = (
+                df_valid.loc[:, ("label", l_name)]
+                - df_valid.loc[:, ("label", l_name)].groupby(level=0, group_keys=False).mean()
+            )
 
             def mapping_fn(x):
                 return 0 if x < 0 else 1

diff --git a/qlib/contrib/model/pytorch_adarnn.py b/qlib/contrib/model/pytorch_adarnn.py
@@ -214,8 +214,10 @@ def train_AdaRNN(self, train_loader_list, epoch, dist_old=None, weight_mat=None)
     def calc_all_metrics(pred):
         """pred is a pandas dataframe that has two attributes: score (pred) and label (real)"""
         res = {}
-        ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score))
-        rank_ic = pred.groupby(level="datetime").apply(lambda x: x.label.corr(x.score, method="spearman"))
+        ic = pred.groupby(level="datetime", group_keys=False).apply(lambda x: x.label.corr(x.score))
+        rank_ic = pred.groupby(level="datetime", group_keys=False).apply(
+            lambda x: x.label.corr(x.score, method="spearman")
+        )
         res["ic"] = ic.mean()
         res["icir"] = ic.mean() / ic.std()
         res["ric"] = rank_ic.mean()

diff --git a/qlib/contrib/model/pytorch_add.py b/qlib/contrib/model/pytorch_add.py
@@ -226,7 +226,7 @@ def loss_rec(self, x, rec_x, record=None):
 
     def get_daily_inter(self, df, shuffle=False):
         # organize the train data into daily batches
-        daily_count = df.groupby(level=0).size().values
+        daily_count = df.groupby(level=0, group_keys=False).size().values
         daily_index = np.roll(np.cumsum(daily_count), 1)
         daily_index[0] = 0
         if shuffle:
@@ -349,15 +349,15 @@ def bootstrap_fit(self, x_train, y_train, m_train, x_valid, y_valid, m_valid):
         return best_score
 
     def gen_market_label(self, df, raw_label):
-        market_label = raw_label.groupby("datetime").mean().squeeze()
+        market_label = raw_label.groupby("datetime", group_keys=False).mean().squeeze()
         bins = [-np.inf, self.lo, self.hi, np.inf]
         market_label = pd.cut(market_label, bins, labels=False)
         market_label.name = ("market_return", "market_return")
         df = df.join(market_label)
         return df
 
     def fit_thresh(self, train_label):
-        market_label = train_label.groupby("datetime").mean().squeeze()
+        market_label = train_label.groupby("datetime", group_keys=False).mean().squeeze()
         self.lo, self.hi = market_label.quantile([1 / 3, 2 / 3])
 
     def fit(