feature-engine · solegalli · Feb 6, 2026 · Jan 28, 2026 · Jan 28, 2026 · Jan 28, 2026
diff --git a/feature_engine/creation/math_features.py b/feature_engine/creation/math_features.py
@@ -1,5 +1,6 @@
 from typing import Any, List, Optional, Union
 
+import numpy as np
 import pandas as pd
 
 from feature_engine._docstrings.fit_attributes import (
@@ -140,7 +141,6 @@ def __init__(
         missing_values: str = "raise",
         drop_original: bool = False,
     ) -> None:
-
         if (
             not isinstance(variables, list)
             or not all(isinstance(var, (int, str)) for var in variables)
@@ -157,16 +157,15 @@ def __init__(
                 "func does not work with dictionaries in this transformer."
             )
 
-        if new_variables_names is not None:
-            if (
-                not isinstance(new_variables_names, list)
-                or not all(isinstance(var, str) for var in new_variables_names)
-                or len(set(new_variables_names)) != len(new_variables_names)
-            ):
-                raise ValueError(
-                    "new_variable_names should be None or a list of unique strings. "
-                    f"Got {new_variables_names} instead."
-                )
+        if new_variables_names is not None and (
+            not isinstance(new_variables_names, list)
+            or not all(isinstance(var, str) for var in new_variables_names)
+            or len(set(new_variables_names)) != len(new_variables_names)
+        ):
+            raise ValueError(
+                "new_variable_names should be None or a list of unique strings. "
+                f"Got {new_variables_names} instead."
+            )
 
         if new_variables_names is not None:
             if isinstance(func, list):
@@ -175,19 +174,57 @@ def __init__(
                         "The number of new feature names must coincide with the number "
                         "of functions."
                     )
-            else:
-                if len(new_variables_names) != 1:
-                    raise ValueError(
-                        "The number of new feature names must coincide with the number "
-                        "of functions."
-                    )
+            elif len(new_variables_names) != 1:
+                raise ValueError(
+                    "The number of new feature names must coincide with the number "
+                    "of functions."
+                )
 
         super().__init__(missing_values, drop_original)
 
         self.variables = variables
         self.func = func
         self.new_variables_names = new_variables_names
 
+    def _map_unnamed_func_to_str(self, func: Any) -> Any:
+        if isinstance(func, list):
+            return [self._map_unnamed_func_to_str(f) for f in func]
+
+        # We map certain numpy functions to their string alias.
+        # This serves two purposes:
+        # 1) It avoids a FutureWarning in pandas 2.1+ which recommends
+        # using the string alias for better performance and future-proofing.
+        # 2) It ensures consistent column naming (e.g. "sum_x1_x2")
+        # regardless of how the function was passed (np.sum vs "sum").
+        map_dict = {
+            np.sum: "sum",
+            np.mean: "mean",
+            np.std: "std",
+            np.min: "min",
+            np.max: "max",
+            np.median: "median",
+            np.prod: "prod",
+        }
+        return map_dict.get(func, func)
+
+    def fit(self, X: pd.DataFrame, y=None):
+        """
+        This method does not learn any parameters. It just stores the normalized
+        function representation.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training input samples.
+
+        y: pandas Series, or np.array. Defaults to None.
+            It is not needed in this transformer. You can pass y or None.
+        """
+        super().fit(X, y)
+        # Normalize func to func_ (sklearn convention: don't modify init params)
+        self.func_ = self._map_unnamed_func_to_str(self.func)
+        return self
+
     def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         """
         Create and add new variables.
@@ -207,9 +244,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         new_variable_names = self._get_new_features_name()
 
         if len(new_variable_names) == 1:
-            X[new_variable_names[0]] = X[self.variables].agg(self.func, axis=1)
+            X[new_variable_names[0]] = X[self.variables].agg(self.func_, axis=1)
         else:
-            X[new_variable_names] = X[self.variables].agg(self.func, axis=1)
+            X[new_variable_names] = X[self.variables].agg(self.func_, axis=1)
 
         if self.drop_original:
             X.drop(columns=self.variables, inplace=True)
@@ -226,14 +263,14 @@ def _get_new_features_name(self) -> List:
         else:
             varlist = [f"{var}" for var in self.variables_]
 
-            if isinstance(self.func, list):
+            if isinstance(self.func_, list):
                 functions = [
-                    fun if type(fun) is str else fun.__name__ for fun in self.func
+                    fun if type(fun) is str else fun.__name__ for fun in self.func_
                 ]
                 feature_names = [
                     f"{function}_{'_'.join(varlist)}" for function in functions
                 ]
             else:
-                feature_names = [f"{self.func}_{'_'.join(varlist)}"]
+                feature_names = [f"{self.func_}_{'_'.join(varlist)}"]
 
         return feature_names
diff --git a/feature_engine/dataframe_checks.py b/feature_engine/dataframe_checks.py
@@ -9,6 +9,8 @@
 from scipy.sparse import issparse
 from sklearn.utils.validation import _check_y, check_consistent_length, column_or_1d
 
+from feature_engine.variable_handling._variable_type_checks import is_object
+
 
 def check_X(X: Union[np.generic, np.ndarray, pd.DataFrame]) -> pd.DataFrame:
     """
@@ -121,10 +123,10 @@ def check_y(
     elif isinstance(y, pd.Series):
         if y.isnull().any():
             raise ValueError("y contains NaN values.")
-        if y.dtype != "O" and not np.isfinite(y).all():
+        if not is_object(y) and not np.isfinite(y).all():
             raise ValueError("y contains infinity values.")
-        if y_numeric and y.dtype == "O":
-            y = y.astype("float")
+        if y_numeric and is_object(y):
+            y = y.astype("float64")
         y = y.copy()
 
     elif isinstance(y, pd.DataFrame):

diff --git a/feature_engine/encoding/similarity_encoder.py b/feature_engine/encoding/similarity_encoder.py
@@ -232,12 +232,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         X = check_X(X)
         variables_ = self._check_or_select_variables(X)
 
-        if self.keywords:
-            if not all(item in variables_ for item in self.keywords.keys()):
-                raise ValueError(
-                    "There are variables in keywords that are not present "
-                    "in the dataset."
-                )
+        if self.keywords and not all(
+            item in variables_ for item in self.keywords.keys()
+        ):
+            raise ValueError(
+                "There are variables in keywords that are not present "
+                "in the dataset."
+            )
 
         # if data contains nan, fail before running any logic
         if self.missing_values == "raise":
@@ -262,10 +263,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                 )
         elif self.missing_values == "impute":
             for var in cols_to_iterate:
+                series = X[var]
                 self.encoder_dict_[var] = (
-                    X[var]
-                    .astype(str)
-                    .replace("nan", "")
+                    series.astype(str)
+                    .mask(series.isna(), "")
                     .value_counts()
                     .head(self.top_categories)
                     .index.tolist()
@@ -276,7 +277,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                     X[var]
                     .astype(str)
                     .value_counts(dropna=True)
-                    .drop("nan", errors="ignore")
+                    .drop(["nan", "<NA>"], errors="ignore")
                     .head(self.top_categories)
                     .index.tolist()
                 )
@@ -316,13 +317,31 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
         new_values = []
         for var in self.variables_:
             if self.missing_values == "impute":
-                X[var] = X[var].astype(str).replace("nan", "")
-            categories = X[var].dropna().astype(str).unique()
+                series = X[var]
+                series = series.astype(str).mask(series.isna(), "")
+            else:
+                series = X[var].astype(str)
+
+            categories = series.unique()
             column_encoder_dict = {
                 x: _gpm_fast_vec(x, self.encoder_dict_[var]) for x in categories
             }
-            column_encoder_dict["nan"] = [np.nan] * len(self.encoder_dict_[var])
-            encoded = np.vstack(X[var].astype(str).map(column_encoder_dict).values)
+            # Ensure map result is always an array of the correct size.
+            # Missing values in categories or unknown categories will map to NaN.
+            default_nan = [np.nan] * len(self.encoder_dict_[var])
+            if "nan" not in column_encoder_dict:
+                column_encoder_dict["nan"] = default_nan
+            if "<NA>" not in column_encoder_dict:
+                column_encoder_dict["<NA>"] = default_nan
+
+            encoded_series = series.map(column_encoder_dict)
+
+            # Robust stacking: replace any float NaNs (from unknown values) with arrays
+            encoded_list = [
+                v if isinstance(v, (list, np.ndarray)) else default_nan
+                for v in encoded_series
+            ]
+            encoded = np.vstack(encoded_list)
             if self.missing_values == "ignore":
                 encoded[X[var].isna(), :] = np.nan
             new_values.append(encoded)

diff --git a/feature_engine/preprocessing/match_columns.py b/feature_engine/preprocessing/match_columns.py
@@ -175,7 +175,7 @@ def __init__(
 
         if not isinstance(verbose, bool):
             raise ValueError(
-                "verbose takes only booleans True and False." f"Got '{verbose} instead."
+                f"verbose takes only booleans True and False. Got '{verbose} instead."
             )
 
         # note: np.nan is an instance of float!!!
@@ -262,7 +262,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
 
         X = X.drop(_columns_to_drop, axis=1)
 
-        X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value)
+        # Add missing columns one at a time to avoid Pandas 3 StringDtype reindex issue
+        for col in _columns_to_add:
+            X[col] = self.fill_value
+
+        # Reorder columns to match training set, without fill_value to avoid issues
+        X = X[self.feature_names_in_]
 
         if self.match_dtypes:
             _current_dtypes = X.dtypes.to_dict()

diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py
@@ -201,7 +201,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
                         axis=0,
                     )
                     df_ls.append(tmp)
-                tmp = pd.concat(df_ls, axis=1)
+                tmp = pd.concat(df_ls, axis=1, sort=False)
 
             else:
                 tmp = X[self.variables_].shift(
@@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
                         axis=0,
                     )
                     df_ls.append(tmp)
-                tmp = pd.concat(df_ls, axis=1)
+                tmp = pd.concat(df_ls, axis=1, sort=False)
 
             else:
                 tmp = X[self.variables_].shift(

diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py
@@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
                     .shift(periods=self.periods, freq=self.freq)
                 )
                 df_ls.append(tmp)
-            tmp = pd.concat(df_ls, axis=1)
+            tmp = pd.concat(df_ls, axis=1, sort=False)
 
         else:
             tmp = (

diff --git a/feature_engine/variable_handling/_variable_type_checks.py b/feature_engine/variable_handling/_variable_type_checks.py
@@ -1,20 +1,25 @@
 import pandas as pd
-from pandas.api.types import is_string_dtype as is_object
+from pandas.api.types import is_object_dtype, is_string_dtype
 from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime
 from pandas.core.dtypes.common import is_numeric_dtype as is_numeric
 
 
-def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:
-    # check for datetime only if object cannot be cast as numeric because
-    # if it could pd.to_datetime would convert it to datetime regardless
-    if is_object(column):
-        is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column)
+def is_object(s) -> bool:
+    return is_object_dtype(s) or is_string_dtype(s)
+
 
+def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:
+    is_cat = False
     # check for datetime only if the type of the categories is not numeric
     # because pd.to_datetime throws an error when it is an integer
-    elif isinstance(column.dtype, pd.CategoricalDtype):
+    if isinstance(column.dtype, pd.CategoricalDtype):
         is_cat = _is_categories_num(column) or not _is_convertible_to_dt(column)
 
+    # check for datetime only if object cannot be cast as numeric because
+    # if it could pd.to_datetime would convert it to datetime regardless
+    elif is_object(column):
+        is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column)
+
     return is_cat
 
 
@@ -26,7 +31,7 @@ def _is_convertible_to_dt(column: pd.Series) -> bool:
     try:
         var = pd.to_datetime(column, utc=True)
         return is_datetime(var)
-    except:
+    except Exception:
         return False
 
 
@@ -39,16 +44,15 @@ def _is_convertible_to_num(column: pd.Series) -> bool:
 
 
 def _is_categorical_and_is_datetime(column: pd.Series) -> bool:
-    # check for datetime only if object cannot be cast as numeric because
-    # if it could pd.to_datetime would convert it to datetime regardless
-    if is_object(column):
-        is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column)
-
+    is_dt = False
     # check for datetime only if the type of the categories is not numeric
     # because pd.to_datetime throws an error when it is an integer
-    elif isinstance(column.dtype, pd.CategoricalDtype):
+    if isinstance(column.dtype, pd.CategoricalDtype):
         is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column)
 
-    else:
-        is_dt = False
+    # check for datetime only if object cannot be cast as numeric because
+    # if it could pd.to_datetime would convert it to datetime regardless
+    elif is_object(column):
+        is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column)
+
     return is_dt