Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
da5ff67
fix: Pandas 3 compatibility - robust dtype checks and test fixes
ankitlade12 Jan 28, 2026
e0c3292
fix: Remove whitespace before colon in slice notation (flake8 E203)
ankitlade12 Jan 28, 2026
ccbfa05
feat: finalize Pandas 3 compatibility fixes and test updates
ankitlade12 Jan 28, 2026
fd43124
style: fix flake8 line length and linting issues
ankitlade12 Jan 28, 2026
8367d4a
style: fix remaining flake8 C416 issue
ankitlade12 Jan 28, 2026
3225500
Fix Pandas 3 regressions in check_y, _check_contains_inf, and StringS…
ankitlade12 Jan 28, 2026
bde0b9b
Fix E501 line too long in dataframe_checks.py
ankitlade12 Jan 28, 2026
dedf500
Fix StringSimilarityEncoder NaN issues and fragile test assertions
ankitlade12 Jan 28, 2026
765e102
fix: Pandas 3 stability - mock datasets and fix FutureWarnings
ankitlade12 Jan 28, 2026
28894c5
style: fix flake8 linting errors E501, E302, E305, SIM102
ankitlade12 Jan 28, 2026
08821a6
test: improve patch coverage for Pandas 3 stability fixes
ankitlade12 Jan 28, 2026
972a4b7
style: fix E501 line too long in similarity encoder tests
ankitlade12 Jan 28, 2026
d141332
style: revert unrelated flake8 and formatting changes
ankitlade12 Feb 3, 2026
05ca43c
fix: restore Pandas 3 test logic and silence Pandas4Warning
ankitlade12 Feb 3, 2026
36c2232
style: move numpy import to top of math_features.py
ankitlade12 Feb 3, 2026
0fd811a
style: fix spacing in MatchVariables verbose error message
ankitlade12 Feb 3, 2026
d98c8d7
test: revert dynamic std values to hardcoded values in MathFeatures t…
ankitlade12 Feb 3, 2026
b02ec5e
style: combine imports in _variable_type_checks.py
ankitlade12 Feb 3, 2026
66ff38b
refactor: centralize is_object function and use it across the codebase
ankitlade12 Feb 3, 2026
b45c51f
refactor: further simplify check_y dtype checks using is_object
ankitlade12 Feb 3, 2026
1ba6b44
revert: remove unnecessary complexity in _check_contains_inf and asso…
ankitlade12 Feb 3, 2026
50eec35
docs: rename _normalize_func to _map_unnamed_func_to_str and add comm…
ankitlade12 Feb 3, 2026
94d2771
perf: optimize casting logic in SimilarityEncoder
ankitlade12 Feb 3, 2026
0cb335b
fix: address remaining code review feedback - follow sklearn conventi…
ankitlade12 Feb 4, 2026
c25d5bd
style: fix linting and follow sklearn convention for MathFeatures
ankitlade12 Feb 4, 2026
8b50186
revert: remove california housing mock from conftest.py
ankitlade12 Feb 4, 2026
4f9b146
revert: restore original error message assertion in DatetimeFeatures …
ankitlade12 Feb 4, 2026
09a1ece
fix: use robust datetime normalization and flexible error assertions …
ankitlade12 Feb 4, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 59 additions & 22 deletions feature_engine/creation/math_features.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from typing import Any, List, Optional, Union

import numpy as np
import pandas as pd

from feature_engine._docstrings.fit_attributes import (
Expand Down Expand Up @@ -140,7 +141,6 @@ def __init__(
missing_values: str = "raise",
drop_original: bool = False,
) -> None:

if (
not isinstance(variables, list)
or not all(isinstance(var, (int, str)) for var in variables)
Expand All @@ -157,16 +157,15 @@ def __init__(
"func does not work with dictionaries in this transformer."
)

if new_variables_names is not None:
if (
not isinstance(new_variables_names, list)
or not all(isinstance(var, str) for var in new_variables_names)
or len(set(new_variables_names)) != len(new_variables_names)
):
raise ValueError(
"new_variable_names should be None or a list of unique strings. "
f"Got {new_variables_names} instead."
)
if new_variables_names is not None and (
not isinstance(new_variables_names, list)
or not all(isinstance(var, str) for var in new_variables_names)
or len(set(new_variables_names)) != len(new_variables_names)
):
raise ValueError(
"new_variable_names should be None or a list of unique strings. "
f"Got {new_variables_names} instead."
)

if new_variables_names is not None:
if isinstance(func, list):
Expand All @@ -175,19 +174,57 @@ def __init__(
"The number of new feature names must coincide with the number "
"of functions."
)
else:
if len(new_variables_names) != 1:
raise ValueError(
"The number of new feature names must coincide with the number "
"of functions."
)
elif len(new_variables_names) != 1:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Neat! Thank you!

raise ValueError(
"The number of new feature names must coincide with the number "
"of functions."
)

super().__init__(missing_values, drop_original)

self.variables = variables
self.func = func
self.new_variables_names = new_variables_names

def _map_unnamed_func_to_str(self, func: Any) -> Any:
if isinstance(func, list):
return [self._map_unnamed_func_to_str(f) for f in func]

# We map certain numpy functions to their string alias.
# This serves two purposes:
# 1) It avoids a FutureWarning in pandas 2.1+ which recommends
# using the string alias for better performance and future-proofing.
# 2) It ensures consistent column naming (e.g. "sum_x1_x2")
# regardless of how the function was passed (np.sum vs "sum").
map_dict = {
np.sum: "sum",
np.mean: "mean",
np.std: "std",
np.min: "min",
np.max: "max",
np.median: "median",
np.prod: "prod",
}
return map_dict.get(func, func)

def fit(self, X: pd.DataFrame, y=None):
"""
This method does not learn any parameters. It just stores the normalized
function representation.

Parameters
----------
X: pandas dataframe of shape = [n_samples, n_features]
The training input samples.

y: pandas Series, or np.array. Defaults to None.
It is not needed in this transformer. You can pass y or None.
"""
super().fit(X, y)
# Normalize func to func_ (sklearn convention: don't modify init params)
self.func_ = self._map_unnamed_func_to_str(self.func)
return self

def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""
Create and add new variables.
Expand All @@ -207,9 +244,9 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
new_variable_names = self._get_new_features_name()

if len(new_variable_names) == 1:
X[new_variable_names[0]] = X[self.variables].agg(self.func, axis=1)
X[new_variable_names[0]] = X[self.variables].agg(self.func_, axis=1)
else:
X[new_variable_names] = X[self.variables].agg(self.func, axis=1)
X[new_variable_names] = X[self.variables].agg(self.func_, axis=1)

if self.drop_original:
X.drop(columns=self.variables, inplace=True)
Expand All @@ -226,14 +263,14 @@ def _get_new_features_name(self) -> List:
else:
varlist = [f"{var}" for var in self.variables_]

if isinstance(self.func, list):
if isinstance(self.func_, list):
functions = [
fun if type(fun) is str else fun.__name__ for fun in self.func
fun if type(fun) is str else fun.__name__ for fun in self.func_
]
feature_names = [
f"{function}_{'_'.join(varlist)}" for function in functions
]
else:
feature_names = [f"{self.func}_{'_'.join(varlist)}"]
feature_names = [f"{self.func_}_{'_'.join(varlist)}"]

return feature_names
8 changes: 5 additions & 3 deletions feature_engine/dataframe_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from scipy.sparse import issparse
from sklearn.utils.validation import _check_y, check_consistent_length, column_or_1d

from feature_engine.variable_handling._variable_type_checks import is_object


def check_X(X: Union[np.generic, np.ndarray, pd.DataFrame]) -> pd.DataFrame:
"""
Expand Down Expand Up @@ -121,10 +123,10 @@ def check_y(
elif isinstance(y, pd.Series):
if y.isnull().any():
raise ValueError("y contains NaN values.")
if y.dtype != "O" and not np.isfinite(y).all():
if not is_object(y) and not np.isfinite(y).all():
raise ValueError("y contains infinity values.")
if y_numeric and y.dtype == "O":
y = y.astype("float")
if y_numeric and is_object(y):
y = y.astype("float64")
y = y.copy()

elif isinstance(y, pd.DataFrame):
Expand Down
47 changes: 33 additions & 14 deletions feature_engine/encoding/similarity_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,13 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
X = check_X(X)
variables_ = self._check_or_select_variables(X)

if self.keywords:
if not all(item in variables_ for item in self.keywords.keys()):
raise ValueError(
"There are variables in keywords that are not present "
"in the dataset."
)
if self.keywords and not all(
item in variables_ for item in self.keywords.keys()
):
raise ValueError(
"There are variables in keywords that are not present "
"in the dataset."
)

# if data contains nan, fail before running any logic
if self.missing_values == "raise":
Expand All @@ -262,10 +263,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
)
elif self.missing_values == "impute":
for var in cols_to_iterate:
series = X[var]
self.encoder_dict_[var] = (
X[var]
.astype(str)
.replace("nan", "")
series.astype(str)
.mask(series.isna(), "")
.value_counts()
.head(self.top_categories)
.index.tolist()
Expand All @@ -276,7 +277,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
X[var]
.astype(str)
.value_counts(dropna=True)
.drop("nan", errors="ignore")
.drop(["nan", "<NA>"], errors="ignore")
.head(self.top_categories)
.index.tolist()
)
Expand Down Expand Up @@ -316,13 +317,31 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
new_values = []
for var in self.variables_:
if self.missing_values == "impute":
X[var] = X[var].astype(str).replace("nan", "")
categories = X[var].dropna().astype(str).unique()
series = X[var]
series = series.astype(str).mask(series.isna(), "")
else:
series = X[var].astype(str)

categories = series.unique()
column_encoder_dict = {
x: _gpm_fast_vec(x, self.encoder_dict_[var]) for x in categories
}
column_encoder_dict["nan"] = [np.nan] * len(self.encoder_dict_[var])
encoded = np.vstack(X[var].astype(str).map(column_encoder_dict).values)
# Ensure map result is always an array of the correct size.
# Missing values in categories or unknown categories will map to NaN.
default_nan = [np.nan] * len(self.encoder_dict_[var])
if "nan" not in column_encoder_dict:
column_encoder_dict["nan"] = default_nan
if "<NA>" not in column_encoder_dict:
column_encoder_dict["<NA>"] = default_nan

encoded_series = series.map(column_encoder_dict)

# Robust stacking: replace any float NaNs (from unknown values) with arrays
encoded_list = [
v if isinstance(v, (list, np.ndarray)) else default_nan
for v in encoded_series
]
encoded = np.vstack(encoded_list)
if self.missing_values == "ignore":
encoded[X[var].isna(), :] = np.nan
new_values.append(encoded)
Expand Down
9 changes: 7 additions & 2 deletions feature_engine/preprocessing/match_columns.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def __init__(

if not isinstance(verbose, bool):
raise ValueError(
"verbose takes only booleans True and False." f"Got '{verbose} instead."
f"verbose takes only booleans True and False. Got '{verbose} instead."
)

# note: np.nan is an instance of float!!!
Expand Down Expand Up @@ -262,7 +262,12 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:

X = X.drop(_columns_to_drop, axis=1)

X = X.reindex(columns=self.feature_names_in_, fill_value=self.fill_value)
# Add missing columns one at a time to avoid Pandas 3 StringDtype reindex issue
for col in _columns_to_add:
X[col] = self.fill_value

# Reorder columns to match training set, without fill_value to avoid issues
X = X[self.feature_names_in_]

if self.match_dtypes:
_current_dtypes = X.dtypes.to_dict()
Expand Down
4 changes: 2 additions & 2 deletions feature_engine/timeseries/forecasting/lag_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
axis=0,
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)
tmp = pd.concat(df_ls, axis=1, sort=False)

else:
tmp = X[self.variables_].shift(
Expand All @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
axis=0,
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)
tmp = pd.concat(df_ls, axis=1, sort=False)

else:
tmp = X[self.variables_].shift(
Expand Down
2 changes: 1 addition & 1 deletion feature_engine/timeseries/forecasting/window_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
.shift(periods=self.periods, freq=self.freq)
)
df_ls.append(tmp)
tmp = pd.concat(df_ls, axis=1)
tmp = pd.concat(df_ls, axis=1, sort=False)

else:
tmp = (
Expand Down
36 changes: 20 additions & 16 deletions feature_engine/variable_handling/_variable_type_checks.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
import pandas as pd
from pandas.api.types import is_string_dtype as is_object
from pandas.api.types import is_object_dtype, is_string_dtype
from pandas.core.dtypes.common import is_datetime64_any_dtype as is_datetime
from pandas.core.dtypes.common import is_numeric_dtype as is_numeric


def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:
# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
if is_object(column):
is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column)
def is_object(s) -> bool:
return is_object_dtype(s) or is_string_dtype(s)


def _is_categorical_and_is_not_datetime(column: pd.Series) -> bool:
is_cat = False
# check for datetime only if the type of the categories is not numeric
# because pd.to_datetime throws an error when it is an integer
elif isinstance(column.dtype, pd.CategoricalDtype):
if isinstance(column.dtype, pd.CategoricalDtype):
is_cat = _is_categories_num(column) or not _is_convertible_to_dt(column)

# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
elif is_object(column):
is_cat = _is_convertible_to_num(column) or not _is_convertible_to_dt(column)

return is_cat


Expand All @@ -26,7 +31,7 @@ def _is_convertible_to_dt(column: pd.Series) -> bool:
try:
var = pd.to_datetime(column, utc=True)
return is_datetime(var)
except:
except Exception:
return False


Expand All @@ -39,16 +44,15 @@ def _is_convertible_to_num(column: pd.Series) -> bool:


def _is_categorical_and_is_datetime(column: pd.Series) -> bool:
# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
if is_object(column):
is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column)

is_dt = False
# check for datetime only if the type of the categories is not numeric
# because pd.to_datetime throws an error when it is an integer
elif isinstance(column.dtype, pd.CategoricalDtype):
if isinstance(column.dtype, pd.CategoricalDtype):
is_dt = not _is_categories_num(column) and _is_convertible_to_dt(column)

else:
is_dt = False
# check for datetime only if object cannot be cast as numeric because
# if it could pd.to_datetime would convert it to datetime regardless
elif is_object(column):
is_dt = not _is_convertible_to_num(column) and _is_convertible_to_dt(column)

return is_dt
Loading