IntelPython · avolkov-intel · Apr 13, 2026 · Apr 15, 2026 · Apr 15, 2026 · Apr 17, 2026
@@ -89,7 +89,7 @@ Configs have the three highest parameter keys:
 | `data`:`id` | None |  | OpenML data id for `fetch_openml` source. |
 | `data`:`preprocessing_kwargs`:`replace_nan` | `median` | `median`, `mean` | Value to replace NaNs in preprocessed data. |
 | `data`:`preprocessing_kwargs`:`category_encoding` | `ordinal` | `ordinal`, `onehot`, `drop`, `ignore` | How to encode categorical features in preprocessed data. |
-| `data`:`preprocessing_kwargs`:`normalize` | False |  | Enables normalization of preprocessed data. |
+| `data`:`preprocessing_kwargs`:`normalize` | None | None, `mean`, `minmax`, `standard` | Enables normalization of preprocessed data. |
 | `data`:`preprocessing_kwargs`:`force_for_sparse` | True |  | Forces preprocessing for sparse data formats. |
 | `data`:`split_kwargs` | Empty `dict` or default split from dataset description |  | Data split parameters for `train_test_split` function. |
 | `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |

@@ -8,7 +8,7 @@
                 }
             },
             "data": {
-                "preprocessing_kwargs": { "normalize": true }
+                "preprocessing_kwargs": { "normalize": "standard" }
             }
         },
         "sklearn knn parameters": {

@@ -65,7 +65,7 @@
                     "max_iter": 10000
                 }
             },
-            "data": { "preprocessing_kwargs": { "normalize": true } }
+            "data": { "preprocessing_kwargs": { "normalize": "standard" } }
         },
         "svm clsf parameters": {
             "algorithm": { "estimator_params": { "random_state": 42 } }

@@ -3,14 +3,14 @@
     "PARAMETERS_SETS": {
         "dbscan datasets": {
             "data": [
-                { "dataset": "cifar", "split_kwargs": { "train_size": 15000 } },
+                { "dataset": "cifar", "split_kwargs": { "train_size": 15000 }, "preprocessing_kwargs": { "normalize": "mean" } },
                 { "dataset": "mnist", "split_kwargs": { "train_size": 40000 } },
                 { "dataset": "sensit", "split_kwargs": { "ignore": true } },
                 { "dataset": "susy", "split_kwargs": { "train_size": 100000 } },
                 {
                     "dataset": "skin_segmentation",
                     "split_kwargs": { "train_size": 100000 },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 {
                     "source": "make_blobs",

@@ -7,15 +7,21 @@
                     {
                         "dataset": "covtype",
                         "split_kwargs": { "ignore": true },
-                        "preprocessing_kwargs": { "normalize": true }
+                        "preprocessing_kwargs": { "normalize": "standard" }
                     },
                     {
-                        "dataset": ["mnist", "gisette"],
+                        "dataset": ["mnist"],
                         "split_kwargs": { "ignore": true }
                     },
+                    {
+                        "dataset" : "gisette",
+                        "split_kwargs" : {"ignore" : true},
+                        "preprocessing_kwargs": { "normalize": "standard" }
+                    },
                     {
                         "dataset": "cifar",
-                        "split_kwargs": { "train_size": 10000, "test_size": null }
+                        "split_kwargs": { "train_size": 10000, "test_size": null },
+                        "preprocessing_kwargs": { "normalize": "mean" }
                     }
                 ]
             },
@@ -28,7 +34,7 @@
                         "shuffle": true,
                         "random_state": 42
                     },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 "algorithm": {
                     "estimator_params": { "n_clusters": [2, 50] }

@@ -5,7 +5,7 @@
             "data": [
                 { "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } },
                 { "dataset": "connect" },
-                { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }
+                { "dataset": "gisette", "preprocessing_kwargs": { "normalize": null } }
             ]
         },
         "kd_tree knn classification datasets": {

@@ -25,7 +25,7 @@
             {
                 "data": {
                     "dataset": "year_prediction_msd",
-                    "preprocessing_kwargs": { "normalize": true },
+                    "preprocessing_kwargs": { "normalize": "standard" },
                     "split_kwargs": { "train_size": 0.5, "test_size": 0.5 }
                 }
             },

@@ -68,14 +68,16 @@
             {
                 "data": {
                     "dataset": "cifar",
-                    "split_kwargs": { "train_size": 0.1, "test_size": null }
+                    "split_kwargs": { "train_size": 0.1, "test_size": null },
+                    "preprocessing_kwargs": { "normalize": "mean" }
                 },
                 "algorithm": { "estimator_params": {"C": 1e-9} }
             },
             {
                 "data": {
                     "dataset": "gisette",
-                    "split_kwargs": { "train_size": 2000, "test_size": null }
+                    "split_kwargs": { "train_size": 2000, "test_size": null },
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 "algorithm": { "estimator_params": {"C": 1e1} }
             }

@@ -15,7 +15,7 @@
                 "algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": {
                     "estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] }
                 }
@@ -30,7 +30,7 @@
                 "data": {
                     "dataset": "mnist",
                     "split_kwargs": { "train_size": 20000, "test_size": null },
-                    "preprocessing_kwargs": { "normalize": false }
+                    "preprocessing_kwargs": { "normalize" : null }
                 },
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
             }
@@ -45,7 +45,7 @@
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
             },
             {
@@ -75,7 +75,7 @@
                 "algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } }
             }
         ],
@@ -89,7 +89,7 @@
                 "algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } }
             },
             {
-                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
+                "data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
                 "algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } }
             },
             {

@@ -10,7 +10,7 @@
                     "random_state": 42
                 },
                 "preprocessing_kwargs": {
-                    "normalize": true
+                    "normalize": "standard"
                 }
             },
             "bench": { "n_runs": 5 },

@@ -4,14 +4,19 @@
         "high-load dbscan datasets": {
             "data": [
                 {
-                    "dataset": ["cifar", "road_network", "covtype"],
+                    "dataset": "cifar",
                     "split_kwargs": { "ignore": true },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "mean" }
+                },
+                {
+                    "dataset": ["road_network", "covtype"],
+                    "split_kwargs": { "ignore": true },
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 {
                     "dataset": "susy",
                     "split_kwargs": { "train_size": 800000 },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 },
                 {
                     "source": "make_blobs",

@@ -15,7 +15,7 @@
                 "data": {
                     "dataset": ["susy", "hepmass"],
                     "split_kwargs": { "ignore": true },
-                    "preprocessing_kwargs": { "normalize": true }
+                    "preprocessing_kwargs": { "normalize": "standard" }
                 }
             },
             {
@@ -37,7 +37,8 @@
             {
                 "data": {
                     "dataset": "cifar",
-                    "split_kwargs": { "ignore": true }
+                    "split_kwargs": { "ignore": true },
+                    "preprocessing_kwargs": { "normalize": "mean" }
                 }
             }
         ]

@@ -31,7 +31,7 @@
                         "susy"
                     ],
                     "preprocessing_kwargs": {
-                        "normalize": true
+                        "normalize": "standard"
                     },
                     "split_kwargs": { "ignore": true }
                 }

@@ -28,7 +28,7 @@
                 "data": {
                     "dataset": "mnist",
                     "split_kwargs": { "ignore": true },
-                    "preprocessing_kwargs": { "normalize": false }
+                    "preprocessing_kwargs": { "normalize": null }
                 },
                 "algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
             }

@@ -16,9 +16,16 @@
             },
             {
                 "data": {
-                    "dataset": ["sensit", "mnist", "cifar"],
+                    "dataset": ["sensit", "mnist"],
                     "split_kwargs": { "ignore": true }
                 }
+            },
+            {
+                "data": {
+                    "dataset" : "cifar",
+                    "split_kwargs": { "ignore" : true },
+                    "preprocessing_kwargs": { "normalize": "mean" }
+                }
             }
         ]
     },

@@ -6,6 +6,7 @@ pandas
 tabulate
 fastparquet
 h5py
+openml
 openpyxl
 tqdm
 psutil

@@ -334,6 +334,48 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
     return acceleration_lines > 0 and fallback_lines == 0
 
 
+def validate_estimator_params(estimator_class, estimator_params: Dict) -> Dict:
+    """Validates parameters and returns only those supported by the estimator.
+
+    Args:
+        estimator_class: The estimator class to validate against
+        estimator_params: Dictionary of parameters to validate
+
+    Returns:
+        Dictionary with only valid parameters
+    """
+    try:
+        init_signature = inspect.signature(estimator_class.__init__)
+        valid_params = set(init_signature.parameters.keys()) - {"self"}
+
+        # Check if estimator accepts **kwargs
+        has_var_keyword = any(
+            param.kind == inspect.Parameter.VAR_KEYWORD
+            for param in init_signature.parameters.values()
+        )
+
+        # If accepts **kwargs, return all params
+        if has_var_keyword:
+            return estimator_params
+
+        # Filter out invalid params and warn
+        filtered_params = {}
+        for param_name, param_value in estimator_params.items():
+            if param_name in valid_params:
+                filtered_params[param_name] = param_value
+            else:
+                logger.warning(
+                    f"Parameter '{param_name}' is not supported by "
+                    f"{estimator_class.__name__} and will be ignored"
+                )
+
+        return filtered_params
+
+    except Exception as e:
+        logger.debug(f"Could not validate parameters for {estimator_class.__name__}: {e}")
+        return estimator_params
+
+
 def create_online_function(method_instance, data_args, batch_size):
     n_batches = data_args[0].shape[0] // batch_size
 
@@ -491,6 +533,9 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
         bench_case, "algorithm:estimator_params", dict()
     )
 
+    # validate and filter estimator parameters
+    estimator_params = validate_estimator_params(estimator_class, estimator_params)
+
     # get estimator methods for measurement
     estimator_methods = get_estimator_methods(bench_case)
 

@@ -23,7 +23,12 @@
 import pandas as pd
 from scipy.sparse import csr_matrix
 from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
+from sklearn.preprocessing import (
+    MinMaxScaler,
+    OneHotEncoder,
+    OrdinalEncoder,
+    StandardScaler,
+)
 
 from ..utils.custom_types import Array
 from ..utils.logger import logger
@@ -167,7 +172,7 @@ def preprocess_x(
     x: Array,
     replace_nan="auto",
     category_encoding="ordinal",
-    normalize=False,
+    normalize=None,
     force_for_sparse=True,
     **kwargs,
 ) -> Array:
@@ -219,9 +224,18 @@ def preprocess_x(
             pass
         else:
             logger.warning(f'Unknown "{category_encoding}" category encoding type.')
-    # Mean-Standard normalization
+    # Normalization
     if normalize:
-        x = (x - x.mean()) / x.std()
+        if normalize == "standard":
+            scaler = StandardScaler(with_mean=True, with_std=True)
+        elif normalize == "mean":
+            scaler = StandardScaler(with_mean=True, with_std=False)
+        elif normalize == "minmax":
+            scaler = MinMaxScaler(feature_range=(0, 1))
+        else:
+            logger.warning(f'Unknown "{normalize}" normalization type.')
+        if scaler is not None and return_type == pd.DataFrame:
+            return pd.DataFrame(scaler.fit_transform(x), columns=x.columns, index=x.index)
     if return_type == np.ndarray:
         return x.values
     else:

@@ -30,7 +30,6 @@
     make_moons,
     make_regression,
 )
-from sklearn.preprocessing import StandardScaler
 
 from .common import cache, load_data_description, load_data_from_cache, preprocess
 from .downloaders import download_and_read_csv, load_openml, retrieve
@@ -369,6 +368,7 @@ def load_epsilon(
     return {"x": x, "y": y}, data_desc
 
 
+@preprocess
 @cache
 def load_gisette(
     data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
@@ -419,8 +419,6 @@ def convert_y(y, n_samples):
     x = np.vstack([x_train, x_test])
     y = np.hstack([y_train, y_test])
 
-    x = StandardScaler(with_mean=True, with_std=True).fit_transform(x)
-
     data_desc = {
         "n_classes": 2,
         "default_split": {
@@ -545,6 +543,7 @@ def transform_x_y(x, y):
     return {"x": x, "y": y}, data_desc
 
 
+@preprocess
 @cache
 def load_cifar(
     data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
@@ -558,7 +557,6 @@ def load_cifar(
     Classification task. n_classes = 10.
     """
     x, y = load_openml(40927, raw_data_cache)
-    x = StandardScaler(with_mean=True, with_std=False).fit_transform(x)
     binary = dataset_params.get("binary", False)
     if binary:
         y = (y > 0).astype(int)
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ pandas @@
     tabulate
     fastparquet
     h5py
+    openml
     openpyxl
     tqdm
     psutil
@@ Expand Down @@