Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion configs/BENCH-CONFIG-SPEC.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ Configs have the three highest parameter keys:
| `data`:`id` | None | | OpenML data id for `fetch_openml` source. |
| `data`:`preprocessing_kwargs`:`replace_nan` | `median` | `median`, `mean` | Value to replace NaNs in preprocessed data. |
| `data`:`preprocessing_kwargs`:`category_encoding` | `ordinal` | `ordinal`, `onehot`, `drop`, `ignore` | How to encode categorical features in preprocessed data. |
| `data`:`preprocessing_kwargs`:`normalize` | False | | Enables normalization of preprocessed data. |
| `data`:`preprocessing_kwargs`:`normalize` | None | None, `mean`, `minmax`, `standard` | Enables normalization of preprocessed data. |
| `data`:`preprocessing_kwargs`:`force_for_sparse` | True | | Forces preprocessing for sparse data formats. |
| `data`:`split_kwargs` | Empty `dict` or default split from dataset description | | Data split parameters for `train_test_split` function. |
| `data`:`format` | `pandas` | `pandas`, `numpy`, `cudf` | Data format to use in benchmark. |
Expand Down
2 changes: 1 addition & 1 deletion configs/common/knn.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
}
},
"data": {
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
Comment thread
david-cortes-intel marked this conversation as resolved.
}
},
"sklearn knn parameters": {
Expand Down
2 changes: 1 addition & 1 deletion configs/common/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@
"max_iter": 10000
}
},
"data": { "preprocessing_kwargs": { "normalize": true } }
"data": { "preprocessing_kwargs": { "normalize": "standard" } }
},
"svm clsf parameters": {
"algorithm": { "estimator_params": { "random_state": 42 } }
Expand Down
4 changes: 2 additions & 2 deletions configs/regular/dbscan.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@
"PARAMETERS_SETS": {
"dbscan datasets": {
"data": [
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 } },
{ "dataset": "cifar", "split_kwargs": { "train_size": 15000 }, "preprocessing_kwargs": { "normalize": "mean" } },
{ "dataset": "mnist", "split_kwargs": { "train_size": 40000 } },
{ "dataset": "sensit", "split_kwargs": { "ignore": true } },
{ "dataset": "susy", "split_kwargs": { "train_size": 100000 } },
{
"dataset": "skin_segmentation",
"split_kwargs": { "train_size": 100000 },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"source": "make_blobs",
Expand Down
14 changes: 10 additions & 4 deletions configs/regular/kmeans.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,21 @@
{
"dataset": "covtype",
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": ["mnist", "gisette"],
"dataset": ["mnist"],
"split_kwargs": { "ignore": true }
},
{
"dataset" : "gisette",
"split_kwargs" : {"ignore" : true},
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": "cifar",
"split_kwargs": { "train_size": 10000, "test_size": null }
"split_kwargs": { "train_size": 10000, "test_size": null },
"preprocessing_kwargs": { "normalize": "mean" }
}
]
},
Expand All @@ -28,7 +34,7 @@
"shuffle": true,
"random_state": 42
},
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": {
"estimator_params": { "n_clusters": [2, 50] }
Expand Down
2 changes: 1 addition & 1 deletion configs/regular/knn.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"data": [
{ "dataset": "susy", "split_kwargs": { "train_size": 80000, "test_size": 20000 } },
{ "dataset": "connect" },
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } }
{ "dataset": "gisette", "preprocessing_kwargs": { "normalize": null } }
]
},
"kd_tree knn classification datasets": {
Expand Down
2 changes: 1 addition & 1 deletion configs/regular/linear_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
{
"data": {
"dataset": "year_prediction_msd",
"preprocessing_kwargs": { "normalize": true },
"preprocessing_kwargs": { "normalize": "standard" },
"split_kwargs": { "train_size": 0.5, "test_size": 0.5 }
}
},
Expand Down
6 changes: 4 additions & 2 deletions configs/regular/logreg.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,16 @@
{
"data": {
"dataset": "cifar",
"split_kwargs": { "train_size": 0.1, "test_size": null }
"split_kwargs": { "train_size": 0.1, "test_size": null },
"preprocessing_kwargs": { "normalize": "mean" }
},
"algorithm": { "estimator_params": {"C": 1e-9} }
},
{
"data": {
"dataset": "gisette",
"split_kwargs": { "train_size": 2000, "test_size": null }
"split_kwargs": { "train_size": 2000, "test_size": null },
"preprocessing_kwargs": { "normalize": "standard" }
},
"algorithm": { "estimator_params": {"C": 1e1} }
}
Expand Down
10 changes: 5 additions & 5 deletions configs/regular/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"algorithm": { "estimator_params": { "C": 100.0, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": {
"estimator_params": { "C": 100.0, "kernel": ["linear", "poly", "rbf"] }
}
Expand All @@ -30,7 +30,7 @@
"data": {
"dataset": "mnist",
"split_kwargs": { "train_size": 20000, "test_size": null },
"preprocessing_kwargs": { "normalize": false }
"preprocessing_kwargs": { "normalize" : null }
},
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
}
Expand All @@ -45,7 +45,7 @@
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["linear", "poly", "rbf"] } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
},
{
Expand Down Expand Up @@ -75,7 +75,7 @@
"algorithm": { "estimator_params": { "nu": 0.1, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "nu": 0.9, "kernel": ["linear", "rbf"] } }
}
],
Expand All @@ -89,7 +89,7 @@
"algorithm": { "estimator_params": { "nu": 0.8, "C": 2.0, "kernel": "rbf" } }
},
{
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": false } },
"data": { "dataset": "gisette", "preprocessing_kwargs": { "normalize": "standard" } },
"algorithm": { "estimator_params": { "nu": 0.9, "C": 1.0, "kernel": "rbf" } }
},
{
Expand Down
2 changes: 1 addition & 1 deletion configs/testing/azure-pipelines-ci.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
"random_state": 42
},
"preprocessing_kwargs": {
"normalize": true
"normalize": "standard"
}
},
"bench": { "n_runs": 5 },
Expand Down
11 changes: 8 additions & 3 deletions configs/weekly/dbscan.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,19 @@
"high-load dbscan datasets": {
"data": [
{
"dataset": ["cifar", "road_network", "covtype"],
"dataset": "cifar",
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "mean" }
},
{
"dataset": ["road_network", "covtype"],
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"dataset": "susy",
"split_kwargs": { "train_size": 800000 },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
},
{
"source": "make_blobs",
Expand Down
5 changes: 3 additions & 2 deletions configs/weekly/kmeans.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"data": {
"dataset": ["susy", "hepmass"],
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": true }
"preprocessing_kwargs": { "normalize": "standard" }
}
},
{
Expand All @@ -37,7 +37,8 @@
{
"data": {
"dataset": "cifar",
"split_kwargs": { "ignore": true }
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": "mean" }
}
}
]
Expand Down
2 changes: 1 addition & 1 deletion configs/weekly/linear_model.json
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"susy"
],
"preprocessing_kwargs": {
"normalize": true
"normalize": "standard"
},
"split_kwargs": { "ignore": true }
}
Expand Down
2 changes: 1 addition & 1 deletion configs/weekly/svm.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
"data": {
"dataset": "mnist",
"split_kwargs": { "ignore": true },
"preprocessing_kwargs": { "normalize": false }
"preprocessing_kwargs": { "normalize": null }
},
"algorithm": { "estimator_params": { "C": 1.0, "kernel": ["poly", "rbf"] } }
}
Expand Down
9 changes: 8 additions & 1 deletion configs/weekly/tsne.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,16 @@
},
{
"data": {
"dataset": ["sensit", "mnist", "cifar"],
"dataset": ["sensit", "mnist"],
"split_kwargs": { "ignore": true }
}
},
{
"data": {
"dataset" : "cifar",
"split_kwargs": { "ignore" : true },
"preprocessing_kwargs": { "normalize": "mean" }
}
}
]
},
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pandas
tabulate
fastparquet
h5py
openml
Comment thread
david-cortes-intel marked this conversation as resolved.
openpyxl
tqdm
psutil
Expand Down
45 changes: 45 additions & 0 deletions sklbench/benchmarks/sklearn_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,48 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
return acceleration_lines > 0 and fallback_lines == 0


def validate_estimator_params(estimator_class, estimator_params: Dict) -> Dict:
"""Validates parameters and returns only those supported by the estimator.

Args:
estimator_class: The estimator class to validate against
estimator_params: Dictionary of parameters to validate

Returns:
Dictionary with only valid parameters
"""
try:
init_signature = inspect.signature(estimator_class.__init__)
valid_params = set(init_signature.parameters.keys()) - {"self"}

# Check if estimator accepts **kwargs
has_var_keyword = any(
param.kind == inspect.Parameter.VAR_KEYWORD
for param in init_signature.parameters.values()
)

# If accepts **kwargs, return all params
if has_var_keyword:
return estimator_params

# Filter out invalid params and warn
filtered_params = {}
for param_name, param_value in estimator_params.items():
if param_name in valid_params:
filtered_params[param_name] = param_value
else:
logger.warning(
f"Parameter '{param_name}' is not supported by "
f"{estimator_class.__name__} and will be ignored"
)

return filtered_params

except Exception as e:
logger.debug(f"Could not validate parameters for {estimator_class.__name__}: {e}")
return estimator_params


def create_online_function(method_instance, data_args, batch_size):
n_batches = data_args[0].shape[0] // batch_size

Expand Down Expand Up @@ -491,6 +533,9 @@ def main(bench_case: BenchCase, filters: List[BenchCase]):
bench_case, "algorithm:estimator_params", dict()
)

# validate and filter estimator parameters
estimator_params = validate_estimator_params(estimator_class, estimator_params)

# get estimator methods for measurement
estimator_methods = get_estimator_methods(bench_case)

Expand Down
22 changes: 18 additions & 4 deletions sklbench/datasets/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,12 @@
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import (
MinMaxScaler,
OneHotEncoder,
OrdinalEncoder,
StandardScaler,
)

from ..utils.custom_types import Array
from ..utils.logger import logger
Expand Down Expand Up @@ -167,7 +172,7 @@ def preprocess_x(
x: Array,
replace_nan="auto",
category_encoding="ordinal",
normalize=False,
normalize=None,
force_for_sparse=True,
**kwargs,
) -> Array:
Expand Down Expand Up @@ -219,9 +224,18 @@ def preprocess_x(
pass
else:
logger.warning(f'Unknown "{category_encoding}" category encoding type.')
# Mean-Standard normalization
# Normalization
if normalize:
x = (x - x.mean()) / x.std()
if normalize == "standard":
scaler = StandardScaler(with_mean=True, with_std=True)
elif normalize == "mean":
scaler = StandardScaler(with_mean=True, with_std=False)
elif normalize == "minmax":
scaler = MinMaxScaler(feature_range=(0, 1))
else:
logger.warning(f'Unknown "{normalize}" normalization type.')
if scaler is not None and return_type == pd.DataFrame:
return pd.DataFrame(scaler.fit_transform(x), columns=x.columns, index=x.index)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't this make it ignore return_type == np.ndarray?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently it works correctly for all return_types as intermediate data is always represented in pandas format. However, this conversion is indeed redundant if return_type is not a pandas dataframe

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is that because it then goes through train_test_split? Isn't that step optional?

if return_type == np.ndarray:
return x.values
else:
Expand Down
6 changes: 2 additions & 4 deletions sklbench/datasets/loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
make_moons,
make_regression,
)
from sklearn.preprocessing import StandardScaler

from .common import cache, load_data_description, load_data_from_cache, preprocess
from .downloaders import download_and_read_csv, load_openml, retrieve
Expand Down Expand Up @@ -369,6 +368,7 @@ def load_epsilon(
return {"x": x, "y": y}, data_desc


@preprocess
@cache
def load_gisette(
data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
Expand Down Expand Up @@ -419,8 +419,6 @@ def convert_y(y, n_samples):
x = np.vstack([x_train, x_test])
y = np.hstack([y_train, y_test])

x = StandardScaler(with_mean=True, with_std=True).fit_transform(x)

data_desc = {
"n_classes": 2,
"default_split": {
Expand Down Expand Up @@ -545,6 +543,7 @@ def transform_x_y(x, y):
return {"x": x, "y": y}, data_desc


@preprocess
@cache
def load_cifar(
data_name: str, data_cache: str, raw_data_cache: str, dataset_params: Dict
Expand All @@ -558,7 +557,6 @@ def load_cifar(
Classification task. n_classes = 10.
"""
x, y = load_openml(40927, raw_data_cache)
x = StandardScaler(with_mean=True, with_std=False).fit_transform(x)
binary = dataset_params.get("binary", False)
if binary:
y = (y > 0).astype(int)
Expand Down
Loading