diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index d4e9439a..c079ccc7 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -20,16 +20,16 @@ jobs: # this is to make the CI run on different sklearn versions include: - python: "3.9" - sklearn_version: "1.1" + sklearn_version: ">=1.1,<1.2" numpy_version: "numpy<2" - python: "3.10" - sklearn_version: "1.2" + sklearn_version: ">=1.2,<1.3" numpy_version: "numpy" - python: "3.11" - sklearn_version: "1.4" + sklearn_version: ">=1.4,<1.5" numpy_version: "numpy" - python: "3.12" - sklearn_version: "1.5" + sklearn_version: ">=1.5,<1.6" numpy_version: "numpy" - python: "3.13" sklearn_version: "nightly" @@ -59,20 +59,22 @@ jobs: - name: Install dependencies run: | + set -x python -m pip install -U pip if [ "${{ matrix.os }}" == "macos-latest" ]; then brew install libomp fi pip install "pytest<8" pip install "${{ matrix.numpy_version }}" - if [ ${{ matrix.sklearn_version }} == "nightly" ]; - then pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn; - else pip install "scikit-learn~=${{ matrix.sklearn_version }}"; + if [ ${{ matrix.sklearn_version }} == "nightly" ]; then + pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn + pip install .[docs,tests] + else + pip install .[docs,tests] "scikit-learn${{ matrix.sklearn_version }}" fi - pip install .[docs,tests] pip install black=="23.9.1" ruff=="0.0.292" mypy=="1.6.0" - if [ ${{ matrix.os }} == "ubuntu-latest" ]; - then sudo apt install pandoc && pandoc --version; + if [ ${{ matrix.os }} == "ubuntu-latest" ]; then + sudo apt install pandoc && pandoc --version; fi python --version pip --version @@ -98,7 +100,7 @@ jobs: - name: Inference tests (conditional) if: contains(env.PR_COMMIT_MESSAGE, '[CI inference]') run: | - python -m pytest -s -v -m "inference" skops/ + python -l -m pytest -s -v -m "inference" skops/ - name: Upload coverage to Codecov uses: codecov/codecov-action@v5 diff --git a/pyproject.toml b/pyproject.toml index f697b327..b5e47d5b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,14 @@ filterwarnings = [ "ignore:DataFrameGroupBy.apply operated on the grouping columns.:DeprecationWarning", # Ignore Pandas 2.2 warning on PyArrow. It might be reverted in a later release. "ignore:\\s*Pyarrow will become a required dependency of pandas.*:DeprecationWarning", + # LightGBM sklearn 1.6 deprecation warning, fixed in the next release + "ignore:'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.:FutureWarning", + # RandomForestQuantileRegressor tags deprecation warning in sklearn 1.7 + "ignore:The RandomForestQuantileRegressor or classes from which it inherits use `_get_tags` and `_more_tags`:FutureWarning", + # ExtraTreesQuantileRegressor tags deprecation warning in sklearn 1.7 + "ignore:The ExtraTreesQuantileRegressor or classes from which it inherits use `_get_tags` and `_more_tags`:FutureWarning", + # BaseEstimator._validate_data deprecation warning in sklearn 1.6 #TODO can be removed when a new release of quantile-forest is out + "ignore:`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7:FutureWarning", ] markers = [ "network: marks tests as requiring internet (deselect with '-m \"not network\"')", diff --git a/scripts/check_file_size.py b/scripts/check_file_size.py index c88bf11e..e9aca319 100644 --- a/scripts/check_file_size.py +++ b/scripts/check_file_size.py @@ -20,7 +20,6 @@ from zipfile import ZIP_DEFLATED, ZipFile import pandas as pd -from sklearn.utils._tags import _safe_tags from sklearn.utils._testing import set_random_state import skops.io as sio @@ -29,6 +28,7 @@ _tested_estimators, get_input, ) +from skops.utils._fixes import get_tags TOPK = 10 # number of largest estimators reported MAX_ALLOWED_SIZE = 1024 # maximum allowed file size in kb @@ -46,8 +46,7 @@ def check_file_size() -> None: set_random_state(estimator, random_state=0) X, y = get_input(estimator) - tags = _safe_tags(estimator) - if tags.get("requires_fit", True): + if get_tags(estimator).requires_fit: with warnings.catch_warnings(): warnings.filterwarnings("ignore", module="sklearn") if y is not None: diff --git a/scripts/check_persistence_performance.py b/scripts/check_persistence_performance.py index 2b1a6c2e..a8b04279 100644 --- a/scripts/check_persistence_performance.py +++ b/scripts/check_persistence_performance.py @@ -15,7 +15,6 @@ from typing import Any import pandas as pd -from sklearn.utils._tags import _safe_tags from sklearn.utils._testing import set_random_state import skops.io as sio @@ -24,6 +23,7 @@ _tested_estimators, get_input, ) +from skops.utils._fixes import get_tags ATOL = 1 # seconds absolute difference allowed at max NUM_REPS = 10 # number of times the check is repeated @@ -43,8 +43,7 @@ def check_persist_performance() -> None: set_random_state(estimator, random_state=0) X, y = get_input(estimator) - tags = _safe_tags(estimator) - if tags.get("requires_fit", True): + if get_tags(estimator).requires_fit: with warnings.catch_warnings(): warnings.filterwarnings("ignore", module="sklearn") if y is not None: diff --git a/skops/_min_dependencies.py b/skops/_min_dependencies.py index 58f12c4c..c90b3716 100644 --- a/skops/_min_dependencies.py +++ b/skops/_min_dependencies.py @@ -33,7 +33,9 @@ # required for persistence tests of external libraries "lightgbm": ("3", "tests", None), "xgboost": ("1.6", "tests", None), - "catboost": ("1.0", "tests", None), + # remove python constraint when catboost supports 3.13 + # https://github.com/catboost/catboost/issues/2748 + "catboost": ("1.0", "tests", 'python_version < "3.13"'), "fairlearn": ("0.7.0", "docs, tests", None), "rich": ("12", "tests, rich", None), } diff --git a/skops/io/_sklearn.py b/skops/io/_sklearn.py index 9f6058a1..10300757 100644 --- a/skops/io/_sklearn.py +++ b/skops/io/_sklearn.py @@ -3,44 +3,94 @@ from typing import Any, Optional, Sequence, Type from sklearn.cluster import Birch +from sklearn.tree._tree import Tree -from ._general import TypeNode +from ._audit import Node, get_tree +from ._general import TypeNode, unsupported_get_state from ._protocol import PROTOCOL +from ._utils import LoadContext, SaveContext, get_module, get_state, gettype +from .exceptions import UnsupportedTypeException try: # TODO: remove once support for sklearn<1.2 is dropped. See #187 from sklearn.covariance._graph_lasso import _DictWithDeprecatedKeys except ImportError: _DictWithDeprecatedKeys = None + from sklearn.linear_model._sgd_fast import ( EpsilonInsensitive, Hinge, - Huber, - Log, - LossFunction, ModifiedHuber, SquaredEpsilonInsensitive, SquaredHinge, - SquaredLoss, ) -from sklearn.tree._tree import Tree -from ._audit import Node, get_tree -from ._general import unsupported_get_state -from ._utils import LoadContext, SaveContext, get_module, get_state, gettype -from .exceptions import UnsupportedTypeException - -ALLOWED_SGD_LOSSES = { - ModifiedHuber, - Hinge, - SquaredHinge, - Log, - SquaredLoss, - Huber, +ALLOWED_LOSSES = { EpsilonInsensitive, + Hinge, + ModifiedHuber, SquaredEpsilonInsensitive, + SquaredHinge, } +try: + # TODO: remove once support for sklearn<1.6 is dropped. + from sklearn.linear_model._sgd_fast import ( + Huber, + Log, + SquaredLoss, + ) + + ALLOWED_LOSSES |= { + Huber, + Log, + SquaredLoss, + } +except ImportError: + pass + +try: + # sklearn>=1.6 + from sklearn._loss._loss import ( + CyAbsoluteError, + CyExponentialLoss, + CyHalfBinomialLoss, + CyHalfGammaLoss, + CyHalfMultinomialLoss, + CyHalfPoissonLoss, + CyHalfSquaredError, + CyHalfTweedieLoss, + CyHalfTweedieLossIdentity, + CyHuberLoss, + CyPinballLoss, + ) + + ALLOWED_LOSSES |= { + CyAbsoluteError, + CyExponentialLoss, + CyHalfBinomialLoss, + CyHalfGammaLoss, + CyHalfMultinomialLoss, + CyHalfPoissonLoss, + CyHalfSquaredError, + CyHalfTweedieLoss, + CyHalfTweedieLossIdentity, + CyHuberLoss, + CyPinballLoss, + } +except ImportError: + pass + +# This import is for the parent class of all loss functions, which is used to +# set the dispatch function for all loss functions. +try: + # From sklearn>=1.6 + from sklearn._loss._loss import CyLossFunction as ParentLossClass +except ImportError: + # sklearn<1.6 + from sklearn.linear_model._sgd_fast import LossFunction as ParentLossClass + + UNSUPPORTED_TYPES = {Birch} @@ -163,13 +213,13 @@ def __init__( super().__init__(state, load_context, constructor=Tree, trusted=self.trusted) -def sgd_loss_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: +def loss_get_state(obj: Any, save_context: SaveContext) -> dict[str, Any]: state = reduce_get_state(obj, save_context) - state["__loader__"] = "SGDNode" + state["__loader__"] = "LossNode" return state -class SGDNode(ReduceNode): +class LossNode(ReduceNode): def __init__( self, state: dict[str, Any], @@ -178,7 +228,7 @@ def __init__( ) -> None: # TODO: make sure trusted here makes sense and used. self.trusted = self._get_trusted( - trusted, [get_module(x) + "." + x.__name__ for x in ALLOWED_SGD_LOSSES] + trusted, [get_module(x) + "." + x.__name__ for x in ALLOWED_LOSSES] ) super().__init__( state, @@ -240,15 +290,16 @@ def _construct(self): # tuples of type and function that gets the state of that type GET_STATE_DISPATCH_FUNCTIONS = [ - (LossFunction, sgd_loss_get_state), + (ParentLossClass, loss_get_state), (Tree, tree_get_state), ] + for type_ in UNSUPPORTED_TYPES: GET_STATE_DISPATCH_FUNCTIONS.append((type_, unsupported_get_state)) # tuples of type and function that creates the instance of that type -NODE_TYPE_MAPPING = { - ("SGDNode", PROTOCOL): SGDNode, +NODE_TYPE_MAPPING: dict[tuple[str, int], Any] = { + ("LossNode", PROTOCOL): LossNode, ("TreeNode", PROTOCOL): TreeNode, } diff --git a/skops/io/tests/_utils.py b/skops/io/tests/_utils.py index b1f41c25..66dbf47f 100644 --- a/skops/io/tests/_utils.py +++ b/skops/io/tests/_utils.py @@ -44,36 +44,38 @@ def _is_steps_like(obj): return True -def _assert_generic_objects_equal(val1, val2): +def _assert_generic_objects_equal(val1, val2, path=""): def _is_builtin(val): # Check if value is a builtin type return getattr(getattr(val, "__class__", {}), "__module__", None) == "builtins" if isinstance(val1, (list, tuple, np.ndarray)): - assert len(val1) == len(val2) + assert len(val1) == len(val2), f"Path: len({path})" for subval1, subval2 in zip(val1, val2): - _assert_generic_objects_equal(subval1, subval2) + _assert_generic_objects_equal(subval1, subval2, path=f"{path}[]") return - assert type(val1) == type(val2) + assert type(val1) == type(val2), f"Path: type({path})" if hasattr(val1, "__dict__"): - assert_params_equal(val1.__dict__, val2.__dict__) + assert_params_equal(val1.__dict__, val2.__dict__, path=f"{path}.__dict__") elif _is_builtin(val1): - assert val1 == val2 + assert val1 == val2, f"Path: {path}" else: # not a normal Python class, could be e.g. a Cython class - _assert_tuples_equal(val1.__reduce__(), val2.__reduce__()) + _assert_tuples_equal( + val1.__reduce__(), val2.__reduce__(), path=f"{path}.__reduce__" + ) -def _assert_tuples_equal(val1, val2): - assert len(val1) == len(val2) +def _assert_tuples_equal(val1, val2, path=""): + assert len(val1) == len(val2), f"Path: len({path})" for subval1, subval2 in zip(val1, val2): - _assert_vals_equal(subval1, subval2) + _assert_vals_equal(subval1, subval2, path=f"{path}[]") -def _assert_vals_equal(val1, val2): +def _assert_vals_equal(val1, val2, path=""): if isinstance(val1, type): # e.g. could be np.int64 - assert val1 is val2 + assert val1 is val2, f"Path: {path}" elif hasattr(val1, "__getstate__") and (val1.__getstate__() is not None): # This includes BaseEstimator since they implement __getstate__ and # that returns the parameters as well. @@ -82,53 +84,59 @@ def _assert_vals_equal(val1, val2): # Some objects return a tuple of parameters, others a dict. state1 = val1.__getstate__() state2 = val2.__getstate__() - assert type(state1) == type(state2) + assert type(state1) == type(state2), f"Path: {path}" if isinstance(state1, tuple): - _assert_tuples_equal(state1, state2) + _assert_tuples_equal(state1, state2, path=path) else: - assert_params_equal(val1.__getstate__(), val2.__getstate__()) + assert_params_equal( + val1.__getstate__(), val2.__getstate__(), path=f"{path}.__getstate__()" + ) elif sparse.issparse(val1): - assert sparse.issparse(val2) and ((val1 - val2).nnz == 0) + assert sparse.issparse(val2) and ((val1 - val2).nnz == 0), f"Path: {path}" elif isinstance(val1, (np.ndarray, np.generic)): if len(val1.dtype) == 0: # for arrays with at least 2 dimensions, check that contiguity is # preserved if val1.squeeze().ndim > 1: - assert val1.flags["C_CONTIGUOUS"] is val2.flags["C_CONTIGUOUS"] - assert val1.flags["F_CONTIGUOUS"] is val2.flags["F_CONTIGUOUS"] + assert ( + val1.flags["C_CONTIGUOUS"] is val2.flags["C_CONTIGUOUS"] + ), f"Path: {path}.flags" + assert ( + val1.flags["F_CONTIGUOUS"] is val2.flags["F_CONTIGUOUS"] + ), f"Path: {path}.flags" if val1.dtype == object: - assert val2.dtype == object - assert val1.shape == val2.shape + assert val2.dtype == object, f"Path: {path}.dtype" + assert val1.shape == val2.shape, f"Path: {path}.shape" for subval1, subval2 in zip(val1, val2): - _assert_generic_objects_equal(subval1, subval2) + _assert_generic_objects_equal(subval1, subval2, path=f"{path}[]") else: # simple comparison of arrays with simple dtypes, almost all # arrays are of this sort. - np.testing.assert_array_equal(val1, val2) + np.testing.assert_array_equal(val1, val2, err_msg=f"Path: {path}") elif len(val1.shape) == 1: # comparing arrays with structured dtypes, but they have to be 1D # arrays. This is what we get from the Tree's state. - assert np.all([x == y for x, y in zip(val1, val2)]) + assert np.all([x == y for x, y in zip(val1, val2)]), f"Path: {path}" else: # we don't know what to do with these values, for now. - assert False + assert False, f"Path: {path}" elif isinstance(val1, (tuple, list)): - _assert_tuples_equal(val1, val2) + _assert_tuples_equal(val1, val2, path=path) elif isinstance(val1, float) and np.isnan(val1): - assert np.isnan(val2) + assert np.isnan(val2), f"Path: {path}" elif isinstance(val1, dict): # dictionaries are compared by comparing their values recursively. - assert set(val1.keys()) == set(val2.keys()) + assert set(val1.keys()) == set(val2.keys()), f"Path: {path}.keys()" for key in val1: - _assert_vals_equal(val1[key], val2[key]) + _assert_vals_equal(val1[key], val2[key], path=f"{path}[{key}]") elif hasattr(val1, "__dict__") and hasattr(val2, "__dict__"): - _assert_vals_equal(val1.__dict__, val2.__dict__) + _assert_vals_equal(val1.__dict__, val2.__dict__, path=f"{path}.__dict__") elif isinstance(val1, np.ufunc): - assert val1 == val2 + assert val1 == val2, f"Path: {path}" elif val1.__class__.__module__ == "builtins": - assert val1 == val2 + assert val1 == val2, f"Path: {path}" else: - _assert_generic_objects_equal(val1, val2) + _assert_generic_objects_equal(val1, val2, path=path) def _clean_params(params): @@ -144,34 +152,35 @@ def _clean_params(params): return params -def assert_params_equal(params1, params2): +def assert_params_equal(params1, params2, path=""): # helper function to compare estimator dictionaries of parameters if params1 is None and params2 is None: return params1, params2 = _clean_params(params1), _clean_params(params2) - assert len(params1) == len(params2) - assert set(params1.keys()) == set(params2.keys()) + assert len(params1) == len(params2), f"Path: len({path})" + assert set(params1.keys()) == set(params2.keys()), f"Path: {path}.keys()" for key in params1: with warnings.catch_warnings(): # this is to silence the deprecation warning from _DictWithDeprecatedKeys warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn") val1, val2 = params1[key], params2[key] - assert type(val1) == type(val2) + subpath = f"{path}[{key}]" + assert type(val1) == type(val2), f"Path: type({subpath})" if _is_steps_like(val1): # Deal with Pipeline.steps, FeatureUnion.transformer_list, etc. - assert _is_steps_like(val2) + assert _is_steps_like(val2), f"Path: {subpath}" val1, val2 = dict(val1), dict(val2) if isinstance(val1, (tuple, list)): assert len(val1) == len(val2) for subval1, subval2 in zip(val1, val2): - _assert_vals_equal(subval1, subval2) + _assert_vals_equal(subval1, subval2, path=f"{subpath}[]") elif isinstance(val1, dict): - assert_params_equal(val1, val2) + assert_params_equal(val1, val2, path=subpath) else: - _assert_vals_equal(val1, val2) + _assert_vals_equal(val1, val2, path=subpath) def assert_method_outputs_equal(estimator, loaded, X): diff --git a/skops/io/tests/test_external.py b/skops/io/tests/test_external.py index d9fa7916..63ee0a78 100644 --- a/skops/io/tests/test_external.py +++ b/skops/io/tests/test_external.py @@ -289,6 +289,15 @@ def test_ranker(self, xgboost, rank_data, trusted, booster, tree_method): class TestCatboost: """Tests for CatBoostClassifier, CatBoostRegressor, and CatBoostRanker""" + @pytest.fixture(autouse=True) + def catboost(self): + """Skip all tests in this class if catboost is not available.""" + try: + catboost = pytest.importorskip("catboost") + except (ImportError, ValueError): # ValueError for numpy2 incompatibility + pytest.skip("Catboost not available or incompatible") + return catboost + @pytest.fixture(autouse=True) def capture_stdout(self): # Mock print and rich.print so that running these tests with pytest -s @@ -317,15 +326,6 @@ def cb_rank_data(self, rank_data): group_id = sum([[i] * n for i, n in enumerate(group)], []) return X, y, group_id - @pytest.fixture(autouse=True) - def catboost(self): - try: - catboost = pytest.importorskip("catboost") - except ValueError: # TODO(numpy2) remove when catboost supports numpy2 - pytest.skip("Catboost not supporting numpy2 yet") - - return catboost - @pytest.fixture def trusted(self): # TODO: adjust once more types are trusted by default diff --git a/skops/io/tests/test_persist.py b/skops/io/tests/test_persist.py index 34b4dae6..649577e9 100644 --- a/skops/io/tests/test_persist.py +++ b/skops/io/tests/test_persist.py @@ -13,6 +13,7 @@ import joblib import numpy as np import pytest +import sklearn from scipy import sparse, special from sklearn.base import BaseEstimator, is_regressor from sklearn.compose import ColumnTransformer @@ -42,13 +43,8 @@ StandardScaler, ) from sklearn.utils import all_estimators, check_random_state -from sklearn.utils._tags import _safe_tags from sklearn.utils._testing import SkipTest, set_random_state -from sklearn.utils.estimator_checks import ( - _construct_instance, - _enforce_estimator_tags_y, - _get_check_estimator_ids, -) +from sklearn.utils.estimator_checks import _get_check_estimator_ids from sklearn.utils.fixes import parse_version, sp_version import skops @@ -66,9 +62,15 @@ from skops.io._utils import LoadContext, SaveContext, _get_state, get_state, gettype from skops.io.exceptions import UnsupportedTypeException, UntrustedTypesFoundException from skops.io.tests._utils import assert_method_outputs_equal, assert_params_equal +from skops.utils._fixes import ( + _enforce_estimator_tags_X, + _enforce_estimator_tags_y, + construct_instances, + get_tags, +) # Default settings for X -N_SAMPLES = 50 +N_SAMPLES = 120 N_FEATURES = 20 @@ -130,6 +132,7 @@ def _tested_estimators(type_filter=None): for name, Estimator in all_estimators(type_filter=type_filter): if Estimator in UNSUPPORTED_TYPES: continue + try: # suppress warnings here for skipped estimators. with warnings.catch_warnings(): @@ -145,19 +148,30 @@ def _tested_estimators(type_filter=None): # scikit-learn < 1.4.0) is not available in scipy >= 1.11.0. The # default solver will be "highs" from scikit-learn >= 1.4.0. # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.QuantileRegressor.html - estimator = _construct_instance(partial(Estimator, solver="highs")) + estimators = construct_instances(partial(Estimator, solver="highs")) else: - estimator = _construct_instance(Estimator) - # with the kind of data we pass, it needs to be 1 for the few - # estimators which have this. - if "n_components" in estimator.get_params(): - estimator.set_params(n_components=1) - # Then n_best needs to be <= n_components - if "n_best" in estimator.get_params(): - estimator.set_params(n_best=1) - if "patch_size" in estimator.get_params(): - # set patch size to fix PatchExtractor test. - estimator.set_params(patch_size=(3, 3)) + estimators = construct_instances(Estimator) + + for estimator in estimators: + # with the kind of data we pass, it needs to be 1 for the few + # estimators which have this. + if "n_components" in estimator.get_params(): + estimator.set_params(n_components=1) + # Then n_best needs to be <= n_components + if "n_best" in estimator.get_params(): + estimator.set_params(n_best=1) + if "patch_size" in estimator.get_params(): + # set patch size to fix PatchExtractor test. + estimator.set_params(patch_size=(3, 3)) + if "skewedness" in estimator.get_params(): + # prevent data generation errors for SkewedChi2Sampler + estimator.set_params(skewedness=20) + if estimator.__class__.__name__ == "GraphicalLasso": + # prevent data generation errors + estimator.set_params(alpha=1) + if estimator.__class__.__name__ == "GraphicalLassoCV": + # prevent data generation errors + estimator.set_params(alphas=[1, 2]) except SkipTest: continue @@ -267,17 +281,19 @@ def _unsupported_estimators(type_filter=None): category=SkipTestWarning, message="Can't instantiate estimator", ) - estimator = _construct_instance(Estimator) + # Get the first instance directly from the generator + estimators = construct_instances(Estimator) # with the kind of data we pass, it needs to be 1 for the few # estimators which have this. - if "n_components" in estimator.get_params(): - estimator.set_params(n_components=1) - # Then n_best needs to be <= n_components - if "n_best" in estimator.get_params(): - estimator.set_params(n_best=1) - if "patch_size" in estimator.get_params(): - # set patch size to fix PatchExtractor test. - estimator.set_params(patch_size=(3, 3)) + for estimator in estimators: + if "n_components" in estimator.get_params(): + estimator.set_params(n_components=1) + # Then n_best needs to be <= n_components + if "n_best" in estimator.get_params(): + estimator.set_params(n_best=1) + if "patch_size" in estimator.get_params(): + # set patch size to fix PatchExtractor test. + estimator.set_params(patch_size=(3, 3)) except SkipTest: continue @@ -311,37 +327,45 @@ def get_input(estimator): n_samples=N_SAMPLES, n_features=N_FEATURES, random_state=0 ) y = _enforce_estimator_tags_y(estimator, y) - tags = _safe_tags(estimator) + X = _enforce_estimator_tags_X(estimator, X) - if tags["pairwise"] is True: - return np.random.rand(N_FEATURES, N_FEATURES), None + tags = get_tags(estimator) - if "2darray" in tags["X_types"]: + if tags.input_tags.pairwise: + # return a square matrix of size N_FEATURES x N_FEATURES and positive values + return np.abs(X[:N_FEATURES, :N_FEATURES]), y[:N_FEATURES] + + if tags.input_tags.positive_only: # Some models require positive X return np.abs(X), y - if "1darray" in tags["X_types"]: + if tags.input_tags.two_d_array: + return X, y + + if tags.input_tags.one_d_array: + if X.ndim == 1: + return X, y return X[:, 0], y - if "3darray" in tags["X_types"]: + if tags.input_tags.three_d_array: return load_sample_images().images[1], None - if "1dlabels" in tags["X_types"]: + if tags.target_tags.one_d_labels: # model only expects y return y, None - if "2dlabels" in tags["X_types"]: + if tags.target_tags.two_d_labels: return [(1, 2), (3,)], None - if "categorical" in tags["X_types"]: + if tags.input_tags.categorical: X = [["Male", 1], ["Female", 3], ["Female", 2]] - y = y[: len(X)] if tags["requires_y"] else None + y = y[: len(X)] if tags.target_tags.required else None return X, y - if "dict" in tags["X_types"]: + if tags.input_tags.dict: return [{"foo": 1, "bar": 2}, {"foo": 3, "baz": 1}], None - if "string" in tags["X_types"]: + if tags.input_tags.string: return [ "This is the first document.", "This document is the second document.", @@ -349,11 +373,11 @@ def get_input(estimator): "Is this the first document?", ], None - if tags["X_types"] == "sparse": + if tags.input_tags.sparse: # TfidfTransformer in sklearn 0.24 needs this return sparse.csr_matrix(X), y - raise ValueError(f"Unsupported X type for estimator: {tags['X_types']}") + raise ValueError(f"Unsupported X type for estimator: {tags.input_tags}") @pytest.mark.parametrize( @@ -363,9 +387,27 @@ def test_can_persist_fitted(estimator): """Check that fitted estimators can be persisted and return the right results.""" set_random_state(estimator, random_state=0) + # A list of estimators which fail on sklearn versions indicated in the list. + xfail = [ + # These are related to loss classes not having the right __reduce__ method. + ("PassiveAggressiveClassifier", ["1.4", "1.5"]), + ("SGDClassifier", ["1.4", "1.5"]), + ("SGDOneClassSVM", ["1.4", "1.5"]), + ("TweedieRegressor", ["1.4", "1.5"]), + ] + + if any( + estimator.__class__.__name__ == name and sklearn.__version__.startswith(version) + for name, versions in xfail + for version in versions + ): + pytest.xfail( + f"Known issue with {estimator.__class__.__name__} on sklearn version" + f" {sklearn.__version__}" + ) + X, y = get_input(estimator) - tags = _safe_tags(estimator) - if tags.get("requires_fit", True): + if get_tags(estimator).requires_fit: with warnings.catch_warnings(): warnings.filterwarnings("ignore", module="sklearn") if y is not None: @@ -415,10 +457,8 @@ def test_can_trust_types(type_): def test_unsupported_type_raises(estimator): """Estimators that are known to fail should raise an error""" set_random_state(estimator, random_state=0) - X, y = get_input(estimator) - tags = _safe_tags(estimator) - if tags.get("requires_fit", True): + if get_tags(estimator).requires_fit: with warnings.catch_warnings(): warnings.filterwarnings("ignore", module="sklearn") if y is not None: diff --git a/skops/utils/_fixes.py b/skops/utils/_fixes.py index 212cb08f..2c16f7ee 100644 --- a/skops/utils/_fixes.py +++ b/skops/utils/_fixes.py @@ -1,3 +1,101 @@ +from __future__ import annotations + +import sys +from dataclasses import dataclass, field + +try: + # new in sklearn 1.1 + from sklearn.utils.estimator_checks import ( + _enforce_estimator_tags_X, + _enforce_estimator_tags_y, + ) +except ImportError: + import numpy as np + from sklearn.metrics.pairwise import linear_kernel, pairwise_distances + + def _enforce_estimator_tags_y(estimator, y): + # Estimators with a `requires_positive_y` tag only accept strictly positive + # data + tags = get_tags(estimator) + if tags.target_tags.positive_only: + # Create strictly positive y. The minimal increment above 0 is 1, as + # y could be of integer dtype. + y += 1 + abs(y.min()) + if ( + tags.classifier_tags is not None + and not tags.classifier_tags.multi_class + and y.size > 0 + ): + y = np.where(y == y.flat[0], y, y.flat[0] + 1) + # Estimators in mono_output_task_error raise ValueError if y is of 1-D + # Convert into a 2-D y for those estimators. + if tags.target_tags.multi_output and not tags.target_tags.single_output: + return np.reshape(y, (-1, 1)) + return y + + def _enforce_estimator_tags_X(estimator, X, X_test=None, kernel=linear_kernel): + def _is_pairwise_metric(estimator): + """Returns True if estimator accepts pairwise metric. + + Parameters + ---------- + estimator : object + Estimator object to test. + + Returns + ------- + out : bool + True if _pairwise is set to True and False otherwise. + """ + metric = getattr(estimator, "metric", None) + + return bool(metric == "precomputed") + + # Estimators with `1darray` in `X_types` tag only accept + # X of shape (`n_samples`,) + if get_tags(estimator).input_tags.one_d_array: + X = X[:, 0] + if X_test is not None: + X_test = X_test[:, 0] # pragma: no cover + # Estimators with a `requires_positive_X` tag only accept + # strictly positive data + if get_tags(estimator).input_tags.positive_only: + X = X - X.min() + if X_test is not None: + X_test = X_test - X_test.min() # pragma: no cover + if get_tags(estimator).input_tags.categorical: + dtype = np.float64 if get_tags(estimator).input_tags.allow_nan else np.int32 + X = np.round((X - X.min())).astype(dtype) + if X_test is not None: + X_test = np.round((X_test - X_test.min())).astype( + dtype + ) # pragma: no cover + + if estimator.__class__.__name__ == "SkewedChi2Sampler": + # SkewedChi2Sampler requires X > -skewdness in transform + X = X - X.min() + if X_test is not None: + X_test = X_test - X_test.min() # pragma: no cover + + X_res = X + + # Pairwise estimators only accept + # X of shape (`n_samples`, `n_samples`) + if _is_pairwise_metric(estimator): + X_res = pairwise_distances(X, metric="euclidean") + if X_test is not None: + X_test = pairwise_distances( + X_test, X, metric="euclidean" + ) # pragma: no cover + elif get_tags(estimator).input_tags.pairwise: + X_res = kernel(X, X) + if X_test is not None: + X_test = kernel(X_test, X) # pragma: no cover + if X_test is not None: + return X_res, X_test + return X_res + + def boxplot(ax, *, tick_labels, **kwargs): """A function to handle labels->tick_labels deprecation. labels is deprecated in 3.9 and removed in 3.11. @@ -6,3 +104,368 @@ def boxplot(ax, *, tick_labels, **kwargs): return ax.boxplot(tick_labels=tick_labels, **kwargs) except TypeError: return ax.boxplot(labels=tick_labels, **kwargs) + + +def construct_instances(estimator): + """Create a test instance of an estimator for compatibility testing. + + This function provides compatibility between different scikit-learn versions + (before and after 1.6) for creating test instances of estimators. It handles + the API change where _construct_instances was moved from estimator_checks to + instance_generator. + """ + try: + from sklearn.utils._test_common.instance_generator import _construct_instances + + return list(_construct_instances(estimator)) + + except ImportError: + from sklearn.utils.estimator_checks import _construct_instance + + return [_construct_instance(estimator)] + + +""" +Estimator Tags +-------------- +The following code implements a tag system for scikit-learn estimators that provides +metadata about their capabilities and requirements. This includes support for both +the new Tags dataclass format (sklearn >= 1.6) and backwards compatibility with +the dictionary format (sklearn < 1.6). + +Most of the code below is copied from scikit-learn: + `link to commit `_ + +This code can be removed when support for scikit-learn < 1.6 is dropped. +""" + + +def get_tags(estimator): + """Get estimator tags in a consistent format across different sklearn versions. + + This function provides compatibility between sklearn versions before and after 1.6. + It returns either a Tags object (sklearn >= 1.6) or a converted Tags object from + the dictionary format (sklearn < 1.6) containing metadata about the estimator's + requirements and capabilities. + + Parameters + ---------- + estimator : estimator object + A scikit-learn estimator instance. + + Returns + ------- + tags : Tags + An object containing metadata about the estimator's requirements and + capabilities (e.g., input types, fitting requirements, classifier/regressor + specific tags). + """ + try: + from sklearn.utils._tags import get_tags + + return get_tags(estimator) + except ImportError: + from sklearn.utils._tags import _safe_tags + + return _to_new_tags(_safe_tags(estimator), estimator) + + +def _dataclass_args(): + if sys.version_info < (3, 10): + return {} + return {"slots": True} + + +@dataclass(**_dataclass_args()) +class InputTags: + """Tags for the input data. + + Parameters + ---------- + one_d_array : bool, default=False + Whether the input can be a 1D array. + + two_d_array : bool, default=True + Whether the input can be a 2D array. Note that most common + tests currently run only if this flag is set to ``True``. + + three_d_array : bool, default=False + Whether the input can be a 3D array. + + sparse : bool, default=False + Whether the input can be a sparse matrix. + + categorical : bool, default=False + Whether the input can be categorical. + + string : bool, default=False + Whether the input can be an array-like of strings. + + dict : bool, default=False + Whether the input can be a dictionary. + + positive_only : bool, default=False + Whether the estimator requires positive X. + + allow_nan : bool, default=False + Whether the estimator supports data with missing values encoded as `np.nan`. + + pairwise : bool, default=False + This boolean attribute indicates whether the data (`X`), + :term:`fit` and similar methods consists of pairwise measures + over samples rather than a feature representation for each + sample. It is usually `True` where an estimator has a + `metric` or `affinity` or `kernel` parameter with value + 'precomputed'. Its primary purpose is to support a + :term:`meta-estimator` or a cross validation procedure that + extracts a sub-sample of data intended for a pairwise + estimator, where the data needs to be indexed on both axes. + Specifically, this tag is used by + `sklearn.utils.metaestimators._safe_split` to slice rows and + columns. + """ + + one_d_array: bool = False + two_d_array: bool = True + three_d_array: bool = False + sparse: bool = False + categorical: bool = False + string: bool = False + dict: bool = False + positive_only: bool = False + allow_nan: bool = False + pairwise: bool = False + + +@dataclass(**_dataclass_args()) +class TargetTags: + """Tags for the target data. + + Parameters + ---------- + required : bool + Whether the estimator requires y to be passed to `fit`, + `fit_predict` or `fit_transform` methods. The tag is ``True`` + for estimators inheriting from `~sklearn.base.RegressorMixin` + and `~sklearn.base.ClassifierMixin`. + + one_d_labels : bool, default=False + Whether the input is a 1D labels (y). + + two_d_labels : bool, default=False + Whether the input is a 2D labels (y). + + positive_only : bool, default=False + Whether the estimator requires a positive y (only applicable + for regression). + + multi_output : bool, default=False + Whether a regressor supports multi-target outputs or a classifier supports + multi-class multi-output. + + single_output : bool, default=True + Whether the target can be single-output. This can be ``False`` if the + estimator supports only multi-output cases. + """ + + required: bool + one_d_labels: bool = False + two_d_labels: bool = False + positive_only: bool = False + multi_output: bool = False + single_output: bool = True + + +@dataclass(**_dataclass_args()) +class TransformerTags: + """Tags for the transformer. + + Parameters + ---------- + preserves_dtype : list[str], default=["float64"] + Applies only on transformers. It corresponds to the data types + which will be preserved such that `X_trans.dtype` is the same + as `X.dtype` after calling `transformer.transform(X)`. If this + list is empty, then the transformer is not expected to + preserve the data type. The first value in the list is + considered as the default data type, corresponding to the data + type of the output when the input data type is not going to be + preserved. + """ + + preserves_dtype: list[str] = field(default_factory=lambda: ["float64"]) + + +@dataclass(**_dataclass_args()) +class ClassifierTags: + """Tags for the classifier. + + Parameters + ---------- + poor_score : bool, default=False + Whether the estimator fails to provide a "reasonable" test-set + score, which currently for classification is an accuracy of + 0.83 on ``make_blobs(n_samples=300, random_state=0)``. The + datasets and values are based on current estimators in scikit-learn + and might be replaced by something more systematic. + + multi_class : bool, default=True + Whether the classifier can handle multi-class + classification. Note that all classifiers support binary + classification. Therefore this flag indicates whether the + classifier is a binary-classifier-only or not. + + multi_label : bool, default=False + Whether the classifier supports multi-label output. + """ + + poor_score: bool = False + multi_class: bool = True + multi_label: bool = False + + +@dataclass(**_dataclass_args()) +class RegressorTags: + """Tags for the regressor. + + Parameters + ---------- + poor_score : bool, default=False + Whether the estimator fails to provide a "reasonable" test-set + score, which currently for regression is an R2 of 0.5 on + ``make_regression(n_samples=200, n_features=10, + n_informative=1, bias=5.0, noise=20, random_state=42)``. The + dataset and values are based on current estimators in scikit-learn + and might be replaced by something more systematic. + + multi_label : bool, default=False + Whether the regressor supports multilabel output. + """ + + poor_score: bool = False + multi_label: bool = False + + +@dataclass(**_dataclass_args()) +class Tags: + """Tags for the estimator. + + See :ref:`estimator_tags` for more information. + + Parameters + ---------- + estimator_type : str or None + The type of the estimator. Can be one of: + - "classifier" + - "regressor" + - "transformer" + - "clusterer" + - "outlier_detector" + - "density_estimator" + + target_tags : :class:`TargetTags` + The target(y) tags. + + transformer_tags : :class:`TransformerTags` or None + The transformer tags. + + classifier_tags : :class:`ClassifierTags` or None + The classifier tags. + + regressor_tags : :class:`RegressorTags` or None + The regressor tags. + + array_api_support : bool, default=False + Whether the estimator supports Array API compatible inputs. + + no_validation : bool, default=False + Whether the estimator skips input-validation. This is only meant for + stateless and dummy transformers! + + non_deterministic : bool, default=False + Whether the estimator is not deterministic given a fixed ``random_state``. + + requires_fit : bool, default=True + Whether the estimator requires to be fitted before calling one of + `transform`, `predict`, `predict_proba`, or `decision_function`. + + _skip_test : bool, default=False + Whether to skip common tests entirely. Don't use this unless + you have a *very good* reason. + + input_tags : :class:`InputTags` + The input data(X) tags. + """ + + estimator_type: str | None + target_tags: TargetTags + transformer_tags: TransformerTags | None = None + classifier_tags: ClassifierTags | None = None + regressor_tags: RegressorTags | None = None + array_api_support: bool = False + no_validation: bool = False + non_deterministic: bool = False + requires_fit: bool = True + _skip_test: bool = False + input_tags: InputTags = field(default_factory=InputTags) + + +def _to_new_tags(old_tags, estimator=None): + """Utility function convert old tags (dictionary) to new tags (dataclass).""" + input_tags = InputTags( + one_d_array="1darray" in old_tags["X_types"], + two_d_array="2darray" in old_tags["X_types"], + three_d_array="3darray" in old_tags["X_types"], + sparse="sparse" in old_tags["X_types"], + categorical="categorical" in old_tags["X_types"], + string="string" in old_tags["X_types"], + dict="dict" in old_tags["X_types"], + positive_only=old_tags["requires_positive_X"], + allow_nan=old_tags["allow_nan"], + pairwise=old_tags["pairwise"], + ) + target_tags = TargetTags( + required=old_tags["requires_y"], + one_d_labels="1dlabels" in old_tags["X_types"], + two_d_labels="2dlabels" in old_tags["X_types"], + positive_only=old_tags["requires_positive_y"], + multi_output=old_tags["multioutput"] or old_tags["multioutput_only"], + single_output=not old_tags["multioutput_only"], + ) + if estimator is not None and ( + hasattr(estimator, "transform") or hasattr(estimator, "fit_transform") + ): + transformer_tags = TransformerTags( + preserves_dtype=old_tags["preserves_dtype"], + ) + else: + transformer_tags = None + estimator_type = getattr(estimator, "_estimator_type", None) + if estimator_type == "classifier": + classifier_tags = ClassifierTags( + poor_score=old_tags["poor_score"], + multi_class=not old_tags["binary_only"], + multi_label=old_tags["multilabel"], + ) + else: + classifier_tags = None + if estimator_type == "regressor": + regressor_tags = RegressorTags( + poor_score=old_tags["poor_score"], + multi_label=old_tags["multilabel"], + ) + else: + regressor_tags = None + return Tags( + estimator_type=estimator_type, + target_tags=target_tags, + transformer_tags=transformer_tags, + classifier_tags=classifier_tags, + regressor_tags=regressor_tags, + input_tags=input_tags, + array_api_support=old_tags["array_api_support"], + no_validation=old_tags["no_validation"], + non_deterministic=old_tags["non_deterministic"], + requires_fit=old_tags["requires_fit"], + _skip_test=old_tags["_skip_test"], + ) diff --git a/spaces/skops_model_card_creator/app.py b/spaces/skops_model_card_creator/app.py index 446ddd62..dd6cac34 100644 --- a/spaces/skops_model_card_creator/app.py +++ b/spaces/skops_model_card_creator/app.py @@ -11,10 +11,11 @@ from typing import Literal import streamlit as st -from create import create_repo_input_form -from edit import edit_input_form -from gethelp import help_page -from start import start_input_form + +from .create import create_repo_input_form +from .edit import edit_input_form +from .help import help_page +from .start import start_input_form # Change cwd to a temporary path if "work_dir" not in st.session_state: diff --git a/spaces/skops_model_card_creator/create.py b/spaces/skops_model_card_creator/create.py index 7ac069c1..33022b94 100644 --- a/spaces/skops_model_card_creator/create.py +++ b/spaces/skops_model_card_creator/create.py @@ -2,10 +2,11 @@ from pathlib import Path import streamlit as st -from utils import get_rendered_model_card from skops import hub_utils +from .utils import get_rendered_model_card + def _add_back_button(): def fn(): diff --git a/spaces/skops_model_card_creator/edit.py b/spaces/skops_model_card_creator/edit.py index 6d0f5bb7..1d736c33 100644 --- a/spaces/skops_model_card_creator/edit.py +++ b/spaces/skops_model_card_creator/edit.py @@ -32,7 +32,11 @@ import streamlit as st from huggingface_hub import hf_hub_download -from tasks import ( + +from skops import card +from skops.card._model_card import PlotSection, split_subsection_names + +from .tasks import ( AddFigureTask, AddMetricsTask, AddSectionTask, @@ -42,15 +46,12 @@ UpdateFigureTitleTask, UpdateSectionTask, ) -from utils import ( +from .utils import ( get_rendered_model_card, iterate_key_section_content, process_card_for_rendering, ) -from skops import card -from skops.card._model_card import PlotSection, split_subsection_names - arepr = reprlib.Repr() arepr.maxstring = 24 tmp_path = Path(mkdtemp(prefix="skops-")) # temporary files diff --git a/spaces/skops_model_card_creator/requirements.txt b/spaces/skops_model_card_creator/requirements.txt index e1747269..bbbe7a10 100644 --- a/spaces/skops_model_card_creator/requirements.txt +++ b/spaces/skops_model_card_creator/requirements.txt @@ -1,4 +1,6 @@ -catboost +# remove python constraint when catboost supports 3.13 +# https://github.com/catboost/catboost/issues/2748 +catboost; python_version < "3.13" huggingface_hub lightgbm pandas