diff --git a/src/datamop/column_scaler.py b/src/datamop/column_scaler.py index 3f8b93a..4c742fc 100644 --- a/src/datamop/column_scaler.py +++ b/src/datamop/column_scaler.py @@ -1,4 +1,4 @@ -# Formula in this function is adapted from Scikit Learn Documentation +# Formula in this function is adapted from Scikit Learn documentation # https://scikit-learn.org/1.5/modules/generated/sklearn.preprocessing.MinMaxScaler.html # https://scikit-learn.org/1.6/modules/generated/sklearn.preprocessing.StandardScaler.html @@ -33,8 +33,11 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T Returns -------- pandas.DataFrame - A copy of the DataFrame with the scaled column replacing the original column if `inplace` is set to `True`. - If `inplace` is set to `False`, the copy of DataFrame is returned with the new scaled column added, keeping the original column. + A copy of the DataFrame with the scaled column + replacing the original column if `inplace` is set to `True`. + If `inplace` is set to `False`, + the copy of DataFrame is returned with the new scaled column added, + keeping the original column. Raises ------ @@ -47,7 +50,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T If the `method` is not `minmax` or `standard`. If the `new_min` value is greater or equal to the `new_max` when using `minmax` method. - Examples -------- >>> import pandas as pd @@ -58,7 +60,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T 0.0 0.5 1.0 - """ # Check input is pd.DataFrame if not isinstance(data, pd.DataFrame): @@ -66,7 +67,10 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T # Empty df warning if data.empty: - warnings.warn("Empty DataFrame detected. Empty DataFrame will be returned.", UserWarning) + warnings.warn( + "Empty DataFrame detected. Empty DataFrame will be returned.", + UserWarning + ) return data.copy() # Error handling @@ -79,12 +83,28 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T # Edge case warning if data[column].isna().any(): - warnings.warn("NaN value detected in column '{column}'. They will be unchanged", UserWarning) + warnings.warn( + f"NaN value detected in column '{column}'. They will be unchanged", + UserWarning + ) if data[column].nunique() == 1: - warnings.warn("Single-value column detected. All values will be scaled to the midpoint of the `new_min` and `new_max`.", UserWarning) - midpoint = (new_min + new_max) / 2 - scaled_column = pd.Series([midpoint] * len(data), index=data.index) + if method == "minmax": + warnings.warn( + "Single-value column detected. " + "All values will be scaled to the midpoint of the `new_min` and `new_max`.", + UserWarning + ) + midpoint = (new_min + new_max) / 2 + scaled_column = pd.Series([midpoint] * len(data), index=data.index) + + elif method == "standard": + warnings.warn( + "Standard deviation is zero. " + "All values are set to 0 to prevent division by zero.", + UserWarning + ) + scaled_column = pd.Series([0] * len(data), index=data.index) # Scale the column else: @@ -94,7 +114,12 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T raise ValueError("`new_min` cannot be greater than `new_max`.") min_value = data[column].min() max_value = data[column].max() - scaled_column = ((data[column] - min_value) / (max_value - min_value)) * (new_max - new_min) + new_min + scaled_column = ( + ((data[column] - min_value) / (max_value - min_value)) + * (new_max - new_min) + + new_min + ) + # standard scaling elif method == "standard": mean_value = data[column].mean() diff --git a/tests/test_column_scaler.py b/tests/test_column_scaler.py index 2b01adb..5c3bf99 100644 --- a/tests/test_column_scaler.py +++ b/tests/test_column_scaler.py @@ -9,6 +9,11 @@ def one_column_df(): """Return DataFrame with one column of numeric values. Used for testing.""" return pd.DataFrame({"price": [25, 50, 75]}) +@pytest.fixture +def one_column_df_float(): + """Return DataFrame with one column of floating values. Used for testing.""" + return pd.DataFrame({"price": [25.0, 50.0, 75.0]}) + @pytest.fixture def single_val_df(): """Return DataFrame with one column with single repeated value. Used for testing.""" @@ -26,11 +31,17 @@ def non_numeric_df(): # Expected use case tests def test_minmax_scaling_default(one_column_df): - """Test min-max scaling with default new_min=0 and new_max=1.""" + """Test min-max scaling with default new_min=0 and new_max=1. Use float values.""" scaled_df = column_scaler(one_column_df, column="price", method="minmax") expected = [0.0, 0.5, 1.0] assert scaled_df["price"].tolist() == expected +def test_minmax_scaling_default_float(one_column_df_float): + """Test min-max scaling with default new_min=0 and new_max=1.""" + scaled_df = column_scaler(one_column_df_float, column="price", method="minmax") + expected = [0.0, 0.5, 1.0] + assert scaled_df["price"].tolist() == expected + def test_minmax_scaling_custom(one_column_df): """Test min-max scaling with custom new_min=10 and new_max=20.""" scaled_df = column_scaler(one_column_df, column="price", method="minmax", new_min=10, new_max=20) @@ -58,6 +69,15 @@ def test_single_value_column_minmax(single_val_df): expected = [15.0, 15.0, 15.0] assert scaled_df["price"].tolist() == expected +def test_single_value_column_standard(single_val_df): + """Test standard scaling with column with single repeated values to prevent division by zero.""" + with pytest.warns(UserWarning, + match="Standard deviation is zero"): + scaled_df = column_scaler(single_val_df, column="price", method="standard") + + expected = [0, 0, 0] + assert scaled_df["price"].tolist() == expected + def test_empty_dataframe(empty_df): """Test scaling on empty DataFrame.""" with pytest.warns(UserWarning, match="Empty DataFrame detected"): @@ -73,6 +93,7 @@ def test_column_with_nan(): expected = [0.0, np.nan, 1.0] assert np.allclose(scaled_df["price"], expected, equal_nan=True) + # Erroneous case tests def test_non_numeric_column(non_numeric_df):