Skip to content

Commit

Permalink
Merge pull request #57 from UBC-MDS/fix_feat_format
Browse files Browse the repository at this point in the history
Fix feat format
  • Loading branch information
davyxuximin authored Feb 3, 2025
2 parents 5b7a220 + 00ab86d commit 0e8128a
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 10 deletions.
Binary file removed .coverage
Binary file not shown.
15 changes: 7 additions & 8 deletions src/datamop/column_encoder.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import pandas as pd


def column_encoder(df, columns, method='one-hot', order=None):
def column_encoder(data, columns, method='one-hot', order=None):
"""
Encodes categorical columns using one-hot or ordinal encoding based on user input.
Parameters:
-----------
df : pandas.DataFrame
data : pandas.DataFrame
The input DataFrame containing the dataset.
columns : list
The name of the columns to be encoded.
Expand Down Expand Up @@ -40,20 +40,20 @@ def column_encoder(df, columns, method='one-hot', order=None):
Examples:
---------
>>> import pandas as pd
>>> df = pd.DataFrame({
>>> data = pd.DataFrame({
... 'Sport': ['Tennis', 'Basketball', 'Football', 'Badminton'],
... 'Level': ['A', 'B', 'C', 'D']
... })
>>> encoded_df_onehot = column_encoder(df, columns=['Sport'], method='one-hot')
>>> encoded_df_onehot = column_encoder(data, columns=['Sport'], method='one-hot')
>>> print(encoded_df_onehot)
Level Sport_Badminton Sport_Basketball Sport_Football Sport_Tennis
A 0 0 0 1
B 0 1 0 0
C 0 0 1 0
D 1 0 0 0
>>> encoded_df_ordinal = column_encoder(df, columns=['Level'], method='ordinal', order={'Level': ['A', 'B', 'C', 'D']})
>>> encoded_df_ordinal = column_encoder(data, columns=['Level'], method='ordinal', order={'Level': ['A', 'B', 'C', 'D']})
>>> print(encoded_df_ordinal)
Sport Level
Tennis 0
Expand All @@ -63,15 +63,15 @@ def column_encoder(df, columns, method='one-hot', order=None):
"""
#check input type
if not isinstance(df, pd.DataFrame):
if not isinstance(data, pd.DataFrame):
raise TypeError("Input must be a pandas DataFrame")
if not isinstance(columns, list) or not all(isinstance(col, str) for col in columns):
raise TypeError("Columns parameter must be a list of strings")
if not isinstance(method, str):
raise TypeError("Method parameter must be a string")
if method == 'ordinal' and order is not None and not isinstance(order, dict):
raise TypeError("Order parameter must be a dictionary")
encoded_df = df.copy()
encoded_df = data.copy()

if method == 'one-hot':
#check if order is input
Expand Down Expand Up @@ -103,7 +103,6 @@ def column_encoder(df, columns, method='one-hot', order=None):
if column not in columns:
raise ValueError(f"The column '{column}' specified in order is not in the column list")


custom_order = order[column]
unique_values = encoded_df[column].unique()
#check if order match what is inside column
Expand Down
2 changes: 0 additions & 2 deletions src/datamop/column_scaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T
return data.copy()

# Error handling

if column not in data.columns:
raise KeyError("Column not found in the DataFrame.")
if not pd.api.types.is_numeric_dtype(data[column]):
Expand All @@ -87,7 +86,6 @@ def column_scaler(data, column, method="minmax", new_min=0, new_max=1, inplace=T
midpoint = (new_min + new_max) / 2
scaled_column = pd.Series([midpoint] * len(data), index=data.index)


# Scale the column
else:
# minmax scaling
Expand Down

0 comments on commit 0e8128a

Please sign in to comment.