Note
Click here to download the full example code
Feature Engineering Tasks#
We’ll define the relevant feature engineering tasks to clean up the SQLite data.
First, let’s import the required libraries.
There are a specific set of columns for which imputation isn’t required. We ignore them.
NO_IMPUTATION_COLS = [
"Hospital Number",
"surgery",
"Age",
"outcome",
"surgical lesion",
"timestamp",
]
We define a mean_median_imputer
task to fill in the missing values of the dataset, for which we use the
SimpleImputer class from the scikit-learn
library.
@task(cache=True, cache_version="1.0")
def mean_median_imputer(
dataframe: pd.DataFrame,
imputation_method: str,
) -> FlyteSchema:
dataframe = dataframe.replace("?", np.nan)
if imputation_method not in ["median", "mean"]:
raise ValueError("imputation_method takes only values 'median' or 'mean'")
imputer = SimpleImputer(missing_values=np.nan, strategy=imputation_method)
imputer = imputer.fit(
dataframe[dataframe.columns[~dataframe.columns.isin(NO_IMPUTATION_COLS)]]
)
dataframe[
dataframe.columns[~dataframe.columns.isin(NO_IMPUTATION_COLS)]
] = imputer.transform(
dataframe[dataframe.columns[~dataframe.columns.isin(NO_IMPUTATION_COLS)]]
)
return dataframe
Let’s define the other task called univariate_selection
that does feature selection.
The SelectKBest method removes all
but the highest scoring features (DataFrame columns).
@task(cache=True, cache_version="1.0")
def univariate_selection(
dataframe: pd.DataFrame, num_features: int, data_class: str
) -> pd.DataFrame:
# Remove ``timestamp`` and ``Hospital Number`` columns as they ought to be present in the dataset
dataframe = dataframe.drop(["event_timestamp", "Hospital Number"], axis=1)
if num_features > 9:
raise ValueError(
f"Number of features must be <= 9; you've given {num_features}"
)
X = dataframe.iloc[:, dataframe.columns != data_class]
y = dataframe.loc[:, data_class]
test = SelectKBest(score_func=f_classif, k=num_features)
fit = test.fit(X, y)
indices = sort((-fit.scores_).argsort()[:num_features])
column_names = list(map(X.columns.__getitem__, indices))
column_names.extend([data_class])
features = fit.transform(X)
return pd.DataFrame(np.c_[features, y.to_numpy()], columns=column_names)
Total running time of the script: ( 0 minutes 0.000 seconds)