Source code for EDAhelper.EDAhelper

import pandas as pd
import numpy as np


[docs]def preprocess(path, method=None, fill_value=None, read_func=pd.read_csv, **kwarg): """ Preprocess data in txt, csv, Excel, etc. by dealing with missing values in numeric columns. Parameters ---------- path : str The path of the data file. method : {None, 'most_frequent', 'mean', 'median', 'constant'}, default=None The imputation method. If None, then missing values are treated as numpy.NaN. If 'mean', then replace missing values using the mean along each column. If 'median', then replace missing values using the median along each column. If 'most_frequent', then replace missing using the most frequent value along each column. If there is more than one such value, only the smallest is returned. If 'constant', then replace missing values with fill_value. fill_value : {None, numerical values}, default=None When method='constant', fill_value is used to replace all occurrences of missing values. If left to the default, fill_value will be 0 when imputing numerical data. read_func : panadas.read_* function name, default=pandas.read_csv Any function reading data from pandas (e.g. read_csv, read_fwf, read_pickle). **kwarg : arbitrary keyword arguments Any keyword arguments are defined in @read_func. Returns ------- pandas.DataFrame The processed table. Examples ------- >>> from EDAhelper import EDAhelper >>> file_path = 'https://raw.githubusercontent.com/pandas-dev/pandas/main/doc/data/titanic.csv' >>> EDAhelper.preprocess(file_path) """ # Input check if not isinstance(path, str): raise Exception('Err msg: wrong path input') # if method not in (None, 'mean', 'median', 'most_frequent', 'constant'): # raise Exception('Err msg: wrong method input') if (method == 'constant') & (fill_value is not None) & (not isinstance(fill_value, (float, int))): raise Exception("Err msg: wrong fill_value input when method = 'constant'") try: df = read_func(path, **kwarg) except Exception as e: raise e if method is None: return df df_num = df.select_dtypes(include='number') num_col = df_num.columns for col in num_col: if df[col].isnull().values.any(): val_filled = 0 if method == 'mean': val_filled = df[col].mean() elif method == 'median': val_filled = df[col].median() elif method == 'most_frequent': val_filled = df[col].mode()[0] elif method == 'constant': if fill_value: val_filled = fill_value else: raise Exception('Err msg: wrong method input') df.loc[df[col].isnull(), col] = val_filled return df