Source code for EDAhelper.column_stats

import pandas as pd
import numpy as np
import statistics

[docs]def column_stats(data, columns): """ Obtain summary statistics of column(s) including count, mean, median, mode, Q1, Q3, variance, standard deviation, correlation, and covariance in table format. Parameters ------------- data: array_like The data set from which columns will be selected columns: vector of strings Columns for which to obtain summary stats, correlation matrix, and covariance matrix (if > 1 column arguments used) Returns ------------- array Summary table detailing all statistics and correlations between chosen columns Examples ------------- >>> column_stats(iris, ('sepal_width', 'petal_length')) >>> """ if not isinstance(columns, (list, tuple, np.ndarray)): raise TypeError("'columns' should be a list, tuple, or array") for column in columns: if not isinstance(column, str): raise TypeError("item in columns should be of type string") if not isinstance(data, pd.DataFrame): raise TypeError("data should be a DataFrame") for column in columns: for row in data[column]: if isinstance(row, str): raise TypeError("values should be of type integer") statsdict = {'Column': [], 'Count': [], 'Mean': [], 'Median': [], 'Mode': [], 'Q1': [], 'Q3': [], 'Var': [], 'Stdev': []} for column in columns: statsdict['Column'].append(column) statsdict['Count'].append(round(float(data[column].describe().loc['count']), 3)) statsdict['Mean'].append(round(float(data[column].describe().loc['mean']), 3)) statsdict['Median'].append(round(float(data[column].describe().loc['50%']), 3)) statsdict['Mode'].append(statistics.mode(data[column])) statsdict['Q1'].append(round(float(data[column].describe().loc['25%']), 3)) statsdict['Q3'].append(round(float(data[column].describe().loc['75%']), 3)) statsdict['Var'].append(round(data[column].var(), 3)) statsdict['Stdev'].append(round(data[column].std(), 3)) cols = [] for column in columns: cols.append(column) covmatrix = pd.DataFrame(data, columns = cols) corrmatrix = pd.DataFrame(data, columns = cols) return pd.DataFrame.from_dict(statsdict), corrmatrix.corr(), covmatrix.cov()