import pandas as pd
import matplotlib.pyplot as plt
import ds_charts as ds

from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
filename = 'data/algae.csv'
data = pd.read_csv(filename, index_col='date', na_values='', parse_dates=True, infer_datetime_format=True)
data.shape

(200, 11)


plt.figure(figsize=(4,2))
values = {'nr records': data.shape[0], 'nr variables': data.shape[1]}
ds.bar_chart(list(values.keys()), list(values.values()), title='Nr of records vs nr variables')
plt.savefig('images/records_variables.png')
plt.show()


data.dtypes

pH                float64
Oxygen            float64
Chloride          float64
Nitrates          float64
Ammonium          float64
Orthophosphate    float64
Phosphate         float64
Chlorophyll       float64
fluid_velocity     object
river_depth        object
season             object
dtype: object


cat_vars = data.select_dtypes(include='object')
data[cat_vars.columns] = data.select_dtypes(['object']).apply(lambda x: x.astype('category'))
data.dtypes

pH                 float64
Oxygen             float64
Chloride           float64
Nitrates           float64
Ammonium           float64
Orthophosphate     float64
Phosphate          float64
Chlorophyll        float64
fluid_velocity    category
river_depth       category
season            category
dtype: object


from numpy import isnan
from datetime import datetime
def get_variable_types(df):
    NR_SYMBOLS = 10
    variable_types = {'binary': [], 'numeric': [], 'date': [], 'symbolic': []}
    for c in df.columns:
        mv = df[c].isna().sum()
        uniques = df[c].unique()
        if mv == 0:
            if len(uniques) == 2:
                variable_types['binary'].append(c)
                df[c].astype('bool')
            elif df[c].dtype == 'datetime64':
                variable_types['date'].append(c)
            elif len(uniques) < NR_SYMBOLS:
                df[c].astype('category')
                variable_types['symbolic'].append(c)
            else:
                variable_types['numeric'].append(c)
        else:
            uniques = [v for v in uniques if not isnan(v)]
            values = [v for v in uniques if isinstance(v,str)]
            if len(uniques) == 2:
                variable_types['binary'].append(c)
            elif len(values) == len(uniques):
                df[c].astype('category')
                variable_types['symbolic'].append(c)
            else:
                values = [v for v in uniques if isinstance(v, datetime)]
                if len(values) == len(uniques):
                    variable_types['date'].append(c)
                else:
                    variable_types['numeric'].append(c)
    return variable_types


variable_types = ds.get_variable_types(data)
counts = {}
for tp in variable_types.keys():
    counts[tp] = len(variable_types[tp])
plt.figure(figsize=(4,2))
ds.bar_chart(list(counts.keys()), list(counts.values()), title='Nr of variables per type')
plt.savefig('images/variable_types.png')
plt.show()


mv = {}
plt.figure()
for var in data:
    nr = data[var].isna().sum()
    if nr > 0:
        mv[var] = nr

ds.bar_chart(list(mv.keys()), list(mv.values()), title='Nr of missing values per variable',
             xlabel='variables', ylabel='nr missing values', rotation=True)
plt.savefig('images/mv.png')
plt.show()

Lab 1: Data Profiling

Data Dimensionality

Variables Type

Missing values