import pandas as pd
import numpy as np
from pandas.plotting import register_matplotlib_converters
import matplotlib.pyplot as plt
import ds_charts as ds

register_matplotlib_converters()
file = 'algae'
filename = 'data/algae.csv'
data = pd.read_csv(filename, index_col='date', na_values='', parse_dates=True, infer_datetime_format=True)

variable_types = ds.get_variable_types(data)
numeric_vars = variable_types['numeric']
symbolic_vars = variable_types['symbolic']
boolean_vars = variable_types['binary']


mv = {}
plt.figure()
for var in data:
    nr = data[var].isna().sum()
    if nr > 0:
        mv[var] = nr

ds.bar_chart(list(mv.keys()), list(mv.values()), title='Nr of missing values per variable',
             xlabel='variables', ylabel='nr missing values', rotation=True)
plt.savefig(f'images/{file}_missing_values.png')
mv

{'pH': 1,
 'Oxygen': 2,
 'Chloride': 10,
 'Nitrates': 2,
 'Ammonium': 2,
 'Orthophosphate': 2,
 'Phosphate': 2,
 'Chlorophyll': 12}


# defines the number of records to discard entire columns
threshold = data.shape[0] * 0.90
missings = [c for c in mv.keys() if mv[c]>threshold]
df = data.drop(columns=missings, inplace=False)
df.to_csv(f'data/{file}_drop_mv.csv', index=False)
print('Dropped variables', missings)

Dropped variables []


# defines the number of variables to discard entire records
threshold = data.shape[1] * 0.50
df = data.dropna(thresh=threshold, inplace=False)
df.to_csv(f'data/{file}_dropna_mv.csv', index=False)
print(df.shape)

(198, 11)


from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame

tmp_nr, tmp_sb, tmp_bool = None, None, None
if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='constant', fill_value=0, missing_values=np.nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(data[numeric_vars]), columns=numeric_vars)
if len(symbolic_vars) > 0:
    imp = SimpleImputer(strategy='constant', fill_value='NA', missing_values=np.nan, copy=True)
    tmp_sb = DataFrame(imp.fit_transform(data[symbolic_vars]), columns=symbolic_vars)
if len(boolean_vars) > 0:
    imp = SimpleImputer(strategy='constant', fill_value=False, missing_values=np.nan, copy=True)
    tmp_bool = DataFrame(imp.fit_transform(data[boolean_vars]), columns=boolean_vars)

data = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
data.to_csv(f'data/{file}_mv_constant.csv', index=False)
data.describe(include='all')


tmp_nr, tmp_sb, tmp_bool = None, None, None
if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='mean', missing_values=np.nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(data[numeric_vars]), columns=numeric_vars)
if len(symbolic_vars) > 0:
    imp = SimpleImputer(strategy='most_frequent', missing_values=np.nan, copy=True)
    tmp_sb = DataFrame(imp.fit_transform(data[symbolic_vars]), columns=symbolic_vars)
if len(boolean_vars) > 0:
    imp = SimpleImputer(strategy='most_frequent', missing_values=np.nan, copy=True)
    tmp_bool = DataFrame(imp.fit_transform(data[boolean_vars]), columns=boolean_vars)

data = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
data.to_csv(f'data/{file}_mv_most_frequent.csv', index=False)
data.describe(include='all')

	pH	Oxygen	Chloride	Nitrates	Ammonium	Orthophosphate	Phosphate	Chlorophyll	fluid_velocity	river_depth	season
count	200.000000	200.000000	200.00000	200.000000	200.000000	200.000000	200.000000	200.000000	200	200	200
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3	3	4
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	high	medium	winter
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	84	84	62
mean	7.971700	9.026600	41.45500	3.250000	152.903000	82.492650	110.435300	12.728550	NaN	NaN	NaN
std	0.822866	2.547113	46.62442	3.771674	179.765702	116.491727	102.306051	20.082659	NaN	NaN	NaN
min	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	NaN	NaN	NaN
25%	7.700000	7.675000	9.04500	1.272500	34.247500	15.367500	18.822500	1.300000	NaN	NaN	NaN
50%	8.055000	9.800000	31.04500	2.655000	98.255000	40.150000	83.220000	4.400000	NaN	NaN	NaN
75%	8.400000	10.800000	56.97750	4.422500	199.850000	102.082500	179.140000	17.200000	NaN	NaN	NaN
max	9.700000	13.400000	391.50000	45.650000	931.830000	771.600000	558.750000	110.460000	NaN	NaN	NaN

	pH	Oxygen	Chloride	Nitrates	Ammonium	Orthophosphate	Phosphate	Chlorophyll	fluid_velocity	river_depth	season
count	200.000000	200.000000	200.00000	200.000000	200.000000	200.000000	200.000000	200.000000	200	200	200
unique	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3	3	4
top	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	high	medium	winter
freq	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	84	84	62
mean	7.971700	9.026600	41.45500	3.250000	152.903000	82.492650	110.435300	12.728550	NaN	NaN	NaN
std	0.822866	2.547113	46.62442	3.771674	179.765702	116.491727	102.306051	20.082659	NaN	NaN	NaN
min	0.000000	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	NaN	NaN	NaN
25%	7.700000	7.675000	9.04500	1.272500	34.247500	15.367500	18.822500	1.300000	NaN	NaN	NaN
50%	8.055000	9.800000	31.04500	2.655000	98.255000	40.150000	83.220000	4.400000	NaN	NaN	NaN
75%	8.400000	10.800000	56.97750	4.422500	199.850000	102.082500	179.140000	17.200000	NaN	NaN	NaN
max	9.700000	13.400000	391.50000	45.650000	931.830000	771.600000	558.750000	110.460000	NaN	NaN	NaN

Lab 2: Data Preparation

Missing Values Imputation

Dropping Missing Values

Filling missing values