Classification
Random Forests
Random Forests, implemented through the RandomForestClassifier
in the sklearn.ensemble
package, are one of the most powerful classification technique, simple and easy to apply.
Parameters study
It trains a set of n decision trees, that are combined in an ensemble of n_estimators
. Each tree, however, is trained over a different subset of the original training data, first by choosing a subset of k variables describing the data, with k determined by the max_features
parameter. Beside many other parameters we can choose the maximum size of each tree, through the max_depth
parameter.
Next, we can see the results achieved by a set of parameters combinations.
from numpy import array, ndarray
from matplotlib.pyplot import subplots, figure, savefig, show
from sklearn.ensemble import RandomForestClassifier
from dslabs_functions import (
CLASS_EVAL_METRICS,
DELTA_IMPROVE,
read_train_test_from_files,
)
from dslabs_functions import HEIGHT, plot_evaluation_results, plot_multiline_chart
def random_forests_study(
trnX: ndarray,
trnY: array,
tstX: ndarray,
tstY: array,
nr_max_trees: int = 2500,
lag: int = 500,
metric: str = "accuracy",
) -> tuple[RandomForestClassifier | None, dict]:
n_estimators: list[int] = [100] + [i for i in range(500, nr_max_trees + 1, lag)]
max_depths: list[int] = [2, 5, 7]
max_features: list[float] = [0.1, 0.3, 0.5, 0.7, 0.9]
best_model: RandomForestClassifier | None = None
best_params: dict = {"name": "RF", "metric": metric, "params": ()}
best_performance: float = 0.0
values: dict = {}
cols: int = len(max_depths)
_, axs = subplots(1, cols, figsize=(cols * HEIGHT, HEIGHT), squeeze=False)
for i in range(len(max_depths)):
d: int = max_depths[i]
values = {}
for f in max_features:
y_tst_values: list[float] = []
for n in n_estimators:
clf = RandomForestClassifier(
n_estimators=n, max_depth=d, max_features=f
)
clf.fit(trnX, trnY)
prdY: array = clf.predict(tstX)
eval: float = CLASS_EVAL_METRICS[metric](tstY, prdY)
y_tst_values.append(eval)
if eval - best_performance > DELTA_IMPROVE:
best_performance = eval
best_params["params"] = (d, f, n)
best_model = clf
# print(f'RF d={d} f={f} n={n}')
values[f] = y_tst_values
plot_multiline_chart(
n_estimators,
values,
ax=axs[0, i],
title=f"Random Forests with max_depth={d}",
xlabel="nr estimators",
ylabel=metric,
percentage=True,
)
print(
f'RF best for {best_params["params"][2]} trees (d={best_params["params"][0]} and f={best_params["params"][1]})'
)
return best_model, best_params
file_tag = "stroke"
train_filename = "data/stroke_train_smote.csv"
test_filename = "data/stroke_test.csv"
target = "stroke"
eval_metric = "accuracy"
trnX, tstX, trnY, tstY, labels, vars = read_train_test_from_files(
train_filename, test_filename, target
)
print(f"Train#={len(trnX)} Test#={len(tstX)}")
print(f"Labels={labels}")
figure()
best_model, params = random_forests_study(
trnX,
trnY,
tstX,
tstY,
nr_max_trees=1000,
lag=250,
metric=eval_metric,
)
savefig(f"images/{file_tag}_rf_{eval_metric}_study.png")
show()
Train#=6806 Test#=1533 Labels=[0, 1] RF best for 1000 trees (d=7 and f=0.3)
<Figure size 600x450 with 0 Axes>
After the plot you can see the parameters for which the best results were achieved. So let's see its performance, in that context in terms of other metrics.
prd_trn: array = best_model.predict(trnX)
prd_tst: array = best_model.predict(tstX)
figure()
plot_evaluation_results(params, trnY, prd_trn, tstY, prd_tst, labels)
savefig(f'images/{file_tag}_rf_{params["name"]}_best_{params["metric"]}_eval.png')
show()
<Figure size 600x450 with 0 Axes>
Random forests have the particularity of providing the importance of each variable in the global model. In order to reach those
importances we just need to collect the featureimportances
attribute from the learnt model as below.
from numpy import std, argsort
from dslabs_functions import plot_horizontal_bar_chart
stdevs: list[float] = list(
std([tree.feature_importances_ for tree in best_model.estimators_], axis=0)
)
importances = best_model.feature_importances_
indices: list[int] = argsort(importances)[::-1]
elems: list[str] = []
imp_values: list[float] = []
for f in range(len(vars)):
elems += [vars[indices[f]]]
imp_values.append(importances[indices[f]])
print(f"{f+1}. {elems[f]} ({importances[indices[f]]})")
figure()
plot_horizontal_bar_chart(
elems,
imp_values,
error=stdevs,
title="RF variables importance",
xlabel="importance",
ylabel="variables",
percentage=True,
)
savefig(f"images/{file_tag}_rf_{eval_metric}_vars_ranking.png")
1. age (0.29372908248146307) 2. smoking_status (0.16640747607952225) 3. ever_married (0.13732227139962788) 4. heart_disease (0.07864907108381088) 5. Residence_type (0.07248622765418535) 6. hypertension (0.06495619872569218) 7. gender (0.06182747028165247) 8. work_type (0.05608753446694439) 9. avg_glucose_level (0.03612315366159387) 10. bmi (0.03241151416550768)
Overfitting study
For Random Forests the simplest parameter to create specializations is the number of estimators allowed: the larger the number of estimators, the higher the complexity of the model.
d_max: int = params["params"][0]
feat: float = params["params"][1]
nr_estimators: list[int] = [i for i in range(2, 2501, 500)]
y_tst_values: list[float] = []
y_trn_values: list[float] = []
acc_metric: str = "accuracy"
for n in nr_estimators:
clf = RandomForestClassifier(n_estimators=n, max_depth=d_max, max_features=feat)
clf.fit(trnX, trnY)
prd_tst_Y: array = clf.predict(tstX)
prd_trn_Y: array = clf.predict(trnX)
y_tst_values.append(CLASS_EVAL_METRICS[acc_metric](tstY, prd_tst_Y))
y_trn_values.append(CLASS_EVAL_METRICS[acc_metric](trnY, prd_trn_Y))
figure()
plot_multiline_chart(
nr_estimators,
{"Train": y_trn_values, "Test": y_tst_values},
title=f"RF overfitting study for d={d_max} and f={feat}",
xlabel="nr_estimators",
ylabel=str(eval_metric),
percentage=True,
)
savefig(f"images/{file_tag}_rf_{eval_metric}_overfitting.png")
Given the properties of random forests, overfitting is a very unusual occurrence. In this case, it is clear that there is no overfitting, since the accuracy over both datasets almost does not change.