|
sklearn
doesn't provide a pattern mining package, so we are using package
mlxtend.frequent_patterns
.
You can access its documentation in "http://rasbt.github.io/mlxtend/api_subpackages/mlxtend.frequent_patterns/"
Pattern mining only applies to symbolic data, therefore to mine numerical one, we need to discretize it.
Regardless the kind of data the apriori algorithm applies to transaction datasets, which requires another transformation
through the TransactionEncoder
available in the mlxtend.preprocessing
package.
import pandas as pd
import matplotlib.pyplot as plt
import ds_charts as ds
import mlxtend.frequent_patterns as pm
data: pd.DataFrame = pd.read_csv('data/breast_cancer.csv')
data = ds.dummify(data, data.columns)
data.shape
MIN_SUP: float = 0.001
var_min_sup =[0.2, 0.1] + [i*MIN_SUP for i in range(100, 0, -10)]
patterns: pd.DataFrame = pm.apriori(data, min_support=MIN_SUP, use_colnames=True, verbose=True)
print(len(patterns),'patterns')
nr_patterns = []
for sup in var_min_sup:
pat = patterns[patterns['support']>=sup]
nr_patterns.append(len(pat))
plt.figure(figsize=(6, 4))
ds.plot_line(var_min_sup, nr_patterns, title='Nr Patterns x Support', xlabel='support', ylabel='Nr Patterns')
plt.show()
MIN_CONF: float = 0.1
rules = pm.association_rules(patterns, metric='confidence', min_threshold=MIN_CONF*5, support_only=False)
print(f'\tfound {len(rules)} rules')
def plot_top_rules(rules: pd.DataFrame, metric: str, per_metric: str) -> None:
_, ax = plt.subplots(figsize=(6, 3))
ax.grid(False)
ax.set_axis_off()
ax.set_title(f'TOP 10 per Min {per_metric} - {metric}', fontweight="bold")
text = ''
cols = ['antecedents', 'consequents']
rules[cols] = rules[cols].applymap(lambda x: tuple(x))
for i in range(len(rules)):
rule = rules.iloc[i]
text += f"{rule['antecedents']} ==> {rule['consequents']}"
text += f"(s: {rule['support']:.2f}, c: {rule['confidence']:.2f}, lift: {rule['lift']:.2f})\n"
ax.text(0, 0, text)
plt.show()
def analyse_per_metric(rules: pd.DataFrame, metric: str, metric_values: list) -> list:
print(f'Analyse per {metric}...')
conf = {'avg': [], 'top25%': [], 'top10': []}
lift = {'avg': [], 'top25%': [], 'top10': []}
top_conf = []
top_lift = []
nr_rules = []
for m in metric_values:
rs = rules[rules[metric] >= m]
nr_rules.append(len(rs))
conf['avg'].append(rs['confidence'].mean(axis=0))
lift['avg'].append(rs['lift'].mean(axis=0))
top_conf = rs.nlargest(int(0.25*len(rs)), 'confidence')
conf['top25%'].append(top_conf['confidence'].mean(axis=0))
top_lift = rs.nlargest(int(0.25*len(rs)), 'lift')
lift['top25%'].append(top_lift['lift'].mean(axis=0))
top_conf = rs.nlargest(10, 'confidence')
conf['top10'].append(top_conf['confidence'].mean(axis=0))
top_lift = rs.nlargest(10, 'lift')
lift['top10'].append(top_lift['lift'].mean(axis=0))
_, axs = plt.subplots(1, 2, figsize=(10, 5), squeeze=False)
ds.multiple_line_chart(metric_values, conf, ax=axs[0, 0], title=f'Avg Confidence x {metric}',
xlabel=metric, ylabel='Avg confidence')
ds.multiple_line_chart(metric_values, lift, ax=axs[0, 1], title=f'Avg Lift x {metric}',
xlabel=metric, ylabel='Avg lift')
plt.show()
plot_top_rules(top_conf, 'confidence', metric)
plot_top_rules(top_lift, 'lift', metric)
return nr_rules
nr_rules_sp = analyse_per_metric(rules, 'support', var_min_sup)
ds.plot_line(var_min_sup, nr_rules_sp, title='Nr rules x Support', xlabel='support', ylabel='Nr. rules', percentage=False)
var_min_conf = [i * MIN_CONF for i in range(10, 5, -1)]
nr_rules_cf = analyse_per_metric(rules, 'confidence', var_min_conf)
ds.plot_line(var_min_conf, nr_rules_cf, title='Nr Rules x Confidence', xlabel='confidence', ylabel='Nr Rules', percentage=False)
How does the number of patterns discovered change with the support threshold?
How does the number of rules discovered change with the confidence threshold?
How do rules quality change with the support threshold?
How does the mean quality of rules relate to the mean quality of the top best rules?
What information were you able to discover?