# import packages
import pandas as pd
import pickle
import popmon
from popmon import resources


# make sure all output in each cell shows, without explicitly printing them
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


# adjust default display settings of pandas dataframes
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)


df = pd.read_csv(resources.data("test.csv.gz"), parse_dates=["date"])
df = df[["age", "balance", "eyeColor", "favoriteFruit", "gender", "latitude", "transaction", "currency", "date"]]
df["date"] = df["date"].dt.to_period("M")


df.head()


_ = df['date'].value_counts().sort_index().plot(kind='bar')


# our traindata is created on data of the first 8 months of 2015
df_train = df[df.date < "2015-09"]


# batch_1 would be the first batch to be fed to the model after this has been put into production
batch_1 = df[df.date == "2015-09"]


# batch_2 would be the second batch to be fed to the model after this has been put into production
# we won't use this dataset in this example, but you can use it when playing around with this notebook yourself
batch_2 = df[df.date == "2015-10"]


# batch_3 would be the third batch to be fed to the model after this has been put into production
# we won't use this dataset in this example, but you can use it when playing around with this notebook yourself
batch_3 = df[df.date == "2015-11"]


hists = df_train.pm_make_histograms(time_axis="date")
hists

2022-04-15 08:12:51,536 INFO [histogram_filler_base]: Filling 8 specified histograms. auto-binning.
100%|█████████████████████████████████████████████████████████████████| 8/8 [00:02<00:00,  3.60it/s]

{'date:age': <Categorize values=SparselyBin size=8,
 'date:balance': <Categorize values=Categorize size=8,
 'date:currency': <Categorize values=Categorize size=8,
 'date:eyeColor': <Categorize values=Categorize size=8,
 'date:favoriteFruit': <Categorize values=Categorize size=8,
 'date:gender': <Categorize values=Categorize size=8,
 'date:latitude': <Categorize values=SparselyBin size=8,
 'date:transaction': <Categorize values=SparselyBin size=8}


bin_specs = popmon.get_bin_specs(hists)
bin_specs

{'date:age': [{}, {'binWidth': 2.0, 'origin': 9.5}],
 'date:balance': [{}, {}],
 'date:currency': [{}, {}],
 'date:eyeColor': [{}, {}],
 'date:favoriteFruit': [{}, {}],
 'date:gender': [{}, {}],
 'date:latitude': [{},
  {'binWidth': 4.347556988888889, 'origin': -85.24212737777779}],
 'date:transaction': [{},
  {'binWidth': 62.90755555555556, 'origin': -1019.9291111111112}]}


monitoring_rules = {"*_pull": [9, 3, -3, -9]
                    , "*_zscore": [7, 4, -4, -7]
                    , "[!p]*_unknown_labels": [0.5, 0.5, 0, 0]
                    , "age:min": [120, 100, 18, 0], "age:max": [120, 100, 18, 0]
                   }


# calculate the metrics based on all histograms and the monitoring rules we adjusted (when no adjustments made you can ignore this parameter, popmon will use its own default)
datastore = popmon.stability_metrics(hists=hists, monitoring_rules=monitoring_rules)

2022-04-15 08:12:53,842 INFO [hist_splitter]: Splitting histograms "hists" as "split_hists"
2022-04-15 08:12:53,860 INFO [hist_comparer]: Comparing "split_hists" with rolling sum of 1 previous histogram(s).
2022-04-15 08:12:56,736 INFO [hist_comparer]: Comparing "split_hists" with reference "split_hists"
2022-04-15 08:12:57,252 INFO [pull_calculator]: Comparing "comparisons" with median/mad of reference "comparisons"
2022-04-15 08:12:57,332 INFO [hist_profiler]: Profiling histograms "split_hists" as "profiles"
2022-04-15 08:12:57,412 INFO [pull_calculator]: Comparing "profiles" with median/mad of reference "profiles"
2022-04-15 08:12:57,821 INFO [apply_func]: Computing significance of (rolling) trend in means of features
2022-04-15 08:12:57,880 INFO [compute_tl_bounds]: Calculating static bounds for "profiles"
2022-04-15 08:12:58,174 INFO [compute_tl_bounds]: Calculating static bounds for "comparisons"
2022-04-15 08:12:58,203 INFO [compute_tl_bounds]: Calculating traffic light alerts for "profiles"
2022-04-15 08:12:58,285 INFO [compute_tl_bounds]: Calculating traffic light alerts for "comparisons"
2022-04-15 08:12:58,325 INFO [apply_func]: Generating traffic light alerts summary.
2022-04-15 08:12:58,363 INFO [alerts_summary]: Combining alerts into artificial variable "_AGGREGATE_"


# let's look at what content can be found in the datastore
datastore.keys()

dict_keys(['hists', 'split_hists', 'comparisons', 'profiles', '08546e4a-2d6d-4e81-b81c-671ae2806d58', 'dynamic_bounds', '1323e273-0562-4924-93d5-dc54d7966970', 'dynamic_bounds_comparisons', 'static_bounds', '70e37164-47bd-4027-8807-173461b65def', 'traffic_lights', 'static_bounds_comparisons', 'd043a793-9735-4d7a-8693-4f1eff423a3d', 'alerts'])


# you can access the datastore like you would any other dictionary
# we will just show you one random example here
datastore['comparisons']['age']


# save the objects as pickles (we could also create jsons and/or store in a database)
pickle.dump(hists, open(f"all_hist.pkl", "wb"))
pickle.dump(monitoring_rules, open(f"monitoring_rules.pkl", "wb"))
pickle.dump(datastore, open(f"datastore.pkl", "wb"))


# start by generating the histograms on the new dataset, using the same bin specifications to make the histograms comparable
new_hists = batch_1.pm_make_histograms(time_axis="date", bin_specs=bin_specs)
new_hists

2022-04-15 08:12:58,655 INFO [histogram_filler_base]: Filling 8 specified histograms. auto-binning.
100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 149.55it/s]

{'date:age': <Categorize values=SparselyBin size=1,
 'date:balance': <Categorize values=Categorize size=1,
 'date:currency': <Categorize values=Categorize size=1,
 'date:eyeColor': <Categorize values=Categorize size=1,
 'date:favoriteFruit': <Categorize values=Categorize size=1,
 'date:gender': <Categorize values=Categorize size=1,
 'date:latitude': <Categorize values=SparselyBin size=1,
 'date:transaction': <Categorize values=SparselyBin size=1}


batch_datastore = popmon.stability_metrics(hists=new_hists, monitoring_rules=monitoring_rules, reference_type='external', reference=hists)

2022-04-15 08:12:58,786 INFO [hist_splitter]: Splitting histograms "hists" as "split_hists"
2022-04-15 08:12:58,796 INFO [hist_comparer]: Comparing "split_hists" with rolling sum of 1 previous histogram(s).
2022-04-15 08:12:58,851 INFO [hist_splitter]: Splitting histograms "ref_hists" as "split_ref_hists"
2022-04-15 08:12:58,865 INFO [hist_comparer]: Comparing "split_hists" with reference "split_ref_hists"
2022-04-15 08:12:59,019 INFO [pull_calculator]: Comparing "comparisons" with median/mad of reference "comparisons"
2022-04-15 08:12:59,102 INFO [hist_profiler]: Profiling histograms "split_hists" as "profiles"
2022-04-15 08:12:59,154 INFO [hist_profiler]: Profiling histograms "split_ref_hists" as "ref_profiles"
2022-04-15 08:12:59,274 INFO [pull_calculator]: Comparing "profiles" with reference "ref_profiles"
2022-04-15 08:12:59,547 INFO [apply_func]: Computing significance of (rolling) trend in means of features
2022-04-15 08:12:59,575 INFO [compute_tl_bounds]: Calculating static bounds for "profiles"
2022-04-15 08:12:59,765 INFO [compute_tl_bounds]: Calculating static bounds for "comparisons"
2022-04-15 08:12:59,803 INFO [compute_tl_bounds]: Calculating traffic light alerts for "profiles"
2022-04-15 08:12:59,933 INFO [compute_tl_bounds]: Calculating traffic light alerts for "comparisons"
2022-04-15 08:12:59,973 INFO [apply_func]: Generating traffic light alerts summary.
2022-04-15 08:13:00,003 INFO [alerts_summary]: Combining alerts into artificial variable "_AGGREGATE_"


# let's checkout the same element in the datastore as we saw before
batch_datastore['profiles']['age']


# let's checkout the same element in the datastore as we saw before
batch_datastore['comparisons']['age']


# Let's look at the alerting summary: age has 2 yellow traffic lights . 
# Since green is coded as a 0, yellow as 1 and red as 2, the worst value is 1, hence yellow. 
batch_datastore['alerts']['age']


# The traffic_lights object tells what triggered the yellow traffic sign. 
# - The min value was in the 0-18 yellow range we specified ourselves. 
# - The pull value of the standard deviation indicates a difference in spread in the new data.  
batch_datastore['traffic_lights']['age']


batch_monitoring_report = popmon.stability_report(hists=new_hists, monitoring_rules=monitoring_rules, reference_type='external', reference=hists)

2022-04-15 08:13:00,164 INFO [hist_splitter]: Splitting histograms "hists" as "split_hists"
2022-04-15 08:13:00,181 INFO [hist_comparer]: Comparing "split_hists" with rolling sum of 1 previous histogram(s).
2022-04-15 08:13:00,278 INFO [hist_splitter]: Splitting histograms "ref_hists" as "split_ref_hists"
2022-04-15 08:13:00,286 INFO [hist_comparer]: Comparing "split_hists" with reference "split_ref_hists"
2022-04-15 08:13:00,470 INFO [pull_calculator]: Comparing "comparisons" with median/mad of reference "comparisons"
2022-04-15 08:13:00,556 INFO [hist_profiler]: Profiling histograms "split_hists" as "profiles"
2022-04-15 08:13:00,604 INFO [hist_profiler]: Profiling histograms "split_ref_hists" as "ref_profiles"
2022-04-15 08:13:00,712 INFO [pull_calculator]: Comparing "profiles" with reference "ref_profiles"
2022-04-15 08:13:01,004 INFO [apply_func]: Computing significance of (rolling) trend in means of features
2022-04-15 08:13:01,044 INFO [compute_tl_bounds]: Calculating static bounds for "profiles"
2022-04-15 08:13:01,224 INFO [compute_tl_bounds]: Calculating static bounds for "comparisons"
2022-04-15 08:13:01,261 INFO [compute_tl_bounds]: Calculating traffic light alerts for "profiles"
2022-04-15 08:13:01,314 INFO [compute_tl_bounds]: Calculating traffic light alerts for "comparisons"
2022-04-15 08:13:01,358 INFO [apply_func]: Generating traffic light alerts summary.
2022-04-15 08:13:01,396 INFO [alerts_summary]: Combining alerts into artificial variable "_AGGREGATE_"
2022-04-15 08:13:01,404 INFO [report_pipelines]: Generating report "html_report".
2022-04-15 08:13:01,404 INFO [histogram_section]: Generating section "Histograms".
100%|█████████████████████████████████████████████████████████████████| 8/8 [00:05<00:00,  1.47it/s]
2022-04-15 08:13:06,859 INFO [traffic_light_section_generator]: Generating section "Traffic Lights". skip empty plots: True
100%|████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 125.59it/s]
2022-04-15 08:13:06,927 INFO [alert_section_generator]: Generating section "Alerts". skip empty plots: True
  0%|                                                                         | 0/9 [00:00<?, ?it/s]C:\ProgramData\Miniconda3\envs\popmon_env\lib\site-packages\popmon\visualization\utils.py:186: RuntimeWarning: invalid value encountered in long_scalars
  a = data[c1][c2] / row_max
100%|████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 342.60it/s]
2022-04-15 08:13:06,953 INFO [section_generator]: Generating section "Comparisons". skip empty plots: True
100%|█████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.08it/s]
2022-04-15 08:13:10,813 INFO [section_generator]: Generating section "Profiles". skip empty plots: True
100%|█████████████████████████████████████████████████████████████████| 8/8 [00:03<00:00,  2.03it/s]


batch_monitoring_report

	age	balance	eyeColor	favoriteFruit	gender	latitude	transaction	currency	date
0	16	$1,333.24	grey	banana	female	18.821230	1131.03	DOL	2015-05
1	57	$3,777.44	brown	banana	female	-82.134610	-896.55	DOL	2015-06
2	26	$1,248.97	blue	strawberry	female	62.383017	1224.04	DOL	2015-10
3	63	$1,049.77	green	strawberry	male	-26.913326	541.08	DOL	2015-06
4	74	$2,942.64	red	strawberry	male	-2.777228	-372.62	DOL	2015-03

	prev1_ks	prev1_ks_zscore	prev1_ks_pvalue	prev1_pearson	prev1_chi2	prev1_chi2_norm	prev1_chi2_zscore	prev1_chi2_pvalue	prev1_chi2_max_residual	prev1_chi2_spike_count	prev1_max_prob_diff	prev1_unknown_labels	ref_ks	ref_ks_zscore	ref_ks_pvalue	ref_pearson	ref_chi2	ref_chi2_norm	ref_chi2_zscore	ref_chi2_pvalue	ref_chi2_max_residual	ref_chi2_spike_count	ref_max_prob_diff	ref_unknown_labels	ref_max_prob_diff_std	ref_max_prob_diff_mean	ref_max_prob_diff_pull	mean_trend10_zscore
date
2015-01	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.442418	-2.312559	0.989627	0.446020	29.987816	0.749695	-1.153065	0.875558	2.368508	0.0	0.065737	NaN	0.0251	0.077203	-0.456798	NaN
2015-02	0.946779	0.435895	0.331457	-5.061864e-17	33.504079	1.015275	0.143809	0.442826	2.001351	0.0	0.100000	NaN	0.918138	0.336679	0.368180	0.258062	30.261057	0.756526	-1.116126	0.867816	2.732871	0.0	0.080237	NaN	0.0251	0.077203	0.120885	NaN
2015-03	0.665432	-0.731410	0.767735	-7.380564e-02	31.669907	0.959694	-0.083522	0.533282	2.510413	0.0	0.148148	NaN	0.462763	-2.118823	0.982947	0.469252	27.002309	0.675058	-1.572555	0.942089	2.296013	0.0	0.104670	NaN	0.0251	0.077203	1.094311	NaN
2015-04	0.583586	-1.201607	0.885242	2.000875e-02	26.657297	0.952046	-0.092784	0.536962	2.115063	0.0	0.148148	NaN	0.907939	0.300592	0.381863	0.346271	29.532473	0.738312	-1.215136	0.887843	2.379007	0.0	0.083427	NaN	0.0251	0.077203	0.247989	NaN
2015-05	0.557784	-1.370834	0.914787	3.796117e-01	17.787931	0.684151	-1.191912	0.883352	1.414657	0.0	0.102217	NaN	0.652802	-0.798327	0.787660	0.487455	22.740538	0.568513	-2.230893	0.987156	2.489760	0.0	0.102358	NaN	0.0251	0.077203	1.002200	NaN
2015-06	1.332077	1.575995	0.057514	-4.175666e-02	27.576876	0.919229	-0.234877	0.592848	2.237766	0.0	0.137931	NaN	1.183783	1.168678	0.121267	0.580993	22.905891	0.572647	-2.203816	0.986231	1.824449	0.0	0.074169	NaN	0.0251	0.077203	-0.120885	NaN
2015-07	0.720158	-0.460633	0.677469	9.555846e-02	26.051765	0.814118	-0.710531	0.761312	1.940285	0.0	0.117647	NaN	0.586110	-1.185666	0.882123	0.114513	28.046507	0.701163	-1.422374	0.922541	2.154558	0.0	0.054809	NaN	0.0251	0.077203	-0.892202	NaN
2015-08	0.656204	-0.780122	0.782340	-2.902757e-01	29.590741	0.896689	-0.352139	0.637633	1.517574	0.0	0.074074	NaN	0.321042	-3.893301	0.999951	0.464718	20.608440	0.515211	-2.593056	0.995244	1.940474	0.0	0.042454	NaN	0.0251	0.077203	-1.384456	NaN

	histogram	filled	overflow	underflow	distinct	nan	count	most_probable_value	mean	std	min	max	p01	p05	p16	p50	p84	p95	p99	filled_std	overflow_std	underflow_std	distinct_std	nan_std	count_std	most_probable_value_std	mean_std	std_std	min_std	max_std	p01_std	p05_std	p16_std	p50_std	p84_std	p95_std	p99_std	filled_mean	overflow_mean	underflow_mean	distinct_mean	nan_mean	count_mean	most_probable_value_mean	mean_mean	std_mean	min_mean	max_mean	p01_mean	p05_mean	p16_mean	p50_mean	p84_mean	p95_mean	p99_mean	filled_pull	overflow_pull	underflow_pull	distinct_pull	nan_pull	count_pull	most_probable_value_pull	mean_pull	std_pull	min_pull	max_pull	p01_pull	p05_pull	p16_pull	p50_pull	p84_pull	p95_pull	p99_pull
date
2015-09	<SparselyBin binWidth=2.0 bins=Count nanflow=C...	38.0	0	0	24	0.0	38.0	32.5	49.078947	25.616228	10.5	88.5	10.5	13.033333	22.564	50.5	79.92	87.1	88.5	4.768058	0.0	0.0	2.0	0.0	4.768058	22.681215	3.169404	1.311268	1.0	1.561249	1.0	1.58015	4.268252	4.193249	3.33033	1.484152	1.561249	31.625	0.0	0.0	23.0	0.0	31.625	44.25	48.889361	21.438507	11.5	88.25	11.5	14.125	25.094167	48.833333	73.27	85.891667	88.25	1.337022	NaN	NaN	0.5	NaN	1.337022	-0.51805	0.059818	3.186017	-1.0	0.160128	-1.0	-0.690863	-0.592788	0.397464	1.996799	0.814157	0.160128

	prev1_ks	prev1_ks_zscore	prev1_ks_pvalue	prev1_pearson	prev1_chi2	prev1_chi2_norm	prev1_chi2_zscore	prev1_chi2_pvalue	prev1_chi2_max_residual	prev1_chi2_spike_count	prev1_max_prob_diff	prev1_unknown_labels	ref_ks	ref_ks_zscore	ref_ks_pvalue	ref_pearson	ref_chi2	ref_chi2_norm	ref_chi2_zscore	ref_chi2_pvalue	ref_chi2_max_residual	ref_chi2_spike_count	ref_max_prob_diff	ref_unknown_labels	ref_max_prob_diff_std	ref_max_prob_diff_mean	ref_max_prob_diff_pull	mean_trend10_zscore
date
2015-09	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.981093	0.550987	0.290821	0.092822	39.948714	0.998718	0.068895	0.472537	2.368508	0.0	0.063137	NaN	0.0	0.063137	NaN	NaN

	worst	n_red	n_yellow	n_green
date
2015-09	1.0	0.0	2.0	25.0

a practical guide for using the popmon package

Introduction

Our use case

Installation of the package

Initialize the notebook

The data

Import our example data

Create the different batches we would have when this model would have been in production

Initalizing the monitoring

Ready to calculate the monitoring metrics and alerts

Now let's try to use the monitoring on our first new batch of data

Create the histograms of the new batch, using the same bin specifications

In the same way we can create the monitoring report to give us a bit more insightful information

Next steps