Recently, I frequently started to visualize time-series with various annotations and in cycle plots to better understand their seasonality. As a reference to my future self, but also for you I will write this blog post.
Therefore, please excuse if at times there is amost no text but only code.
First I will generate some dummy data and show my existing plots.
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.dates as mdates
aut_locator = mdates.AutoDateLocator(minticks=3, maxticks=7)
aut_formatter = mdates.ConciseDateFormatter(aut_locator)
import random
random_seed = 47
np.random.seed(random_seed)
random.seed(random_seed)
Populating the interactive namespace from numpy and matplotlib
def generate_df_for_device(n_observations, n_metrics, device_id, geo_id, topology_id, cohort_id):
df = pd.DataFrame(np.random.randn(n_observations,n_metrics), index=pd.date_range('2020', freq='H', periods=n_observations))
df.columns = [f'metrik_{c}' for c in df.columns]
df['geospatial_id'] = geo_id
df['topology_id'] = topology_id
df['cohort_id'] = cohort_id
df['device_id'] = device_id
return df
def generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels):
results = []
for i in range(1, n_devices +1):
#print(i)
r = random.randrange(1, n_devices)
cohort = random.randrange(1, cohort_levels)
topo = random.randrange(1, topo_levels)
df_single_dvice = generate_df_for_device(n_observations, n_metrics, i, r, topo, cohort)
results.append(df_single_dvice)
#print(r)
return pd.concat(results)
# hourly data, 1 week of data
n_observations = 7 * 24
n_metrics = 3
n_devices = 20
cohort_levels = 3
topo_levels = 5
df = generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels)
df = df.sort_index()
df = df.reset_index().rename(columns={'index':'hour'})
df['dt'] = df.hour.dt.date
df.head()
hour | metrik_0 | metrik_1 | metrik_2 | geospatial_id | topology_id | cohort_id | device_id | dt | |
---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | -0.848009 | 1.305906 | 0.924208 | 12 | 4 | 1 | 1 | 2020-01-01 |
1 | 2020-01-01 | -0.516120 | 0.617011 | 0.623065 | 8 | 3 | 1 | 9 | 2020-01-01 |
2 | 2020-01-01 | 0.762399 | -0.359898 | -0.905238 | 19 | 3 | 2 | 13 | 2020-01-01 |
3 | 2020-01-01 | 0.708512 | -1.502019 | -2.677056 | 8 | 4 | 2 | 8 | 2020-01-01 |
4 | 2020-01-01 | 0.249475 | 0.590983 | -0.677694 | 11 | 3 | 1 | 12 | 2020-01-01 |
marker_labels = pd.DataFrame({'cohort_id':[1,1, 1], 'marker_type':['a', 'b', 'a'], 'start':['2020-01-2', '2020-01-04 05', '2020-01-06'], 'end':[np.nan, '2020-01-05 16', np.nan]})
marker_labels['start'] = pd.to_datetime(marker_labels['start'])
marker_labels['end'] = pd.to_datetime(marker_labels['end'])
marker_labels.loc[marker_labels['end'].isnull(), 'end'] = marker_labels.start + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
marker_labels
cohort_id | marker_type | start | end | |
---|---|---|---|---|
0 | 1 | a | 2020-01-02 00:00:00 | 2020-01-02 23:59:59 |
1 | 1 | b | 2020-01-04 05:00:00 | 2020-01-05 16:00:00 |
2 | 1 | a | 2020-01-06 00:00:00 | 2020-01-06 23:59:59 |
We want to only consider the labels we have and figure out if we see a valid match there. But what does this say about precision?
Between JOIN https://stackoverflow.com/questions/44106304/merging-two-pandas-dataframes-by-interval
joined = a.merge(b,on='id')
joined = joined[joined.ts.between(joined.ts1,joined.ts2)]
other options with SQLITE and SQL BETWEEN operator would exist but are more complex to integrate (for now) https://stackoverflow.com/questions/30627968/merge-pandas-dataframes-where-one-value-is-between-two-others
merged_res = (df.reset_index()
.merge(marker_labels, on='cohort_id', how='left')
.query('start <= hour <= end')
.set_index('index')
.reindex(df.index)
)
merged_res = merged_res.combine_first(df)
print(df.shape[0])
merged_res.shape[0]
3360
3360
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
ax = sns.lineplot(x='hour', y='metrik_0', data=df[df.cohort_id == cohort_id], ax=ax)
ax.xaxis.set_major_locator(aut_locator)
ax.xaxis.set_major_formatter(aut_formatter)
plt.title(f'cohort_id: {cohort_id}', fontsize=45)
plt.xlabel('')
plt.ylabel('metrik_0', fontsize=35)
for index, row in marker_labels.iterrows():
start = row.start
end = row.end
marker_type = row.marker_type
if marker_type == 'b':
ax.axvspan(start, end, color='gray', alpha=0.2)
else:
ax.axvspan(start, end, color='orange', alpha=0.5)
plt.show()
1
2
Variant 1: multiple metrics
metrik_columns = df.columns[df.columns.str.contains('metrik')].to_list()
metrik_columns
['metrik_0', 'metrik_1', 'metrik_2']
width=1200
height=500
import holoviews as hv
import hvplot.pandas
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
current_plot = df[df.cohort_id == cohort_id].set_index(['hour'])[metrik_columns].hvplot(width=width, height=height).opts(active_tools=['box_zoom'])
for index, row in marker_labels.iterrows():
start = row.start
end = row.end
marker_type = row.marker_type
vspan = hv.VSpan(start, end)
if marker_type == 'b':
current_plot = current_plot * vspan.opts(color='grey', alpha=0.2)
else:
current_plot = current_plot * vspan.opts(color='orange', alpha=0.4)
display(current_plot)
1
2
Variant 2: single metrik for the different devices
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
current_plot = df[df.cohort_id == cohort_id].set_index(['hour'])[['metrik_0', 'device_id']].hvplot(by='device_id', width=width, height=height).opts(active_tools=['box_zoom'])
for index, row in marker_labels.iterrows():
start = row.start
end = row.end
marker_type = row.marker_type
vspan = hv.VSpan(start, end)
if marker_type == 'b':
current_plot = current_plot * vspan.opts(color='grey', alpha=0.2)
else:
current_plot = current_plot * vspan.opts(color='orange', alpha=0.4)
display(current_plot)
1
2
Question 1:
How can I move the legend of the interactive plot down and also display more than a single column? I could not get it to work so far https://github.com/holoviz/holoviews/issues/3780
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
a1 = sns.lineplot(x=df['hour'].dt.hour, y='metrik_0', hue='device_id', units='dt', style='dt', estimator=None, data=df[(df.cohort_id == cohort_id)], ax=ax)
handles, labels = a1.get_legend_handles_labels()
a1.legend(handles=handles[1:], labels=labels[1:], loc='center', bbox_to_anchor=(0.5, -0.25), ncol=6, fontsize=20)
plt.title(f'cohort_id: {cohort_id}', fontsize=35)
plt.xlabel('hour of the day', fontsize=35)
plt.ylabel('metrik_0', fontsize=35)
plt.show()
1
2
merged_res.marker_type = merged_res.marker_type.fillna('no_labels_reported')
How to plot the line separately?
for cohort_id in sorted(merged_res.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
a1 = sns.lineplot(x=merged_res['hour'].dt.hour, y='metrik_0', hue='marker_type', units='dt', style='dt', estimator=None, data=merged_res[(merged_res.cohort_id == cohort_id)], ax=ax)
handles, labels = a1.get_legend_handles_labels()
a1.legend(handles=handles[1:], labels=labels[1:], loc='center', bbox_to_anchor=(0.5, -0.25), ncol=6, fontsize=20)
plt.title(f'cohort_id: {cohort_id}', fontsize=35)
plt.xlabel('hour of the day', fontsize=35)
plt.ylabel('metrik_0', fontsize=35)
plt.show()
1.0
2.0
merged_res.head()
cohort_id | device_id | dt | end | geospatial_id | hour | marker_type | metrik_0 | metrik_1 | metrik_2 | start | topology_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 2020-01-01 | NaT | 12.0 | 2020-01-01 | no_labels_reported | -0.848009 | 1.305906 | 0.924208 | NaT | 4.0 |
1 | 1.0 | 9.0 | 2020-01-01 | NaT | 8.0 | 2020-01-01 | no_labels_reported | -0.516120 | 0.617011 | 0.623065 | NaT | 3.0 |
2 | 2.0 | 13.0 | 2020-01-01 | NaT | 19.0 | 2020-01-01 | no_labels_reported | 0.762399 | -0.359898 | -0.905238 | NaT | 3.0 |
3 | 2.0 | 8.0 | 2020-01-01 | NaT | 8.0 | 2020-01-01 | no_labels_reported | 0.708512 | -1.502019 | -2.677056 | NaT | 4.0 |
4 | 1.0 | 12.0 | 2020-01-01 | NaT | 11.0 | 2020-01-01 | no_labels_reported | 0.249475 | 0.590983 | -0.677694 | NaT | 3.0 |
for cohort_id in sorted(merged_res.cohort_id.unique()):
print(cohort_id)
current_plot = merged_res[merged_res.cohort_id == cohort_id].set_index(['hour'])[['metrik_0', 'marker_type']].hvplot(by='marker_type', width=width, height=height).opts(active_tools=['box_zoom'])
for index, row in marker_labels.iterrows():
start = row.start
end = row.end
marker_type = row.marker_type
vspan = hv.VSpan(start, end)
if marker_type == 'b':
current_plot = current_plot * vspan.opts(color='grey', alpha=0.2)
else:
current_plot = current_plot * vspan.opts(color='orange', alpha=0.4)
display(current_plot)
1.0
2.0
merged_res['hour_time'] = merged_res['hour'].dt.hour
merged_res.device_id = merged_res.device_id.astype(str)
for cohort_id in sorted(merged_res.cohort_id.unique()):
print(cohort_id)
current_plot = merged_res[merged_res.cohort_id == cohort_id].set_index(['hour_time'])[['metrik_0', 'marker_type', 'device_id', 'dt']].hvplot(by=['marker_type'],
hover_cols=['dt', 'device_id'], width=width, height=height).opts(active_tools=['box_zoom'])
display(current_plot)
1.0
2.0
import datashader as ds
import datashader.transfer_functions as tf
import holoviews as hv
from holoviews.operation.datashader import datashade
hv.extension('bokeh')
from datashader.colors import Sets1to3
lab_s = df.cohort_id.unique()
color_key = [(name,color) for name,color in zip(lab_s, Sets1to3)]
color_points = hv.NdOverlay({n: hv.Points([0,0], label=str(n)).opts(style=dict(color=c)) for n,c in color_key})
datashade(hv.Points(df, kdims=['metrik_0', 'metrik_1'], vdims=['cohort_id']), aggregator=ds.count_cat('cohort_id') , color_key=Sets1to3).opts(width=width, height=height) * color_points
Though this is not necessarily very indicative on the dummy dta. It could still be worth it on a full (real dataset).
Now build on (3) and create something for plotting periodicities, interactivity but scalable for the full (real) dataset:
merged_res.head()
cohort_id | device_id | dt | end | geospatial_id | hour | marker_type | metrik_0 | metrik_1 | metrik_2 | start | topology_id | hour_time | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.0 | 1.0 | 2020-01-01 | NaT | 12.0 | 2020-01-01 | no_labels_reported | -0.848009 | 1.305906 | 0.924208 | NaT | 4.0 | 0 |
1 | 1.0 | 9.0 | 2020-01-01 | NaT | 8.0 | 2020-01-01 | no_labels_reported | -0.516120 | 0.617011 | 0.623065 | NaT | 3.0 | 0 |
2 | 2.0 | 13.0 | 2020-01-01 | NaT | 19.0 | 2020-01-01 | no_labels_reported | 0.762399 | -0.359898 | -0.905238 | NaT | 3.0 | 0 |
3 | 2.0 | 8.0 | 2020-01-01 | NaT | 8.0 | 2020-01-01 | no_labels_reported | 0.708512 | -1.502019 | -2.677056 | NaT | 4.0 | 0 |
4 | 1.0 | 12.0 | 2020-01-01 | NaT | 11.0 | 2020-01-01 | no_labels_reported | 0.249475 | 0.590983 | -0.677694 | NaT | 3.0 | 0 |
lab_s = merged_res.marker_type.unique()
color_key = [(name,color) for name,color in zip(lab_s, Sets1to3)]
color_points = hv.NdOverlay({n: hv.Points([0,0], label=str(n)).opts(style=dict(color=c)) for n,c in color_key})
datashade(hv.Points(merged_res, kdims=['hour_time', 'metrik_0'], vdims=['marker_type']), aggregator=ds.count_cat('marker_type') , color_key=Sets1to3).opts(width=width, height=height) * color_points
Plotting individual points does not work in matplotlib. There are simply too many observations in the daata set. Instead, we could aggregate data and then plot it again.
agged = merged_res.groupby(['hour_time', 'marker_type']).metrik_0.mean().unstack()#.reset_index()
display(agged.head())
plt.rc('ytick', labelsize=20)
plt.rc('xtick', labelsize=20)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
agged.plot.bar(ax=ax)
plt.xlabel('hour of the day', fontsize=35)
plt.ylabel('mean of metrik_0', fontsize=35)
plt.show()
marker_type | a | b | no_labels_reported |
---|---|---|---|
hour_time | |||
0 | 0.256931 | -0.147816 | -0.027035 |
1 | -0.033618 | -0.334960 | 0.103872 |
2 | 0.362046 | 0.522553 | 0.081811 |
3 | 0.033785 | -0.196713 | 0.154278 |
4 | 0.031332 | 0.217341 | -0.173423 |