Recently, I frequently started to visualize time-series with various annotations and in cycle plots to better understand their seasonality. As a reference to my future self, but also for you I will write this blog post.
Therefore, please excuse if at times there is amost no text but only code.
First I will generate some dummy data and show my existing plots.
%pylab inline
import pandas as pd
import numpy as np
import seaborn as sns; sns.set()
import matplotlib.dates as mdates
aut_locator = mdates.AutoDateLocator(minticks=3, maxticks=7)
aut_formatter = mdates.ConciseDateFormatter(aut_locator)
import random
random_seed = 47
np.random.seed(random_seed)
random.seed(random_seed)
Populating the interactive namespace from numpy and matplotlib
def generate_df_for_device(n_observations, n_metrics, device_id, geo_id, topology_id, cohort_id):
df = pd.DataFrame(np.random.randn(n_observations,n_metrics), index=pd.date_range('2020', freq='H', periods=n_observations))
df.columns = [f'metrik_{c}' for c in df.columns]
df['geospatial_id'] = geo_id
df['topology_id'] = topology_id
df['cohort_id'] = cohort_id
df['device_id'] = device_id
return df
def generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels):
results = []
for i in range(1, n_devices +1):
#print(i)
r = random.randrange(1, n_devices)
cohort = random.randrange(1, cohort_levels)
topo = random.randrange(1, topo_levels)
df_single_dvice = generate_df_for_device(n_observations, n_metrics, i, r, topo, cohort)
results.append(df_single_dvice)
#print(r)
return pd.concat(results)
# hourly data, 1 week of data
n_observations = 7 * 24
n_metrics = 3
n_devices = 20
cohort_levels = 3
topo_levels = 5
df = generate_multi_device(n_observations, n_metrics, n_devices, cohort_levels, topo_levels)
df = df.sort_index()
df = df.reset_index().rename(columns={'index':'hour'})
df['dt'] = df.hour.dt.date
df.head()
hour | metrik_0 | metrik_1 | metrik_2 | geospatial_id | topology_id | cohort_id | device_id | dt | |
---|---|---|---|---|---|---|---|---|---|
0 | 2020-01-01 | -0.848009 | 1.305906 | 0.924208 | 12 | 4 | 1 | 1 | 2020-01-01 |
1 | 2020-01-01 | -0.516120 | 0.617011 | 0.623065 | 8 | 3 | 1 | 9 | 2020-01-01 |
2 | 2020-01-01 | 0.762399 | -0.359898 | -0.905238 | 19 | 3 | 2 | 13 | 2020-01-01 |
3 | 2020-01-01 | 0.708512 | -1.502019 | -2.677056 | 8 | 4 | 2 | 8 | 2020-01-01 |
4 | 2020-01-01 | 0.249475 | 0.590983 | -0.677694 | 11 | 3 | 1 | 12 | 2020-01-01 |
marker_labels = pd.DataFrame({'cohort_id':[1,1, 1], 'marker_type':['a', 'b', 'a'], 'start':['2020-01-2', '2020-01-04 05', '2020-01-06'], 'end':[np.nan, '2020-01-05 16', np.nan]})
marker_labels['start'] = pd.to_datetime(marker_labels['start'])
marker_labels['end'] = pd.to_datetime(marker_labels['end'])
marker_labels.loc[marker_labels['end'].isnull(), 'end'] = marker_labels.start + pd.Timedelta(days=1) - pd.Timedelta(seconds=1)
marker_labels
cohort_id | marker_type | start | end | |
---|---|---|---|---|
0 | 1 | a | 2020-01-02 00:00:00 | 2020-01-02 23:59:59 |
1 | 1 | b | 2020-01-04 05:00:00 | 2020-01-05 16:00:00 |
2 | 1 | a | 2020-01-06 00:00:00 | 2020-01-06 23:59:59 |
We want to only consider the labels we have and figure out if we see a valid match there. But what does this say about precision?
Between JOIN https://stackoverflow.com/questions/44106304/merging-two-pandas-dataframes-by-interval
joined = a.merge(b,on='id')
joined = joined[joined.ts.between(joined.ts1,joined.ts2)]
other options with SQLITE and SQL BETWEEN operator would exist but are more complex to integrate (for now) https://stackoverflow.com/questions/30627968/merge-pandas-dataframes-where-one-value-is-between-two-others
merged_res = (df.reset_index()
.merge(marker_labels, on='cohort_id', how='left')
.query('start <= hour <= end')
.set_index('index')
.reindex(df.index)
)
merged_res = merged_res.combine_first(df)
print(df.shape[0])
merged_res.shape[0]
3360
3360
for cohort_id in sorted(df.cohort_id.unique()):
print(cohort_id)
figsize = (25, 9)
fig, ax = plt.subplots(figsize=figsize)
ax = sns.lineplot(x='hour', y='metrik_0', data=df[df.cohort_id == cohort_id], ax=ax)
ax.xaxis.set_major_locator(aut_locator)
ax.xaxis.set_major_formatter(aut_formatter)
plt.title(f'cohort_id: {cohort_id}', fontsize=45)
plt.xlabel('')
plt.ylabel('metrik_0', fontsize=35)
for index, row in marker_labels.iterrows():
start = row.start
end = row.end
marker_type = row.marker_type
if marker_type == 'b':
ax.axvspan(start, end, color='gray', alpha=0.2)
else:
ax.axvspan(start, end, color='orange', alpha=0.5)
plt.show()
1
2