Segmentation¶
How to use time step segmentation to reduce the number of timesteps per period.
Author: Maximilian Hoffmann
In [1]:
Copied!
%load_ext autoreload
%autoreload 2
import pandas as pd
import plotly.express as px
import plotly.io as pio
import tsam
from tsam import ClusterConfig, SegmentConfig
pio.renderers.default = "notebook_connected"
import warnings
# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
"ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)
%load_ext autoreload
%autoreload 2
import pandas as pd
import plotly.express as px
import plotly.io as pio
import tsam
from tsam import ClusterConfig, SegmentConfig
pio.renderers.default = "notebook_connected"
import warnings
# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
"ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)
Input data¶
In [2]:
Copied!
raw = pd.read_csv("testdata.csv", index_col=0)
raw = pd.read_csv("testdata.csv", index_col=0)
10 typical days at hourly resolution (no segmentation)¶
Baseline: hierarchical clustering with medoid representation and 24 hourly timesteps.
In [3]:
Copied!
result = tsam.aggregate(
raw,
n_clusters=10,
period_duration=24,
cluster=ClusterConfig(method="hierarchical"),
)
result.accuracy
result = tsam.aggregate(
raw,
n_clusters=10,
period_duration=24,
cluster=ClusterConfig(method="hierarchical"),
)
result.accuracy
Out[3]:
AccuracyMetrics( rmse=0.1044 (weighted), mae=0.0715 (weighted), rmse_duration=0.0337 (weighted) )
In [4]:
Copied!
result.plot.cluster_members()
result.plot.cluster_members()
In [5]:
Copied!
result.cluster_representatives
result.cluster_representatives
Out[5]:
| GHI | Load | T | Wind | ||
|---|---|---|---|---|---|
| timestep | |||||
| 0 | 0 | 0.000000 | 383.795921 | 4.421453 | 5.043609 |
| 1 | 0.000000 | 371.689230 | 4.026704 | 4.933965 | |
| 2 | 0.000000 | 374.310544 | 3.829330 | 4.714678 | |
| 3 | 0.000000 | 369.720991 | 3.730643 | 4.605034 | |
| 4 | 0.000000 | 376.053584 | 3.335894 | 5.591827 | |
| ... | ... | ... | ... | ... | ... |
| 9 | 19 | 2.199516 | 478.910919 | 10.145308 | 2.192873 |
| 20 | 0.000000 | 457.575577 | 10.342682 | 2.192873 | |
| 21 | 0.000000 | 444.225788 | 10.540056 | 2.192873 | |
| 22 | 0.000000 | 430.132842 | 10.441369 | 2.192873 | |
| 23 | 0.000000 | 401.874887 | 10.441369 | 2.192873 |
240 rows × 4 columns
20 typical days with 12 irregular segments¶
Segmentation reduces the number of timesteps per period while preserving key transitions.
In [6]:
Copied!
result_segmented = tsam.aggregate(
raw,
n_clusters=20,
period_duration=24,
cluster=ClusterConfig(method="hierarchical"),
segments=SegmentConfig(n_segments=12),
)
result_segmented.accuracy
result_segmented = tsam.aggregate(
raw,
n_clusters=20,
period_duration=24,
cluster=ClusterConfig(method="hierarchical"),
segments=SegmentConfig(n_segments=12),
)
result_segmented.accuracy
Out[6]:
AccuracyMetrics( rmse=0.0961 (weighted), mae=0.0658 (weighted), rmse_duration=0.0203 (weighted) )
In [7]:
Copied!
result_segmented.plot.cluster_members()
result_segmented.plot.cluster_members()
In [8]:
Copied!
result_segmented.plot.segment_durations()
result_segmented.plot.segment_durations()
In [9]:
Copied!
result_segmented.cluster_representatives
result_segmented.cluster_representatives
Out[9]:
| GHI | Load | T | Wind | |||
|---|---|---|---|---|---|---|
| Segment Step | Segment Duration | |||||
| 0 | 0 | 4 | 0.000000 | 403.310708 | 1.025025 | 2.226777 |
| 1 | 2 | 0.000000 | 424.393351 | 1.122830 | 3.340165 | |
| 2 | 1 | 0.000000 | 495.118712 | 1.171733 | 3.340165 | |
| 3 | 2 | 11.182790 | 541.665766 | 1.562953 | 3.340165 | |
| 4 | 4 | 74.551932 | 547.697631 | 2.272040 | 2.226777 | |
| ... | ... | ... | ... | ... | ... | ... |
| 19 | 7 | 3 | 76.681988 | 471.099097 | -2.447057 | 3.340165 |
| 8 | 4 | 34.080883 | 469.022225 | -1.664616 | 2.226777 | |
| 9 | 1 | 0.000000 | 499.029025 | -2.055836 | 2.226777 | |
| 10 | 4 | 0.000000 | 498.349573 | -2.178093 | 1.113388 | |
| 11 | 2 | 0.000000 | 474.908494 | -2.495959 | 2.226777 |
240 rows × 4 columns
Comparison¶
In [10]:
Copied!
results = {
"10 x 24h": result,
"20 x 12seg": result_segmented,
}
# Duration curves
frames = []
for name, r in {"Original": None, **results}.items():
vals = (raw if r is None else r.reconstructed)["Load"]
sorted_vals = vals.sort_values(ascending=False).reset_index(drop=True)
frames.append(
pd.DataFrame(
{"Hour": range(len(sorted_vals)), "Load": sorted_vals, "Method": name}
)
)
px.line(
pd.concat(frames, ignore_index=True),
x="Hour",
y="Load",
color="Method",
title="Duration Curve Comparison - Load",
)
results = {
"10 x 24h": result,
"20 x 12seg": result_segmented,
}
# Duration curves
frames = []
for name, r in {"Original": None, **results}.items():
vals = (raw if r is None else r.reconstructed)["Load"]
sorted_vals = vals.sort_values(ascending=False).reset_index(drop=True)
frames.append(
pd.DataFrame(
{"Hour": range(len(sorted_vals)), "Load": sorted_vals, "Method": name}
)
)
px.line(
pd.concat(frames, ignore_index=True),
x="Hour",
y="Load",
color="Method",
title="Duration Curve Comparison - Load",
)
In [11]:
Copied!
# Heatmap comparison
param = "GHI"
import plotly.graph_objects as go
from plotly.subplots import make_subplots
labels = ["Original", "10 x 24h", "20 x 12seg"]
data = [
tsam.unstack_to_periods(raw, period_duration=24),
tsam.unstack_to_periods(result.reconstructed, period_duration=24),
tsam.unstack_to_periods(result_segmented.reconstructed, period_duration=24),
]
fig = make_subplots(rows=3, cols=1, subplot_titles=labels, vertical_spacing=0.05)
for i, d in enumerate(data, 1):
fig.add_trace(go.Heatmap(z=d[param].values.T, coloraxis="coloraxis"), row=i, col=1)
fig.update_layout(
height=750,
coloraxis={"colorscale": "Viridis"},
title_text=f"Heatmap Comparison - {param}",
)
fig.show()
# Heatmap comparison
param = "GHI"
import plotly.graph_objects as go
from plotly.subplots import make_subplots
labels = ["Original", "10 x 24h", "20 x 12seg"]
data = [
tsam.unstack_to_periods(raw, period_duration=24),
tsam.unstack_to_periods(result.reconstructed, period_duration=24),
tsam.unstack_to_periods(result_segmented.reconstructed, period_duration=24),
]
fig = make_subplots(rows=3, cols=1, subplot_titles=labels, vertical_spacing=0.05)
for i, d in enumerate(data, 1):
fig.add_trace(go.Heatmap(z=d[param].values.T, coloraxis="coloraxis"), row=i, col=1)
fig.update_layout(
height=750,
coloraxis={"colorscale": "Viridis"},
title_text=f"Heatmap Comparison - {param}",
)
fig.show()
In [12]:
Copied!
# Time slice comparison
frames = []
for name, r in {"Original": None, **results}.items():
df = raw if r is None else r.reconstructed
sliced = df.loc["20100210":"20100218", ["Load"]].copy()
sliced["Method"] = name
frames.append(sliced)
px.line(
pd.concat(frames).reset_index(names="Time"),
x="Time",
y="Load",
color="Method",
title="Time Slice Comparison - Load (Feb 10-18)",
)
# Time slice comparison
frames = []
for name, r in {"Original": None, **results}.items():
df = raw if r is None else r.reconstructed
sliced = df.loc["20100210":"20100218", ["Load"]].copy()
sliced["Method"] = name
frames.append(sliced)
px.line(
pd.concat(frames).reset_index(names="Time"),
x="Time",
y="Load",
color="Method",
title="Time Slice Comparison - Load (Feb 10-18)",
)
Validation¶
Column means should be preserved for both approaches.
In [13]:
Copied!
means = pd.DataFrame(
{
"Original": raw.mean(),
"10 x 24h": result.reconstructed.mean(),
"20 x 12seg": result_segmented.reconstructed.mean(),
}
)
means
means = pd.DataFrame(
{
"Original": raw.mean(),
"10 x 24h": result.reconstructed.mean(),
"20 x 12seg": result_segmented.reconstructed.mean(),
}
)
means
Out[13]:
| Original | 10 x 24h | 20 x 12seg | |
|---|---|---|---|
| GHI | 110.990183 | 110.990183 | 110.990179 |
| Load | 450.260335 | 450.260335 | 450.260335 |
| T | 7.790616 | 7.790616 | 7.790616 |
| Wind | 3.057306 | 3.057306 | 3.057306 |