Representation Methods¶

Comparison of different cluster representation methods: medoid, maxoid, mean, minmax, and duration.

Author: Maximilian Hoffmann

In [1]:

Copied!





%load_ext autoreload
%autoreload 2

import pandas as pd
import plotly.express as px
import plotly.io as pio

import tsam
from tsam import ClusterConfig

pio.renderers.default = "notebook_connected"
import warnings

# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
    "ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)
%load_ext autoreload
%autoreload 2

import pandas as pd
import plotly.express as px
import plotly.io as pio

import tsam
from tsam import ClusterConfig

pio.renderers.default = "notebook_connected"
import warnings

# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
    "ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)

Input data¶

In [2]:

Copied!

raw = pd.read_csv("testdata.csv", index_col=0)
raw
raw = pd.read_csv("testdata.csv", index_col=0)
raw

Out[2]:

	GHI	T	Wind	Load
2009-12-31 23:30:00	0	-2.1	7.1	375.478394
2010-01-01 00:30:00	0	-2.8	8.6	364.541326
2010-01-01 01:30:00	0	-3.3	9.7	357.416844
2010-01-01 02:30:00	0	-3.2	9.8	350.191306
2010-01-01 03:30:00	0	-3.2	9.4	345.161449
...	...	...	...	...
2010-12-31 18:30:00	0	0.9	4.1	413.094373
2010-12-31 19:30:00	0	0.6	3.8	396.424646
2010-12-31 20:30:00	0	0.3	3.4	386.052577
2010-12-31 21:30:00	0	-0.3	4.1	383.066817
2010-12-31 22:30:00	0	-1.1	5.3	365.189007

8760 rows × 4 columns

Medoid representation¶

Picks the actual observed period closest to each cluster centroid.

In [3]:

Copied!





result_medoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="medoid"),
)
result_medoid.accuracy
result_medoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="medoid"),
)
result_medoid.accuracy

Out[3]:

AccuracyMetrics(
  rmse=0.1097 (weighted),
  mae=0.0766 (weighted),
  rmse_duration=0.0374 (weighted)
)

In [4]:

Copied!

result_medoid.plot.cluster_members()
result_medoid.plot.cluster_members()

Maxoid representation¶

Picks the period that maximizes the sum of all column values in each cluster.

In [5]:

Copied!





result_maxoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="maxoid"),
    preserve_column_means=False,
)
result_maxoid.accuracy
result_maxoid = tsam.aggregate(
    raw,
    n_clusters=8,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="maxoid"),
    preserve_column_means=False,
)
result_maxoid.accuracy

Out[5]:

AccuracyMetrics(
  rmse=0.1907 (weighted),
  mae=0.1329 (weighted),
  rmse_duration=0.1110 (weighted)
)

In [6]:

Copied!

result_maxoid.plot.cluster_members()
result_maxoid.plot.cluster_members()

Mean representation¶

Averages all member periods of each cluster.

In [7]:

Copied!





result_mean = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="mean"),
)
result_mean.accuracy
result_mean = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="mean"),
)
result_mean.accuracy

Out[7]:

AccuracyMetrics(
  rmse=0.0803 (weighted),
  mae=0.0554 (weighted),
  rmse_duration=0.0294 (weighted)
)

In [8]:

Copied!

result_mean.plot.cluster_members()
result_mean.plot.cluster_members()

MinMax Mean representation¶

Like mean, but preserves the minimum and maximum values per column.

In [9]:

Copied!





result_minmax = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="minmax_mean"),
    preserve_column_means=False,
)
result_minmax.accuracy
result_minmax = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="minmax_mean"),
    preserve_column_means=False,
)
result_minmax.accuracy

Out[9]:

AccuracyMetrics(
  rmse=0.0803 (weighted),
  mae=0.0554 (weighted),
  rmse_duration=0.0294 (weighted)
)

In [10]:

Copied!

result_minmax.plot.cluster_members()
result_minmax.plot.cluster_members()

Distribution representation¶

Preserves the full value distribution (duration curve) within each cluster.

In [11]:

Copied!





result_duration = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="distribution"),
    preserve_column_means=False,
)
result_duration.accuracy
result_duration = tsam.aggregate(
    raw,
    n_clusters=20,
    period_duration=24,
    cluster=ClusterConfig(method="hierarchical", representation="distribution"),
    preserve_column_means=False,
)
result_duration.accuracy

Out[11]:

AccuracyMetrics(
  rmse=0.0957 (weighted),
  mae=0.0643 (weighted),
  rmse_duration=0.0055 (weighted)
)

In [12]:

Copied!

result_duration.plot.cluster_members()
result_duration.plot.cluster_members()

Comparison¶

Compare all representation methods via duration curves, heatmaps, and time slices.

In [13]:

Copied!





results = {
    "Medoid (8)": result_medoid,
    "Maxoid (8)": result_maxoid,
    "Mean (20)": result_mean,
    "Minmax (20)": result_minmax,
    "Distribution (20)": result_duration,
}

# Duration curves
frames = []
for name, r in {"Original": None, **results}.items():
    vals = (raw if r is None else r.reconstructed)["Load"]
    sorted_vals = vals.sort_values(ascending=False).reset_index(drop=True)
    frames.append(
        pd.DataFrame(
            {"Hour": range(len(sorted_vals)), "Load": sorted_vals, "Method": name}
        )
    )

px.line(
    pd.concat(frames, ignore_index=True),
    x="Hour",
    y="Load",
    color="Method",
    title="Duration Curve Comparison - Load",
)
results = {
    "Medoid (8)": result_medoid,
    "Maxoid (8)": result_maxoid,
    "Mean (20)": result_mean,
    "Minmax (20)": result_minmax,
    "Distribution (20)": result_duration,
}

# Duration curves
frames = []
for name, r in {"Original": None, **results}.items():
    vals = (raw if r is None else r.reconstructed)["Load"]
    sorted_vals = vals.sort_values(ascending=False).reset_index(drop=True)
    frames.append(
        pd.DataFrame(
            {"Hour": range(len(sorted_vals)), "Load": sorted_vals, "Method": name}
        )
    )

px.line(
    pd.concat(frames, ignore_index=True),
    x="Hour",
    y="Load",
    color="Method",
    title="Duration Curve Comparison - Load",
)

In [14]:

Copied!





# Heatmap comparison
param = "GHI"
unstacked_orig = tsam.unstack_to_periods(raw, period_duration=24)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ["Original", *list(results.keys())]
data = [unstacked_orig] + [
    tsam.unstack_to_periods(r.reconstructed, period_duration=24)
    for r in results.values()
]

fig = make_subplots(
    rows=len(data), cols=1, subplot_titles=labels, vertical_spacing=0.03
)
for i, d in enumerate(data, 1):
    fig.add_trace(go.Heatmap(z=d[param].values.T, coloraxis="coloraxis"), row=i, col=1)
fig.update_layout(
    height=250 * len(data),
    coloraxis={"colorscale": "Viridis"},
    title_text=f"Heatmap Comparison - {param}",
)
fig.show()
# Heatmap comparison
param = "GHI"
unstacked_orig = tsam.unstack_to_periods(raw, period_duration=24)

import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ["Original", *list(results.keys())]
data = [unstacked_orig] + [
    tsam.unstack_to_periods(r.reconstructed, period_duration=24)
    for r in results.values()
]

fig = make_subplots(
    rows=len(data), cols=1, subplot_titles=labels, vertical_spacing=0.03
)
for i, d in enumerate(data, 1):
    fig.add_trace(go.Heatmap(z=d[param].values.T, coloraxis="coloraxis"), row=i, col=1)
fig.update_layout(
    height=250 * len(data),
    coloraxis={"colorscale": "Viridis"},
    title_text=f"Heatmap Comparison - {param}",
)
fig.show()

In [15]:

Copied!





# Time slice comparison
frames = []
for name, r in {"Original": None, **results}.items():
    df = raw if r is None else r.reconstructed
    sliced = df.loc["20100210":"20100218", ["Load"]].copy()
    sliced["Method"] = name
    frames.append(sliced)

px.line(
    pd.concat(frames).reset_index(names="Time"),
    x="Time",
    y="Load",
    color="Method",
    title="Time Slice Comparison - Load (Feb 10-18)",
)
# Time slice comparison
frames = []
for name, r in {"Original": None, **results}.items():
    df = raw if r is None else r.reconstructed
    sliced = df.loc["20100210":"20100218", ["Load"]].copy()
    sliced["Method"] = name
    frames.append(sliced)

px.line(
    pd.concat(frames).reset_index(names="Time"),
    x="Time",
    y="Load",
    color="Method",
    title="Time Slice Comparison - Load (Feb 10-18)",
)

Validation¶

Column means should be preserved (except maxoid, which uses preserve_column_means=False).

In [16]:

Copied!





means = pd.DataFrame(
    {"Original": raw.mean()}
    | {name: r.reconstructed.mean() for name, r in results.items()}
)
means
means = pd.DataFrame(
    {"Original": raw.mean()}
    | {name: r.reconstructed.mean() for name, r in results.items()}
)
means

Out[16]:

	Original	Medoid (8)	Maxoid (8)	Mean (20)	Minmax (20)	Distribution (20)
GHI	110.990183	110.990183	119.566553	110.990183	110.990183	110.990183
Load	450.260335	450.260335	418.473753	450.260335	450.260335	450.260335
T	7.790616	7.790616	8.349030	7.790616	7.790616	7.790616
Wind	3.057306	3.057306	4.385788	3.057306	3.057306	3.057306