Clustering Transfer¶
This notebook demonstrates the cluster once, apply many times workflow.
In [1]:
Copied!
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import tsam
from tsam import ClusterConfig, ClusteringResult
pio.renderers.default = "notebook_connected"
# Ensure results directory exists
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)
raw = pd.read_csv("testdata.csv", index_col=0) # 4 columns: GHI, T, Wind, Load
import warnings
# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
"ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
import tsam
from tsam import ClusterConfig, ClusteringResult
pio.renderers.default = "notebook_connected"
# Ensure results directory exists
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)
raw = pd.read_csv("testdata.csv", index_col=0) # 4 columns: GHI, T, Wind, Load
import warnings
# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
"ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)
Two Paths to Aggregate¶
Path A: Cluster on ALL variables Path B: Cluster on WIND only, then apply to all variables
In [2]:
Copied!
# Path A: Cluster on ALL variables
result_all = tsam.aggregate(
raw, n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)
# Path B: Cluster on WIND only, then transfer to all
result_wind = tsam.aggregate(
raw[["Wind"]], n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)
result_transferred = result_wind.clustering.apply(raw)
# Path A: Cluster on ALL variables
result_all = tsam.aggregate(
raw, n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)
# Path B: Cluster on WIND only, then transfer to all
result_wind = tsam.aggregate(
raw[["Wind"]], n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)
result_transferred = result_wind.clustering.apply(raw)
Comparing Cluster Assignments¶
- Row 1 (all variables) differs from Row 2 (wind only)
- Row 2 and Row 3 are identical - the transfer preserves the clustering!
In [3]:
Copied!
px.imshow(
pd.DataFrame(
{
"Clustered on ALL variables": result_all.cluster_assignments,
"Clustered on WIND only": result_wind.cluster_assignments,
"Transferred to all variables": result_transferred.cluster_assignments,
},
index=pd.RangeIndex(
start=0, stop=len(result_all.cluster_assignments), name="Original Period"
),
).T,
color_continuous_scale="viridis",
title="Cluster Assignments: Row 2 and 3 are IDENTICAL!",
)
px.imshow(
pd.DataFrame(
{
"Clustered on ALL variables": result_all.cluster_assignments,
"Clustered on WIND only": result_wind.cluster_assignments,
"Transferred to all variables": result_transferred.cluster_assignments,
},
index=pd.RangeIndex(
start=0, stop=len(result_all.cluster_assignments), name="Original Period"
),
).T,
color_continuous_scale="viridis",
title="Cluster Assignments: Row 2 and 3 are IDENTICAL!",
)
Verifying the Transfer¶
The wind typical periods are identical - but now we also have GHI, T, and Load:
In [4]:
Copied!
# Wind typical periods: wind-only vs transferred
wind_only = result_wind.cluster_representatives["Wind"]
wind_transferred = result_transferred.cluster_representatives["Wind"]
print(
"Wind typical periods IDENTICAL after transfer:", wind_only.equals(wind_transferred)
)
print(f"Max difference: {(wind_only - wind_transferred).abs().max()}")
# Wind typical periods: wind-only vs transferred
wind_only = result_wind.cluster_representatives["Wind"]
wind_transferred = result_transferred.cluster_representatives["Wind"]
print(
"Wind typical periods IDENTICAL after transfer:", wind_only.equals(wind_transferred)
)
print(f"Max difference: {(wind_only - wind_transferred).abs().max()}")
Wind typical periods IDENTICAL after transfer: True Max difference: 0.0
In [5]:
Copied!
# Verify: cluster assignments are equal
print(
"Cluster assignments equal:",
np.array_equal(
result_wind.cluster_assignments, result_transferred.cluster_assignments
),
)
# Verify: cluster assignments are equal
print(
"Cluster assignments equal:",
np.array_equal(
result_wind.cluster_assignments, result_transferred.cluster_assignments
),
)
Cluster assignments equal: True
In [6]:
Copied!
# But now we have ALL columns!
print("Wind-only result columns:", result_wind.cluster_representatives.columns.tolist())
print(
"Transferred result columns:",
result_transferred.cluster_representatives.columns.tolist(),
)
# But now we have ALL columns!
print("Wind-only result columns:", result_wind.cluster_representatives.columns.tolist())
print(
"Transferred result columns:",
result_transferred.cluster_representatives.columns.tolist(),
)
Wind-only result columns: ['Wind'] Transferred result columns: ['GHI', 'Load', 'T', 'Wind']
Use Case: Save and Reload Clustering¶
In [7]:
Copied!
# Save clustering to file
result_wind.clustering.to_json("clustering.json")
# Later: load and apply to any data
clustering = ClusteringResult.from_json("clustering.json")
result_reloaded = clustering.apply(raw)
print(
"Reloaded result identical:",
result_transferred.cluster_representatives.equals(
result_reloaded.cluster_representatives
),
)
# Save clustering to file
result_wind.clustering.to_json("clustering.json")
# Later: load and apply to any data
clustering = ClusteringResult.from_json("clustering.json")
result_reloaded = clustering.apply(raw)
print(
"Reloaded result identical:",
result_transferred.cluster_representatives.equals(
result_reloaded.cluster_representatives
),
)
Reloaded result identical: True
Summary¶
# Cluster on subset
result_wind = tsam.aggregate(data[["Wind"]], n_clusters=8)
# Apply to all variables - wind stays identical!
result_all = result_wind.clustering.apply(data)
# Save for later
result_wind.clustering.to_json("clustering.json")