Clustering Transfer¶

This notebook demonstrates the cluster once, apply many times workflow.

In [1]:

Copied!





from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

import tsam
from tsam import ClusterConfig, ClusteringResult

pio.renderers.default = "notebook_connected"

# Ensure results directory exists
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)

raw = pd.read_csv("testdata.csv", index_col=0)  # 4 columns: GHI, T, Wind, Load
import warnings

# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
    "ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio

import tsam
from tsam import ClusterConfig, ClusteringResult

pio.renderers.default = "notebook_connected"

# Ensure results directory exists
RESULTS_DIR = Path("results")
RESULTS_DIR.mkdir(exist_ok=True)

raw = pd.read_csv("testdata.csv", index_col=0)  # 4 columns: GHI, T, Wind, Load
import warnings

# Added to every example notebook: silence the v3 column-order
# FutureWarning in the rendered docs (tsam v4 returns result columns in
# input order; see migration guide).
warnings.filterwarnings(
    "ignore", category=FutureWarning, message=".*sorted alphabetically.*"
)

Two Paths to Aggregate¶

Path A: Cluster on ALL variables Path B: Cluster on WIND only, then apply to all variables

In [2]:

Copied!





# Path A: Cluster on ALL variables
result_all = tsam.aggregate(
    raw, n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)

# Path B: Cluster on WIND only, then transfer to all
result_wind = tsam.aggregate(
    raw[["Wind"]], n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)
result_transferred = result_wind.clustering.apply(raw)
# Path A: Cluster on ALL variables
result_all = tsam.aggregate(
    raw, n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)

# Path B: Cluster on WIND only, then transfer to all
result_wind = tsam.aggregate(
    raw[["Wind"]], n_clusters=8, cluster=ClusterConfig(method="hierarchical")
)
result_transferred = result_wind.clustering.apply(raw)

Comparing Cluster Assignments¶

Row 1 (all variables) differs from Row 2 (wind only)
Row 2 and Row 3 are identical - the transfer preserves the clustering!

In [3]:

Copied!





px.imshow(
    pd.DataFrame(
        {
            "Clustered on ALL variables": result_all.cluster_assignments,
            "Clustered on WIND only": result_wind.cluster_assignments,
            "Transferred to all variables": result_transferred.cluster_assignments,
        },
        index=pd.RangeIndex(
            start=0, stop=len(result_all.cluster_assignments), name="Original Period"
        ),
    ).T,
    color_continuous_scale="viridis",
    title="Cluster Assignments: Row 2 and 3 are IDENTICAL!",
)
px.imshow(
    pd.DataFrame(
        {
            "Clustered on ALL variables": result_all.cluster_assignments,
            "Clustered on WIND only": result_wind.cluster_assignments,
            "Transferred to all variables": result_transferred.cluster_assignments,
        },
        index=pd.RangeIndex(
            start=0, stop=len(result_all.cluster_assignments), name="Original Period"
        ),
    ).T,
    color_continuous_scale="viridis",
    title="Cluster Assignments: Row 2 and 3 are IDENTICAL!",
)

Verifying the Transfer¶

The wind typical periods are identical - but now we also have GHI, T, and Load:

In [4]:

Copied!





# Wind typical periods: wind-only vs transferred
wind_only = result_wind.cluster_representatives["Wind"]
wind_transferred = result_transferred.cluster_representatives["Wind"]

print(
    "Wind typical periods IDENTICAL after transfer:", wind_only.equals(wind_transferred)
)
print(f"Max difference: {(wind_only - wind_transferred).abs().max()}")
# Wind typical periods: wind-only vs transferred
wind_only = result_wind.cluster_representatives["Wind"]
wind_transferred = result_transferred.cluster_representatives["Wind"]

print(
    "Wind typical periods IDENTICAL after transfer:", wind_only.equals(wind_transferred)
)
print(f"Max difference: {(wind_only - wind_transferred).abs().max()}")

Wind typical periods IDENTICAL after transfer: True
Max difference: 0.0

In [5]:

Copied!





# Verify: cluster assignments are equal
print(
    "Cluster assignments equal:",
    np.array_equal(
        result_wind.cluster_assignments, result_transferred.cluster_assignments
    ),
)
# Verify: cluster assignments are equal
print(
    "Cluster assignments equal:",
    np.array_equal(
        result_wind.cluster_assignments, result_transferred.cluster_assignments
    ),
)

Cluster assignments equal: True

In [6]:

Copied!





# But now we have ALL columns!
print("Wind-only result columns:", result_wind.cluster_representatives.columns.tolist())
print(
    "Transferred result columns:",
    result_transferred.cluster_representatives.columns.tolist(),
)
# But now we have ALL columns!
print("Wind-only result columns:", result_wind.cluster_representatives.columns.tolist())
print(
    "Transferred result columns:",
    result_transferred.cluster_representatives.columns.tolist(),
)

Wind-only result columns: ['Wind']
Transferred result columns: ['GHI', 'Load', 'T', 'Wind']

Use Case: Save and Reload Clustering¶

In [7]:

Copied!





# Save clustering to file
result_wind.clustering.to_json("clustering.json")

# Later: load and apply to any data
clustering = ClusteringResult.from_json("clustering.json")
result_reloaded = clustering.apply(raw)

print(
    "Reloaded result identical:",
    result_transferred.cluster_representatives.equals(
        result_reloaded.cluster_representatives
    ),
)
# Save clustering to file
result_wind.clustering.to_json("clustering.json")

# Later: load and apply to any data
clustering = ClusteringResult.from_json("clustering.json")
result_reloaded = clustering.apply(raw)

print(
    "Reloaded result identical:",
    result_transferred.cluster_representatives.equals(
        result_reloaded.cluster_representatives
    ),
)

Reloaded result identical: True

Summary¶

# Cluster on subset
result_wind = tsam.aggregate(data[["Wind"]], n_clusters=8)

# Apply to all variables - wind stays identical!
result_all = result_wind.clustering.apply(data)

# Save for later
result_wind.clustering.to_json("clustering.json")