def durationRepresentation(
candidates,
clusterOrder,
distributionPeriodWise,
timeStepsPerPeriod,
representMinMax=False,
):
"""
Represents the candidates of a given cluster group (clusterOrder)
such that for every attribute the number of time steps is best fit.
:param candidates: Dissimilarity matrix where each row represents a candidate
:type candidates: np.ndarray
:param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
:type clusterOrder: np.array
:param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
:type representMinMax: bool
"""
# make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
# the time steps inside the candidates.
columnTuples = []
num_attributes = int(candidates.shape[1] / timeStepsPerPeriod)
for i in range(num_attributes):
for j in range(timeStepsPerPeriod):
columnTuples.append((i, j))
candidates_df = pd.DataFrame(
candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
)
# There are two options for the duration representation. Either, the distribution of each cluster is preserved
# (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
# inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
if distributionPeriodWise:
n_attrs = num_attributes
# Reshape to 3D: (periods, attributes, timesteps)
candidates_3d = candidates.reshape(-1, n_attrs, timeStepsPerPeriod)
clusterCenters = []
for clusterNum in np.unique(clusterOrder):
indice = np.where(clusterOrder == clusterNum)[0]
n_cands = len(indice)
if n_cands == 0:
continue
# (n_cands, n_attrs, timesteps) -> (n_attrs, n_cands, timesteps)
cluster_data = candidates_3d[indice].transpose(1, 0, 2)
# Sort all values per attribute, then reshape to duration curve
flat = cluster_data.reshape(n_attrs, -1)
flat = np.sort(flat, axis=1, kind="stable")
repr_values = flat.reshape(n_attrs, timeStepsPerPeriod, n_cands).mean(
axis=2
)
if representMinMax:
repr_values[:, 0] = flat[:, 0]
repr_values[:, -1] = flat[:, -1]
# Reorder each attribute's repr_values by its mean profile.
# Round means before argsort to ensure identical tie-breaking
# across platforms and numpy versions.
means = np.round(cluster_data.mean(axis=1), 10)
order = means.argsort(axis=1, kind="stable")
rows = np.arange(n_attrs)[:, None]
final_repr = np.empty_like(repr_values)
final_repr[rows, order] = repr_values
clusterCenters.append(final_repr.ravel())
else:
clusterCentersList = []
for a in candidates_df.columns.levels[0]:
meanVals = []
clusterLengths = []
for clusterNum in np.unique(clusterOrder):
indice = np.where(clusterOrder == clusterNum)
noCandidates = len(indice[0])
# get all the values of a certain attribute and cluster
candidateValues = candidates_df.loc[indice[0], a]
# calculate centroid of each cluster and append to list
meanVals.append(np.round(candidateValues.mean(), 10))
# make a list of weights of each cluster for each time step within the period
clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
# concat centroid values and cluster weights for all clusters
meansAndWeights = pd.concat(
[
pd.DataFrame(np.array(meanVals)).stack(
future_stack=True,
),
pd.DataFrame(np.array(clusterLengths)).stack(
future_stack=True,
),
],
axis=1,
)
# sort all values of all clusters according to the centroid values
meansAndWeightsSorted = meansAndWeights.sort_values(0, kind="stable")
# save order of the sorted centroid values across all clusters
order = meansAndWeightsSorted.index
# sort all values of the original time series
sortedAttr = (
candidates_df.loc[:, a]
.stack(
future_stack=True,
)
.sort_values(kind="stable")
.values
)
# take mean of sections of the original duration curve according to the cluster and its weight the
# respective section is assigned to
representationValues = []
counter = 0
for i, j in enumerate(meansAndWeightsSorted[1]):
representationValues.append(sortedAttr[counter : counter + j].mean())
counter += j
# respect max and min of the attributes
if representMinMax:
representationValues = _representMinMax(
representationValues,
sortedAttr,
meansAndWeightsSorted,
keepSum=True,
)
# transform all representation values to a data frame and arrange it
# according to the order of the sorted
# centroid values
representationValues = pd.DataFrame(np.array(representationValues))
representationValues.index = order
representationValues.sort_index(inplace=True)
# append all cluster values attribute-wise to a list
clusterCentersList.append(representationValues.unstack())
# rearrange so that rows are the cluster centers and columns are time steps x attributes
clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))
return clusterCenters