Skip to content

tsam.utils.durationRepresentation

tsam.utils.durationRepresentation

Orders a set of representation values to fit several candidate value sets

durationRepresentation

durationRepresentation(
    candidates,
    clusterOrder,
    distributionPeriodWise,
    timeStepsPerPeriod,
    representMinMax=False,
)

Represents the candidates of a given cluster group (clusterOrder) such that for every attribute the number of time steps is best fit.

Parameters:

Name Type Description Default
candidates ndarray

Dissimilarity matrix where each row represents a candidate

required
clusterOrder array

Integer array where the index refers to the candidate and the Integer entry to the group

required
representMinMax bool

If in every cluster the minimum and the maximum of the attribute should be represented

False
Source code in src/tsam/utils/durationRepresentation.py
def durationRepresentation(
    candidates,
    clusterOrder,
    distributionPeriodWise,
    timeStepsPerPeriod,
    representMinMax=False,
):
    """
    Represents the candidates of a given cluster group (clusterOrder)
    such that for every attribute the number of time steps is best fit.

    :param candidates: Dissimilarity matrix where each row represents a candidate
    :type candidates: np.ndarray

    :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
    :type clusterOrder: np.array

    :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
    :type representMinMax: bool
    """

    # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
    # the time steps inside the candidates.
    columnTuples = []
    num_attributes = int(candidates.shape[1] / timeStepsPerPeriod)
    for i in range(num_attributes):
        for j in range(timeStepsPerPeriod):
            columnTuples.append((i, j))
    candidates_df = pd.DataFrame(
        candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
    )

    # There are two options for the duration representation. Either, the distribution of each cluster is preserved
    # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
    # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
    if distributionPeriodWise:
        n_attrs = num_attributes

        # Reshape to 3D: (periods, attributes, timesteps)
        candidates_3d = candidates.reshape(-1, n_attrs, timeStepsPerPeriod)

        clusterCenters = []
        for clusterNum in np.unique(clusterOrder):
            indice = np.where(clusterOrder == clusterNum)[0]
            n_cands = len(indice)
            if n_cands == 0:
                continue

            # (n_cands, n_attrs, timesteps) -> (n_attrs, n_cands, timesteps)
            cluster_data = candidates_3d[indice].transpose(1, 0, 2)

            # Sort all values per attribute, then reshape to duration curve
            flat = cluster_data.reshape(n_attrs, -1)
            flat = np.sort(flat, axis=1, kind="stable")
            repr_values = flat.reshape(n_attrs, timeStepsPerPeriod, n_cands).mean(
                axis=2
            )

            if representMinMax:
                repr_values[:, 0] = flat[:, 0]
                repr_values[:, -1] = flat[:, -1]

            # Reorder each attribute's repr_values by its mean profile.
            # Round means before argsort to ensure identical tie-breaking
            # across platforms and numpy versions.
            means = np.round(cluster_data.mean(axis=1), 10)
            order = means.argsort(axis=1, kind="stable")
            rows = np.arange(n_attrs)[:, None]
            final_repr = np.empty_like(repr_values)
            final_repr[rows, order] = repr_values

            clusterCenters.append(final_repr.ravel())

    else:
        clusterCentersList = []
        for a in candidates_df.columns.levels[0]:
            meanVals = []
            clusterLengths = []
            for clusterNum in np.unique(clusterOrder):
                indice = np.where(clusterOrder == clusterNum)
                noCandidates = len(indice[0])
                # get all the values of a certain attribute and cluster
                candidateValues = candidates_df.loc[indice[0], a]
                # calculate centroid of each cluster and append to list
                meanVals.append(np.round(candidateValues.mean(), 10))
                # make a list of weights of each cluster for each time step within the period
                clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
            # concat centroid values and cluster weights for all clusters
            meansAndWeights = pd.concat(
                [
                    pd.DataFrame(np.array(meanVals)).stack(
                        future_stack=True,
                    ),
                    pd.DataFrame(np.array(clusterLengths)).stack(
                        future_stack=True,
                    ),
                ],
                axis=1,
            )
            # sort all values of all clusters according to the centroid values
            meansAndWeightsSorted = meansAndWeights.sort_values(0, kind="stable")
            # save order of the sorted centroid values across all clusters
            order = meansAndWeightsSorted.index
            # sort all values of the original time series
            sortedAttr = (
                candidates_df.loc[:, a]
                .stack(
                    future_stack=True,
                )
                .sort_values(kind="stable")
                .values
            )
            # take mean of sections of the original duration curve according to the cluster and its weight the
            # respective section is assigned to
            representationValues = []
            counter = 0
            for i, j in enumerate(meansAndWeightsSorted[1]):
                representationValues.append(sortedAttr[counter : counter + j].mean())
                counter += j
            # respect max and min of the attributes
            if representMinMax:
                representationValues = _representMinMax(
                    representationValues,
                    sortedAttr,
                    meansAndWeightsSorted,
                    keepSum=True,
                )

            # transform all representation values to a data frame and arrange it
            # according to the order of the sorted
            # centroid values
            representationValues = pd.DataFrame(np.array(representationValues))
            representationValues.index = order
            representationValues.sort_index(inplace=True)
            # append all cluster values attribute-wise to a list
            clusterCentersList.append(representationValues.unstack())
        # rearrange so that rows are the cluster centers and columns are time steps x attributes
        clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))

    return clusterCenters