tsam.utils.durationRepresentation¶

tsam.utils.durationRepresentation ¶

Orders a set of representation values to fit several candidate value sets

durationRepresentation ¶

durationRepresentation(
    candidates,
    clusterOrder,
    distributionPeriodWise,
    timeStepsPerPeriod,
    representMinMax=False,
)

Represents the candidates of a given cluster group (clusterOrder) such that for every attribute the number of time steps is best fit.

Parameters:

Name	Type	Description	Default
`candidates`	`ndarray`	Dissimilarity matrix where each row represents a candidate	required
`clusterOrder`	`array`	Integer array where the index refers to the candidate and the Integer entry to the group	required
`representMinMax`	`bool`	If in every cluster the minimum and the maximum of the attribute should be represented	`False`

Source code in src/tsam/utils/durationRepresentation.py

def durationRepresentation(
    candidates,
    clusterOrder,
    distributionPeriodWise,
    timeStepsPerPeriod,
    representMinMax=False,
):
    """
    Represents the candidates of a given cluster group (clusterOrder)
    such that for every attribute the number of time steps is best fit.

    :param candidates: Dissimilarity matrix where each row represents a candidate
    :type candidates: np.ndarray

    :param clusterOrder: Integer array where the index refers to the candidate and the Integer entry to the group
    :type clusterOrder: np.array

    :param representMinMax: If in every cluster the minimum and the maximum of the attribute should be represented
    :type representMinMax: bool
    """

    # make pd.DataFrame each row represents a candidate, and the columns are defined by two levels: the attributes and
    # the time steps inside the candidates.
    columnTuples = []
    num_attributes = int(candidates.shape[1] / timeStepsPerPeriod)
    for i in range(num_attributes):
        for j in range(timeStepsPerPeriod):
            columnTuples.append((i, j))
    candidates_df = pd.DataFrame(
        candidates, columns=pd.MultiIndex.from_tuples(columnTuples)
    )

    # There are two options for the duration representation. Either, the distribution of each cluster is preserved
    # (periodWise = True) or the distribution of the total time series is preserved only. In the latter case, the
    # inner-cluster variance is smaller and the variance across the typical periods' mean values is higher
    if distributionPeriodWise:
        n_attrs = num_attributes

        # Reshape to 3D: (periods, attributes, timesteps)
        candidates_3d = candidates.reshape(-1, n_attrs, timeStepsPerPeriod)

        clusterCenters = []
        for clusterNum in np.unique(clusterOrder):
            indice = np.where(clusterOrder == clusterNum)[0]
            n_cands = len(indice)
            if n_cands == 0:
                continue

            # (n_cands, n_attrs, timesteps) -> (n_attrs, n_cands, timesteps)
            cluster_data = candidates_3d[indice].transpose(1, 0, 2)

            # Sort all values per attribute, then reshape to duration curve
            flat = cluster_data.reshape(n_attrs, -1)
            flat = np.sort(flat, axis=1, kind="stable")
            repr_values = flat.reshape(n_attrs, timeStepsPerPeriod, n_cands).mean(
                axis=2
            )

            if representMinMax:
                repr_values[:, 0] = flat[:, 0]
                repr_values[:, -1] = flat[:, -1]

            # Reorder each attribute's repr_values by its mean profile.
            # Round means before argsort to ensure identical tie-breaking
            # across platforms and numpy versions.
            means = np.round(cluster_data.mean(axis=1), 10)
            order = means.argsort(axis=1, kind="stable")
            rows = np.arange(n_attrs)[:, None]
            final_repr = np.empty_like(repr_values)
            final_repr[rows, order] = repr_values

            clusterCenters.append(final_repr.ravel())

    else:
        clusterCentersList = []
        for a in candidates_df.columns.levels[0]:
            meanVals = []
            clusterLengths = []
            for clusterNum in np.unique(clusterOrder):
                indice = np.where(clusterOrder == clusterNum)
                noCandidates = len(indice[0])
                # get all the values of a certain attribute and cluster
                candidateValues = candidates_df.loc[indice[0], a]
                # calculate centroid of each cluster and append to list
                meanVals.append(np.round(candidateValues.mean(), 10))
                # make a list of weights of each cluster for each time step within the period
                clusterLengths.append(np.repeat(noCandidates, timeStepsPerPeriod))
            # concat centroid values and cluster weights for all clusters
            meansAndWeights = pd.concat(
                [
                    pd.DataFrame(np.array(meanVals)).stack(
                        future_stack=True,
                    ),
                    pd.DataFrame(np.array(clusterLengths)).stack(
                        future_stack=True,
                    ),
                ],
                axis=1,
            )
            # sort all values of all clusters according to the centroid values
            meansAndWeightsSorted = meansAndWeights.sort_values(0, kind="stable")
            # save order of the sorted centroid values across all clusters
            order = meansAndWeightsSorted.index
            # sort all values of the original time series
            sortedAttr = (
                candidates_df.loc[:, a]
                .stack(
                    future_stack=True,
                )
                .sort_values(kind="stable")
                .values
            )
            # take mean of sections of the original duration curve according to the cluster and its weight the
            # respective section is assigned to
            representationValues = []
            counter = 0
            for i, j in enumerate(meansAndWeightsSorted[1]):
                representationValues.append(sortedAttr[counter : counter + j].mean())
                counter += j
            # respect max and min of the attributes
            if representMinMax:
                representationValues = _representMinMax(
                    representationValues,
                    sortedAttr,
                    meansAndWeightsSorted,
                    keepSum=True,
                )

            # transform all representation values to a data frame and arrange it
            # according to the order of the sorted
            # centroid values
            representationValues = pd.DataFrame(np.array(representationValues))
            representationValues.index = order
            representationValues.sort_index(inplace=True)
            # append all cluster values attribute-wise to a list
            clusterCentersList.append(representationValues.unstack())
        # rearrange so that rows are the cluster centers and columns are time steps x attributes
        clusterCenters = np.array(pd.concat(clusterCentersList, axis=1))

    return clusterCenters