Source code for k_maxoids
# -*- coding: utf-8 -*-
"""Exact K-maxoids clustering"""
import numpy as np
import numpy.random as rnd
from sklearn.base import BaseEstimator, ClusterMixin, TransformerMixin
from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
from sklearn.utils import check_array
[docs]class KMaxoids(BaseEstimator, ClusterMixin, TransformerMixin):
"""
k-maxoids class.
:param n_clusters: How many maxoids. Must be positive. optional, default: 8
:type n_clusters: integer
:param distance_metric: What distance metric to use. optional, default: 'euclidean'
:type distance_metric: string
"""
def __init__(
self,
n_clusters=8,
distance_metric="euclidean",
):
self.n_clusters = n_clusters
self.distance_metric = distance_metric
def _check_init_args(self):
# Check n_clusters
if (
self.n_clusters is None
or self.n_clusters <= 0
or not isinstance(self.n_clusters, int)
):
raise ValueError("n_clusters has to be nonnegative integer")
# Check distance_metric
if callable(self.distance_metric):
self.distance_func = self.distance_metric
elif self.distance_metric in PAIRWISE_DISTANCE_FUNCTIONS:
self.distance_func = PAIRWISE_DISTANCE_FUNCTIONS[self.distance_metric]
else:
raise ValueError(
"distance_metric needs to be "
+ "callable or one of the "
+ "following strings: "
+ "{}".format(PAIRWISE_DISTANCE_FUNCTIONS.keys())
+ ". Instead, '{}' ".format(self.distance_metric)
+ "was given."
)
[docs] def fit(self, X, y=None):
"""Fit K-Maxoids to the provided data.
:param X: shape=(n_samples, n_features)
:type X: array-like or sparse matrix
:returns: self
"""
self._check_init_args()
# check that the array is good and attempt to convert it to
# Numpy array if possible
X = self._check_array(X)
# apply distance metric to get the distance matrix
D = self.distance_func(X)
# run mk-maxoids clustering
self.cluster_centers_, self.labels_ = self.k_maxoids(X, self.n_clusters)
return self
def _check_array(self, X):
X = check_array(X)
# Check that the number of clusters is less than or equal to
# the number of samples
if self.n_clusters > X.shape[0]:
raise ValueError(
"The number of medoids "
+ "({}) ".format(self.n_clusters)
+ "must be larger than the number "
+ "of samples ({})".format(X.shape[0])
)
return X
def k_maxoids(self, X, k, numpasses=5, doLogarithmic=False, n_init=100):
X_old = X
n, m = X.shape
inertiaTempPrime = None
for i in range(n_init):
inds = rnd.permutation(np.arange(n))
X = X[inds]
M = np.copy(X[:k])
for t in range(numpasses):
for j in range(n):
x = X[j]
D = np.sum((M - x) ** 2, axis=1)
i = np.argmin(D)
d = np.sum((M - M[i]) ** 2, axis=1)
if doLogarithmic:
D[i] = 1.0
d[i] = 1.0
valx = np.prod(D)
valm = np.prod(d)
else:
D[i] = 0.0
d[i] = 0.0
valx = np.sum(D)
valm = np.sum(d)
if valx > valm:
M[i] = x
dTemp = self.distance_func(X_old, Y=list(M))
inertiaTemp = np.sum(np.min(dTemp, axis=1))
if inertiaTempPrime is None:
mFinal = M
inertiaTempPrime = inertiaTemp
else:
if inertiaTemp < inertiaTempPrime:
mFinal = M
inertiaTempPrime = inertiaTemp
D = self.distance_func(X_old, Y=list(mFinal))
I = np.argmin(D, axis=1)
return list(mFinal), I