Source code for hyperion.clustering.ahc

"""
 Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
"""

import numpy as np
import h5py
from copy import copy

from scipy.cluster.hierarchy import linkage
from sklearn.metrics import homogeneity_score, completeness_score

from ..hyp_defs import float_cpu
from ..hyp_model import HypModel


[docs]class AHC(HypModel):
[docs] def __init__(self, method="average", metric="llr", **kwargs): super().__init__(**kwargs) self.method = method self.metric = metric self.Z = None self.flat_clusters = None
[docs] def fit(self, x, mask=None): if mask is not None: x = copy(x) x[mask == False] = -1e10 idx = np.triu(np.ones_like(x, dtype=bool), k=1) scores = x[idx] if self.metric == "llr": max_score = np.max(scores) scores = -scores + max_score self.Z = linkage(scores, method=self.method) self.Z[:, 2] = -self.Z[:, 2] + max_score elif self.metric == "prob": scores = 1 - scores self.Z = linkage(scores, method=self.method) self.Z[:, 2] = 1 - self.Z[:, 2] else: self.Z = linkage(scores, method=self.method, metric=self.metric)
[docs] def get_flat_clusters(self, t, criterion="threshold"): if criterion == "threshold": return self.get_flat_clusters_from_thr(t) else: return self.get_flat_clusters_from_num_clusters(t)
[docs] def get_flat_clusters_from_num_clusters(self, num_clusters): N = self.Z.shape[0] + 1 num_clusters = min(N, num_clusters) p_idx = N - num_clusters if self.flat_clusters is not None: return self.flat_clusters[p_idx] flat_clusters = np.arange(N, dtype=int) for i in range(p_idx): segm_idx = np.logical_or( flat_clusters == self.Z[i, 0], flat_clusters == self.Z[i, 1] ) flat_clusters[segm_idx] = N + i _, flat_clusters = np.unique(flat_clusters, return_inverse=True) return flat_clusters
[docs] def get_flat_clusters_from_thr(self, thr): if self.metric == "llr" or self.metric == "prob": idx = self.Z[:, 2] >= thr else: idx = self.Z[:, 2] <= thr num_clusters = self.Z.shape[0] + 1 - np.sum(idx) return self.get_flat_clusters_from_num_clusters(num_clusters)
[docs] def compute_flat_clusters(): N = self.Z.shape[0] + 1 flat_clusters = np.zeros((N, N), dtype=int) flat_clusters[0] = np.arange(N, dtype=int) for i in range(N - 1): flat_clusters[i + 1] = flat_clusters[i] segm_idx = np.logical_or( flat_clusters[i] == self.Z[i, 0], flat_clusters[i] == self.Z[i, 1] ) flat_clusters[i + 1][segm_idx] = N + i for i in range(1, N): _, flat_clusters[i] = np.unique(flat_clusters, return_inverse=True) self.flat_clusters = flat_clusters
[docs] def evaluate_impurity_det(self, labels_true): if self.flat_clusters is None: self.compute_flat_clusters() # homogeneity: each cluster contains only members of a single class. (cluster purity) # completeness: all members of a given class are assigned to the same cluster. (class purity) N = self.flat_clusters.shape[0] h = np.zeros((N,), dtype=float_cpu()) c = np.zeros((N,), dtype=float_cpu()) for i in range(self.flat_clusters.shape[0]): h[i] = homogeneity_score(labels_true, self.flat_clusters[i]) c[i] = completeness_score(labels_true, self.flat_clusters[i]) return 1 - h, 1 - c