Source code for hyperion.metrics.utils

"""
 Copyright 2018 Johns Hopkins University  (Author: Jesus Villalba)
 Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)

 Utility functions to evaluate performance
"""

import numpy as np

from ..hyp_defs import float_cpu


[docs]def effective_prior(p_tar, c_miss, c_fa): """This function adjusts a given prior probability of target p_targ, to incorporate the effects of a cost of miss, cmiss, and a cost of false-alarm, cfa. Args: p_tar: target prior c_miss: cost of miss c_fa: cost of false alarm Returns: Effective prior """ beta = p_tar * c_miss / (1 - p_tar) / c_fa p_eff = beta / (1 + beta) return p_eff
[docs]def pavx(y): """PAV: Pool Adjacent Violators algorithm. Non-paramtetric optimization subject to monotonicity. ghat = pav(y) fits a vector ghat with nondecreasing components to the data vector y such that sum((y - ghat).^2) is minimal. (Pool-adjacent-violators algorithm). Author: This code is and adaptation from Bosaris Toolkit and it is a simplified version of the 'IsoMeans.m' code made available by Lutz Duembgen at: http://www.imsv.unibe.ch/~duembgen/software Args: y: uncalibrated scores Returns: Calibrated scores Width of pav bins, from left to right (the number of bins is data dependent) Height: corresponding heights of bins (in increasing order) """ assert isinstance(y, np.ndarray) n = len(y) assert n > 0 index = np.zeros(y.shape, dtype=int) l = np.zeros(y.shape, dtype=int) # An interval of indices is represented by its left endpoint # ("index") and its length "len" ghat = np.zeros_like(y) ci = 0 index[ci] = 0 l[ci] = 1 ghat[ci] = y[0] # ci is the number of the interval considered currently. # ghat[ci] is the mean of y-values within this interval. for j in range(1, n): # a new index intervall, {j}, is created: ci = ci + 1 index[ci] = j l[ci] = 1 ghat[ci] = y[j] # while ci >= 1 and ghat[np.maximum(ci-1,0)] >= ghat[ci]: while ci >= 1 and ghat[ci - 1] >= ghat[ci]: # "pool adjacent violators": nw = l[ci - 1] + l[ci] ghat[ci - 1] = ghat[ci - 1] + (l[ci] / nw) * (ghat[ci] - ghat[ci - 1]) l[ci - 1] = nw ci = ci - 1 height = np.copy(ghat[: ci + 1]) width = l[: ci + 1] # Now define ghat for all indices: while n >= 1: for j in range(index[ci], n): ghat[j] = ghat[ci] n = index[ci] ci = ci - 1 return ghat, width, height
[docs]def opt_loglr(tar, non, method="laplace"): """Non-parametric optimization of score to log-likelihood-ratio mapping. Taken from Bosaris toolkit. Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection, Computer Speech and Language, 2005 Args: tar: target scores. non: non-target scores. method: laplace(default, avoids inf log-LR)/raw Returns: Calibrated tar and non-tar log-LR """ ntar = len(tar) nnon = len(non) n = ntar + nnon scores = np.concatenate((tar, non)) p_ideal = np.zeros((n,), dtype=float_cpu()) p_ideal[:ntar] = 1 sort_idx = np.argsort(scores, kind="mergesort") # print(scores) # print(sort_idx) p_ideal = p_ideal[sort_idx] if method == "laplace": # The extra targets and non-targets at scores of -inf and +inf effectively # implement Laplace's rule of succession to avoid log LRs of infinite magnitudes. p_ideal = np.concatenate(([1, 0], p_ideal, [1, 0])) p_opt, _, _ = pavx(p_ideal) if method == "laplace": p_opt = p_opt[2:-2] # Posterior to loglr # This LR is prior-independent in the sense that if we weight the data with a synthetic prior, # it makes no difference to the optimizing LR mapping. # (A synthetic prior DOES change Popt: The posterior log-odds changes by an additive term. But this # this cancels again when converting to log LR. ) # print(p_opt) post_log_odds = np.log(p_opt) - np.log(1 - p_opt) prior_log_odds = np.log(ntar / nnon) llr = post_log_odds - prior_log_odds llr += 1e-6 * np.arange(n) / n llr[sort_idx] = llr tar_llr = llr[:ntar] non_llr = llr[ntar:] return tar_llr, non_llr