"""
Copyright 2018 Johns Hopkins University (Author: Jesus Villalba)
Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
Utility functions to evaluate performance
"""
import numpy as np
from ..hyp_defs import float_cpu
[docs]def effective_prior(p_tar, c_miss, c_fa):
"""This function adjusts a given prior probability of target p_targ,
to incorporate the effects of a cost of miss, cmiss, and a cost of false-alarm, cfa.
Args:
p_tar: target prior
c_miss: cost of miss
c_fa: cost of false alarm
Returns:
Effective prior
"""
beta = p_tar * c_miss / (1 - p_tar) / c_fa
p_eff = beta / (1 + beta)
return p_eff
[docs]def pavx(y):
"""PAV: Pool Adjacent Violators algorithm. Non-paramtetric optimization subject to monotonicity.
ghat = pav(y)
fits a vector ghat with nondecreasing components to the
data vector y such that sum((y - ghat).^2) is minimal.
(Pool-adjacent-violators algorithm).
Author: This code is and adaptation from Bosaris Toolkit and
it is a simplified version of the 'IsoMeans.m' code made available
by Lutz Duembgen at:
http://www.imsv.unibe.ch/~duembgen/software
Args:
y: uncalibrated scores
Returns:
Calibrated scores
Width of pav bins, from left to right
(the number of bins is data dependent)
Height: corresponding heights of bins (in increasing order)
"""
assert isinstance(y, np.ndarray)
n = len(y)
assert n > 0
index = np.zeros(y.shape, dtype=int)
l = np.zeros(y.shape, dtype=int)
# An interval of indices is represented by its left endpoint
# ("index") and its length "len"
ghat = np.zeros_like(y)
ci = 0
index[ci] = 0
l[ci] = 1
ghat[ci] = y[0]
# ci is the number of the interval considered currently.
# ghat[ci] is the mean of y-values within this interval.
for j in range(1, n):
# a new index intervall, {j}, is created:
ci = ci + 1
index[ci] = j
l[ci] = 1
ghat[ci] = y[j]
# while ci >= 1 and ghat[np.maximum(ci-1,0)] >= ghat[ci]:
while ci >= 1 and ghat[ci - 1] >= ghat[ci]:
# "pool adjacent violators":
nw = l[ci - 1] + l[ci]
ghat[ci - 1] = ghat[ci - 1] + (l[ci] / nw) * (ghat[ci] - ghat[ci - 1])
l[ci - 1] = nw
ci = ci - 1
height = np.copy(ghat[: ci + 1])
width = l[: ci + 1]
# Now define ghat for all indices:
while n >= 1:
for j in range(index[ci], n):
ghat[j] = ghat[ci]
n = index[ci]
ci = ci - 1
return ghat, width, height
[docs]def opt_loglr(tar, non, method="laplace"):
"""Non-parametric optimization of score to log-likelihood-ratio mapping.
Taken from Bosaris toolkit.
Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection, Computer Speech and Language, 2005
Args:
tar: target scores.
non: non-target scores.
method: laplace(default, avoids inf log-LR)/raw
Returns:
Calibrated tar and non-tar log-LR
"""
ntar = len(tar)
nnon = len(non)
n = ntar + nnon
scores = np.concatenate((tar, non))
p_ideal = np.zeros((n,), dtype=float_cpu())
p_ideal[:ntar] = 1
sort_idx = np.argsort(scores, kind="mergesort")
# print(scores)
# print(sort_idx)
p_ideal = p_ideal[sort_idx]
if method == "laplace":
# The extra targets and non-targets at scores of -inf and +inf effectively
# implement Laplace's rule of succession to avoid log LRs of infinite magnitudes.
p_ideal = np.concatenate(([1, 0], p_ideal, [1, 0]))
p_opt, _, _ = pavx(p_ideal)
if method == "laplace":
p_opt = p_opt[2:-2]
# Posterior to loglr
# This LR is prior-independent in the sense that if we weight the data with a synthetic prior,
# it makes no difference to the optimizing LR mapping.
# (A synthetic prior DOES change Popt: The posterior log-odds changes by an additive term. But this
# this cancels again when converting to log LR. )
# print(p_opt)
post_log_odds = np.log(p_opt) - np.log(1 - p_opt)
prior_log_odds = np.log(ntar / nnon)
llr = post_log_odds - prior_log_odds
llr += 1e-6 * np.arange(n) / n
llr[sort_idx] = llr
tar_llr = llr[:ntar]
non_llr = llr[ntar:]
return tar_llr, non_llr