Interactive Example - Upper Limit#
In this example we will generate a toy dataset with a small gaussian signal over an exponential background and use it to compute an expected and observed upper limit on the signal fraction. These operations are performed in the widgets in the last cell, which allows to tune the following parameters:
n_sig
: number of signal events in the toy datasetn_bkg
: number of background events in the toy datasetlikelihood
: type of likelihood to use, i.e. unbinned or binned
In what follows we summarize the procedure developed in generate_compute_plot
to compute upper limits for observed and expected data. This will be useful to understand the information displayed in the so-called “brazilian bands” plot.
First of all, the toy dataset that simulates our observation is produced.
Using the profile likelihood ratio
and the test statistics
we first define an array of test values pois_null
for our POI (the signal fraction) and compute \(CL_{s+b}\). Note that \(CL_{s+b}\) is obtained using \(p_\mu = 1-F(q_\mu|\mu') = 1 - \Phi(\sqrt{q_\mu})\).
Generate an Asimov dataset.
Compute \(CL_{b}\) and \(CL_s\) for each value of the POI we are testing.
Compute the median, \(1 \sigma\) and \(2 \sigma\) expected \(CL_s\) values for the array of POIs we are testing.
Interpolate in order to find the value of the POI for which each of the above mentioned functions gets a value of 0.05 (i.e. the 95% CL).
import argparse
import numpy as np
import matplotlib.pyplot as plt
from scipy.integrate import quad
from scipy.stats import expon
from scipy.optimize import minimize
from scipy.stats import norm
from scipy import interpolate
from ipywidgets import interactive
import ipywidgets as widgets
import json
gbounds = (0.1, 3.0)
gmu = 1.2
gsigma = 0.1
class Likelihood:
def __init__(self, function, data):
self.function = function
self.data = data
def __call__(self, params):
return np.prod(self.function(self.data, *params))
class NLL(Likelihood):
def __call__(self, params):
return -np.sum([np.log(self.function(self.data, *params))])
class BinnedLikelihood(Likelihood):
def __init__(self, function, hist, edges):
self.function = function
self.hist = hist
self.edges = edges
def __call__(self, params):
integration_limits = []
inf_limit = self.edges[0]
for sup_limit in self.edges[1:]:
integration_limits.append((inf_limit, sup_limit))
inf_limit = sup_limit
n_tot = self.hist.sum()
nu_is = np.array([n_tot * quad(lambda x: self.function(x, *params), inf_lim, sup_lim)[0] for inf_lim, sup_lim in integration_limits])
return - np.sum(self.hist * np.log(nu_is))
def model(x, f_sig, tau):
#return f_sig*(1/(gsigma * np.sqrt(2 * np.pi)) * np.exp(-(x - gmu)**2 / (2 * gsigma**2))) + (1-f_sig)*((1/tau) * np.exp(-(x / tau)))
return f_sig * norm(gmu, gsigma).pdf(x) + (1 - f_sig) * expon(scale=tau).pdf(x)
def q_mu(nll1, nll2, poi1, poi2):
q = 2 * (nll1 - nll2)
zeros = np.zeros(q.shape)
condition = (poi2 > poi1) | (q < 0)
return np.where(condition, zeros, q)
def p_mu(q_mu, nsigma=0):
return 1 - norm.cdf(np.sqrt(q_mu) - nsigma)
def p_alt(q_obs, q_alt):
sqrtqobs = np.sqrt(q_obs)
sqrtqalt = np.sqrt(q_alt)
return 1.0 - norm.cdf(sqrtqobs - sqrtqalt)
def generate_asimov_hist(model, params, bounds, nbins=100):
bin_edges = np.linspace(bounds[0], bounds[1], nbins + 1)
bin_centers = 0.5 * (bin_edges[1:] + bin_edges[:-1])
integration_limits = []
inf_limit = bin_edges[0]
for sup_limit in bin_edges[1:]:
integration_limits.append((inf_limit, sup_limit))
inf_limit = sup_limit
hist = np.array([quad(lambda x: model(x, *params), inf_lim, sup_lim)[0] for inf_lim, sup_lim in integration_limits])
return hist, bin_edges, bin_centers
def generate_pseudo_asimov_dataset(model, params, bounds, nevs=1000000, nbins=100):
hist, bin_edges, bin_centers = generate_asimov_hist(model, params, bounds, nbins)
edges_pairs = list(zip(bin_edges[:-1], bin_edges[1:]))
return np.concatenate([np.random.uniform(ep[0], ep[1], int(np.round(nevs*h))) for ep, h in zip(edges_pairs, hist)])
def plot_data_and_model(ax, data, model, res, x_range, nbins=50):
ax.hist(data, bins=nbins, density=True, label="data")
ax.plot(x_range, model(x_range, *res.x), 'r-', label="fit")
ax.set_xlabel("x")
ax.set_ylabel("f(x)")
ax.set_xlim(0.1, 3.0)
ax.legend()
return ax
def plot_all(ax, x, clsb, clb, cls, exp_cls, lk):
ax.fill_between(x, exp_cls[-2], exp_cls[2], color='y', label="$CL_s$ expected 68%")
ax.fill_between(x, exp_cls[-1], exp_cls[1], color='g', label="$CL_s$ expected 95%")
ax.plot(x, exp_cls[0], 'k--', label="$CL_s$ expected median")
ax.plot(x, clsb, 'k-')
ax.plot(x, clsb, 'bo', label="$CL_{s+b}$")
ax.plot(x, clb, 'k-')
ax.plot(x, clb, 'ko', label="$CL_b$")
ax.plot(x, cls, 'k-')
ax.plot(x, cls, 'ro', label="$CL_s$")
ax.hlines(y=0.05, xmin=x[0], xmax=x[-1], color='r', linestyle='-')
ax.set_ylabel("p-value")
ax.set_xlabel("POI")
ax.legend(loc="upper right")
return ax
def generate_compute_plot(n_sig, n_bkg, likelihood):
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20, 8))
axes[0].clear()
axes[1].clear()
bounds = (0.1, 3.0)
bnds = ((0.001, 1.0), (0.03, 1.0)) #to feed the minimizer
# Generate dataset (with very small signal)
bkg = np.random.exponential(0.5, n_bkg)
peak = np.random.normal(gmu, gsigma, n_sig)
data = np.concatenate((bkg, peak))
data = data[(data > bounds[0]) & (data < bounds[1])]
if likelihood == "binned":
h, e = np.histogram(data, bins=100, range=bounds)
lk = BinnedLikelihood(model, h, e)
else:
lk = NLL(model, data)
x0 = [n_sig / (n_bkg + n_bkg), 0.5]
global_res = minimize(fun=lk, x0=x0, method='Powell', bounds=bnds, tol=1e-6)
axes[0] = plot_data_and_model(axes[0], data, model, global_res, np.linspace(bounds[0], bounds[1], 1000), 40)
# compute observed quantities
pois_null = np.linspace(0.001, 0.2, 30)
pois_best = np.ones(pois_null.shape) * global_res.x[0]
nll_best = np.ones(pois_best.shape) * global_res.fun
nll_null = []
for pn in pois_null:
def to_minimize(params):
return lk([pn, *params])
x0 = global_res.x[1:]
res = minimize(fun=to_minimize, x0=x0, method='Powell', bounds=bnds, tol=1e-6)
nll_null.append(res.fun)
qobs = q_mu(nll_null, nll_best, pois_null, pois_best)
pobs = p_mu(qobs)
# Asimov
bkg_only_pars = [0.0] + list(global_res.x[1:])
if likelihood == "binned":
asimov_hist, asimov_edges, asimov_centers = generate_asimov_hist(model, bkg_only_pars, bounds, nbins=100)
asimov_hist *= (n_sig + n_bkg)
lk_asimov = BinnedLikelihood(model, asimov_hist, asimov_edges)
else:
asimov_dataset = generate_pseudo_asimov_dataset(model, bkg_only_pars, bounds, nevs=n_sig + n_bkg)
lk_asimov = NLL(model, asimov_dataset)
def to_minimize(params):
return lk_asimov([0.0, *params])
x0 = global_res.x[1:]
global_res_asimov = minimize(fun=to_minimize, x0=x0, method='Powell', bounds=bnds[1:], tol=1e-6)
pois_alt = np.zeros(pois_null.shape)
nll_best_asimov = np.ones(pois_best.shape) * global_res_asimov.fun
nll_null_asimov = []
for pn in pois_null:
def to_minimize_loc(params):
return lk_asimov([pn, *params])
x0_loc = global_res_asimov.x
res = minimize(fun=to_minimize_loc, x0=x0_loc, method='Powell', bounds=bnds[1:], tol=1e-6)
nll_null_asimov.append(res.fun)
q_asimov = q_mu(nll_null_asimov, nll_best_asimov, pois_null, pois_alt)
p_asimov = p_alt(qobs, q_asimov)
cls = pobs / p_asimov
# Expected
exp_clsb = {}
exp_clb = {}
exp_cls = {}
sigmas = [0, 1, 2, -1, -2]
for sigma in sigmas:
exp_clsb[sigma] = p_mu(q_asimov, sigma)
exp_clb[sigma] = np.ones(exp_clsb[sigma].shape) * norm.cdf(sigma)
exp_cls[sigma] = exp_clsb[sigma] / exp_clb[sigma]
axes[1] = plot_all(axes[1], pois_null, pobs, p_asimov, cls, exp_cls, likelihood)
# Find upper limit
interpolated_funcs = {}
upper_limits = {}
interpolated_funcs["obs"] = interpolate.interp1d(pois_null, cls, kind="cubic")
for sigma in sigmas:
interpolated_funcs["exp_{}".format(str(sigma).replace("-", "m"))] = interpolate.interp1d(pois_null, exp_cls[sigma], kind="cubic")
more_pois_null = np.linspace(0.001, 0.2, 10000)
line = np.ones(more_pois_null.shape) * 0.05
for name, func in interpolated_funcs.items():
interpolated_line = func(more_pois_null)
idx = np.argwhere(np.diff(np.sign(interpolated_line - line))).flatten()
upper_limits[name] = more_pois_null[idx]
print("upper_limits: ", json.dumps({k: list(l) for k, l in upper_limits.items()}, indent=4))
slider_plot = interactive(
generate_compute_plot,
n_sig=widgets.IntSlider(min=1, max=50, step=1, value=10),
n_bkg=widgets.IntSlider(min=250, max=500, step=1, value=300),
likelihood=widgets.SelectMultiple(options=["unbinned", "binned"], value=("unbinned", ))
)
display(slider_plot)