Motivation¶

import itertools
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

%matplotlib inline
plt.rcParams["figure.figsize"] = [5, 5]

Data preparation¶

def getMockData(x_range, y_range):
    """The additional exponentiation of the counts is to still have the nice
    shapes even though we need to take the logarithm for the actual data
    to display nicely.
    """
    gauss = lambda x, mu, sigma: np.exp(-(x-mu)**2/2/sigma**2)

    # In Z->nunu sample.
    # sig1 = lambda x: gauss(x, 0.75, .1)
    # bkg1 = lambda x: .04 - .02*x + .5*gauss(x, .4, .2)  + gauss(x, 0, .07) 
    sig1 = lambda x: 1 / ((1.05 - x )**2) #+ 100 * np.exp(gauss(x, 1.0, .1))
    tmp1 = lambda x: 1 / ((0.05 + x ) * (1.01 - x )) + np.exp(2*x)
    bkg1 = lambda x: tmp1(x) + 1e-4  #- tmp1(0)*x/(0.05 + x )

    # In Z->e+e- sample.
    # sig_y = lambda y: gauss(y, .75, .07)
    sig_y = lambda x: 1
    sig_2d = lambda x, y: sig1(x) * sig_y(y)

    gauss2d = lambda x, y, x0, y0, sx, sy, rho: np.exp(
        -.5*((x-x0)**2/sx**2 + 2*(x-x0)*(y-y0)/sx/sy*rho + (y-y0)**2/sy**2))
    bkg_2d = lambda x, y: (
        + .2/(x*y + .1)
        + np.maximum(0, 
                3*gauss2d(x, y, .6, .15, .5, .2, .7)
                + 2*gauss2d(x, y, .25, .75, .15, .2, -.3))
    )

    #s1d = np.exp(sig1(x_range))
    s1d = sig1(x_range)
    b1d = bkg1(x_range)
    s1d = s1d / s1d.sum() * 1e4
    b1d = b1d / b1d.sum() * 1.3e6

    # Build the 2D hist.
    X, Y = np.meshgrid(x_range, y_range)
    s2d = sig_2d(X, Y)
    b2d = bkg_2d(X, Y)
    s2d = s2d / s2d.sum() * 1e4
    b2d = b2d / b2d.sum() * 1.3e6

    return s1d, b1d, s2d, b2d

mock_data = False

# Some tuning parameters.
default_cut_x = .6

x_range = np.linspace(0, 1, 20, endpoint=False)
y_range = np.linspace(0, 1, 20, endpoint=False)
X, Y = np.meshgrid(x_range, y_range)

if mock_data:
    s1d, b1d, s2d, b2d = getMockData(x_range, y_range)
    y_label = "arbirtary event counts"
else:
    from higgsstrahlung_adapter import getData
    s1d, b1d, s2d, b2d = getData(x_range, y_range)
    y_label = "N$_\mathrm{ev}$ / 250 ifb"

Create the Z ranking...
           mZ &lt;= 165.81, eff=100.00 %, pur=  0.17 %, 1726.7908
      mRecoil &lt;= 143.49, eff= 90.00 %, pur=  7.55 %, 1554.1117
           mZ &lt;= 102.11, eff= 80.00 %, pur= 18.99 %, 1381.4326
           mZ &lt;=  96.13, eff= 70.00 %, pur= 31.81 %, 1208.7535
   abs(cosTZ) &lt;=   0.91, eff= 60.00 %, pur= 45.34 %, 1036.0745
      mRecoil &lt;= 130.27, eff= 50.00 %, pur= 55.91 %,  863.3954
   abs(cosTZ) &lt;=   0.81, eff= 40.01 %, pur= 68.37 %,  690.7163
      mRecoil &lt;= 127.09, eff= 30.00 %, pur= 78.21 %,  518.0372
   abs(cosTZ) &lt;=   0.63, eff= 20.00 %, pur= 83.52 %,  345.3582
           mZ &gt;=  90.83, eff= 10.00 %, pur= 87.22 %,  172.6791

Counting sample: $ZH \rightarrow \nu\bar{\nu} H$¶

Preselection¶

$M_\textrm{vis} \in [10, 180]~\textrm{GeV}$
$M_\textrm{miss} \in [50, 220]~\textrm{GeV}$
$\left| \textrm{cos}\theta_\textrm{miss}\right| < 0.99$
$M_\textrm{vis} + M_\textrm{miss} < 247~\textrm{GeV}$

Rather loose preselection. The signal purity after this preselection is O(1%).

Build a BDT¶

When searching for the $ZH \rightarrow \nu\bar{\nu} H$ signal the assumption is that everything that is visible in the event stems from the Higgs decay.

BDT variables¶

e.g. # charged Hadrons, $M_\textrm{vis} \widehat{=} M_H$, $p_T$ of leading lepton, ...

Example illustration¶

Note that the probability density functions are shown - Normalization and shape do not correspond to what is expected to be found in the data.

def draw1D(ax, x, label_and_y, cut_x=0, xy_flipped=False, lim=None):
    max_y = 0 
    for label, y in label_and_y:
        y
        max_y = max(max_y, max(y))

        if xy_flipped:
            ax.plot(y, x, label=label)
            ax.fill_betweenx(x, y, where=x>=cut_x, alpha=.5)
        else:
            ax.plot(x, y, label=label)
            ax.fill_between(x, y, where=x>=cut_x, alpha=.5)
    
    if xy_flipped:
        ax.hlines(cut_x, 0, max_y, color="black", ls="--", label="cut")
        ax.set_xlabel(y_label)
        ax.set_xscale("log")
        if lim:
            ax.set_xlim(lim)
    else:
        ax.vlines(cut_x, 0, max_y, color="black", ls="--", label="cut")
        ax.set_ylabel(y_label)
        ax.set_yscale("log")
        if lim:
            ax.set_ylim(lim)


def drawNu(ax, cut_x=0):
    x = x_range
    label_and_y = [("sig", s1d), ("bkg", b1d)]
    draw1D(ax, x, label_and_y, cut_x)

    ax.set_xlabel("BDT$_\mathrm{H}$")
    ax.set_title("$Z \\rightarrow \\nu\\bar{\\nu}$")
    plt.legend()

fig_nu, ax_nu = plt.subplots()
drawNu(ax_nu, cut_x=default_cut_x)
fig_nu.savefig("fig/BDT_H_nu.png", facecolor="white", dpi=300)

Reference sample $ZH \rightarrow e^+e^- H$¶

In the analyis, both $Z \rightarrow e^+e^-$ and $Z \rightarrow \mu^+\mu^-$ events are used in the reference sample. This sample is used to estimate the (signal) efficiency of the cut on the BDT output (see above).

After seperating the (presumed) Higgs and Z boson remnants (and a again a preselection), the distribution in the BDT output variable can be shown. Under the assumption that the seperation step worked well, the signal distribution has the same shape as in the $Z\rightarrow \nu\bar\nu$ Higgsstrahlung case. The relative normalization of the two signal curves is determined by the Z boson branching ratios.

fig_eH, ax_eH = plt.subplots()
label_and_y = [
    ("sig", np.sum(s2d, axis=1)),
    ("bkg", np.sum(b2d, axis=1)),
]
draw1D(ax_eH, x_range, label_and_y, cut_x=default_cut_x)
ax_eH.set_xlabel("BDT$_\mathrm{H}$")
ax_eH.set_title("$Z \\rightarrow e^+ e^-$")
plt.legend()
plt.savefig("fig/BDT_H_e1.png", facecolor="white", dpi=300)

Additional information from the Z decay¶

In contrast to the $Z \rightarrow \nu \bar\nu$ sample we now have additional information from the Z decay available.

So far this information is used to perform a strong preselection. It is hard to establish a notion of a good preselection in this way. In a counting experiment we would want to optimize the selection such that it yields a high efficiency $\times$ purity and thus the smallest possible poissonian uncertainty.

In this analysis we are interested in minimizing the uncertainty on the Higgstrahlung cross section $\sigma_\mathrm{ZH}$(LCWS proceedings). With our approach, the dependency of $\sigma_\mathrm{ZH}$ on the BDT$_\mathrm{H}$ cut and the Z-based selection is more complicated. Optimizing with respect to efficiency $\times$ purity will not yield ideal results.

Compressing the Z-dependent information into one dimension¶

Employing a BDT for the Higgs-part selection means that we build a powerful one-dimensional distribution (as a representation of the space of the training variables). The idea is that the BDT output has a high seperating power between signal and background.

In the following we want to explore what we can do if we have a one-dimensional representation of the Z$(\rightarrow ee)$ part of the event, $f(\mathrm{Z})$. This is not necessarily a complex construct (e.g. BDT). A simpler proposal is:

$f(\mathrm{Z}) = \mathrm{Norm} \cdot (M_\textrm{ee} - 91)^2 + (M_\textrm{recoil} - 125)^2$.

An illustration of the 2-dimensional distribution of the signal in the $f(\mathrm{Z})-\mathrm{BDT}_\mathrm{H}$ space is shown below. The gaussian form is only chosen for simplicity and not expected. As the decays of the Higgs boson and the Z boson in the Higgsstrahlung event are independent, the uncorrelatedness of the two dimensions is a feature that should stay true with the signal distribution from data.

fig, ax_e_2d_signal = plt.subplots()

ax_e_2d_signal.contourf(x_range, y_range, s2d, ls="-")
ax_e_2d_signal.set_xlabel("BDT$_\mathrm{H}$")
ax_e_2d_signal.set_ylabel("$f(\mathrm{Z})$")
fig.savefig("fig/H_vs_Z_signal.png", facecolor="white", dpi=300)

Adding the background¶

The following figure is an artist's view of the 2-dimensional distributions of signal and background in the $Z \rightarrow e^+e^-$ sample.

The background distribution is shown as colored contour areas. The signal distribution is superimposed as contour lines.

def drawHZ(ax, cut_x=0, cut_y=0):
    def logZ(z):
        log_z = np.log(z)
        log_z[log_z == -np.inf] = min(log_z[log_z != - np.inf])
        return log_z

    ax.set_xlabel("BDT$_\mathrm{H}$")
    ax.set_ylabel("$f(\mathrm{Z})$")
    ax.contourf(x_range, y_range, logZ(b2d))

    z = logZ(s2d)
    z[z < 1e-3*z.max()] = 0 # To avoid drawing artifacts from low statistics.
    ax.contour(X, Y, z)


    ax.fill_between(x_range, cut_y, 
        alpha=.75, facecolor="white", hatch="/", edgecolor="black")#"red")
    ax.vlines(cut_x, cut_y, max(y_range), color="black", ls="--")

fig, ax_e_2d = plt.subplots()
drawHZ(ax_e_2d, 0, 0)

fig.savefig("fig/H_vs_Z_all.png", facecolor="white", dpi=300)

def allRegionsPlot(cut_x=0, cut_y=0):
    fig = plt.figure(figsize=(7, 8))
    grid = plt.GridSpec(5, 4, hspace=0.3, wspace=0.3)
    ax_main = fig.add_subplot(grid[2:, :-1])
    ax_x_pdf1 = fig.add_subplot(grid[0, :-1], sharex=ax_main)
    ax_x_pdf2 = fig.add_subplot(grid[1, :-1], sharex=ax_main)
    ax_y_pdf  = fig.add_subplot(grid[2:, -1], sharey=ax_main)

    plt.setp(ax_x_pdf1.get_xticklabels(), visible=False)
    plt.setp(ax_x_pdf2.get_xticklabels(), visible=False)
    plt.setp(ax_y_pdf.get_yticklabels(), visible=False)
    
    drawHZ(ax_main, cut_x, cut_y)

    nu_label_and_y = [("sig", s1d), ("bkg", b1d)]

    idx_cut_x = np.argwhere(x_range >= cut_x)[0][0]
    idx_cut_y = np.argwhere(y_range >= cut_y)[0][0]
    
    eH_label_and_y = [
        ("sig", np.sum(s2d[idx_cut_y:, :], axis=0)),
        ("bkg", np.sum(b2d[idx_cut_y:, :], axis=0)),
    ]
    eZ_label_and_y = [
        ("sig", np.sum(s2d[:, idx_cut_x:], axis=1)),
        ("bkg", np.sum(b2d[:, idx_cut_x:], axis=1)),
    ]
    nu_lim = (.1, 1.5*b1d.max())
    eH_lim = (.1, 1.5*np.sum(b2d, axis=0).max())
    eZ_lim = (.1, 1.5*np.sum(b2d, axis=1).max())
    draw1D(ax_x_pdf1, x_range, nu_label_and_y, cut_x, lim=nu_lim)
    draw1D(ax_x_pdf2, x_range, eH_label_and_y, cut_x, lim=eH_lim)
    draw1D(ax_y_pdf,  y_range, eZ_label_and_y, cut_y, xy_flipped=True, lim=eZ_lim)

    ax_x_pdf1.set_title("$Z \\rightarrow \\nu\\bar{\\nu}$", fontsize=8)
    ax_x_pdf2.set_title("$Z \\rightarrow e^+e^-$", fontsize=8)
    ax_y_pdf.set_title("$Z \\rightarrow e^+e^-$", fontsize=8)
    ax_x_pdf1.legend(loc="upper left")
    return fig

#for cut_x, cut_y in itertools.product(np.arange(0, 1, .1), np.arange(0, 1, .1)):
for cut_x, cut_y in [(.5, .5)]:
    fig = allRegionsPlot(cut_x, cut_y)
    fig.savefig(f"all_regions/x{cut_x:.2f}_x{cut_y:.2f}.png", facecolor="white", dpi=500)

#!convert -delay 100 -loop 0 -limit disk 4GiB all_regions/*_x0.2.png all_regions.gif

Actually count the entries per region¶

all_counts = []
for cut_x, cut_y in itertools.product(np.arange(0, 1, .1), np.arange(0, 1, .1)):
    N = {}
    N["H"] = cut_x
    N["Z"] = cut_y
    idx_cut_x = np.argwhere(x_range >= cut_x)[0][0]
    idx_cut_y = np.argwhere(y_range >= cut_y)[0][0]

    N["s_nu"] = np.sum(s1d[idx_cut_x:])
    N["b_nu"] = np.sum(b1d[idx_cut_x:])
    N["s_e1"] = np.sum(s2d[idx_cut_y:, idx_cut_x:])
    N["b_e1"] = np.sum(b2d[idx_cut_y:, idx_cut_x:])
    N["s_e1_not"] = np.sum(s2d[idx_cut_y:, :idx_cut_x])
    N["b_e1_not"] = np.sum(b2d[idx_cut_y:, :idx_cut_x])
    all_counts.append(N)
N = pd.DataFrame(all_counts)

N["counting_unc"] = ((N.s_nu + N.b_nu) / N.s_nu ** 2)**.5
N["efficiency_unc"] = (
    (N.s_e1 + N.s_e1_not + N.b_e1 + N.b_e1_not) / (N.s_e1 + N.s_e1_not) ** 2  
    + (N.s_e1 + N.b_e1) / N.s_e1 ** 2 
    - 2*(N.s_e1 + N.b_e1) / (N.s_e1 * (N.s_e1 + N.s_e1_not))
)**.5
N["cs_unc"] = (N["counting_unc"]**2 + N["efficiency_unc"]**2)**.5

print(N.counting_unc.min())
N

0.09386146947610914

for unc in [
    "counting_unc", 
    #"efficiency_unc", 
    #"cs_unc"
]:
    (100*N[unc]).plot(label=unc)
plt.legend()
#plt.yscale("log")
plt.xticks(np.arange(0, len(N)+1, 10))
plt.grid()
#plt.ylim((0, 5))

unc = []
N_unc = []
eff_unc = []
for x, df in N.groupby("H"):
    idx = df["cs_unc"].idxmin()
    unc.append(N.iloc[idx]["cs_unc"])
    eff_unc.append(N.iloc[idx]["efficiency_unc"])
    N_unc.append(N.iloc[idx]["counting_unc"])
plt.plot(unc, label="unc")
plt.plot(N_unc, label="N_unc")
plt.plot(eff_unc, label="eff_unc")
plt.legend()

&lt;matplotlib.legend.Legend at 0x7f787821b280&gt;

	H	Z	s_nu	b_nu	s_e1	b_e1	s_e1_not	b_e1_not	counting_unc	efficiency_unc	cs_unc
0	0.0	0.0	10418.725561	1.331171e+06	1726.790764	1.764330e+06	0.000000	0.000000	0.111172	0.000000	0.111172
1	0.0	0.1	10418.725561	1.331171e+06	1640.199816	4.236735e+04	0.000000	0.000000	0.111172	0.000000	0.111172
2	0.0	0.2	10418.725561	1.331171e+06	1467.510454	1.138866e+04	0.000000	0.000000	0.111172	0.000000	0.111172
3	0.0	0.3	10418.725561	1.331171e+06	1294.917908	3.655941e+03	0.000000	0.000000	0.111172	0.000000	0.111172
4	0.0	0.4	10418.725561	1.331171e+06	1122.311384	1.801097e+03	0.000000	0.000000	0.111172	0.000000	0.111172
...	...	...	...	...	...	...	...	...	...	...	...
95	0.9	0.5	8926.536814	7.057489e+05	839.251026	6.811322e+02	110.444958	268.915294	0.094705	0.021209	0.097050
96	0.9	0.6	8926.536814	7.057489e+05	682.362217	3.366668e+02	94.678974	122.464928	0.094705	0.019802	0.096753
97	0.9	0.7	8926.536814	7.057489e+05	539.686725	1.936884e+02	64.716379	29.207132	0.094705	0.016911	0.096203
98	0.9	0.8	8926.536814	7.057489e+05	383.304116	8.123480e+01	48.438605	21.149280	0.094705	0.020325	0.096861
99	0.9	0.9	8926.536814	7.057489e+05	233.999562	3.493190e+01	25.025448	4.677283	0.094705	0.022103	0.097250