# Use anajob to study the run

With `anajob`, we can get a file-level, event-level and collection-level overview.

In [None]:
# # Data can be produced with:
# cd /data_ilc/flc/kunath/local_only/eehiq
# source /cvmfs/ilc.desy.de/sw/x86_64_gcc82_centos7/v02-02-03/init_ilcsoft.sh
# anajob rv02-02.sv02-02.mILD_l5_o1_v02.E250-SetA.I500002.P2f_z_eehiq.eL.pR.n000.d_dstm_15783_0.slcio > anajob.txt
#
# Or alternatively, if you have the pyLCIO kernel set up, uncomment the following line:
# !cd /data_ilc/flc/kunath/local_only/eehiq; if [ ! -f anajob.txt ]; then anajob rv02-02.sv02-02.mILD_l5_o1_v02.E250-SetA.I500002.P2f_z_eehiq.eL.pR.n000.d_dstm_15783_0.slcio > anajob.txt; else echo "File already exists!"; fi

In [None]:
import io
import matplotlib.colors as mcolors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys
from IPython.display import HTML

data_folder = "/data_ilc/flc/kunath/local_only/eehiq"
with open(data_folder + "/anajob.txt") as f:
    raw_txt = f.read()

In [None]:
def remove_prefix(full_str, prefix):
    assert full_str.startswith(prefix)
    return full_str[len(prefix) :]


def remove_suffix(full_str, suffix):
    assert full_str.endswith(suffix)
    return full_str[: -len(suffix)]


class Anajob:
    _header_tag = "anajob:  will open and read from files: \n\n"
    _header_end_tag = "\n\n will reopen and read from files: \n"
    _run_tag = "\n  Run : "
    _event_tag = "///////////////////////////////////\nEVENT: "

    def __init__(self, raw_string, max_events=-1):
        str_events = raw_string.split(self._event_tag)

        str_header = str_events.pop(0)
        self.run_df = self._process_header(str_header)
        self.n_events_used = self.n_events if max_events == -1 else int(max_events)
        print(self.__repr__())

        self.event_header = {"EVENT": [], "RUN": [], "DETECTOR": []}
        str_events[-1] = remove_suffix(
            str_events[-1],
            f"{self.n_events} events read from files: \n     {self.file_name}\n",
        )
        assert len(str_events) >= self.n_events_used
        self.df = pd.concat(map(self._make_event, str_events[: self.n_events_used]))
        self.event_header = pd.DataFrame(self.event_header).set_index("EVENT")

        assert self.n_runs == len(self.event_header["RUN"].unique())
        assert self.n_events_used == len(self.event_header)
        assert self.n_events_used == len(self.df["EVENT"].unique())

    def __repr__(self):
        return "\n".join(
            [
                f"{__class__.__name__} with {self.n_events_used}/{self.n_events} "
                f"events from file {self.file_name}"
            ]
        )

    def _make_event(self, str_event):
        event_lines = str_event.split("\n")
        i_event = int(event_lines.pop(0))
        self.event_header["EVENT"].append(i_event)
        self.event_header["RUN"].append(int(remove_prefix(event_lines.pop(0), "RUN: ")))
        self.event_header["DETECTOR"].append(
            remove_prefix(event_lines.pop(0), "DETECTOR: ")
        )
        event_lines.pop(0)  # COLLECTIONS: (see below)
        event_lines.pop(0)  # ///////////////////////////////////
        event_lines.pop(0)  #
        event_lines.pop(0)  # -----------------------------------
        event_lines.pop(1)  # ===================================
        while event_lines[-1].strip(" ") == "":
            event_lines.pop()
        event_lines.pop()  # -----------------------------------
        table_str = "\n".join(event_lines).split("  ")
        table_str = filter(None, table_str)
        table_str = map(lambda x: x.strip(" "), table_str)
        table_str = "\t".join(table_str)
        table_str = table_str.replace("\t\n", "\n")
        col_df = pd.read_csv(io.StringIO(table_str), sep="\t")
        col_df["EVENT"] = i_event
        if not i_event % 1000:
            print(f"{i_event:>6}/{self.n_events_used} events\r")
        return col_df

    def _process_header(self, str_header):
        run_infos = str_header.split(self._run_tag)
        header = run_infos.pop(0)
        assert header.startswith(self._header_tag)
        files = header[len(self._header_tag) :].split("\n")
        assert files.pop() == ""
        assert len(files) == 1
        for str_file in files:
            self.file_name, str_file = str_file.strip().split(maxsplit=1)
            str_file = str_file.strip()
            assert str_file.startswith("[ ") and str_file.endswith(" ]")
            self.n_runs, self.n_events = str_file[2:-2].split(", ")
            self.n_runs = int(remove_prefix(self.n_runs, "number of runs: "))
            self.n_events = int(remove_prefix(self.n_events, "number of events: "))
        run_infos[-1] = remove_suffix(
            run_infos[-1], self._header_end_tag + "     " + self.file_name + "\n"
        )
        run_series = {}
        for i, run_info in enumerate(run_infos):
            run_series[i] = self._make_run(run_info)
        return pd.concat(run_series, axis=1).transpose()

    def _make_run(self, str_run):
        run_dict = {}
        lines = str_run.split("\n")
        assert lines.pop().strip().strip("-") == ""
        i_run, detector_tag = lines.pop(0).split(" - ")
        run_dict["RUN"] = int(i_run)
        run_dict["DETECTOR"] = detector_tag.rstrip(": ")
        for line in lines:
            key, val = line.split(": ", maxsplit=1)
            key = remove_prefix(key, " parameter ")
            key = remove_suffix(key, " [string]")
            run_dict[key] = val.rstrip(", ")
        return pd.Series(run_dict)


print("For the run overview, it is not necessary to read all the events.")
aj = Anajob(raw_txt, max_events=3)

## Run information

We can see that this run consists of 72 parts.

In [None]:
run_df = aj.run_df
run_df

### Fields with unique values over all runs

In [None]:
repeated_info = run_df.transpose()[run_df.nunique().values == 1][0]
steering_file = repeated_info.pop("SteeringFileContent")
_df = pd.DataFrame(
    np.array([repeated_info.index.values, repeated_info.values]).T.reshape(-1, 2)
)
_df.columns = ["field name", "unique value"]
HTML(_df.to_html(index=False))

#### Steering file

In [None]:
print("\n".join(steering_file.split("\\n")))

### Information that differs per run

In [None]:
run_df.transpose()[run_df.nunique().values != 1].transpose()

#### Example: All output file names

In [None]:
for v in run_df.transpose()[run_df.nunique().values != 1].transpose().outputFile.values:
    print(v)

## Event header

To have the code run fast, only a small `max_events` is used here.
The values in the event header are still identical when looking at the whole file.

In [None]:
print(aj.event_header.nunique())
aj.event_header

## Event information

In [None]:
anajob_events_path = data_folder + "/anajob_events.csv"
try:
    ev_df = pd.read_csv(anajob_events_path, index_col=0)
except FileNotFoundError:
    ev_df = Anajob(raw_txt, max_events=-1).df
    try:
        ev_df.to_csv(anajob_events_path)
    except BaseException:
        print(
            f"WARNING: The dataframe could not be saved to {anajob_events_path}",
            file=sys.stderr,
        )

In [None]:
mean_n = ev_df.groupby("COLLECTION NAME")["NUMBER OF ELEMENTS"].mean()
_df = pd.DataFrame(np.array([mean_n.index.values, mean_n.values]).T.reshape(-1, 2))
_df.columns = ["COLLECTION NAME", "mean NUMBER OF ELEMENTS"]
HTML(_df.to_html(index=False))

In [None]:
ev_df

In [None]:
def plot_entries_per_collection(df):
    per_collection = df.groupby("COLLECTION NAME")["NUMBER OF ELEMENTS"]
    bins1 = np.arange(-0.5, 51, 1)
    bins2 = np.arange(-0.5, 16)
    bins3 = np.arange(-0.5, per_collection.max().max() + 0.5, 20)
    fig, ax = plt.subplots(figsize=(12, 9))
    axins1 = ax.inset_axes([0.30, 0.60, 0.69, 0.39], transform=ax.transAxes)
    axins2 = ax.inset_axes([0.30, 0.17, 0.69, 0.39], transform=ax.transAxes)
    kw = dict(histtype="step", linewidth=2, density=True)
    for i, collection in enumerate(
        per_collection.sum().sort_values(ascending=False).index
    ):
        x = df[df["COLLECTION NAME"] == collection]["NUMBER OF ELEMENTS"]
        kw["color"] = list(mcolors.TABLEAU_COLORS)[i % 10]
        kw["linestyle"] = ["-", ":", "--", "-."][i // 10]
        x.hist(bins=bins1, ax=ax, label=f"{collection} ({x.mean():.1f})", **kw)
        if all(x < max(bins2)):
            x.hist(bins=bins2, ax=axins1, cumulative=-1, **kw)
        x.hist(bins=bins3, ax=axins2, **kw)
    axins1.patch.set_alpha(0.8)
    axins2.patch.set_alpha(0.8)
    ax.set_ylabel("pdf")
    axins1.set_ylabel("cdf")
    axins2.set_ylabel("pdf")
    axins2.set_yscale("log")
    ax.legend(title="COLLECTION NAME", bbox_to_anchor=(1.0, 1.0))
    fig.tight_layout()
    return fig


fig = plot_entries_per_collection(ev_df)