diff --git a/MC/bin/o2dpg_sim_workflow.py b/MC/bin/o2dpg_sim_workflow.py index dcd535a22..89827daea 100755 --- a/MC/bin/o2dpg_sim_workflow.py +++ b/MC/bin/o2dpg_sim_workflow.py @@ -1798,6 +1798,12 @@ def getDigiTaskName(det): if created_by_option != '': created_by_option += ' ' + aod_creator + aod_parent_option = option_if_available('o2-aod-producer-workflow', '--aod-parent', envfile=async_envfile) + if aod_parent_option != '' and len(args.aod_parent_file) > 0: + aod_parent_option += ' ' + args.aod_parent_file + else: + aod_parent_option = '' + aod_timeframe_id = f"${{ALIEN_PROC_ID}}{aod_df_id}" if not args.run_anchored else "" if len(args.aod_output_folder) > 0: aod_timeframe_id = args.aod_output_folder @@ -1818,7 +1824,7 @@ def getDigiTaskName(det): "--anchor-pass ${ALIEN_JDL_LPMANCHORPASSNAME:-unknown}", "--anchor-prod ${ALIEN_JDL_LPMANCHORPRODUCTION:-unknown}", "--reco-pass ${ALIEN_JDL_LPMPASSNAME:-unknown}", - f"--aod-parent {args.aod_parent_file}", + aod_parent_option, created_by_option, "--combine-source-devices" if not args.no_combine_dpl_devices else "", "--disable-mc" if args.no_mc_labels else "", diff --git a/MC/utils/AODBcRewriter.C b/MC/utils/AODBcRewriter.C index 4c0240fdb..16c7451ef 100644 --- a/MC/utils/AODBcRewriter.C +++ b/MC/utils/AODBcRewriter.C @@ -14,9 +14,8 @@ // // This tool fixes all three problems in one pass per DF_ directory: // -// Stage 0 — Deduplicate the BC table in place (order-preserving; the input -// must already be globalBC-sorted). Build BC permutation map: -// bcPerm[oldBCrow] = newBCrow (monotonic non-decreasing). +// Stage 0 — Sort & deduplicate the BC table. Build BC permutation map: +// bcPerm[oldBCrow] = newBCrow. // // Stage 1 — Process every table that carries fIndexBCs / fIndexBC. // Remap the index via bcPerm, sort rows by the new index, and @@ -502,31 +501,33 @@ static PermMap rewriteTable(TTree *src, TDirectory *dirOut, } // ============================================================================ -// SECTION 4 — Stage 0: BC table deduplication (order-preserving) +// SECTION 4 — Stage 0: BC table sort + deduplication // ============================================================================ // -// Reads fGlobalBC from the BC tree, drops exact-duplicate BC values IN PLACE -// (preserving input row order), and writes the compacted table. Returns -// bcPerm[oldRow] = newRow. +// Reads fGlobalBC from the BC tree, sorts rows, drops exact-duplicate BC +// values, and writes the compacted table. Returns bcPerm[oldRow] = newRow. // -// The dedup is deliberately order-preserving so that bcPerm is monotonic -// non-decreasing. This matters because every BC-indexed table (collisions, -// FT0/FV0/FDD/Zdc, ...) is sliced per BC and must stay sorted by its fIndexBCs, -// and collisions are in turn the grouping anchor for tracks (sorted by -// fIndexCollisions). A non-order-preserving BC remap would force a full reorder -// cascade BC -> collisions -> tracks to keep all those groupings valid; keeping -// bcPerm monotonic means none of those tables need to be reordered at all. +// IMPORTANT (history — do not "optimize" this back): +// A previous revision replaced this sort with an order-preserving in-place +// dedup that std::abort()ed unless the input BC table was already globalBC- +// sorted. That directly contradicts this tool's PURPOSE (a) at the top of the +// file — repairing *non-monotonic* fGlobalBC in MERGED AO2Ds — so it aborted on +// exactly the files it exists to fix ("doesn't run to completion"). We sort +// unconditionally instead. // -// This REQUIRES the input BC table to already be sorted by fGlobalBC (the -// standard AO2D invariant; also asserted on the output by validateDF check #1). -// We assert it loudly rather than silently emit a non-monotonic BC table. +// Sorting BCs does imply a reorder cascade: collisions are sorted by their +// remapped fIndexBCs in Stage 1, and the collision-grouped track tables must +// then be re-grouped to follow the new collision order in Stage 1b. That +// cascade is real and unavoidable for non-monotonic input; it is handled +// explicitly and completely by those stages. This is the correct fix — keep +// it. An "assert already sorted" shortcut here is a known dead end. struct BCStage0Result { - PermMap bcPerm; // bcPerm[oldRow] = newRow in the deduped BC table + PermMap bcPerm; // bcPerm[oldRow] = newRow in sorted/deduped BC table Long64_t nUnique = 0; }; -static BCStage0Result stage0_dedupBCs(TTree *treeBCs, TDirectory *dirOut) { +static BCStage0Result stage0_sortBCs(TTree *treeBCs, TDirectory *dirOut) { BCStage0Result res; Long64_t n = treeBCs->GetEntries(); if (n == 0) return res; @@ -539,28 +540,20 @@ static BCStage0Result stage0_dedupBCs(TTree *treeBCs, TDirectory *dirOut) { std::vector gbcs(n); for (Long64_t i = 0; i < n; ++i) { treeBCs->GetEntry(i); gbcs[i] = gbc; } - // The BC table must already be sorted by fGlobalBC: the dedup below is - // order-preserving (merges only adjacent equal-globalBC rows), which keeps - // bcPerm monotonic and avoids a reorder cascade through collisions/tracks. - // A non-monotonic input would silently break that guarantee, so abort loudly. - for (Long64_t i = 1; i < n; ++i) { - if (gbcs[i] < gbcs[i - 1]) { - std::cerr << "FATAL: O2bc_* table is not sorted by fGlobalBC (row " << i - << " globalBC=" << gbcs[i] << " < row " << (i - 1) - << " globalBC=" << gbcs[i - 1] << ").\n" - << " AODBcRewriter requires a globalBC-sorted BC table so that\n" - << " BC deduplication is order-preserving; aborting.\n"; - std::abort(); - } - } + // Sort row indices by fGlobalBC (stable, so equal-globalBC rows keep their + // input order before being collapsed below). + std::vector order(n); + std::iota(order.begin(), order.end(), 0); + std::stable_sort(order.begin(), order.end(), + [&](Long64_t a, Long64_t b){ return gbcs[a] < gbcs[b]; }); - // Build the deduplicated row list and the (monotonic) permutation in source - // row order: adjacent rows sharing a globalBC collapse onto one output row. + // Build the deduplicated row list and the permutation: rows sharing a + // globalBC collapse onto one output row. res.bcPerm.assign(n, -1); std::vector rowOrder; // source rows to keep, in output order ULong64_t prev = 0; Int_t newRow = -1; - for (Long64_t srcRow = 0; srcRow < n; ++srcRow) { + for (Long64_t srcRow : order) { if (newRow < 0 || gbcs[srcRow] != prev) { ++newRow; prev = gbcs[srcRow]; @@ -571,7 +564,7 @@ static BCStage0Result stage0_dedupBCs(TTree *treeBCs, TDirectory *dirOut) { } res.nUnique = rowOrder.size(); - std::cout << " BC stage: " << n << " rows -> " << res.nUnique << " unique (in-place dedup)\n"; + std::cout << " BC stage: " << n << " rows -> " << res.nUnique << " unique (sorted)\n"; // Write the BC table (no index remapping needed for the table itself) rewriteTable(treeBCs, dirOut, rowOrder, /*indexBranch=*/"", /*parentPerm=*/{}); @@ -1159,10 +1152,10 @@ static void processDF(TDirectory *dirIn, TDirectory *dirOut) { return; } - // ---- Stage 0: deduplicate BCs (order-preserving) ---- + // ---- Stage 0: sort & deduplicate BCs ---- std::cout << "-- Stage 0: BCs --\n"; dirOut->cd(); - BCStage0Result s0 = stage0_dedupBCs(treeBCs, dirOut); + BCStage0Result s0 = stage0_sortBCs(treeBCs, dirOut); if (treeFlags) stage0_copyBCFlags(treeFlags, dirOut, s0.bcPerm); // Track which tree names have been written so we don't double-write @@ -1319,6 +1312,37 @@ static Long64_t checkIndexRange(TTree *t, const char *branchName, return bad; } +// Verify the collision-grouping invariant that O2's ArrowTableSlicingCache +// (validateOrder) enforces on the consumer side: in a table grouped by +// fIndexCollisions, every distinct index value — including the -1 "ambiguous" +// group — must occupy a single contiguous run of rows. A split group is +// exactly the failure that crashed event-selection downstream +// ("Table ... index fIndexCollisions has a group with index -1 that is split +// by N"). This is the post-write counterpart to Stage 1b's regrouping. +// Returns the number of groups found split into >1 run (0 = OK). +static Long64_t checkCollisionGroupContiguity(TTree *t) { + TBranch *br = t->GetBranch("fIndexCollisions"); + if (!br) return 0; + TLeaf *leaf = (TLeaf*)br->GetListOfLeaves()->At(0); + if (!leaf || TString(leaf->GetTypeName()) != "Int_t") return 0; + if (leaf->GetLen() != 1 || leaf->GetLeafCount()) return 0; // scalar only + + Int_t idx = 0; + br->SetAddress(&idx); + std::unordered_set closed; // groups whose run has already ended + Int_t cur = 0; bool have = false; + Long64_t split = 0; + for (Long64_t i = 0; i < t->GetEntries(); ++i) { + br->GetEntry(i); + if (have && idx == cur) continue; // current run continues + if (have) closed.insert(cur); // previous run just ended + if (closed.count(idx)) ++split; // re-opening a group seen earlier + cur = idx; have = true; + } + br->ResetAddress(); + return split; +} + static bool validateDF(TDirectory *d) { bool ok = true; @@ -1438,6 +1462,29 @@ static bool validateDF(TDirectory *d) { } } + // ---- Collision-group contiguity (slicing invariant) ---- + // O2's slicing cache requires each fIndexCollisions group (incl. -1) to be a + // single contiguous run. This is the exact invariant whose violation crashed + // event-selection; it is what Stage 1b re-establishes. Check every + // collision-grouped track table (paste-join children follow their parent's + // row order, so checking the parent suffices). + TIter it3(d->GetListOfKeys()); + TKey *k3; + while ((k3 = (TKey*)it3())) { + TObject *obj = d->Get(k3->GetName()); + if (!obj || !obj->InheritsFrom(TTree::Class())) continue; + TTree *t = (TTree*)obj; + if (!isCollGroupedTrackTable(t->GetName())) continue; + if (isPasteJoinChild(t->GetName())) continue; + Long64_t split = checkCollisionGroupContiguity(t); + if (split > 0) { + std::cerr << " [FAIL] " << t->GetName() + << ": fIndexCollisions has " << split + << " group(s) split into non-contiguous runs (slicing will abort)\n"; + ok = false; + } + } + return ok; } diff --git a/MC/utils/CLAUDE.md b/MC/utils/CLAUDE.md index 0ff309146..56968ed16 100644 --- a/MC/utils/CLAUDE.md +++ b/MC/utils/CLAUDE.md @@ -1,5 +1,77 @@ # CLAUDE.md — AODBcRewriter Development Handoff +## Current work state (handoff — 2026-06-09) + +**Branch:** `fix/aodbcrewriter-track-regroup` · **Last commit:** `8bb9d30b3c` +(committed, not yet pushed/validated). + +### The bug being fixed: tracks' `-1` collision group split + +Downstream O2 analysis (`o2-analysis-event-selection`) was crashing with: + +``` +[FATAL] Table Tracks_IU index fIndexCollisions has a group with index -1 + that is split by 776 +``` + +O2's `ArrowTableSlicingCache::validateOrder` requires every `fIndexCollisions` +group in a track table — **including the `-1` "ambiguous" group** — to be one +contiguous run of rows. Two commits broke this: + +- **`b11cd3de`** added a value-wise remap of tracks' `fIndexCollisions` via + `collPerm` but left the track **rows in input order**. Since Stage 1 reorders + `O2collision` (sort by remapped `fIndexBCs`), the remapped values no longer + formed contiguous groups → the split. (b11cd3de made the *values* correct at + the cost of the *grouping*; the complete fix needs **both** — reorder rows + *and* remap values.) +- **`28b44ef`** (a colleague's attempt) then replaced the Stage 0 BC **sort** + with an order-preserving dedup that `std::abort()`s unless the input BC table + is already `globalBC`-sorted. That contradicts PURPOSE (a) — repairing + *non-monotonic* BCs in merged files — so it aborted on exactly the files the + tool exists to fix ("doesn't run to completion"). + +### What the current fix does + +1. **Reverted only the Stage 0 change** of `28b44ef`: restored `stage0_sortBCs` + (sort + dedup) and removed the abort. Non-monotonic merged BCs are repaired + again, as intended. +2. **Kept `28b44ef`'s Stage 1b** (`stage1b_reorderTrackTables`, Section 9b) and + the `fIndexTracks*` / `fIndexMFTTracks` / `fIndexFwdTracks` remaps in + `processPasteJoinTables`. This regroup-tracks-by-remapped-`fIndexCollisions` + mechanism is the *correct* fix for the split; it only ever failed because the + aborting Stage 0 stopped it from running. Rewriting it from scratch was + judged higher-risk than keeping the reviewed logic. +3. **Added validator check** `checkCollisionGroupContiguity` (Section 11): + mirrors O2's slicing invariant — flags any `fIndexCollisions` group split + into >1 run. Runs over every collision-grouped track table. + +**Design note / cascade:** sorting BCs is unavoidable for non-monotonic input +and forces a reorder cascade **BC (Stage 0) → collisions (Stage 1) → tracks +(Stage 1b)**, propagated to paste-join children and all track references. Do +**not** re-introduce an "assert already sorted / order-preserving" Stage 0 — it +is a known dead end (see the history note in the Section 4 code comment). + +### Not done yet / next steps for whoever picks this up + +- **UNVALIDATED on real data.** Parses cleanly in cling (`.L AODBcRewriter.C`), + but has *not* been run on a real merged AO2D, nor through the analysis task + that crashed. The macro is interpreter-only; `.L AODBcRewriter.C+` (ACLiC) + fails on missing std includes — pre-existing, not a regression. + Test sequence: `AODBcRewriter("AO2D.root","out.root")` → + `AODBcRewriterValidate("out.root")` (expect no `[FAIL]`) → then the **real** + `o2-analysis-event-selection` on `out.root` (ground truth). +- **Known fragility (whack-a-mole):** the `fIndexTracks*` reference remap is a + hardcoded enumeration in `processPasteJoinTables` and the validator's + `kIndexBranchToTable`. A missed reference into a reordered track table = silent + corruption. Longer term, *derive* index→referent relationships from the AO2D + column-name conventions instead of enumerating. +- **Biggest gap:** there is no executable analysis-level CI for this tool, so + regressions are only found in production with delay. Building a reproducer + (merged AO2D that triggers the split/abort) + a CI check that runs the real + task is the agreed top priority after this fix lands. + +--- + ## What this tool does `AODBcRewriter.C` is a ROOT macro that fixes structural integrity problems in @@ -87,7 +159,8 @@ order strictly follow its paste-join parent. | 5 | `stage0_copyBCFlags` | Copy BC flags table following BC row selection | | 6 | `MCCollKey`, `MCCollKeyHash`, `stage1_BCindexedTables` | Process all BC-indexed tables; deduplicate MCCollisions | | 7 | `stage2_MCCollIndexedTables` | Process all MCCollision-indexed tables; drop rows whose parent was deduped | -| 8 | `rowOrderFromPerm`, `findPermByPrefix`, `processPasteJoinTables` | Reorder paste-joined tables to follow their parent (1:1 row count guaranteed); remap any of their own index columns value-wise; copy unrelated tables verbatim | +| 9b | `isCollGroupedTrackTable`, `stage1b_reorderTrackTables` | **Stage 1b**: regroup collision-grouped track tables (`O2track_iu`, `O2mfttrack`, `O2fwdtrack`) by remapped `fIndexCollisions` (`-1` sinks to a contiguous tail); publish track perms so children/references follow. Restores the O2 slicing invariant after the BC→collision reorder cascade | +| 8 | `rowOrderFromPerm`, `findPermByPrefix`, `processPasteJoinTables` | Reorder paste-joined tables to follow their parent (1:1 row count guaranteed); remap any of their own index columns value-wise (incl. `fIndexTracks*` via the Stage 1b track perms); copy unrelated tables verbatim | | 9 | `copyNonTreeObjects` | Copy TMap metadata and other non-TTree objects | | 10 | `processDF` | Orchestrates all stages for one `DF_*` directory | | 11 | `AODBcRewriter` | Top-level entry: opens files, iterates `DF_*` dirs, preserves compression | @@ -227,6 +300,24 @@ The new validator catches the regression class as `[FAIL] paste-join size mismatch: O2mccollisionlabel* has N rows but parent O2collision* has M`. +### ~~9. Tracks' `-1` collision group split after BC/collision reorder~~ (RESOLVED — pending validation) + +See the **Current work state** section at the top for the full story. In short: +after Stage 1 reorders `O2collision`, the collision-grouped track tables must be +**reordered** (not just have their `fIndexCollisions` values remapped) so each +group — including the `-1` ambiguous group — stays one contiguous run, as O2's +`ArrowTableSlicingCache::validateOrder` requires. + +**Fix:** Stage 1b (`stage1b_reorderTrackTables`, Section 9b) stable-sorts each +track table by remapped `fIndexCollisions` (`-1` to a contiguous tail), publishes +the track perm, and `processPasteJoinTables` follows it for paste-join children +and remaps every `fIndexTracks*` reference. New validator check +`checkCollisionGroupContiguity` flags split groups as +`[FAIL] ... fIndexCollisions has N group(s) split into non-contiguous runs`. + +**Status:** committed on `fix/aodbcrewriter-track-regroup` (`8bb9d30b3c`), parses +in cling, **not yet run on a real merged AO2D or the failing analysis task.** + --- ## Testing checklist