diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 012a132b56..0222ad6fe9 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -28,6 +28,7 @@
 #include <dual_simplex/user_problem.hpp>
 
 #include <raft/core/nvtx.hpp>
+#include <utilities/circular_deque.hpp>
 #include <utilities/hashing.hpp>
 #include <utilities/scope_guard.hpp>
 
@@ -37,9 +38,7 @@
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
-#include <deque>
 #include <limits>
-#include <optional>
 #include <string>
 #include <vector>
 
@@ -301,11 +300,16 @@ branch_and_bound_t<i_t, f_t>::branch_and_bound_t(
 template <typename i_t, typename f_t>
 f_t branch_and_bound_t<i_t, f_t>::get_lower_bound()
 {
-  f_t lower_bound        = lower_bound_ceiling_.load();
-  f_t heap_lower_bound   = node_queue_.get_lower_bound();
-  f_t worker_lower_bound = worker_pool_.get_lower_bound();
-  lower_bound            = std::min(heap_lower_bound, lower_bound);
-  lower_bound            = std::min(worker_lower_bound, lower_bound);
+  f_t lower_bound = lower_bound_numerical_.load();
+
+  if (bfs_worker_pool_.is_initialized()) {
+    for (i_t i = 0; i < bfs_worker_pool_.size(); ++i) {
+      if (bfs_worker_pool_[i]->is_active) {
+        lower_bound = std::min(lower_bound, bfs_worker_pool_[i]->lower_bound.load());
+        lower_bound = std::min(lower_bound, bfs_worker_pool_[i]->node_queue.get_lower_bound());
+      }
+    }
+  }
 
   if (std::isfinite(lower_bound)) {
     return lower_bound;
@@ -464,7 +468,7 @@ void branch_and_bound_t<i_t, f_t>::update_user_bound(f_t lower_bound)
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solution)
+void branch_and_bound_t<i_t, f_t>::set_solution_from_heuristics(const std::vector<f_t>& solution)
 {
   mutex_original_lp_.lock();
   if (solution.size() != original_problem_.num_cols) {
@@ -478,7 +482,8 @@ void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solu
   mutex_original_lp_.unlock();
   bool is_feasible    = false;
   bool attempt_repair = false;
-  if (!incumbent_.has_incumbent || obj < incumbent_.objective) {
+
+  if (improves_incumbent(obj)) {
     f_t primal_err;
     f_t bound_err;
     i_t num_fractional;
@@ -494,8 +499,10 @@ void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solu
     mutex_original_lp_.unlock();
     mutex_upper_.lock();
     if (is_feasible && improves_incumbent(obj)) {
-      upper_bound_ = std::min(upper_bound_.load(), obj);
+      f_t current_upper_bound = upper_bound_.load();
+      upper_bound_            = std::min(current_upper_bound, obj);
       incumbent_.set_incumbent_solution(obj, crushed_solution);
+      if (current_upper_bound > upper_bound_.load()) { report_heuristic(obj); }
     } else {
       attempt_repair         = true;
       constexpr bool verbose = false;
@@ -513,7 +520,6 @@ void branch_and_bound_t<i_t, f_t>::set_new_solution(const std::vector<f_t>& solu
     settings_.log.debug("Solution objective not better than current upper_bound_. Not accepted.\n");
   }
 
-  if (is_feasible) { report_heuristic(obj); }
   if (attempt_repair) {
     mutex_repair_.lock();
     repair_queue_.push_back(solution);
@@ -712,9 +718,11 @@ void branch_and_bound_t<i_t, f_t>::set_final_solution(mip_solution_t<i_t, f_t>&
   if (solver_status_ == mip_status_t::TIME_LIMIT) {
     settings_.log.printf("Time limit reached. Stopping the solver...\n");
   }
+
   if (solver_status_ == mip_status_t::WORK_LIMIT) {
     settings_.log.printf("Work limit reached. Stopping the solver...\n");
   }
+
   if (solver_status_ == mip_status_t::NODE_LIMIT) {
     settings_.log.printf("Node limit reached. Stopping the solver...\n");
   }
@@ -858,7 +866,7 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   std::vector<f_t>& solution = worker->leaf_solution.x;
 
   switch (worker->search_strategy) {
-    case search_strategy_t::BEST_FIRST:
+    case BEST_FIRST:
 
       if (settings_.reliability_branching != 0) {
         branch_var = pc_.reliable_variable_selection(node_ptr,
@@ -867,7 +875,7 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
                                                      var_types_,
                                                      exploration_stats_,
                                                      upper_bound_,
-                                                     worker_pool_.num_idle_workers(),
+                                                     bfs_worker_pool_.num_idle(),
                                                      new_slacks_,
                                                      original_lp_);
       } else {
@@ -878,17 +886,18 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
 
       return {branch_var, round_dir};
 
-    case search_strategy_t::COEFFICIENT_DIVING:
+    case COEFFICIENT_DIVING:
       return coefficient_diving(
         original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log);
 
-    case search_strategy_t::LINE_SEARCH_DIVING:
+    case LINE_SEARCH_DIVING:
       return line_search_diving(fractional, solution, root_relax_soln_.x, log);
 
-    case search_strategy_t::PSEUDOCOST_DIVING:
+    case PSEUDOCOST_DIVING:
       return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log);
 
-    case search_strategy_t::GUIDED_DIVING:
+    case GUIDED_DIVING:
+      assert(incumbent_.has_incumbent);
       mutex_upper_.lock();
       current_incumbent = incumbent_.x;
       mutex_upper_.unlock();
@@ -975,10 +984,10 @@ struct nondeterministic_policy_t : tree_update_policy_t<i_t, f_t> {
   void on_numerical_issue(mip_node_t<i_t, f_t>* node) override
   {
     if (worker->search_strategy == search_strategy_t::BEST_FIRST) {
-      fetch_min(bnb.lower_bound_ceiling_, node->lower_bound);
+      fetch_min(bnb.lower_bound_numerical_, node->lower_bound);
       log.printf("LP returned numerical issue on node %d. Best bound set to %+10.6e.\n",
                  node->node_id,
-                 compute_user_objective(bnb.original_lp_, bnb.lower_bound_ceiling_.load()));
+                 compute_user_objective(bnb.original_lp_, bnb.lower_bound_numerical_.load()));
     }
   }
 
@@ -1106,12 +1115,12 @@ struct deterministic_diving_policy_t
   : deterministic_policy_base_t<i_t, f_t, deterministic_diving_worker_t<i_t, f_t>> {
   using base = deterministic_policy_base_t<i_t, f_t, deterministic_diving_worker_t<i_t, f_t>>;
 
-  std::deque<mip_node_t<i_t, f_t>*>& stack;
+  circular_deque_t<mip_node_t<i_t, f_t>*>& stack;
   i_t max_backtrack_depth;
 
   deterministic_diving_policy_t(branch_and_bound_t<i_t, f_t>& bnb,
                                 deterministic_diving_worker_t<i_t, f_t>& worker,
-                                std::deque<mip_node_t<i_t, f_t>*>& stack,
+                                circular_deque_t<mip_node_t<i_t, f_t>*>& stack,
                                 i_t max_backtrack_depth)
     : base(bnb, worker), stack(stack), max_backtrack_depth(max_backtrack_depth)
   {
@@ -1277,7 +1286,7 @@ std::pair<node_status_t, branch_direction_t> branch_and_bound_t<i_t, f_t>::updat
         policy.select_branch_variable(node_ptr, leaf_fractional, leaf_solution.x);
       round_dir = dir;
 
-      assert(node_ptr->vstatus.size() == leaf_problem.num_cols);
+      assert(worker->leaf_vstatus.size() == leaf_problem.num_cols);
       assert(branch_var >= 0);
       assert(dir != branch_direction_t::NONE);
 
@@ -1291,7 +1300,7 @@ std::pair<node_status_t, branch_direction_t> branch_and_bound_t<i_t, f_t>::updat
                          branch_var,
                          leaf_solution.x[branch_var],
                          num_frac,
-                         node_ptr->vstatus,
+                         worker->leaf_vstatus,
                          leaf_problem,
                          log);
       search_tree.update(node_ptr, node_status_t::HAS_CHILDREN);
@@ -1416,8 +1425,8 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
   }
 #endif
 
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  assert(leaf_vstatus.size() == worker->leaf_problem.num_cols);
+  decompress_vstatus(node_ptr->packed_vstatus, worker->leaf_problem.num_cols, worker->leaf_vstatus);
+  assert(worker->leaf_vstatus.size() == worker->leaf_problem.num_cols);
 
   simplex_solver_settings_t lp_settings = settings_;
   lp_settings.concurrent_halt           = &node_concurrent_halt_;
@@ -1491,7 +1500,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
                                                   lp_start_time,
                                                   worker->leaf_problem,
                                                   lp_settings,
-                                                  leaf_vstatus,
+                                                  worker->leaf_vstatus,
                                                   worker->basis_factors,
                                                   worker->basic_list,
                                                   worker->nonbasic_list,
@@ -1509,7 +1518,7 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
                                                    worker->basis_factors,
                                                    worker->basic_list,
                                                    worker->nonbasic_list,
-                                                   leaf_vstatus,
+                                                   worker->leaf_vstatus,
                                                    worker->leaf_edge_norms);
 
         lp_status = convert_lp_status_to_dual_status(second_status);
@@ -1526,11 +1535,19 @@ dual::status_t branch_and_bound_t<i_t, f_t>::solve_node_lp(
 
   return lp_status;
 }
+
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker)
+void branch_and_bound_t<i_t, f_t>::plunge_with(bfs_worker_t<i_t, f_t>* worker,
+                                               mip_node_t<i_t, f_t>* start_node)
 {
-  std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(worker->start_node);
+  assert(worker != nullptr && worker->is_active.load());
+  assert(start_node != nullptr);
+
+  // Stack holds at most 2 entries: the preferred child + its sibling.
+  // The sibling is evicted to the queue before a new pair of children is added.
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(2);
+  stack.push_front(start_node);
+
   worker->recompute_basis  = true;
   worker->recompute_bounds = true;
   worker->ensure_orbital_fixing();
@@ -1542,6 +1559,18 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
 
   while (stack.size() > 0 && (solver_status_ == mip_status_t::UNSET && is_running_) &&
          rel_gap > settings_.relative_mip_gap_tol && abs_gap > settings_.absolute_mip_gap_tol) {
+    if (worker->worker_id == 0) { repair_heuristic_solutions(); }
+
+    if (worker->total_active_diving_workers < worker->total_max_diving_workers &&
+        worker->node_queue.diving_queue_size() > 0) {
+      launch_diving_worker(worker);
+    }
+
+    if (bfs_worker_pool_.num_idle() > 0 && worker->node_queue.best_first_queue_size() > 0) {
+      launch_bfs_worker(worker);
+    }
+
+    assert(stack.size() <= 2);
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
     stack.pop_front();
     ++exploration_stats_.nodes_being_solved;
@@ -1564,7 +1593,23 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
       continue;
     }
 
-    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
+    f_t now = toc(exploration_stats_.start_time);
+
+    if (worker->worker_id == 0) {
+      f_t time_since_last_log =
+        exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
+      i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
+
+      if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
+           time_since_last_log >= 1) ||
+          (time_since_last_log > 30) || now > settings_.time_limit) {
+        report(' ', upper_bound_, lower_bound, node_ptr->depth, node_ptr->integer_infeasible);
+        exploration_stats_.last_log             = tic();
+        exploration_stats_.nodes_since_last_log = 0;
+      }
+    }
+
+    if (now > settings_.time_limit) {
       solver_status_ = mip_status_t::TIME_LIMIT;
       stack.push_front(node_ptr);
       --exploration_stats_.nodes_being_solved;
@@ -1615,22 +1660,22 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
       if (stack.size() > 0) {
         mip_node_t<i_t, f_t>* node = stack.back();
         stack.pop_back();
-        node_queue_.push(node);
+        worker->node_queue.push_atomic(node);
       }
 
       exploration_stats_.nodes_unexplored += 2;
 
       if (round_dir == branch_direction_t::UP) {
-        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
-          node_queue_.push(node_ptr->get_down_child());
+        if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) {
+          worker->node_queue.push_atomic(node_ptr->get_down_child());
         } else {
           stack.push_front(node_ptr->get_down_child());
         }
 
         stack.push_front(node_ptr->get_up_child());
       } else {
-        if (node_queue_.best_first_queue_size() < min_node_queue_size_) {
-          node_queue_.push(node_ptr->get_up_child());
+        if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) {
+          worker->node_queue.push_atomic(node_ptr->get_up_child());
         } else {
           stack.push_front(node_ptr->get_up_child());
         }
@@ -1645,28 +1690,144 @@ void branch_and_bound_t<i_t, f_t>::plunge_with(branch_and_bound_worker_t<i_t, f_
     abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
   }
 
-  lower_bound = get_lower_bound();
-  upper_bound = upper_bound_;
-  rel_gap     = user_relative_gap(original_lp_, upper_bound, lower_bound);
-  abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
-
   // If the solver exits early without consuming the local stack, or converged according to
   // the gap rules while nodes are still pending, put those nodes back into the global queue
   // before returning.
   while (!stack.empty()) {
     auto node = stack.front();
     stack.pop_front();
-    node_queue_.push(node);
+    worker->node_queue.push_atomic(node);
+  }
+
+  // The worker is no longer exploring the tree. Set its lower bound to infinity to avoid
+  // interfering with the global lower bound calculation.
+  worker->lower_bound = std::numeric_limits<f_t>::infinity();
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::launch_bfs_worker(bfs_worker_t<i_t, f_t>* worker)
+{
+  bfs_worker_t<i_t, f_t>* idle_worker = bfs_worker_pool_.pop_idle_worker();
+  if (!idle_worker) return;
+
+  assert(idle_worker->is_active.load() == false);
+  assert(idle_worker->node_queue.best_first_queue_size() == 0);
+
+  // Pre-emptively set the lower bound of the idle worker for the top of the heap
+  // so it is visible to all workers.
+  idle_worker->lower_bound = worker->node_queue.get_lower_bound();
+  idle_worker->set_active();
+
+  bool success = idle_worker->node_queue.steal_from(worker->node_queue, 1);
+
+  // Update to the actual lower bound of the stolen node (another worker may attempt to
+  // steal the same node at the same time)
+  idle_worker->lower_bound = idle_worker->node_queue.get_lower_bound();
+
+  // If the idle worker is set to active (i.e., its node queue has a valid node),
+  // launch a openmp task to run the best-first search for that worker
+  if (success) {
+#pragma omp task affinity(*idle_worker) priority(CUOPT_CRITICAL_TASK_PRIORITY) default(none) \
+  firstprivate(idle_worker)
+    best_first_search_with(idle_worker);
+  } else {
+    // The idle worker was not successfully initialized. This should occur
+    // rarely or even none at all. Keep here for safety.
+    bfs_worker_pool_.return_worker_to_pool(idle_worker);
+  }
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::work_stealing(bfs_worker_t<i_t, f_t>* worker)
+{
+  i_t nodes_to_steal = settings_.bnb_nodes_per_steal >= 0 ? settings_.bnb_nodes_per_steal
+                                                          : MIP_DEFAULT_NODES_PER_STEAL;
+  i_t max_attempts   = settings_.bnb_max_steal_attempts >= 0 ? settings_.bnb_max_steal_attempts
+                                                             : MIP_DEFAULT_MAX_STEAL_ATTEMPTS;
+  for (i_t i = 0; i < max_attempts; ++i) {
+    i_t victim_id                  = worker->rng.uniform(0, bfs_worker_pool_.size());
+    bfs_worker_t<i_t, f_t>* victim = bfs_worker_pool_[victim_id];
+    if (worker->steal_from(victim, nodes_to_steal)) { break; }
+  }
+}
+
+template <typename i_t, typename f_t>
+void branch_and_bound_t<i_t, f_t>::best_first_search_with(bfs_worker_t<i_t, f_t>* worker)
+{
+  f_t lower_bound = get_lower_bound();
+  f_t abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
+  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+  f_t steal_chance =
+    settings_.bnb_steal_chance >= 0 ? settings_.bnb_steal_chance : MIP_DEFAULT_STEAL_CHANCE;
+  node_queue_t<i_t, f_t>& node_queue = worker->node_queue;
+
+  worker->calculate_num_diving_workers(bfs_worker_pool_.size(),
+                                       diving_worker_pool_.size(),
+                                       has_solver_space_incumbent(),
+                                       settings_.diving_settings);
+
+  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
+         rel_gap > settings_.relative_mip_gap_tol && node_queue.best_first_queue_size() > 0) {
+    // If the guided diving was disabled previously due to the lack of an incumbent solution,
+    // re-enable as soon as a new incumbent is found.
+    if (diving_worker_pool_.size() > 0 && settings_.diving_settings.guided_diving != 0 &&
+        worker->max_diving_workers[GUIDED_DIVING] == 0) {
+      if (has_solver_space_incumbent()) {
+        worker->calculate_num_diving_workers(bfs_worker_pool_.size(),
+                                             diving_worker_pool_.size(),
+                                             has_solver_space_incumbent(),
+                                             settings_.diving_settings);
+      }
+    }
+
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    }
+
+    // Pre-emptively set the lower bound of the worker
+    worker->lower_bound              = node_queue.get_lower_bound();
+    mip_node_t<i_t, f_t>* start_node = node_queue.pop();
+    if (!start_node) continue;
+    worker->lower_bound = start_node->lower_bound;
+
+    if (upper_bound_.load() < start_node->lower_bound) {
+      // This node was put on the heap earlier but its lower bound is now greater than the
+      // current upper bound
+      search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
+      search_tree_.update(start_node, node_status_t::FATHOMED);
+      --exploration_stats_.nodes_unexplored;
+      continue;
+    }
+
+    plunge_with(worker, start_node);
+
+    lower_bound = get_lower_bound();
+    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
+    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+
+    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
+      node_concurrent_halt_ = 1;
+      solver_status_        = mip_status_t::OPTIMAL;
+      break;
+    }
+
+    // Steal a node with some probability or when it is empty. The victim is determined at random.
+    if (node_queue.best_first_queue_size() == 0 || worker->rng.next_double() < steal_chance) {
+      work_stealing(worker);
+    }
   }
 
-  if (settings_.num_threads > 1) {
-    worker_pool_.return_worker_to_pool(worker);
-    active_workers_per_strategy_[BEST_FIRST]--;
+  // If the worker has still nodes in the queue (this can happen if it was stopped due to
+  // time limit, small gap or other reason), then do not add back to the pool to avoid
+  // constantly trying to start it again
+  if (worker->node_queue.best_first_queue_size() == 0) {
+    bfs_worker_pool_.return_worker_to_pool(worker);
   }
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>* worker)
+void branch_and_bound_t<i_t, f_t>::dive_with(diving_worker_t<i_t, f_t>* worker)
 {
   raft::common::nvtx::range scope("BB::diving_thread");
   if (worker->orbital_fixing) { worker->orbital_fixing->disable(); }
@@ -1680,8 +1841,12 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
   worker->recompute_basis  = true;
   worker->recompute_bounds = true;
 
-  search_tree_t<i_t, f_t> dive_tree(std::move(*worker->start_node));
-  std::deque<mip_node_t<i_t, f_t>*> stack;
+  search_tree_t<i_t, f_t> dive_tree(std::move(worker->start_node));
+
+  // Since we are perform a DFS with a limit amount of backtracking, the
+  // stack can hold at most `diving_backtrack_limit` + 2 siblings nodes of the
+  // current level
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(diving_backtrack_limit + 4);
   stack.push_front(&dive_tree.root);
 
   branch_and_bound_stats_t<i_t, f_t> dive_stats;
@@ -1708,7 +1873,10 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
       continue;
     }
 
-    if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; }
+    if (toc(exploration_stats_.start_time) > settings_.time_limit) {
+      solver_status_ = mip_status_t::TIME_LIMIT;
+      break;
+    }
     if (dive_stats.nodes_explored >= diving_node_limit) { break; }
 
     dual::status_t lp_status = solve_node_lp(node_ptr, worker, dive_stats, log);
@@ -1718,7 +1886,6 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
       solver_status_ = mip_status_t::TIME_LIMIT;
       break;
     }
-
     if (lp_status == dual::status_t::CONCURRENT_LIMIT) { break; }
     if (lp_status == dual::status_t::ITERATION_LIMIT) { break; }
 
@@ -1737,9 +1904,10 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
       }
     }
 
-    // Remove nodes that we no longer can backtrack to (i.e., from the current node, we can only
+    // Remove nodes that we can no longer backtrack to (i.e., from the current node, we can only
     // backtrack to a node that is has a depth of at most 5 levels lower than the current node).
-    if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > diving_backtrack_limit) {
+    while (stack.size() > 1 &&
+           stack.front()->depth - stack.back()->depth > diving_backtrack_limit) {
       stack.pop_back();
     }
 
@@ -1749,220 +1917,69 @@ void branch_and_bound_t<i_t, f_t>::dive_with(branch_and_bound_worker_t<i_t, f_t>
     abs_gap     = compute_user_abs_gap(original_lp_, upper_bound, lower_bound);
   }
 
-  worker_pool_.return_worker_to_pool(worker);
-  active_workers_per_strategy_[search_strategy]--;
+  diving_worker_pool_.return_worker_to_pool(worker);
 }
 
 template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::run_scheduler()
+bool branch_and_bound_t<i_t, f_t>::launch_diving_worker(bfs_worker_t<i_t, f_t>* bfs_worker)
 {
-  diving_heuristics_settings_t<i_t, f_t> diving_settings = settings_.diving_settings;
-  const i_t num_workers                                  = 2 * settings_.num_threads;
-
-  if (!has_solver_space_incumbent()) { diving_settings.guided_diving = false; }
-  std::vector<search_strategy_t> strategies = get_search_strategies(diving_settings);
-  std::array<i_t, num_search_strategies> max_num_workers_per_type =
-    get_max_workers(num_workers, strategies);
-
-  worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, symmetry_, settings_);
-  active_workers_per_strategy_.fill(0);
-
-#ifdef CUOPT_LOG_DEBUG
-  for (auto strategy : strategies) {
-    settings_.log.debug("%c%d: max num of workers = %d",
-                        feasible_solution_symbol(strategy),
-                        strategy,
-                        max_num_workers_per_type[strategy]);
+  // Get an idle worker.
+  diving_worker_t<i_t, f_t>* diving_worker = diving_worker_pool_.pop_idle_worker();
+  if (diving_worker == nullptr) { return false; }
+
+  bool success = bfs_worker->node_queue.diving_init(original_lp_,
+                                                    diving_worker->start_node,
+                                                    diving_worker->start_lower,
+                                                    diving_worker->start_upper,
+                                                    diving_worker->bounds_changed);
+  if (!success) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
   }
-#endif
-
-  f_t lower_bound     = get_lower_bound();
-  f_t abs_gap         = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-  f_t rel_gap         = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-  i_t last_node_depth = 0;
-  i_t last_int_infeas = 0;
-
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol &&
-         (active_workers_per_strategy_[0] > 0 || node_queue_.best_first_queue_size() > 0)) {
-    bool launched_any_task = false;
-
-    repair_heuristic_solutions();
-
-    // If the guided diving was disabled previously due to the lack of an incumbent solution,
-    // re-enable as soon as a new incumbent is found.
-    if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) {
-      if (has_solver_space_incumbent()) {
-        diving_settings.guided_diving = settings_.diving_settings.guided_diving;
-        strategies                    = get_search_strategies(diving_settings);
-        max_num_workers_per_type      = get_max_workers(num_workers, strategies);
-
-#ifdef CUOPT_LOG_DEBUG
-        for (auto type : strategies) {
-          settings_.log.debug("%c%d: max num of workers = %d",
-                              feasible_solution_symbol(type),
-                              type,
-                              max_num_workers_per_type[type]);
-        }
-#endif
-      }
-    }
-
-    f_t now = toc(exploration_stats_.start_time);
-    f_t time_since_last_log =
-      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
-
-    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-         time_since_last_log >= 1) ||
-        (time_since_last_log > 30) || now > settings_.time_limit) {
-      i_t queue_size = node_queue_.best_first_queue_size();
-      i_t depth      = queue_size > 0 ? node_queue_.bfs_top()->depth : last_node_depth;
-      i_t int_infeas = queue_size > 0 ? node_queue_.bfs_top()->integer_infeasible : last_int_infeas;
-      report(' ', upper_bound_, lower_bound, depth, int_infeas);
-      exploration_stats_.last_log             = tic();
-      exploration_stats_.nodes_since_last_log = 0;
-    }
-
-    if (now > settings_.time_limit) {
-      solver_status_ = mip_status_t::TIME_LIMIT;
-      break;
-    }
-
-    for (auto strategy : strategies) {
-      if (active_workers_per_strategy_[strategy] >= max_num_workers_per_type[strategy]) {
-        continue;
-      }
-
-      // Get an idle worker.
-      branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
-      if (worker == nullptr) { break; }
-
-      if (strategy == BEST_FIRST) {
-        // If there any node left in the heap, we pop the top node and explore it.
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound) {
-          // This node was put on the heap earlier but its lower bound is now greater than the
-          // current upper bound
-          search_tree_.graphviz_node(
-            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-          continue;
-        }
-
-        // Remove the worker from the idle list.
-        worker_pool_.pop_idle_worker();
-        worker->init_best_first(start_node.value(), original_lp_);
-        last_node_depth = start_node.value()->depth;
-        last_int_infeas = start_node.value()->integer_infeasible;
-        active_workers_per_strategy_[strategy]++;
-        launched_any_task = true;
-
-#pragma omp task affinity(worker) default(none) firstprivate(worker)
-        plunge_with(worker);
-
-      } else {
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_diving();
-
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound ||
-            start_node.value()->depth < diving_settings.min_node_depth) {
-          continue;
-        }
 
-        bool is_feasible =
-          worker->init_diving(start_node.value(), strategy, original_lp_, settings_);
-        if (!is_feasible) { continue; }
-
-        // Remove the worker from the idle list.
-        worker_pool_.pop_idle_worker();
-        active_workers_per_strategy_[strategy]++;
-        launched_any_task = true;
-
-#pragma omp task affinity(worker) default(none) firstprivate(worker)
-        dive_with(worker);
-      }
-    }
-
-    lower_bound = get_lower_bound();
-    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
-
-    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
-      node_concurrent_halt_ = 1;
-      solver_status_        = mip_status_t::OPTIMAL;
-      break;
-    }
-
-    // If no new task was launched in this iteration, suspend temporarily the
-    // execution of the scheduler. As of 8/Jan/2026, GCC does not
-    // implement taskyield, but LLVM does.
-    if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); }
+  if (upper_bound_.load() < diving_worker->start_node.lower_bound ||
+      diving_worker->start_node.depth < settings_.diving_settings.min_node_depth) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
   }
-}
-
-template <typename i_t, typename f_t>
-void branch_and_bound_t<i_t, f_t>::single_threaded_solve()
-{
-  raft::common::nvtx::range scope("BB::single_threaded_solve");
-  worker_pool_.init(1, original_lp_, Arow_, var_types_, symmetry_, settings_);
-  branch_and_bound_worker_t<i_t, f_t>* worker = worker_pool_.get_idle_worker();
-
-  f_t lower_bound = get_lower_bound();
-  f_t abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-  f_t rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
 
-  while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol &&
-         rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) {
-    repair_heuristic_solutions();
+  bool is_feasible = diving_worker->presolve_start_bounds(settings_);
+  if (!is_feasible) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
+  }
 
-    f_t now = toc(exploration_stats_.start_time);
-    f_t time_since_last_log =
-      exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log);
-    i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log;
-
-    if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) &&
-         time_since_last_log >= 1) ||
-        (time_since_last_log > 30) || now > settings_.time_limit) {
-      i_t depth      = node_queue_.bfs_top()->depth;
-      i_t int_infeas = node_queue_.bfs_top()->integer_infeasible;
-      report(' ', upper_bound_, lower_bound, depth, int_infeas);
-      exploration_stats_.last_log             = tic();
-      exploration_stats_.nodes_since_last_log = 0;
-    }
+  if (toc(exploration_stats_.start_time) > settings_.time_limit ||
+      solver_status_ != mip_status_t::UNSET) {
+    diving_worker_pool_.return_worker_to_pool(diving_worker);
+    return false;
+  }
 
-    if (now > settings_.time_limit) {
-      solver_status_ = mip_status_t::TIME_LIMIT;
-      break;
-    }
+  for (int i = 1; i < num_search_strategies; ++i) {
+    auto strategy = search_strategies[i];
 
-    // If there any node left in the heap, we pop the top node and explore it.
-    std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
+    if (bfs_worker->active_diving_workers[strategy] < bfs_worker->max_diving_workers[strategy]) {
+      diving_worker->search_strategy = strategy;
+      diving_worker->bfs_worker      = bfs_worker;
+      diving_worker->set_active();
+      bfs_worker->active_diving_workers[strategy]++;
+      bfs_worker->total_active_diving_workers++;
 
-    if (!start_node.has_value()) { continue; }
-    if (upper_bound_.load() < start_node.value()->lower_bound) {
-      // This node was put on the heap earlier but its lower bound is now greater than the
-      // current upper bound
-      search_tree_.graphviz_node(
-        settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-      search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-      continue;
-    }
+      assert(bfs_worker->active_diving_workers[strategy].load() <=
+             bfs_worker->max_diving_workers[strategy]);
+      assert(bfs_worker->total_active_diving_workers.load() <=
+             bfs_worker->total_max_diving_workers);
 
-    worker->init_best_first(start_node.value(), original_lp_);
-    plunge_with(worker);
-
-    lower_bound = get_lower_bound();
-    abs_gap     = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound);
-    rel_gap     = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound);
+#pragma omp task affinity(*diving_worker) priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) \
+  firstprivate(diving_worker)
+      dive_with(diving_worker);
 
-    if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) {
-      solver_status_ = mip_status_t::OPTIMAL;
-      break;
+      return true;
     }
   }
+
+  diving_worker_pool_.return_worker_to_pool(diving_worker);
+  return false;
 }
 
 template <typename i_t, typename f_t>
@@ -1983,7 +2000,7 @@ lp_status_t branch_and_bound_t<i_t, f_t>::solve_root_relaxation(
   lp_status_t root_status;
 
 // Launch a task for solving the root LP relaxation via dual simplex.
-#pragma omp task default(shared) depend(out : root_status)
+#pragma omp task default(shared) depend(out : root_status) priority(CUOPT_CRITICAL_TASK_PRIORITY)
   {
     root_status = solve_linear_program_with_advanced_basis(original_lp_,
                                                            exploration_stats_.start_time,
@@ -2456,7 +2473,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     tolerances_for_clique.absolute_mip_gap            = settings_.absolute_mip_gap_tol;
     tolerances_for_clique.relative_mip_gap            = settings_.relative_mip_gap_tol;
 
-#pragma omp task depend(out : *clique_signal) firstprivate(tolerances_for_clique)
+#pragma omp task priority(CUOPT_DEFAULT_TASK_PRIORITY) depend(out : *clique_signal) \
+  firstprivate(tolerances_for_clique)
     {
       user_problem_t<i_t, f_t> problem_copy = original_problem_;
       timer_t timer(std::numeric_limits<double>::infinity());
@@ -2576,14 +2594,13 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     return mip_status_t::OPTIMAL;
   }
 
-  is_running_          = true;
-  lower_bound_ceiling_ = inf;
+  is_running_            = true;
+  lower_bound_numerical_ = inf;
 
   if (num_fractional != 0 && settings_.max_cut_passes > 0) {
     settings_.log.printf(
-      " | Explored | Unexplored |    Objective    |     Bound     | IntInf | Depth | Iter/Node |   "
-      "Gap    "
-      "|  Time  |\n");
+      " | Explored | Unexplored |    Objective    |     Bound     | IntInf | Depth | Iter/Node | "
+      "Gap    |  Time  |\n");
   }
 
   cut_pool_t<i_t, f_t> cut_pool(original_lp_.num_cols, settings_);
@@ -2621,7 +2638,7 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
       if (settings_.deterministic) {
         queue_external_solution_deterministic(user_assignment, work_units);
       } else {
-        set_new_solution(user_assignment);
+        set_solution_from_heuristics(user_assignment);
       }
     };
   auto stop_root_cut_cpufj = [&]() {
@@ -2643,7 +2660,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
 
     cut_pass_result_t cut_pass_result;
     if (root_cut_cpufj_task) {
-#pragma omp task shared(root_cut_cpufj_task) default(none) depend(out : *root_cut_cpufj_task)
+#pragma omp task shared(root_cut_cpufj_task) priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) \
+  depend(out : *root_cut_cpufj_task)
       detail::run_fj_cpu_task(*root_cut_cpufj_task,
                               std::numeric_limits<f_t>::infinity(),
                               std::numeric_limits<f_t>::infinity());
@@ -2821,8 +2839,6 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                       root_vstatus_,
                       original_lp_,
                       log);
-  node_queue_.push(search_tree_.root.get_down_child());
-  node_queue_.push(search_tree_.root.get_up_child());
 
   if (symmetry_ != nullptr) {
     i_t removed =
@@ -2843,11 +2859,12 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   exploration_stats_.nodes_unexplored     = 2;
   exploration_stats_.nodes_since_last_log = 0;
   exploration_stats_.last_log             = tic();
-  min_node_queue_size_                    = 2 * settings_.num_threads;
+  min_node_queue_size_                    = 20;
 
   if (settings_.diving_settings.coefficient_diving != 0) {
     calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
   }
+
   if (settings_.deterministic) {
     settings_.log.printf(
       " | Explored | Unexplored |    Objective    |     Bound     | IntInf | Depth | Iter/Node "
@@ -2862,10 +2879,29 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   {
     if (settings_.deterministic) {
       run_deterministic_coordinator(Arow_);
-    } else if (settings_.num_threads > 1) {
-      run_scheduler();
     } else {
-      single_threaded_solve();
+      const i_t num_workers        = settings_.num_threads;
+      const i_t num_bfs_workers    = std::max(settings_.num_threads / 2, 1);
+      const i_t num_diving_workers = num_workers - num_bfs_workers;
+      bfs_worker_pool_.init(num_bfs_workers, original_lp_, Arow_, var_types_, symmetry_, settings_);
+
+      if (num_diving_workers > 0) {
+        diving_worker_pool_.init(num_diving_workers,
+                                 original_lp_,
+                                 Arow_,
+                                 var_types_,
+                                 symmetry_,
+                                 settings_,
+                                 num_bfs_workers);
+      }
+
+      bfs_worker_t<i_t, f_t>* initial_worker = bfs_worker_pool_.pop_idle_worker();
+      node_queue_t<i_t, f_t>& node_queue     = initial_worker->node_queue;
+      node_queue.push_lockfree(search_tree_.root.get_down_child());
+      node_queue.push_lockfree(search_tree_.root.get_up_child());
+      initial_worker->lower_bound = initial_worker->node_queue.get_lower_bound();
+      initial_worker->set_active();
+      best_first_search_with(initial_worker);
     }
   }  // Implicit barrier for all tasks created within the group (RINS, B&B workers)
 
@@ -2877,31 +2913,28 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
     lower_bound    = deterministic_compute_lower_bound();
     solver_status_ = deterministic_global_termination_status_;
   } else {
-    if (node_queue_.best_first_queue_size() > 0) {
+    lower_bound = lower_bound_numerical_;
+
+    for (int i = 0; i < bfs_worker_pool_.size(); ++i) {
+      bfs_worker_t<i_t, f_t>* worker = bfs_worker_pool_[i];
+
       // We need to clear the queue and use the info in the search tree for the lower bound
-      while (node_queue_.best_first_queue_size() > 0) {
-        std::optional<mip_node_t<i_t, f_t>*> start_node = node_queue_.pop_best_first();
-
-        if (!start_node.has_value()) { continue; }
-        if (upper_bound_.load() < start_node.value()->lower_bound) {
-          // This node was put on the heap earlier but its lower bound is now greater than the
-          // current upper bound
-          search_tree_.graphviz_node(
-            settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound);
-          search_tree_.update(start_node.value(), node_status_t::FATHOMED);
-          continue;
-        } else {
-          node_queue_.push(
-            start_node.value());  // Needed to ensure we don't lose the correct lower bound
-          break;
-        }
+      while (worker->node_queue.best_first_queue_size() > 0 &&
+             worker->node_queue.get_lower_bound() > upper_bound_.load()) {
+        mip_node_t<i_t, f_t>* start_node = worker->node_queue.pop();
+        // This node was put on the heap earlier but its lower bound is now greater than the
+        // current upper bound
+        search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound);
+        search_tree_.update(start_node, node_status_t::FATHOMED);
+        --exploration_stats_.nodes_unexplored;
       }
-      lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound()
-                                                            : search_tree_.root.lower_bound;
-    } else {
-      lower_bound = search_tree_.root.lower_bound;
+
+      lower_bound = std::min(lower_bound, worker->node_queue.get_lower_bound());
     }
+
+    if (!std::isfinite(lower_bound)) { lower_bound = search_tree_.root.lower_bound; }
   }
+
   set_final_solution(solution, lower_bound);
   return solver_status_;
 }
@@ -3030,17 +3063,9 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
   deterministic_horizon_step_ = 0.50;
 
   // Compute worker counts using the same formula as reliability-branching scheduler
-  const i_t num_workers = settings_.num_threads;
-  std::vector<search_strategy_t> search_strategies =
-    get_search_strategies(settings_.diving_settings);
-  std::array<i_t, num_search_strategies> max_num_workers =
-    get_max_workers(num_workers, search_strategies);
-
-  const int num_bfs_workers = max_num_workers[search_strategy_t::BEST_FIRST];
-  int num_diving_workers    = 0;
-  for (size_t i = 1; i < search_strategies.size(); ++i) {
-    num_diving_workers += max_num_workers[search_strategies[i]];
-  }
+  const i_t num_workers        = settings_.num_threads;
+  const i_t num_bfs_workers    = std::max(num_workers / 2, 1);
+  const i_t num_diving_workers = num_workers - num_bfs_workers;
 
   deterministic_mode_enabled_              = true;
   deterministic_current_horizon_           = deterministic_horizon_step_;
@@ -3052,8 +3077,8 @@ void branch_and_bound_t<i_t, f_t>::run_deterministic_coordinator(const csr_matri
 
   if (num_diving_workers > 0) {
     // Extract diving types from search_strategies (skip BEST_FIRST at index 0)
-    std::vector<search_strategy_t> diving_types(search_strategies.begin() + 1,
-                                                search_strategies.end());
+    std::vector<search_strategy_t> diving_types(search_strategies + 1,
+                                                search_strategies + num_search_strategies);
 
     if (settings_.diving_settings.coefficient_diving != 0) {
       calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_);
@@ -3443,10 +3468,10 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
 
   // Solve LP relaxation
   worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols);
-  std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-  i_t node_iter                                = 0;
-  f_t lp_start_time                            = tic();
-  std::vector<f_t> leaf_edge_norms             = edge_norms_;
+  decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols, worker.leaf_vstatus);
+  i_t node_iter                    = 0;
+  f_t lp_start_time                = tic();
+  std::vector<f_t> leaf_edge_norms = edge_norms_;
 
   dual::status_t lp_status = dual_phase2_with_advanced_basis(2,
                                                              0,
@@ -3454,7 +3479,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                              lp_start_time,
                                                              worker.leaf_problem,
                                                              lp_settings,
-                                                             leaf_vstatus,
+                                                             worker.leaf_vstatus,
                                                              worker.basis_factors,
                                                              worker.basic_list,
                                                              worker.nonbasic_list,
@@ -3472,7 +3497,7 @@ node_status_t branch_and_bound_t<i_t, f_t>::solve_node_deterministic(
                                                                          worker.basis_factors,
                                                                          worker.basic_list,
                                                                          worker.nonbasic_list,
-                                                                         leaf_vstatus,
+                                                                         worker.leaf_vstatus,
                                                                          leaf_edge_norms,
                                                                          &worker.work_context);
     lp_status                 = convert_lp_status_to_dual_status(second_status);
@@ -3722,7 +3747,7 @@ void branch_and_bound_t<i_t, f_t>::deterministic_sort_replay_events(
   deterministic_merge_pseudo_cost_updates(*deterministic_workers_);
 
   for (const auto& worker : *deterministic_workers_) {
-    fetch_min(lower_bound_ceiling_, worker.local_lower_bound_ceiling);
+    fetch_min(lower_bound_numerical_, worker.local_lower_bound_ceiling);
   }
 }
 
@@ -3826,7 +3851,7 @@ template <typename i_t, typename f_t>
 f_t branch_and_bound_t<i_t, f_t>::deterministic_compute_lower_bound()
 {
   // Compute lower bound from BFS worker local structures only
-  f_t lower_bound = lower_bound_ceiling_.load();
+  f_t lower_bound = lower_bound_numerical_.load();
 
   // Check all BFS worker queues
   for (const auto& worker : *deterministic_workers_) {
@@ -3930,8 +3955,10 @@ void branch_and_bound_t<i_t, f_t>::deterministic_assign_diving_nodes()
       continue;             // this worker is full, try next one
     }
 
-    auto entry = diving_heap_.pop();
-    if (entry.has_value()) { worker.enqueue_dive_node(entry.value().node, original_lp_); }
+    if (!diving_heap_.empty()) {
+      auto entry = diving_heap_.pop();
+      worker.enqueue_dive_node(entry.node, original_lp_);
+    }
   }
 
   diving_heap_.clear();
@@ -3981,11 +4008,6 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
 {
   raft::common::nvtx::range scope("BB::deterministic_dive");
 
-  // Create local search tree for the dive
-  search_tree_t<i_t, f_t> dive_tree(std::move(entry.node));
-  std::deque<mip_node_t<i_t, f_t>*> stack;
-  stack.push_front(&dive_tree.root);
-
   worker.dive_lower = std::move(entry.resolved_lower);
   worker.dive_upper = std::move(entry.resolved_upper);
 
@@ -3995,6 +4017,11 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
   worker.lp_iters_this_dive         = 0;
   worker.recompute_bounds_and_basis = true;
 
+  // Create local search tree for the dive
+  search_tree_t<i_t, f_t> dive_tree(std::move(entry.node));
+  circular_deque_t<mip_node_t<i_t, f_t>*> stack(2 * max_backtrack_depth + 4);
+  stack.push_front(&dive_tree.root);
+
   while (!stack.empty() && deterministic_global_termination_status_ == mip_status_t::UNSET &&
          nodes_this_dive < max_nodes_per_dive) {
     mip_node_t<i_t, f_t>* node_ptr = stack.front();
@@ -4055,18 +4082,18 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
 
     // Solve LP relaxation
     worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols);
-    std::vector<variable_status_t>& leaf_vstatus = node_ptr->vstatus;
-    i_t node_iter                                = 0;
-    f_t lp_start_time                            = tic();
-    std::vector<f_t> leaf_edge_norms             = edge_norms_;
+    i_t node_iter                    = 0;
+    f_t lp_start_time                = tic();
+    std::vector<f_t> leaf_edge_norms = edge_norms_;
 
+    decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols, worker.leaf_vstatus);
     dual::status_t lp_status = dual_phase2_with_advanced_basis(2,
                                                                0,
                                                                worker.recompute_bounds_and_basis,
                                                                lp_start_time,
                                                                worker.leaf_problem,
                                                                lp_settings,
-                                                               leaf_vstatus,
+                                                               worker.leaf_vstatus,
                                                                worker.basis_factors,
                                                                worker.basic_list,
                                                                worker.nonbasic_list,
@@ -4083,7 +4110,7 @@ void branch_and_bound_t<i_t, f_t>::deterministic_dive(
                                                                            worker.basis_factors,
                                                                            worker.basic_list,
                                                                            worker.nonbasic_list,
-                                                                           leaf_vstatus,
+                                                                           worker.leaf_vstatus,
                                                                            leaf_edge_norms,
                                                                            &worker.work_context);
       lp_status                 = convert_lp_status_to_dual_status(second_status);
diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp
index 839335c8c9..0b32b2ece5 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.hpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.hpp
@@ -112,7 +112,7 @@ class branch_and_bound_t {
   }
 
   // Set a solution based on the user problem during the course of the solve
-  void set_new_solution(const std::vector<f_t>& solution);
+  void set_solution_from_heuristics(const std::vector<f_t>& solution);
 
   // This queues the solution to be processed at the correct work unit timestamp
   void queue_external_solution_deterministic(const std::vector<f_t>& solution, double work_unit_ts);
@@ -239,18 +239,14 @@ class branch_and_bound_t {
   // Pseudocosts
   pseudo_costs_t<i_t, f_t> pc_;
 
-  // Heap storing the nodes waiting to be explored.
-  node_queue_t<i_t, f_t> node_queue_;
-
   // Search tree
   search_tree_t<i_t, f_t> search_tree_;
 
-  // Count the number of workers per type that either are being executed or
-  // are waiting to be executed.
-  std::array<omp_atomic_t<i_t>, num_search_strategies> active_workers_per_strategy_;
+  // Worker pool dedicated to the best-first search
+  bfs_worker_pool_t<i_t, f_t> bfs_worker_pool_;
 
-  // Worker pool
-  branch_and_bound_worker_pool_t<i_t, f_t> worker_pool_;
+  // Worker pool dedicated to diving
+  diving_worker_pool_t<i_t, f_t> diving_worker_pool_;
 
   // Global status of the solver.
   omp_atomic_t<mip_status_t> solver_status_;
@@ -262,8 +258,9 @@ class branch_and_bound_t {
   i_t min_node_queue_size_;
 
   // In case, a best-first thread encounters a numerical issue when solving a node,
-  // its blocks the progression of the lower bound.
-  omp_atomic_t<f_t> lower_bound_ceiling_;
+  // its blocks the progression of the lower bound as it cannot explore the
+  // corresponding subtree.
+  omp_atomic_t<f_t> lower_bound_numerical_;
   std::function<void(f_t)> user_bound_callback_;
 
   void report_heuristic(f_t obj);
@@ -317,22 +314,26 @@ class branch_and_bound_t {
   // Repairs low-quality solutions from the heuristics, if it is applicable.
   void repair_heuristic_solutions();
 
+  // Launch a new diving worker from a given best-first worker.
+  bool launch_diving_worker(bfs_worker_t<i_t, f_t>* bfs_worker);
+
+  // Launch a new best-first worker from a given bfs worker.
+  void launch_bfs_worker(bfs_worker_t<i_t, f_t>* worker);
+
+  // Perform best-first search with a given bfs worker.
+  void best_first_search_with(bfs_worker_t<i_t, f_t>* worker);
+
   // We use best-first to pick the `start_node` and then perform a depth-first search
   // from this node (i.e., a plunge). It can only backtrack to a sibling node.
   // Unexplored nodes in the subtree are inserted back into the global heap.
-  void plunge_with(branch_and_bound_worker_t<i_t, f_t>* worker);
+  void plunge_with(bfs_worker_t<i_t, f_t>* worker, mip_node_t<i_t, f_t>* start_node);
+
+  // A worker attempts to steal nodes from another worker
+  void work_stealing(bfs_worker_t<i_t, f_t>* worker);
 
   // Perform a deep dive in the subtree determined by the `start_node` in order
   // to find integer feasible solutions.
-  void dive_with(branch_and_bound_worker_t<i_t, f_t>* worker);
-
-  // Run the scheduler whose will schedule and manage
-  // all the other workers.
-  void run_scheduler();
-
-  // Run the branch-and-bound algorithm in single threaded mode.
-  // This disable all diving heuristics.
-  void single_threaded_solve();
+  void dive_with(diving_worker_t<i_t, f_t>* worker);
 
   // Solve the LP relaxation of a leaf node
   dual::status_t solve_node_lp(mip_node_t<i_t, f_t>* node_ptr,
diff --git a/cpp/src/branch_and_bound/constants.hpp b/cpp/src/branch_and_bound/constants.hpp
index 39bfa0bf3a..fed422ef89 100644
--- a/cpp/src/branch_and_bound/constants.hpp
+++ b/cpp/src/branch_and_bound/constants.hpp
@@ -24,6 +24,9 @@ enum search_strategy_t : int {
   COEFFICIENT_DIVING = 4   // Coefficient diving (9.2.1)
 };
 
+constexpr search_strategy_t search_strategies[] = {
+  BEST_FIRST, PSEUDOCOST_DIVING, LINE_SEARCH_DIVING, GUIDED_DIVING, COEFFICIENT_DIVING};
+
 enum class branch_direction_t { NONE = -1, DOWN = 0, UP = 1 };
 
 enum class branch_and_bound_mode_t { PARALLEL = 0, DETERMINISTIC = 1 };
diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp
index 88c4f0e940..fb1c0f450f 100644
--- a/cpp/src/branch_and_bound/deterministic_workers.hpp
+++ b/cpp/src/branch_and_bound/deterministic_workers.hpp
@@ -195,8 +195,8 @@ class deterministic_bfs_worker_t
       plunge_stack.pop_front();
       return node;
     }
-    auto node_opt = backlog.pop();
-    return node_opt.has_value() ? node_opt.value() : nullptr;
+
+    return !backlog.empty() ? backlog.pop() : nullptr;
   }
 
   size_t queue_size() const
diff --git a/cpp/src/branch_and_bound/diving_heuristics.cpp b/cpp/src/branch_and_bound/diving_heuristics.cpp
index a0bb731c1e..4989e48c6a 100644
--- a/cpp/src/branch_and_bound/diving_heuristics.cpp
+++ b/cpp/src/branch_and_bound/diving_heuristics.cpp
@@ -92,7 +92,7 @@ branch_variable_t<i_t> pseudocost_diving(pseudo_costs_t<i_t, f_t>& pc,
     f_t score              = 0;
     branch_direction_t dir = branch_direction_t::DOWN;
 
-    f_t root_val = (j < static_cast<i_t>(root_solution.size())) ? root_solution[j] : solution[j];
+    f_t root_val = root_solution[j];
 
     if (solution[j] < root_val - f_t(0.4)) {
       score = score_down;
diff --git a/cpp/src/branch_and_bound/mip_node.hpp b/cpp/src/branch_and_bound/mip_node.hpp
index 39150eb4fe..2c0968ffce 100644
--- a/cpp/src/branch_and_bound/mip_node.hpp
+++ b/cpp/src/branch_and_bound/mip_node.hpp
@@ -91,7 +91,7 @@ class mip_node_t {
       branch_var_upper(std::numeric_limits<f_t>::infinity()),
       fractional_val(std::numeric_limits<f_t>::infinity()),
       objective_estimate(std::numeric_limits<f_t>::infinity()),
-      vstatus(0)
+      packed_vstatus(0)
   {
     children[0] = nullptr;
     children[1] = nullptr;
@@ -107,7 +107,7 @@ class mip_node_t {
       branch_dir(branch_direction_t::NONE),
       integer_infeasible(-1),
       objective_estimate(std::numeric_limits<f_t>::infinity()),
-      vstatus(basis)
+      packed_vstatus(compress_vstatus(basis))
   {
     children[0] = nullptr;
     children[1] = nullptr;
@@ -131,7 +131,7 @@ class mip_node_t {
       fractional_val(branch_var_value),
       integer_infeasible(integer_inf),
       objective_estimate(parent_node->objective_estimate),
-      vstatus(basis)
+      packed_vstatus(compress_vstatus(basis))
   {
     branch_var_lower = branch_direction == branch_direction_t::DOWN ? problem.lower[branch_var]
                                                                     : std::ceil(branch_var_value);
@@ -186,7 +186,7 @@ class mip_node_t {
     children[0] = std::move(down_child);
     children[1] = std::move(up_child);
     // When we add children we no longer need to store our basis
-    vstatus.clear();
+    packed_vstatus = {};
   }
 
   bool is_inactive() const
@@ -274,7 +274,7 @@ class mip_node_t {
   // This method creates a copy of the current node
   // with its parent set to `nullptr`
   // This detaches the node from the tree.
-  mip_node_t<i_t, f_t> detach_copy() const
+  mip_node_t detach_copy() const
   {
     mip_node_t<i_t, f_t> copy;
     copy.lower_bound        = lower_bound;
@@ -282,7 +282,7 @@ class mip_node_t {
     copy.depth              = depth;
     copy.node_id            = node_id;
     copy.integer_infeasible = integer_infeasible;
-    copy.vstatus            = vstatus;
+    copy.packed_vstatus     = packed_vstatus;
     copy.branch_var         = branch_var;
     copy.branch_dir         = branch_dir;
     copy.branch_var_lower   = branch_var_lower;
@@ -315,7 +315,11 @@ class mip_node_t {
   mip_node_t<i_t, f_t>* parent;
   std::unique_ptr<mip_node_t> children[2];
 
-  std::vector<variable_status_t> vstatus;
+  std::vector<uint8_t> packed_vstatus;
+
+  // Indicate if we can dive from this node or not. This is set to false when
+  // this node was already selected for diving once.
+  omp_atomic_t<bool> can_dive{true};
 
   // Cumulative orbital fixing bound changes from root to this node.
   // Stored so that when a child starts a new plunge, the parent's
diff --git a/cpp/src/branch_and_bound/node_queue.hpp b/cpp/src/branch_and_bound/node_queue.hpp
index 09d030c96e..b07b371b81 100644
--- a/cpp/src/branch_and_bound/node_queue.hpp
+++ b/cpp/src/branch_and_bound/node_queue.hpp
@@ -10,7 +10,6 @@
 #include <algorithm>
 #include <memory>
 #include <mutex>
-#include <optional>
 #include <utility>
 #include <vector>
 
@@ -29,12 +28,14 @@ class heap_t {
   {
     buffer.push_back(node);
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
   }
 
   void push(T&& node)
   {
     buffer.push_back(std::move(node));
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
   }
 
   template <typename... Args>
@@ -42,39 +43,125 @@ class heap_t {
   {
     buffer.emplace_back(std::forward<Args>(args)...);
     std::push_heap(buffer.begin(), buffer.end(), comp);
+    ++num_entries_;
+    assert(num_entries_.load() == buffer.size());
   }
 
-  std::optional<T> pop()
+  T pop()
   {
-    if (buffer.empty()) return std::nullopt;
-
+    --num_entries_;
     std::pop_heap(buffer.begin(), buffer.end(), comp);
     T node = std::move(buffer.back());
     buffer.pop_back();
+    assert(num_entries_.load() == buffer.size());
     return node;
   }
 
-  size_t size() const { return buffer.size(); }
+  size_t size() const { return num_entries_; }
   T& top() { return buffer.front(); }
-  void clear() { buffer.clear(); }
-  bool empty() const { return buffer.empty(); }
+
+  void clear()
+  {
+    buffer.clear();
+    num_entries_ = 0;
+  }
+
+  bool empty() const { return num_entries_ == 0; }
 
   // Read-only access to underlying buffer for iteration without modification
   const std::vector<T>& data() const { return buffer; }
 
  private:
   std::vector<T> buffer;
+  omp_atomic_t<size_t> num_entries_{0};
   Comp comp;
 };
 
-// A queue storing the nodes waiting to be explored/dived from.
+// A queue storing the nodes waiting to be explored.
 template <typename i_t, typename f_t>
 class node_queue_t {
+ public:
+  void push_atomic(mip_node_t<i_t, f_t>* new_node)
+  {
+    std::lock_guard lock(mutex_);
+    push_lockfree(new_node);
+  }
+
+  void push_lockfree(mip_node_t<i_t, f_t>* new_node)
+  {
+    assert(new_node != nullptr);
+    auto entry = std::make_shared<heap_entry_t>(new_node);
+    best_first_heap_.push(entry);
+    if (new_node->can_dive) diving_heap_.push(entry);
+    lower_bound_ = best_first_heap_.top()->lower_bound;
+  }
+
+  mip_node_t<i_t, f_t>* pop()
+  {
+    std::lock_guard lock(mutex_);
+    if (best_first_heap_.empty()) { return nullptr; }
+    auto entry                 = best_first_heap_.pop();
+    lower_bound_               = best_first_heap_.empty() ? std::numeric_limits<f_t>::infinity()
+                                                          : best_first_heap_.top()->lower_bound;
+    mip_node_t<i_t, f_t>* node = std::exchange(entry->node, nullptr);
+    assert(node != nullptr);
+    return node;
+  }
+
+  bool diving_init(const lp_problem_t<i_t, f_t>& lp,
+                   mip_node_t<i_t, f_t>& start_node,
+                   std::vector<f_t>& start_lower,
+                   std::vector<f_t>& start_upper,
+                   std::vector<bool>& bounds_changed)
+  {
+    std::lock_guard lock(mutex_);
+
+    auto node = pop_diving();
+    if (!node) return false;
+
+    start_node  = node->detach_copy();
+    start_lower = lp.lower;
+    start_upper = lp.upper;
+    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
+    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
+    return true;
+  }
+
+  bool steal_from(node_queue_t& victim, i_t nodes_to_steal)
+  {
+    assert(this != &victim);
+
+    bool steal = false;
+    std::scoped_lock lock(mutex_, victim.mutex_);
+
+    for (i_t k = 0; k < nodes_to_steal; ++k) {
+      if (victim.best_first_heap_.size() < nodes_to_steal) break;
+      auto entry = victim.best_first_heap_.pop();
+
+      // Invalidate the node for diving
+      mip_node_t<i_t, f_t>* node = std::exchange(entry->node, nullptr);
+      push_lockfree(node);
+      victim.lower_bound_ = victim.best_first_heap_.empty()
+                              ? std::numeric_limits<f_t>::infinity()
+                              : victim.best_first_heap_.top()->lower_bound;
+      steal               = true;
+    }
+    return steal;
+  }
+
+  i_t diving_queue_size() { return diving_heap_.size(); }
+  i_t best_first_queue_size() { return best_first_heap_.size(); }
+
+  f_t get_lower_bound()
+  {
+    return best_first_heap_.empty() ? std::numeric_limits<f_t>::infinity() : lower_bound_.load();
+  }
+
  private:
   struct heap_entry_t {
     mip_node_t<i_t, f_t>* node = nullptr;
-    f_t lower_bound            = -inf;
-    f_t score                  = inf;
+    f_t lower_bound            = -std::numeric_limits<f_t>::infinity();
+    f_t score                  = std::numeric_limits<f_t>::infinity();
 
     heap_entry_t(mip_node_t<i_t, f_t>* new_node)
       : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate)
@@ -102,67 +189,24 @@ class node_queue_t {
     }
   };
 
-  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap;
-  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap;
-  omp_mutex_t mutex;
-
- public:
-  void push(mip_node_t<i_t, f_t>* new_node)
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    auto entry = std::make_shared<heap_entry_t>(new_node);
-    best_first_heap.push(entry);
-    diving_heap.push(entry);
-  }
-
-  std::optional<mip_node_t<i_t, f_t>*> pop_best_first()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    auto entry = best_first_heap.pop();
-
-    if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); }
-
-    return std::nullopt;
-  }
-
-  std::optional<mip_node_t<i_t, f_t>*> pop_diving()
+  mip_node_t<i_t, f_t>* pop_diving()
   {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-
-    while (!diving_heap.empty()) {
-      auto entry = diving_heap.pop();
-
-      if (entry.has_value()) {
-        if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; }
+    while (!diving_heap_.empty()) {
+      auto entry = diving_heap_.pop();
+      if (entry->node != nullptr) {
+        entry->node->can_dive = false;
+        return entry->node;
       }
     }
 
-    return std::nullopt;
+    return nullptr;
   }
 
-  i_t diving_queue_size()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return diving_heap.size();
-  }
-
-  i_t best_first_queue_size()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.size();
-  }
+  heap_t<std::shared_ptr<heap_entry_t>, lower_bound_comp> best_first_heap_;
+  heap_t<std::shared_ptr<heap_entry_t>, score_comp> diving_heap_;
+  omp_mutex_t mutex_;
 
-  f_t get_lower_bound()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound;
-  }
-
-  mip_node_t<i_t, f_t>* bfs_top()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex);
-    return best_first_heap.empty() ? nullptr : best_first_heap.top()->node;
-  }
+  omp_atomic_t<f_t> lower_bound_{std::numeric_limits<f_t>::infinity()};
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index e71ec987d3..fc3ee3bb3f 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -1080,7 +1080,7 @@ void strong_branching_reduced(const lp_problem_t<i_t, f_t>& original_lp,
                                           strong_branch_up);
   } else {
     if (effective_batch_pdlp != 0) {
-#pragma omp task default(shared)
+#pragma omp task default(shared) priority(CUOPT_HIGH_TASK_PRIORITY)
       batch_pdlp_strong_branching_task(settings,
                                        effective_batch_pdlp,
                                        start_time,
@@ -1100,10 +1100,9 @@ void strong_branching_reduced(const lp_problem_t<i_t, f_t>& original_lp,
       i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
 // Here we are creating more tasks than the number of threads
 // such that they can be scheduled dynamically to the threads.
-#pragma omp taskloop num_tasks(n) default(shared)
+#pragma omp taskloop num_tasks(n) default(shared) priority(CUOPT_DEFAULT_TASK_PRIORITY)
       for (i_t k = 0; k < n; ++k) {
-        i_t start = std::floor(k * fractional.size() / n);
-        i_t end   = std::floor((k + 1) * fractional.size() / n);
+        auto [start, end] = calculate_index_range(k, fractional.size(), n);
 
         constexpr bool verbose = false;
         if (verbose) {
@@ -1506,8 +1505,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   // If `reliable_threshold == 0`, then we set the uninitialized pseudocosts to the average.
   // Otherwise, the best ones are initialized via strong branching, while the other are ignored.  //
-  // In the latter, we are not using the average pseudocost (which calculated in the `initialized`
-  // method).
+  // So we only need to initialize the average for the former.
   if (reliable_threshold == 0) {
     avg_down = compute_pseudocost_average_down();
     avg_up   = compute_pseudocost_average_up();
@@ -1613,14 +1611,12 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       min_percent_solved_by_batch_pdlp_at_root_for_pdlp);
   }
 
-  const int num_tasks     = std::max(max_num_tasks, 1);
-  const int task_priority = reliability_branching_settings.task_priority;
+  const int num_tasks = std::max(max_num_tasks, 1);
   // If both batch PDLP and DS are used we double the max number of candidates
   const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates
                                           : reliability_branching_settings.max_num_candidates;
   const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
 
-  assert(task_priority > 0);
   assert(max_num_candidates > 0);
   assert(num_candidates > 0);
   assert(num_tasks > 0);
@@ -1662,7 +1658,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             single_pivot_objective_change_estimate(worker->leaf_problem,
                                                    settings,
                                                    local_Arow,
-                                                   node_ptr->vstatus,
+                                                   worker->leaf_vstatus,
                                                    j,
                                                    basic_map[j],
                                                    leaf_solution,
@@ -1714,7 +1710,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   std::atomic<int> concurrent_halt{0};
 
   if (use_pdlp) {
-#pragma omp task default(shared)
+#pragma omp task default(shared) priority(CUOPT_HIGH_TASK_PRIORITY)
     batch_pdlp_reliability_branching_task(settings.log,
                                           rb_mode,
                                           num_candidates,
@@ -1749,7 +1745,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   f_t dual_simplex_start_time = tic();
 
   if (rb_mode != 2) {
-#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(shared)
+#pragma omp taskloop if (num_tasks > 1) priority(CUOPT_HIGH_TASK_PRIORITY) \
+  num_tasks(num_tasks) default(shared)
     for (i_t i = 0; i < num_candidates; ++i) {
       auto [score, j] = unreliable_list[i];
 
@@ -1766,7 +1763,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           const auto [obj, status] = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
-                                                     node_ptr->vstatus,
+                                                     worker->leaf_vstatus,
                                                      worker->leaf_edge_norms,
                                                      worker->basis_factors,
                                                      worker->basic_list,
@@ -1811,7 +1808,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           const auto [obj, status] = trial_branching(worker->leaf_problem,
                                                      settings,
                                                      var_types,
-                                                     node_ptr->vstatus,
+                                                     worker->leaf_vstatus,
                                                      worker->leaf_edge_norms,
                                                      worker->basis_factors,
                                                      worker->basic_list,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 5f80a7915b..a370a00de8 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -38,10 +38,6 @@ struct reliability_branching_settings_t {
   // Upper bound for the maximum number of LP iterations for a single trial branching
   i_t upper_max_lp_iter = 500;
 
-  // Priority of the tasks created when running the trial branching in parallel.
-  // Set to 1 to have the same priority as the other tasks.
-  i_t task_priority = 5;
-
   // The maximum number of candidates initialized by strong branching in a single
   // node
   i_t max_num_candidates = 100;
diff --git a/cpp/src/branch_and_bound/worker.hpp b/cpp/src/branch_and_bound/worker.hpp
index cf227e561f..29a3164dfc 100644
--- a/cpp/src/branch_and_bound/worker.hpp
+++ b/cpp/src/branch_and_bound/worker.hpp
@@ -9,6 +9,7 @@
 
 #include <branch_and_bound/constants.hpp>
 #include <branch_and_bound/mip_node.hpp>
+#include <branch_and_bound/node_queue.hpp>
 #include <branch_and_bound/symmetry.hpp>
 
 #include <dual_simplex/basis_updates.hpp>
@@ -16,7 +17,6 @@
 
 #include <utilities/pcgenerator.hpp>
 
-#include <deque>
 #include <vector>
 
 namespace cuopt::linear_programming::dual_simplex {
@@ -42,16 +42,36 @@ struct branch_and_bound_stats_t {
   omp_atomic_t<int64_t> lexical_reduction_pruned_nodes    = 0;
 };
 
+template <typename f_t, typename i_t>
+bool is_search_strategy_enabled(search_strategy_t strategy,
+                                bool has_incumbent,
+                                diving_heuristics_settings_t<i_t, f_t> settings)
+{
+  switch (strategy) {
+    case BEST_FIRST: return true;
+    case PSEUDOCOST_DIVING: return settings.pseudocost_diving != 0;
+    case LINE_SEARCH_DIVING: return settings.line_search_diving != 0;
+    case GUIDED_DIVING: return settings.guided_diving != 0 && has_incumbent;
+    case COEFFICIENT_DIVING: return settings.coefficient_diving != 0;
+  }
+
+  return false;
+}
+
 template <typename i_t, typename f_t>
 class branch_and_bound_worker_t {
  public:
-  const i_t worker_id;
+  using float_type = f_t;
+  using int_type   = i_t;
+
+  i_t worker_id;
   omp_atomic_t<search_strategy_t> search_strategy;
   omp_atomic_t<bool> is_active;
   omp_atomic_t<f_t> lower_bound;
 
   lp_problem_t<i_t, f_t> leaf_problem;
   lp_solution_t<i_t, f_t> leaf_solution;
+  std::vector<variable_status_t> leaf_vstatus;
   std::vector<f_t> leaf_edge_norms;
 
   basis_update_mpf_t<i_t, f_t> basis_factors;
@@ -63,7 +83,6 @@ class branch_and_bound_worker_t {
 
   std::vector<f_t> start_lower;
   std::vector<f_t> start_upper;
-  mip_node_t<i_t, f_t>* start_node;
 
   pcgenerator_t rng;
 
@@ -89,53 +108,23 @@ class branch_and_bound_worker_t {
                             const lp_problem_t<i_t, f_t>& original_lp,
                             const csr_matrix_t<i_t, f_t>& Arow,
                             const std::vector<variable_type_t>& var_type,
-                            const simplex_solver_settings_t<i_t, f_t>& settings)
+                            const simplex_solver_settings_t<i_t, f_t>& settings,
+                            uint64_t rng_offset = 0)
     : worker_id(worker_id),
       search_strategy(BEST_FIRST),
       is_active(false),
       lower_bound(-std::numeric_limits<f_t>::infinity()),
       leaf_problem(original_lp),
       leaf_solution(original_lp.num_rows, original_lp.num_cols),
+      leaf_vstatus(original_lp.num_cols),
       basis_factors(original_lp.num_rows, settings.refactor_frequency),
       basic_list(original_lp.num_rows),
       nonbasic_list(),
       node_presolver(leaf_problem, Arow, {}, var_type),
       bounds_changed(original_lp.num_cols, false),
-      rng(settings.random_seed + pcgenerator_t::default_seed + worker_id,
-          pcgenerator_t::default_stream ^ worker_id)
-  {
-  }
-
-  // Set the `start_node` for best-first search.
-  void init_best_first(mip_node_t<i_t, f_t>* node, const lp_problem_t<i_t, f_t>& original_lp)
+      rng(settings.random_seed + pcgenerator_t::default_seed + rng_offset + worker_id,
+          pcgenerator_t::default_stream ^ (worker_id + rng_offset))
   {
-    start_node      = node;
-    start_lower     = original_lp.lower;
-    start_upper     = original_lp.upper;
-    search_strategy = BEST_FIRST;
-    lower_bound     = node->lower_bound;
-    is_active       = true;
-  }
-
-  // Initialize the worker for diving, setting the `start_node`, `start_lower` and
-  // `start_upper`. Returns `true` if the starting node is feasible via
-  // bounds propagation.
-  bool init_diving(mip_node_t<i_t, f_t>* node,
-                   search_strategy_t type,
-                   const lp_problem_t<i_t, f_t>& original_lp,
-                   const simplex_solver_settings_t<i_t, f_t>& settings)
-  {
-    internal_node   = node->detach_copy();
-    start_node      = &internal_node;
-    start_lower     = original_lp.lower;
-    start_upper     = original_lp.upper;
-    search_strategy = type;
-    lower_bound     = node->lower_bound;
-    is_active       = true;
-
-    std::fill(bounds_changed.begin(), bounds_changed.end(), false);
-    node->get_variable_bounds(start_lower, start_upper, bounds_changed);
-    return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper);
   }
 
   // Set the variables bounds for the LP relaxation in the current node.
@@ -160,14 +149,127 @@ class branch_and_bound_worker_t {
       settings, bounds_changed, leaf_problem.lower, leaf_problem.upper);
   }
 
- private:
-  // For diving, we need to store the full node instead of
-  // of just a pointer, since it is not stored in the tree anymore.
-  // To keep the same interface across all worker types,
-  // this will be used as a temporary storage and
-  // will be pointed by `start_node`.
-  // For exploration, this will not be used.
-  mip_node_t<i_t, f_t> internal_node;
+  void set_active() { is_active = true; }
+};
+
+template <typename i_t, typename f_t>
+class bfs_worker_t : public branch_and_bound_worker_t<i_t, f_t> {
+ public:
+  using Base = branch_and_bound_worker_t<i_t, f_t>;
+  bfs_worker_t(i_t worker_id,
+               const lp_problem_t<i_t, f_t>& original_lp,
+               const csr_matrix_t<i_t, f_t>& Arow,
+               const std::vector<variable_type_t>& var_type,
+               const simplex_solver_settings_t<i_t, f_t>& settings,
+               uint64_t rng_offset = 0)
+    : Base(worker_id, original_lp, Arow, var_type, settings, rng_offset)
+  {
+    this->start_lower     = original_lp.lower;
+    this->start_upper     = original_lp.upper;
+    this->search_strategy = BEST_FIRST;
+
+    max_diving_workers.fill(0);
+    active_diving_workers.fill(0);
+    total_active_diving_workers = 0;
+  }
+
+  void set_inactive() { this->is_active = false; }
+
+  // Steal nodes from another worker
+  bool steal_from(bfs_worker_t* victim, i_t nodes_to_steal)
+  {
+    if (!victim || nodes_to_steal < 1) return false;
+    if (victim == this || !victim->is_active ||
+        victim->node_queue.best_first_queue_size() < 2 * nodes_to_steal) {
+      return false;
+    }
+
+    return node_queue.steal_from(victim->node_queue, nodes_to_steal);
+  }
+
+  // Calculate the number of diving workers that this worker can launch. Having a fixed number
+  // of workers allows the solver to be more deterministic.
+  void calculate_num_diving_workers(i_t num_bfs_workers,
+                                    i_t total_diving_workers,
+                                    bool has_incumbent,
+                                    const diving_heuristics_settings_t<i_t, f_t>& settings)
+  {
+    i_t num_active = 0;
+    for (i_t i = 1; i < num_search_strategies; ++i) {
+      num_active += is_search_strategy_enabled(search_strategies[i], has_incumbent, settings);
+    }
+
+    total_max_diving_workers = 0;
+    max_diving_workers.fill(0);
+    if (num_active == 0) { return; }
+
+    for (size_t i = 1, k = 0; i < num_search_strategies; ++i) {
+      if (is_search_strategy_enabled(search_strategies[i], has_incumbent, settings)) {
+        // Calculate the number of workers for a given diving heuristic
+        auto [type_start, type_end] = calculate_index_range(k, total_diving_workers, num_active);
+        i_t workers_per_type        = type_end - type_start;
+
+        // Calculate the number of diving workers allocated to this (best-first) worker
+        auto [start, end] =
+          calculate_index_range(this->worker_id, workers_per_type, num_bfs_workers);
+        max_diving_workers[i] = end - start;
+        total_max_diving_workers += max_diving_workers[i];
+        ++k;
+      }
+    }
+  }
+
+  // The worker-local node heap.
+  node_queue_t<i_t, f_t> node_queue;
+
+  // The number of diving workers of each type that this (best-first) worker can launch.
+  std::array<i_t, num_search_strategies> max_diving_workers;
+
+  // The number of active diving workers of each type associated with this (best-first) worker.
+  std::array<omp_atomic_t<i_t>, num_search_strategies> active_diving_workers;
+
+  // Keep track of the total number of active diving worker that are associated with this
+  // (best-first) worker
+  omp_atomic_t<i_t> total_active_diving_workers{0};
+
+  // The maximum number of diving worker that are associated with this
+  // (best-first) worker
+  i_t total_max_diving_workers{0};
+};
+
+template <typename i_t, typename f_t>
+class diving_worker_t : public branch_and_bound_worker_t<i_t, f_t> {
+ public:
+  using Base = branch_and_bound_worker_t<i_t, f_t>;
+  using Base::Base;
+
+  // Apply bound strengthening to the starting variable bounds
+  bool presolve_start_bounds(const simplex_solver_settings_t<i_t, f_t>& settings)
+  {
+    return this->node_presolver.bounds_strengthening(
+      settings, this->bounds_changed, this->start_lower, this->start_upper);
+  }
+
+  // Set this node inactive
+  void set_inactive()
+  {
+    if (!this->is_active.load()) { return; }
+    assert(bfs_worker != nullptr);
+    assert(bfs_worker->active_diving_workers[this->search_strategy].load() > 0);
+    assert(bfs_worker->total_active_diving_workers.load() > 0);
+
+    this->is_active = false;
+    --bfs_worker->active_diving_workers[this->search_strategy];
+    --bfs_worker->total_active_diving_workers;
+  }
+
+  f_t get_lower_bound() { return this->lower_bound; }
+
+  mip_node_t<i_t, f_t> start_node;
+
+  // The best-first worker that is associated with this diving worker. Used for controlling the
+  // number of active diving workers.
+  bfs_worker_t<i_t, f_t>* bfs_worker{nullptr};
 };
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/branch_and_bound/worker_pool.hpp b/cpp/src/branch_and_bound/worker_pool.hpp
index 01df7b0520..d75794bbf0 100644
--- a/cpp/src/branch_and_bound/worker_pool.hpp
+++ b/cpp/src/branch_and_bound/worker_pool.hpp
@@ -8,127 +8,102 @@
 #pragma once
 
 #include <branch_and_bound/worker.hpp>
+#include <utilities/circular_deque.hpp>
 
 namespace cuopt::linear_programming::dual_simplex {
 
-template <typename i_t, typename f_t>
-class branch_and_bound_worker_pool_t {
+template <typename WorkerType>
+class worker_pool_t {
  public:
+  using i_t = typename WorkerType::int_type;
+  using f_t = typename WorkerType::float_type;
+
   void init(i_t num_workers,
             const lp_problem_t<i_t, f_t>& original_lp,
             const csr_matrix_t<i_t, f_t>& Arow,
             const std::vector<variable_type_t>& var_type,
             mip_symmetry_t<i_t, f_t>* symmetry,
-            const simplex_solver_settings_t<i_t, f_t>& settings)
+            const simplex_solver_settings_t<i_t, f_t>& settings,
+            const uint64_t rng_offset = 0)
   {
+    assert(!is_initialized_);
+    assert(num_workers > 0);
+
     workers_.resize(num_workers);
     num_idle_workers_ = num_workers;
+    idle_workers_.clear_resize(num_workers);
     for (i_t i = 0; i < num_workers; ++i) {
-      workers_[i] = std::make_unique<branch_and_bound_worker_t<i_t, f_t>>(
-        i, original_lp, Arow, var_type, settings);
+      workers_[i] =
+        std::make_unique<WorkerType>(i, original_lp, Arow, var_type, settings, rng_offset);
+      idle_workers_.push_back(i);
       // Propagate the (possibly null) symmetry pointer; workers lazily build
       // their orbital_fixing/lexical_reduction state via ensure_orbital_fixing().
       workers_[i]->symmetry_ptr = symmetry;
-      idle_workers_.push_front(i);
     }
 
-    is_initialized = true;
+    is_initialized_ = true;
   }
 
-  // Here, we are assuming that the scheduler is the only
-  // thread that can retrieve/pop an idle worker.
-  branch_and_bound_worker_t<i_t, f_t>* get_idle_worker()
+  WorkerType* pop_idle_worker()
   {
-    std::lock_guard<omp_mutex_t> lock(mutex_);
+    std::lock_guard lock(mutex_);
     if (idle_workers_.empty()) {
       return nullptr;
     } else {
       i_t idx = idle_workers_.front();
-      return workers_[idx].get();
-    }
-  }
-
-  // Here, we are assuming that the scheduler is the only
-  // thread that can retrieve/pop an idle worker.
-  void pop_idle_worker()
-  {
-    std::lock_guard<omp_mutex_t> lock(mutex_);
-    if (!idle_workers_.empty()) {
       idle_workers_.pop_front();
       num_idle_workers_--;
+      assert(idle_workers_.size() == static_cast<size_t>(num_idle_workers_.load()));
+      assert(idx >= 0 && static_cast<size_t>(idx) < workers_.size());
+      return workers_[idx].get();
     }
   }
 
-  void return_worker_to_pool(branch_and_bound_worker_t<i_t, f_t>* worker)
+  void return_worker_to_pool(WorkerType* worker)
   {
-    worker->is_active = false;
-    std::lock_guard<omp_mutex_t> lock(mutex_);
+    std::lock_guard lock(mutex_);
+    assert(worker != nullptr);
+    assert(workers_[worker->worker_id].get() == worker);
+    assert(static_cast<size_t>(num_idle_workers_.load()) == idle_workers_.size());
+    assert(idle_workers_.size() <= workers_.size());
+
+    worker->set_inactive();
+    assert(!worker->is_active.load());
     idle_workers_.push_back(worker->worker_id);
     num_idle_workers_++;
   }
 
-  f_t get_lower_bound()
+  WorkerType* operator[](i_t id)
   {
-    f_t lower_bound = std::numeric_limits<f_t>::infinity();
-
-    if (is_initialized) {
-      for (i_t i = 0; i < workers_.size(); ++i) {
-        if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) {
-          lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound);
-        }
-      }
-    }
-
-    return lower_bound;
+    assert(id >= 0 && static_cast<size_t>(id) < workers_.size());
+    assert(workers_[id] != nullptr);
+    return workers_[id].get();
+  }
+  WorkerType* operator[](i_t id) const
+  {
+    assert(id >= 0 && static_cast<size_t>(id) < workers_.size());
+    assert(workers_[id] != nullptr);
+    return workers_[id].get();
   }
 
-  i_t num_idle_workers() { return num_idle_workers_; }
+  bool is_initialized() const { return is_initialized_; }
+
+  i_t num_idle() const { return num_idle_workers_; }
+  i_t size() const { return workers_.size(); }
 
  private:
-  // Worker pool
-  std::vector<std::unique_ptr<branch_and_bound_worker_t<i_t, f_t>>> workers_;
-  bool is_initialized = false;
+  std::vector<std::unique_ptr<WorkerType>> workers_;
+  bool is_initialized_ = false;
 
   omp_mutex_t mutex_;
-  std::deque<i_t> idle_workers_;
-  omp_atomic_t<i_t> num_idle_workers_;
+  circular_deque_t<i_t> idle_workers_;
+  omp_atomic_t<i_t> num_idle_workers_{0};
 };
 
-template <typename f_t, typename i_t>
-std::vector<search_strategy_t> get_search_strategies(
-  diving_heuristics_settings_t<i_t, f_t> settings)
-{
-  std::vector<search_strategy_t> types;
-  types.reserve(num_search_strategies);
-  types.push_back(BEST_FIRST);
-  if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); }
-  if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); }
-  if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); }
-  if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); }
-  return types;
-}
-
-template <typename i_t>
-std::array<i_t, num_search_strategies> get_max_workers(
-  i_t num_workers, const std::vector<search_strategy_t>& strategies)
-{
-  std::array<i_t, num_search_strategies> max_num_workers;
-  max_num_workers.fill(0);
-
-  i_t bfs_workers             = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1);
-  max_num_workers[BEST_FIRST] = bfs_workers;
-
-  i_t diving_workers = (num_workers - bfs_workers);
-  i_t m              = strategies.size() - 1;
-
-  for (size_t i = 1, k = 0; i < strategies.size(); ++i) {
-    i_t start                      = (double)k * diving_workers / m;
-    i_t end                        = (double)(k + 1) * diving_workers / m;
-    max_num_workers[strategies[i]] = end - start;
-    ++k;
-  }
+template <typename i_t, typename f_t>
+using bfs_worker_pool_t = worker_pool_t<bfs_worker_t<i_t, f_t>>;
 
-  return max_num_workers;
-}
+template <typename i_t, typename f_t>
+using diving_worker_pool_t = worker_pool_t<diving_worker_t<i_t, f_t>>;
 
 }  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp
index d69bf8877e..e415f03672 100644
--- a/cpp/src/dual_simplex/initial_basis.cpp
+++ b/cpp/src/dual_simplex/initial_basis.cpp
@@ -18,6 +18,88 @@
 
 namespace cuopt::linear_programming::dual_simplex {
 
+uint8_t encode(variable_status_t vstatus)
+{
+  assert(vstatus != variable_status_t::SUPERBASIC &&
+         "packed_variable_status_t does not support superbasic variables");
+
+  uint8_t val = 0;
+  switch (vstatus) {
+    case variable_status_t::BASIC: val = 0b00; break;
+    case variable_status_t::NONBASIC_LOWER: val = 0b01; break;
+    case variable_status_t::NONBASIC_UPPER: val = 0b10; break;
+    case variable_status_t::NONBASIC_FIXED: val = 0b01; break;
+    default: val = 0b11;
+  }
+
+  return val;
+}
+
+variable_status_t decode(uint8_t val)
+{
+  val &= 0b11;
+  if (val == 0b00) return variable_status_t::BASIC;
+  if (val == 0b01) return variable_status_t::NONBASIC_LOWER;
+  if (val == 0b10) return variable_status_t::NONBASIC_UPPER;
+  return variable_status_t::NONBASIC_FREE;
+}
+
+std::vector<uint8_t> compress_vstatus(const std::vector<variable_status_t>& vstatus)
+{
+  size_t compressed_size = vstatus.size() / 4;
+  size_t has_tail        = (vstatus.size() % 4 > 0);
+
+  std::vector<uint8_t> packed_vstatus;
+  packed_vstatus.resize(compressed_size + has_tail);
+
+  for (size_t i = 0; i < compressed_size; ++i) {
+    size_t j          = i * 4;
+    packed_vstatus[i] = 0;
+    packed_vstatus[i] |= encode(vstatus[j]);
+    packed_vstatus[i] |= encode(vstatus[j + 1]) << 2;
+    packed_vstatus[i] |= encode(vstatus[j + 2]) << 4;
+    packed_vstatus[i] |= encode(vstatus[j + 3]) << 6;
+  }
+
+  if (has_tail) {
+    size_t j                        = compressed_size * 4;
+    packed_vstatus[compressed_size] = 0;
+    packed_vstatus[compressed_size] |= encode(vstatus[j]);
+    if (j + 1 < vstatus.size()) packed_vstatus[compressed_size] |= encode(vstatus[j + 1]) << 2;
+    if (j + 2 < vstatus.size()) packed_vstatus[compressed_size] |= encode(vstatus[j + 2]) << 4;
+    if (j + 3 < vstatus.size()) packed_vstatus[compressed_size] |= encode(vstatus[j + 3]) << 6;
+  }
+
+  return packed_vstatus;
+}
+
+void decompress_vstatus(const std::vector<uint8_t>& packed_vstatus,
+                        size_t vstatus_size,
+                        std::vector<variable_status_t>& vstatus)
+{
+  size_t compressed_size = vstatus_size / 4;
+  size_t has_tail        = (vstatus_size % 4 > 0);
+
+  vstatus.resize(vstatus_size);
+  assert(packed_vstatus.size() == compressed_size + has_tail);
+
+  for (size_t i = 0; i < compressed_size; ++i) {
+    size_t j       = i * 4;
+    vstatus[j]     = decode(packed_vstatus[i]);
+    vstatus[j + 1] = decode(packed_vstatus[i] >> 2);
+    vstatus[j + 2] = decode(packed_vstatus[i] >> 4);
+    vstatus[j + 3] = decode(packed_vstatus[i] >> 6);
+  }
+
+  if (has_tail) {
+    size_t j   = compressed_size * 4;
+    vstatus[j] = decode(packed_vstatus[compressed_size]);
+    if (j + 1 < vstatus.size()) vstatus[j + 1] = decode(packed_vstatus[compressed_size] >> 2);
+    if (j + 2 < vstatus.size()) vstatus[j + 2] = decode(packed_vstatus[compressed_size] >> 4);
+    if (j + 3 < vstatus.size()) vstatus[j + 3] = decode(packed_vstatus[compressed_size] >> 6);
+  }
+}
+
 template <typename i_t, typename f_t>
 i_t initial_basis_selection(const lp_problem_t<i_t, f_t>& problem,
                             const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/dual_simplex/initial_basis.hpp b/cpp/src/dual_simplex/initial_basis.hpp
index 646785fbd2..f4cb19efc5 100644
--- a/cpp/src/dual_simplex/initial_basis.hpp
+++ b/cpp/src/dual_simplex/initial_basis.hpp
@@ -1,6 +1,6 @@
 /* clang-format off */
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  */
 /* clang-format on */
@@ -23,6 +23,11 @@ enum class variable_status_t : int8_t {
   SUPERBASIC     = 4
 };
 
+std::vector<uint8_t> compress_vstatus(const std::vector<variable_status_t>& vstatus);
+void decompress_vstatus(const std::vector<uint8_t>& packed_vstatus,
+                        size_t vstatus_size,
+                        std::vector<variable_status_t>& vstatus);
+
 template <typename i_t, typename f_t>
 i_t initial_basis_selection(const lp_problem_t<i_t, f_t>& problem,
                             const simplex_solver_settings_t<i_t, f_t>& settings,
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index 4683fa1d9e..27eac7f985 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -114,6 +114,9 @@ struct simplex_solver_settings_t {
       mip_batch_pdlp_reliability_branching(0),
       strong_branching_simplex_iteration_limit(-1),
       random_seed(0),
+      bnb_steal_chance(-1),
+      bnb_nodes_per_steal(-1),
+      bnb_max_steal_attempts(-1),
       reliability_branching(-1),
       inside_mip(0),
       sub_mip(0),
@@ -206,13 +209,21 @@ struct simplex_solver_settings_t {
                                              // PDLP only
   // Set the maximum number of simplex iterations allowed per trial branch when applying
   // strong branching to the root node.
-  // -1 - Automatic (iteration limit = 200)
-  // 0, 1 - Estimate the objective change using a single pivot of dual simplex
-  // >1 - Set as the iteration limit in dual simplex
+  // -1 - automatic (iteration limit = 200)
+  // 0, 1 - estimate the objective change using a single pivot of dual simplex
+  // >1 - set as the iteration limit in dual simplex
   i_t strong_branching_simplex_iteration_limit;
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
+  // In B&B, indicate the chance in which a worker can steal a node from another worker.
+  // -1 - automatic (0.05)
+  // 0 - disable
+  // >0 - set the stealing chance [0, 1]
+  f_t bnb_steal_chance;
+  i_t bnb_nodes_per_steal;
+  i_t bnb_max_steal_attempts;
+
   // Settings for the reliability branching.
   // - -1: automatic
   // - 0: disable (use pseudocost branching instead)
diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu
index 9396d7158a..e1318edf4d 100644
--- a/cpp/src/mip_heuristics/diversity/lns/rins.cu
+++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu
@@ -64,7 +64,7 @@ void rins_t<i_t, f_t>::node_callback(const std::vector<f_t>& solution, f_t objec
       lp_optimal_solution = solution;
 
       CUOPT_LOG_DEBUG("Launching RINS task");
-#pragma omp task default(none)
+#pragma omp task default(none) priority(CUOPT_MEDIUM_TASK_PRIORITY)
       run_rins();
     } else {
       launch_new_task = true;
@@ -225,7 +225,8 @@ void rins_t<i_t, f_t>::run_rins()
   fj_cpu->log_prefix = "[RINS] ";
 
   CUOPT_LOG_DEBUG("Launching CPUFJ (RINS) task");
-#pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none)
+#pragma omp task shared(fj_cpu) firstprivate(time_limit) \
+  priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none)
   cpufj_solve(fj_cpu.get(), time_limit);
 
   f_t lower_bound = context.branch_and_bound_ptr ? context.branch_and_bound_ptr->get_lower_bound()
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
index 12b6c04070..46ffde6089 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu
@@ -45,7 +45,8 @@ void early_cpufj_t<i_t, f_t>::start()
                                          double) { this->try_update_best(solver_obj, assignment); };
 
   CUOPT_LOG_DEBUG("Launching early CPUFJ task");
-#pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none)
+#pragma omp task shared(fj_cpu_) priority(CUOPT_DEFAULT_TASK_PRIORITY) \
+  depend(out : *fj_cpu_) default(none)
   cpufj_solve(fj_cpu_.get());
 }
 
diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
index 758c6272c1..eac901151b 100644
--- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
+++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu
@@ -60,7 +60,8 @@ void early_gpufj_t<i_t, f_t>::start()
 
   CUOPT_LOG_DEBUG("Launching early GPUFJ task");
 
-#pragma omp task default(none) shared(fj_ptr_) depend(out : *fj_ptr_)
+#pragma omp task default(none) shared(fj_ptr_) priority(CUOPT_DEFAULT_TASK_PRIORITY) \
+  depend(out : *fj_ptr_)
   {
     RAFT_CUDA_TRY(cudaSetDevice(this->device_id_));
     fj_ptr_->solve(*this->solution_ptr_);
diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu
index 4a13425437..8a288b2da8 100644
--- a/cpp/src/mip_heuristics/local_search/local_search.cu
+++ b/cpp/src/mip_heuristics/local_search/local_search.cu
@@ -96,7 +96,8 @@ void local_search_t<i_t, f_t>::start_cpufj_scratch_threads(population_t<i_t, f_t
 
   for (size_t i = 0; i < scratch_cpu_fj.size(); ++i) {
     auto ptr = scratch_cpu_fj[i].get();
-#pragma omp task firstprivate(ptr) depend(out : *ptr) default(none)
+#pragma omp task firstprivate(ptr) priority(CUOPT_DEFAULT_TASK_PRIORITY) \
+  depend(out : *ptr) default(none)
     cpufj_solve(ptr);
   }
 }
@@ -134,8 +135,8 @@ void local_search_t<i_t, f_t>::start_cpufj_lptopt_scratch_threads(
 
   CUOPT_LOG_DEBUG("Launching scratch CPUFJ (on LP optimal) task");
 
-#pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \
-  depend(out : *scratch_cpu_fj_on_lp_opt)
+#pragma omp task shared(scratch_cpu_fj_on_lp_opt) \
+  priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) depend(out : *scratch_cpu_fj_on_lp_opt)
   cpufj_solve(scratch_cpu_fj_on_lp_opt.get());
 }
 
@@ -199,7 +200,8 @@ void local_search_t<i_t, f_t>::start_cpufj_deterministic(
     };
 
   CUOPT_LOG_DEBUG("Launching deterministic CPUFJ task");
-#pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj)
+#pragma omp task shared(deterministic_cpu_fj) priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) \
+  depend(inout : *deterministic_cpu_fj)
   cpufj_solve(deterministic_cpu_fj.get());
 
   // Signal that registration is complete - B&B can now wait on producers
@@ -270,7 +272,8 @@ bool local_search_t<i_t, f_t>::do_fj_solve(solution_t<i_t, f_t>& solution,
       size_t n = std::min<size_t>(omp_get_num_threads() - 1, ls_cpu_fj.size());
       CUOPT_LOG_DEBUG("Launching %d CPUFJ tasks", n);
 
-#pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(n) nogroup
+#pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(n) \
+  nogroup priority(CUOPT_DEFAULT_TASK_PRIORITY)
       for (size_t i = 0; i < n; ++i) {
         cpufj_solve(ls_cpu_fj[i].get());
       }
diff --git a/cpp/src/mip_heuristics/mip_constants.hpp b/cpp/src/mip_heuristics/mip_constants.hpp
index f50c58194e..7d28048ba9 100644
--- a/cpp/src/mip_heuristics/mip_constants.hpp
+++ b/cpp/src/mip_heuristics/mip_constants.hpp
@@ -25,3 +25,18 @@
 // MIP-only gate: skip the concurrent barrier when fewer threads are available than this
 // (1 PDLP + 1 dual simplex + 1 barrier). Stand-alone LP always runs all three.
 #define CUOPT_CONCURRENT_LP_BARRIER_REQUIRED_THREAD_COUNT 3
+
+/* @brief Priority classes for the omp tasks. Highest value = higher priority.
+ * Note that this only gives a hint to the runtime, such that the high priority
+ * is not guarantee to be executed before a low priority one (i.e., do not rely on
+ * these values for correctness).
+ */
+#define CUOPT_CRITICAL_TASK_PRIORITY 1000
+#define CUOPT_HIGH_TASK_PRIORITY     100
+#define CUOPT_MEDIUM_TASK_PRIORITY   10
+#define CUOPT_DEFAULT_TASK_PRIORITY  1
+
+// Default values for work stealing in B&B
+#define MIP_DEFAULT_STEAL_CHANCE       0.05
+#define MIP_DEFAULT_NODES_PER_STEAL    10
+#define MIP_DEFAULT_MAX_STEAL_ATTEMPTS 3
diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
index 3d62b99f66..43c58ff1d7 100644
--- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
+++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu
@@ -249,7 +249,8 @@ void conditional_bound_strengthening_t<i_t, f_t>::select_constraint_pairs_host(
   i_t num_tasks = std::max(omp_get_num_threads() - 2, 1);
 
   CUOPT_LOG_INFO("Selecting constraint pairs with %d tasks", num_tasks);
-#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(shared)
+#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(shared) \
+  priority(CUOPT_DEFAULT_TASK_PRIORITY)
   for (i_t cnstr = 0; cnstr < problem.n_constraints; ++cnstr) {
     for (i_t jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) {
       int var = variables[jj];
diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu
index 36b96dceaf..3767e0f2d0 100644
--- a/cpp/src/mip_heuristics/presolve/probing_cache.cu
+++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu
@@ -896,17 +896,16 @@ bool compute_probing_cache(bound_presolve_t<i_t, f_t>& bound_presolve,
     if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; }
     size_t step_end = std::min(step_start + step_size, priority_indices.size());
 
-#pragma omp taskloop num_tasks(num_tasks) default(shared)
+#pragma omp taskloop num_tasks(num_tasks) default(shared) priority(CUOPT_DEFAULT_TASK_PRIORITY)
     for (size_t task_id = 0; task_id < num_tasks; ++task_id) {
-      size_t n     = step_end - step_start;
-      size_t begin = step_start + std::floor(static_cast<f_t>(n) * task_id / num_tasks);
-      size_t end   = step_start + std::floor(static_cast<f_t>(n) * (task_id + 1) / num_tasks);
+      size_t n                   = step_end - step_start;
+      auto [begin, end]          = calculate_index_range(task_id, n, num_tasks);
       auto& multi_probe_presolve = multi_probe_presolve_pool[task_id];
       auto& modification_vector  = modification_vector_pool[task_id];
       auto& substitution_vector  = substitution_vector_pool[task_id];
       if (timer.check_time_limit()) { continue; }
 
-      for (size_t i = begin; i < end; ++i) {
+      for (size_t i = step_start + begin; i < step_start + end; ++i) {
         auto var_idx = priority_indices[i];
         if (timer.check_time_limit()) { continue; }
 
diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu
index 1c7cfe6f06..2b64a7c681 100644
--- a/cpp/src/mip_heuristics/solve.cu
+++ b/cpp/src/mip_heuristics/solve.cu
@@ -812,10 +812,9 @@ mip_solution_t<i_t, f_t> solve_mip(optimization_problem_t<i_t, f_t>& op_problem,
 }
 
 template <typename i_t, typename f_t>
-mip_solution_t<i_t, f_t> solve_mip(
-  raft::handle_t const* handle_ptr,
-  const cuopt::linear_programming::io::mps_data_model_t<i_t, f_t>& mps_data_model,
-  mip_solver_settings_t<i_t, f_t> const& settings)
+mip_solution_t<i_t, f_t> solve_mip(raft::handle_t const* handle_ptr,
+                                   const io::mps_data_model_t<i_t, f_t>& mps_data_model,
+                                   mip_solver_settings_t<i_t, f_t> const& settings)
 {
   auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model);
   return solve_mip(op_problem, settings);
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index 3bcbca6939..c25ade0c05 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -451,7 +451,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
       branch_and_bound->set_concurrent_lp_root_solve(true);
 
       context.problem_ptr->branch_and_bound_callback =
-        std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_new_solution,
+        std::bind(&dual_simplex::branch_and_bound_t<i_t, f_t>::set_solution_from_heuristics,
                   branch_and_bound.get(),
                   std::placeholders::_1);
     } else if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) {
@@ -488,7 +488,7 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
 #pragma omp taskgroup
   {
     if (!context.settings.heuristics_only) {
-#pragma omp task default(shared)
+#pragma omp task default(shared) priority(CUOPT_CRITICAL_TASK_PRIORITY)
       {
         branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution);
       }
diff --git a/cpp/src/utilities/circular_deque.hpp b/cpp/src/utilities/circular_deque.hpp
new file mode 100644
index 0000000000..6e9d8f5b05
--- /dev/null
+++ b/cpp/src/utilities/circular_deque.hpp
@@ -0,0 +1,114 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <cassert>
+#include <utility>
+#include <vector>
+
+namespace cuopt {
+
+// A fixed-capacity double-ended queue backed by a circular buffer.
+// All operations are O(1) with no dynamic allocation after construction.
+//
+// Preconditions (asserted in debug builds):
+//   - push_front / push_back : size() < capacity()
+//   - pop_front  / pop_back  : !empty()
+//   - front / back           : !empty()
+template <typename T>
+class circular_deque_t {
+ public:
+  circular_deque_t() : buffer_(1), capacity_(1), head_(0), tail_(0) {}
+
+  // Allocates storage for exactly `capacity` elements up front.
+  explicit circular_deque_t(size_t capacity)
+    : buffer_(capacity + 1),  // +1 to distinguish full (next(tail)==head) from empty (head==tail)
+      capacity_(capacity + 1),
+      head_(0),
+      tail_(0)
+  {
+    assert(capacity > 0);
+  }
+
+  bool empty() const { return head_ == tail_; }
+  bool full() const { return next(tail_) == head_; }
+
+  size_t size() const { return (tail_ - head_ + capacity_) % capacity_; }
+  size_t capacity() const { return capacity_ - 1; }
+
+  void clear_resize(size_t new_capacity)
+  {
+    assert(new_capacity > 0);
+    head_     = 0;
+    tail_     = 0;
+    capacity_ = new_capacity + 1;
+    buffer_.resize(capacity_);
+  }
+
+  void push_back(T val)
+  {
+    assert(!full());
+    buffer_[tail_] = std::move(val);
+    tail_          = next(tail_);
+  }
+
+  void push_front(T val)
+  {
+    assert(!full());
+    head_          = prev(head_);
+    buffer_[head_] = std::move(val);
+  }
+
+  T pop_front()
+  {
+    assert(!empty());
+    T val = std::move(buffer_[head_]);
+    head_ = next(head_);
+    return val;
+  }
+
+  T pop_back()
+  {
+    assert(!empty());
+    tail_ = prev(tail_);
+    return std::move(buffer_[tail_]);
+  }
+
+  T& front()
+  {
+    assert(!empty());
+    return buffer_[head_];
+  }
+  const T& front() const
+  {
+    assert(!empty());
+    return buffer_[head_];
+  }
+
+  T& back()
+  {
+    assert(!empty());
+    return buffer_[prev(tail_)];
+  }
+  const T& back() const
+  {
+    assert(!empty());
+    return buffer_[prev(tail_)];
+  }
+
+ private:
+  size_t next(size_t idx) const { return (idx + 1) % capacity_; }
+  size_t prev(size_t idx) const { return (idx + capacity_ - 1) % capacity_; }
+
+  std::vector<T> buffer_;
+  size_t capacity_;
+  size_t head_;
+  size_t tail_;
+};
+
+}  // namespace cuopt
diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp
index a13b9ec887..59b754460d 100644
--- a/cpp/src/utilities/omp_helpers.hpp
+++ b/cpp/src/utilities/omp_helpers.hpp
@@ -7,11 +7,27 @@
 
 #pragma once
 
+#include <cmath>
+#include <utility>
+
+namespace cuopt {
+
+// Split `total` entries across `n` blocks and return the [start, end) range for `k`-th block.
+// Useful for splitting work across OpenMP tasks/threads/workers.
+template <typename i_t>
+std::pair<i_t, i_t> calculate_index_range(i_t k, double total, double n)
+{
+  i_t start = std::floor(k * total / n);
+  i_t end   = std::floor((k + 1) * total / n);
+  return {start, end};
+}
+
+}  // namespace cuopt
+
 #ifdef _OPENMP
 
 #include <omp.h>
 #include <memory>
-#include <utility>
 
 namespace cuopt {
 
@@ -20,11 +36,9 @@ namespace cuopt {
 class omp_mutex_t {
  public:
   omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); }
-
-  omp_mutex_t(const omp_mutex_t&) = delete;
-
   omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); }
 
+  omp_mutex_t(const omp_mutex_t&)            = delete;
   omp_mutex_t& operator=(const omp_mutex_t&) = delete;
 
   omp_mutex_t& operator=(omp_mutex_t&& other)
@@ -203,17 +217,10 @@ class omp_atomic_t {
  private:
   T val;
 
-#ifndef __NVCC__
   friend double fetch_min(omp_atomic_t<double>& atomic_var, double other);
   friend double fetch_max(omp_atomic_t<double>& atomic_var, double other);
-#endif
 };
 
-// Atomic CAS are only supported in OpenMP v5.1
-// (gcc 12+ or clang 14+), however, nvcc (or the host compiler) cannot
-// parse it correctly yet
-#ifndef __NVCC__
-
 // Free non-template functions are necessary because of a clang 20 bug
 // when omp atomic compare is used within a templated context.
 // see https://github.com/llvm/llvm-project/issues/127466
@@ -238,7 +245,6 @@ inline double fetch_max(omp_atomic_t<double>& atomic_var, double other)
   }
   return old;
 }
-#endif
 
 #endif
 
diff --git a/cpp/src/utilities/pcgenerator.hpp b/cpp/src/utilities/pcgenerator.hpp
index 29a865f02f..e83e5f36ad 100644
--- a/cpp/src/utilities/pcgenerator.hpp
+++ b/cpp/src/utilities/pcgenerator.hpp
@@ -21,12 +21,11 @@ class pcgenerator_t {
   static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL;
 
   /**
-   * @brief ctor. Initializes the PCG
-   * @param rng_state is the generator state used for initializing the generator
-   * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences
-   * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a
-   * unique set of random numbers. This can be achieved by initializing the generator with same
-   * rng_state for all the threads and diststreamt values for subsequence.
+   * @brief Initializes the PCG generator.
+   * @param seed        Generator state seed.
+   * @param subsequence Selects one of 2^64 independent subsequences. Use distinct values per
+   *                    thread to guarantee non-overlapping streams in parallel contexts.
+   * @param offset      Number of outputs to skip ahead before the first draw.
    */
   pcgenerator_t(const uint64_t seed        = default_seed,
                 const uint64_t subsequence = default_stream,
@@ -35,7 +34,12 @@ class pcgenerator_t {
     set_seed(seed, subsequence, offset);
   }
 
-  // Set the seed, subsequence and offset of the PCG
+  /**
+   * @brief Re-seeds the generator.
+   * @param seed        Generator state seed.
+   * @param subsequence Selects one of 2^64 independent subsequences.
+   * @param offset      Number of outputs to skip ahead before the first draw.
+   */
   void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0)
   {
     state  = uint64_t(0);
@@ -47,8 +51,12 @@ class pcgenerator_t {
     skipahead(offset);
   }
 
-  // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown
-  // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf
+  /**
+   * @brief Advances the generator state by @p offset steps in O(log offset) time.
+   *
+   * Uses the closed-form LCG jump described in "Random Number Generation with Arbitrary Strides"
+   * (F. B. Brown, https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf).
+   */
   void skipahead(uint64_t offset)
   {
     uint64_t G = 1;
@@ -68,9 +76,7 @@ class pcgenerator_t {
   }
 
   /**
-   * @defgroup NextRand Generate the next random number
-   * @brief This code is derived from PCG basic code
-   * @{
+   * @returns the next uniformly distributed 32-bit unsigned integer.
    */
   uint32_t next_u32()
   {
@@ -83,6 +89,9 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed 64-bit unsigned integer.
+   */
   uint64_t next_u64()
   {
     uint64_t ret;
@@ -93,6 +102,10 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed non-negative 32-bit signed integer in [0,
+   * INT32_MAX].
+   */
   int32_t next_i32()
   {
     int32_t ret;
@@ -102,6 +115,10 @@ class pcgenerator_t {
     return ret;
   }
 
+  /**
+   * @returns the next uniformly distributed non-negative 64-bit signed integer in [0,
+   * INT64_MAX].
+   */
   int64_t next_i64()
   {
     int64_t ret;
@@ -111,10 +128,19 @@ class pcgenerator_t {
     return ret;
   }
 
-  float next_float() { return static_cast<float>((next_u32() >> 8) * 0x1.0p-24); }
+  /**
+   * @returns a uniformly distributed float in [0, 1).
+   */
+  float next_float() { return (next_u32() >> 8) * 0x1.0p-24; }
 
-  double next_double() { return static_cast<double>((next_u64() >> 11) * 0x1.0p-53); }
+  /**
+   * @returns a uniformly distributed double in [0, 1).
+   */
+  double next_double() { return (next_u64() >> 11) * 0x1.0p-53; }
 
+  /**
+   * @returns the next random value of type @p T.
+   */
   template <typename T>
   T next()
   {
@@ -130,9 +156,11 @@ class pcgenerator_t {
   void next(float& ret) { ret = next_float(); }
   void next(double& ret) { ret = next_double(); }
 
-  /// Draws a sample from a uniform distribution. The samples are uniformly distributed over
-  /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward
-  /// some numbers in the range (scaling by floating-point).
+  /**
+   * @brief Draws a sample from a uniform distribution over `[low, high)`.
+   *
+   * May have a slight bias toward some values due to floating-point scaling.
+   */
   template <typename T>
   T uniform(T low, T high)
   {
@@ -141,7 +169,9 @@ class pcgenerator_t {
     return low + (val * range);
   }
 
-  // Shuffles the contents of a sequence using the Fisher–Yates algorithm.
+  /**
+   * @brief Shuffles @p seq in-place using the Fisher-Yates algorithm.
+   */
   template <typename T>
   void shuffle(std::vector<T>& seq)
   {