diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp index 012a132b56..0222ad6fe9 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.cpp +++ b/cpp/src/branch_and_bound/branch_and_bound.cpp @@ -28,6 +28,7 @@ #include #include +#include #include #include @@ -37,9 +38,7 @@ #include #include #include -#include #include -#include #include #include @@ -301,11 +300,16 @@ branch_and_bound_t::branch_and_bound_t( template f_t branch_and_bound_t::get_lower_bound() { - f_t lower_bound = lower_bound_ceiling_.load(); - f_t heap_lower_bound = node_queue_.get_lower_bound(); - f_t worker_lower_bound = worker_pool_.get_lower_bound(); - lower_bound = std::min(heap_lower_bound, lower_bound); - lower_bound = std::min(worker_lower_bound, lower_bound); + f_t lower_bound = lower_bound_numerical_.load(); + + if (bfs_worker_pool_.is_initialized()) { + for (i_t i = 0; i < bfs_worker_pool_.size(); ++i) { + if (bfs_worker_pool_[i]->is_active) { + lower_bound = std::min(lower_bound, bfs_worker_pool_[i]->lower_bound.load()); + lower_bound = std::min(lower_bound, bfs_worker_pool_[i]->node_queue.get_lower_bound()); + } + } + } if (std::isfinite(lower_bound)) { return lower_bound; @@ -464,7 +468,7 @@ void branch_and_bound_t::update_user_bound(f_t lower_bound) } template -void branch_and_bound_t::set_new_solution(const std::vector& solution) +void branch_and_bound_t::set_solution_from_heuristics(const std::vector& solution) { mutex_original_lp_.lock(); if (solution.size() != original_problem_.num_cols) { @@ -478,7 +482,8 @@ void branch_and_bound_t::set_new_solution(const std::vector& solu mutex_original_lp_.unlock(); bool is_feasible = false; bool attempt_repair = false; - if (!incumbent_.has_incumbent || obj < incumbent_.objective) { + + if (improves_incumbent(obj)) { f_t primal_err; f_t bound_err; i_t num_fractional; @@ -494,8 +499,10 @@ void branch_and_bound_t::set_new_solution(const std::vector& solu mutex_original_lp_.unlock(); mutex_upper_.lock(); if (is_feasible && improves_incumbent(obj)) { - upper_bound_ = std::min(upper_bound_.load(), obj); + f_t current_upper_bound = upper_bound_.load(); + upper_bound_ = std::min(current_upper_bound, obj); incumbent_.set_incumbent_solution(obj, crushed_solution); + if (current_upper_bound > upper_bound_.load()) { report_heuristic(obj); } } else { attempt_repair = true; constexpr bool verbose = false; @@ -513,7 +520,6 @@ void branch_and_bound_t::set_new_solution(const std::vector& solu settings_.log.debug("Solution objective not better than current upper_bound_. Not accepted.\n"); } - if (is_feasible) { report_heuristic(obj); } if (attempt_repair) { mutex_repair_.lock(); repair_queue_.push_back(solution); @@ -712,9 +718,11 @@ void branch_and_bound_t::set_final_solution(mip_solution_t& if (solver_status_ == mip_status_t::TIME_LIMIT) { settings_.log.printf("Time limit reached. Stopping the solver...\n"); } + if (solver_status_ == mip_status_t::WORK_LIMIT) { settings_.log.printf("Work limit reached. Stopping the solver...\n"); } + if (solver_status_ == mip_status_t::NODE_LIMIT) { settings_.log.printf("Node limit reached. Stopping the solver...\n"); } @@ -858,7 +866,7 @@ branch_variable_t branch_and_bound_t::variable_selection( std::vector& solution = worker->leaf_solution.x; switch (worker->search_strategy) { - case search_strategy_t::BEST_FIRST: + case BEST_FIRST: if (settings_.reliability_branching != 0) { branch_var = pc_.reliable_variable_selection(node_ptr, @@ -867,7 +875,7 @@ branch_variable_t branch_and_bound_t::variable_selection( var_types_, exploration_stats_, upper_bound_, - worker_pool_.num_idle_workers(), + bfs_worker_pool_.num_idle(), new_slacks_, original_lp_); } else { @@ -878,17 +886,18 @@ branch_variable_t branch_and_bound_t::variable_selection( return {branch_var, round_dir}; - case search_strategy_t::COEFFICIENT_DIVING: + case COEFFICIENT_DIVING: return coefficient_diving( original_lp_, fractional, solution, var_up_locks_, var_down_locks_, log); - case search_strategy_t::LINE_SEARCH_DIVING: + case LINE_SEARCH_DIVING: return line_search_diving(fractional, solution, root_relax_soln_.x, log); - case search_strategy_t::PSEUDOCOST_DIVING: + case PSEUDOCOST_DIVING: return pseudocost_diving(pc_, fractional, solution, root_relax_soln_.x, log); - case search_strategy_t::GUIDED_DIVING: + case GUIDED_DIVING: + assert(incumbent_.has_incumbent); mutex_upper_.lock(); current_incumbent = incumbent_.x; mutex_upper_.unlock(); @@ -975,10 +984,10 @@ struct nondeterministic_policy_t : tree_update_policy_t { void on_numerical_issue(mip_node_t* node) override { if (worker->search_strategy == search_strategy_t::BEST_FIRST) { - fetch_min(bnb.lower_bound_ceiling_, node->lower_bound); + fetch_min(bnb.lower_bound_numerical_, node->lower_bound); log.printf("LP returned numerical issue on node %d. Best bound set to %+10.6e.\n", node->node_id, - compute_user_objective(bnb.original_lp_, bnb.lower_bound_ceiling_.load())); + compute_user_objective(bnb.original_lp_, bnb.lower_bound_numerical_.load())); } } @@ -1106,12 +1115,12 @@ struct deterministic_diving_policy_t : deterministic_policy_base_t> { using base = deterministic_policy_base_t>; - std::deque*>& stack; + circular_deque_t*>& stack; i_t max_backtrack_depth; deterministic_diving_policy_t(branch_and_bound_t& bnb, deterministic_diving_worker_t& worker, - std::deque*>& stack, + circular_deque_t*>& stack, i_t max_backtrack_depth) : base(bnb, worker), stack(stack), max_backtrack_depth(max_backtrack_depth) { @@ -1277,7 +1286,7 @@ std::pair branch_and_bound_t::updat policy.select_branch_variable(node_ptr, leaf_fractional, leaf_solution.x); round_dir = dir; - assert(node_ptr->vstatus.size() == leaf_problem.num_cols); + assert(worker->leaf_vstatus.size() == leaf_problem.num_cols); assert(branch_var >= 0); assert(dir != branch_direction_t::NONE); @@ -1291,7 +1300,7 @@ std::pair branch_and_bound_t::updat branch_var, leaf_solution.x[branch_var], num_frac, - node_ptr->vstatus, + worker->leaf_vstatus, leaf_problem, log); search_tree.update(node_ptr, node_status_t::HAS_CHILDREN); @@ -1416,8 +1425,8 @@ dual::status_t branch_and_bound_t::solve_node_lp( } #endif - std::vector& leaf_vstatus = node_ptr->vstatus; - assert(leaf_vstatus.size() == worker->leaf_problem.num_cols); + decompress_vstatus(node_ptr->packed_vstatus, worker->leaf_problem.num_cols, worker->leaf_vstatus); + assert(worker->leaf_vstatus.size() == worker->leaf_problem.num_cols); simplex_solver_settings_t lp_settings = settings_; lp_settings.concurrent_halt = &node_concurrent_halt_; @@ -1491,7 +1500,7 @@ dual::status_t branch_and_bound_t::solve_node_lp( lp_start_time, worker->leaf_problem, lp_settings, - leaf_vstatus, + worker->leaf_vstatus, worker->basis_factors, worker->basic_list, worker->nonbasic_list, @@ -1509,7 +1518,7 @@ dual::status_t branch_and_bound_t::solve_node_lp( worker->basis_factors, worker->basic_list, worker->nonbasic_list, - leaf_vstatus, + worker->leaf_vstatus, worker->leaf_edge_norms); lp_status = convert_lp_status_to_dual_status(second_status); @@ -1526,11 +1535,19 @@ dual::status_t branch_and_bound_t::solve_node_lp( return lp_status; } + template -void branch_and_bound_t::plunge_with(branch_and_bound_worker_t* worker) +void branch_and_bound_t::plunge_with(bfs_worker_t* worker, + mip_node_t* start_node) { - std::deque*> stack; - stack.push_front(worker->start_node); + assert(worker != nullptr && worker->is_active.load()); + assert(start_node != nullptr); + + // Stack holds at most 2 entries: the preferred child + its sibling. + // The sibling is evicted to the queue before a new pair of children is added. + circular_deque_t*> stack(2); + stack.push_front(start_node); + worker->recompute_basis = true; worker->recompute_bounds = true; worker->ensure_orbital_fixing(); @@ -1542,6 +1559,18 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_t 0 && (solver_status_ == mip_status_t::UNSET && is_running_) && rel_gap > settings_.relative_mip_gap_tol && abs_gap > settings_.absolute_mip_gap_tol) { + if (worker->worker_id == 0) { repair_heuristic_solutions(); } + + if (worker->total_active_diving_workers < worker->total_max_diving_workers && + worker->node_queue.diving_queue_size() > 0) { + launch_diving_worker(worker); + } + + if (bfs_worker_pool_.num_idle() > 0 && worker->node_queue.best_first_queue_size() > 0) { + launch_bfs_worker(worker); + } + + assert(stack.size() <= 2); mip_node_t* node_ptr = stack.front(); stack.pop_front(); ++exploration_stats_.nodes_being_solved; @@ -1564,7 +1593,23 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_t settings_.time_limit) { + f_t now = toc(exploration_stats_.start_time); + + if (worker->worker_id == 0) { + f_t time_since_last_log = + exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); + i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; + + if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && + time_since_last_log >= 1) || + (time_since_last_log > 30) || now > settings_.time_limit) { + report(' ', upper_bound_, lower_bound, node_ptr->depth, node_ptr->integer_infeasible); + exploration_stats_.last_log = tic(); + exploration_stats_.nodes_since_last_log = 0; + } + } + + if (now > settings_.time_limit) { solver_status_ = mip_status_t::TIME_LIMIT; stack.push_front(node_ptr); --exploration_stats_.nodes_being_solved; @@ -1615,22 +1660,22 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_t 0) { mip_node_t* node = stack.back(); stack.pop_back(); - node_queue_.push(node); + worker->node_queue.push_atomic(node); } exploration_stats_.nodes_unexplored += 2; if (round_dir == branch_direction_t::UP) { - if (node_queue_.best_first_queue_size() < min_node_queue_size_) { - node_queue_.push(node_ptr->get_down_child()); + if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) { + worker->node_queue.push_atomic(node_ptr->get_down_child()); } else { stack.push_front(node_ptr->get_down_child()); } stack.push_front(node_ptr->get_up_child()); } else { - if (node_queue_.best_first_queue_size() < min_node_queue_size_) { - node_queue_.push(node_ptr->get_up_child()); + if (worker->node_queue.best_first_queue_size() < min_node_queue_size_) { + worker->node_queue.push_atomic(node_ptr->get_up_child()); } else { stack.push_front(node_ptr->get_up_child()); } @@ -1645,28 +1690,144 @@ void branch_and_bound_t::plunge_with(branch_and_bound_worker_tnode_queue.push_atomic(node); + } + + // The worker is no longer exploring the tree. Set its lower bound to infinity to avoid + // interfering with the global lower bound calculation. + worker->lower_bound = std::numeric_limits::infinity(); +} + +template +void branch_and_bound_t::launch_bfs_worker(bfs_worker_t* worker) +{ + bfs_worker_t* idle_worker = bfs_worker_pool_.pop_idle_worker(); + if (!idle_worker) return; + + assert(idle_worker->is_active.load() == false); + assert(idle_worker->node_queue.best_first_queue_size() == 0); + + // Pre-emptively set the lower bound of the idle worker for the top of the heap + // so it is visible to all workers. + idle_worker->lower_bound = worker->node_queue.get_lower_bound(); + idle_worker->set_active(); + + bool success = idle_worker->node_queue.steal_from(worker->node_queue, 1); + + // Update to the actual lower bound of the stolen node (another worker may attempt to + // steal the same node at the same time) + idle_worker->lower_bound = idle_worker->node_queue.get_lower_bound(); + + // If the idle worker is set to active (i.e., its node queue has a valid node), + // launch a openmp task to run the best-first search for that worker + if (success) { +#pragma omp task affinity(*idle_worker) priority(CUOPT_CRITICAL_TASK_PRIORITY) default(none) \ + firstprivate(idle_worker) + best_first_search_with(idle_worker); + } else { + // The idle worker was not successfully initialized. This should occur + // rarely or even none at all. Keep here for safety. + bfs_worker_pool_.return_worker_to_pool(idle_worker); + } +} + +template +void branch_and_bound_t::work_stealing(bfs_worker_t* worker) +{ + i_t nodes_to_steal = settings_.bnb_nodes_per_steal >= 0 ? settings_.bnb_nodes_per_steal + : MIP_DEFAULT_NODES_PER_STEAL; + i_t max_attempts = settings_.bnb_max_steal_attempts >= 0 ? settings_.bnb_max_steal_attempts + : MIP_DEFAULT_MAX_STEAL_ATTEMPTS; + for (i_t i = 0; i < max_attempts; ++i) { + i_t victim_id = worker->rng.uniform(0, bfs_worker_pool_.size()); + bfs_worker_t* victim = bfs_worker_pool_[victim_id]; + if (worker->steal_from(victim, nodes_to_steal)) { break; } + } +} + +template +void branch_and_bound_t::best_first_search_with(bfs_worker_t* worker) +{ + f_t lower_bound = get_lower_bound(); + f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); + f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + f_t steal_chance = + settings_.bnb_steal_chance >= 0 ? settings_.bnb_steal_chance : MIP_DEFAULT_STEAL_CHANCE; + node_queue_t& node_queue = worker->node_queue; + + worker->calculate_num_diving_workers(bfs_worker_pool_.size(), + diving_worker_pool_.size(), + has_solver_space_incumbent(), + settings_.diving_settings); + + while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && + rel_gap > settings_.relative_mip_gap_tol && node_queue.best_first_queue_size() > 0) { + // If the guided diving was disabled previously due to the lack of an incumbent solution, + // re-enable as soon as a new incumbent is found. + if (diving_worker_pool_.size() > 0 && settings_.diving_settings.guided_diving != 0 && + worker->max_diving_workers[GUIDED_DIVING] == 0) { + if (has_solver_space_incumbent()) { + worker->calculate_num_diving_workers(bfs_worker_pool_.size(), + diving_worker_pool_.size(), + has_solver_space_incumbent(), + settings_.diving_settings); + } + } + + if (toc(exploration_stats_.start_time) > settings_.time_limit) { + solver_status_ = mip_status_t::TIME_LIMIT; + break; + } + + // Pre-emptively set the lower bound of the worker + worker->lower_bound = node_queue.get_lower_bound(); + mip_node_t* start_node = node_queue.pop(); + if (!start_node) continue; + worker->lower_bound = start_node->lower_bound; + + if (upper_bound_.load() < start_node->lower_bound) { + // This node was put on the heap earlier but its lower bound is now greater than the + // current upper bound + search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound); + search_tree_.update(start_node, node_status_t::FATHOMED); + --exploration_stats_.nodes_unexplored; + continue; + } + + plunge_with(worker, start_node); + + lower_bound = get_lower_bound(); + abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); + rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); + + if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { + node_concurrent_halt_ = 1; + solver_status_ = mip_status_t::OPTIMAL; + break; + } + + // Steal a node with some probability or when it is empty. The victim is determined at random. + if (node_queue.best_first_queue_size() == 0 || worker->rng.next_double() < steal_chance) { + work_stealing(worker); + } } - if (settings_.num_threads > 1) { - worker_pool_.return_worker_to_pool(worker); - active_workers_per_strategy_[BEST_FIRST]--; + // If the worker has still nodes in the queue (this can happen if it was stopped due to + // time limit, small gap or other reason), then do not add back to the pool to avoid + // constantly trying to start it again + if (worker->node_queue.best_first_queue_size() == 0) { + bfs_worker_pool_.return_worker_to_pool(worker); } } template -void branch_and_bound_t::dive_with(branch_and_bound_worker_t* worker) +void branch_and_bound_t::dive_with(diving_worker_t* worker) { raft::common::nvtx::range scope("BB::diving_thread"); if (worker->orbital_fixing) { worker->orbital_fixing->disable(); } @@ -1680,8 +1841,12 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t worker->recompute_basis = true; worker->recompute_bounds = true; - search_tree_t dive_tree(std::move(*worker->start_node)); - std::deque*> stack; + search_tree_t dive_tree(std::move(worker->start_node)); + + // Since we are perform a DFS with a limit amount of backtracking, the + // stack can hold at most `diving_backtrack_limit` + 2 siblings nodes of the + // current level + circular_deque_t*> stack(diving_backtrack_limit + 4); stack.push_front(&dive_tree.root); branch_and_bound_stats_t dive_stats; @@ -1708,7 +1873,10 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t continue; } - if (toc(exploration_stats_.start_time) > settings_.time_limit) { break; } + if (toc(exploration_stats_.start_time) > settings_.time_limit) { + solver_status_ = mip_status_t::TIME_LIMIT; + break; + } if (dive_stats.nodes_explored >= diving_node_limit) { break; } dual::status_t lp_status = solve_node_lp(node_ptr, worker, dive_stats, log); @@ -1718,7 +1886,6 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t solver_status_ = mip_status_t::TIME_LIMIT; break; } - if (lp_status == dual::status_t::CONCURRENT_LIMIT) { break; } if (lp_status == dual::status_t::ITERATION_LIMIT) { break; } @@ -1737,9 +1904,10 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t } } - // Remove nodes that we no longer can backtrack to (i.e., from the current node, we can only + // Remove nodes that we can no longer backtrack to (i.e., from the current node, we can only // backtrack to a node that is has a depth of at most 5 levels lower than the current node). - if (stack.size() > 1 && stack.front()->depth - stack.back()->depth > diving_backtrack_limit) { + while (stack.size() > 1 && + stack.front()->depth - stack.back()->depth > diving_backtrack_limit) { stack.pop_back(); } @@ -1749,220 +1917,69 @@ void branch_and_bound_t::dive_with(branch_and_bound_worker_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound, lower_bound); } - worker_pool_.return_worker_to_pool(worker); - active_workers_per_strategy_[search_strategy]--; + diving_worker_pool_.return_worker_to_pool(worker); } template -void branch_and_bound_t::run_scheduler() +bool branch_and_bound_t::launch_diving_worker(bfs_worker_t* bfs_worker) { - diving_heuristics_settings_t diving_settings = settings_.diving_settings; - const i_t num_workers = 2 * settings_.num_threads; - - if (!has_solver_space_incumbent()) { diving_settings.guided_diving = false; } - std::vector strategies = get_search_strategies(diving_settings); - std::array max_num_workers_per_type = - get_max_workers(num_workers, strategies); - - worker_pool_.init(num_workers, original_lp_, Arow_, var_types_, symmetry_, settings_); - active_workers_per_strategy_.fill(0); - -#ifdef CUOPT_LOG_DEBUG - for (auto strategy : strategies) { - settings_.log.debug("%c%d: max num of workers = %d", - feasible_solution_symbol(strategy), - strategy, - max_num_workers_per_type[strategy]); + // Get an idle worker. + diving_worker_t* diving_worker = diving_worker_pool_.pop_idle_worker(); + if (diving_worker == nullptr) { return false; } + + bool success = bfs_worker->node_queue.diving_init(original_lp_, + diving_worker->start_node, + diving_worker->start_lower, + diving_worker->start_upper, + diving_worker->bounds_changed); + if (!success) { + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; } -#endif - - f_t lower_bound = get_lower_bound(); - f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); - i_t last_node_depth = 0; - i_t last_int_infeas = 0; - - while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && - rel_gap > settings_.relative_mip_gap_tol && - (active_workers_per_strategy_[0] > 0 || node_queue_.best_first_queue_size() > 0)) { - bool launched_any_task = false; - - repair_heuristic_solutions(); - - // If the guided diving was disabled previously due to the lack of an incumbent solution, - // re-enable as soon as a new incumbent is found. - if (settings_.diving_settings.guided_diving != diving_settings.guided_diving) { - if (has_solver_space_incumbent()) { - diving_settings.guided_diving = settings_.diving_settings.guided_diving; - strategies = get_search_strategies(diving_settings); - max_num_workers_per_type = get_max_workers(num_workers, strategies); - -#ifdef CUOPT_LOG_DEBUG - for (auto type : strategies) { - settings_.log.debug("%c%d: max num of workers = %d", - feasible_solution_symbol(type), - type, - max_num_workers_per_type[type]); - } -#endif - } - } - - f_t now = toc(exploration_stats_.start_time); - f_t time_since_last_log = - exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); - i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; - - if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && - time_since_last_log >= 1) || - (time_since_last_log > 30) || now > settings_.time_limit) { - i_t queue_size = node_queue_.best_first_queue_size(); - i_t depth = queue_size > 0 ? node_queue_.bfs_top()->depth : last_node_depth; - i_t int_infeas = queue_size > 0 ? node_queue_.bfs_top()->integer_infeasible : last_int_infeas; - report(' ', upper_bound_, lower_bound, depth, int_infeas); - exploration_stats_.last_log = tic(); - exploration_stats_.nodes_since_last_log = 0; - } - - if (now > settings_.time_limit) { - solver_status_ = mip_status_t::TIME_LIMIT; - break; - } - - for (auto strategy : strategies) { - if (active_workers_per_strategy_[strategy] >= max_num_workers_per_type[strategy]) { - continue; - } - - // Get an idle worker. - branch_and_bound_worker_t* worker = worker_pool_.get_idle_worker(); - if (worker == nullptr) { break; } - - if (strategy == BEST_FIRST) { - // If there any node left in the heap, we pop the top node and explore it. - std::optional*> start_node = node_queue_.pop_best_first(); - - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound) { - // This node was put on the heap earlier but its lower bound is now greater than the - // current upper bound - search_tree_.graphviz_node( - settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); - search_tree_.update(start_node.value(), node_status_t::FATHOMED); - continue; - } - - // Remove the worker from the idle list. - worker_pool_.pop_idle_worker(); - worker->init_best_first(start_node.value(), original_lp_); - last_node_depth = start_node.value()->depth; - last_int_infeas = start_node.value()->integer_infeasible; - active_workers_per_strategy_[strategy]++; - launched_any_task = true; - -#pragma omp task affinity(worker) default(none) firstprivate(worker) - plunge_with(worker); - - } else { - std::optional*> start_node = node_queue_.pop_diving(); - - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound || - start_node.value()->depth < diving_settings.min_node_depth) { - continue; - } - bool is_feasible = - worker->init_diving(start_node.value(), strategy, original_lp_, settings_); - if (!is_feasible) { continue; } - - // Remove the worker from the idle list. - worker_pool_.pop_idle_worker(); - active_workers_per_strategy_[strategy]++; - launched_any_task = true; - -#pragma omp task affinity(worker) default(none) firstprivate(worker) - dive_with(worker); - } - } - - lower_bound = get_lower_bound(); - abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); - - if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { - node_concurrent_halt_ = 1; - solver_status_ = mip_status_t::OPTIMAL; - break; - } - - // If no new task was launched in this iteration, suspend temporarily the - // execution of the scheduler. As of 8/Jan/2026, GCC does not - // implement taskyield, but LLVM does. - if (!launched_any_task) { std::this_thread::sleep_for(std::chrono::milliseconds(1)); } + if (upper_bound_.load() < diving_worker->start_node.lower_bound || + diving_worker->start_node.depth < settings_.diving_settings.min_node_depth) { + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; } -} - -template -void branch_and_bound_t::single_threaded_solve() -{ - raft::common::nvtx::range scope("BB::single_threaded_solve"); - worker_pool_.init(1, original_lp_, Arow_, var_types_, symmetry_, settings_); - branch_and_bound_worker_t* worker = worker_pool_.get_idle_worker(); - - f_t lower_bound = get_lower_bound(); - f_t abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - f_t rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); - while (solver_status_ == mip_status_t::UNSET && abs_gap > settings_.absolute_mip_gap_tol && - rel_gap > settings_.relative_mip_gap_tol && node_queue_.best_first_queue_size() > 0) { - repair_heuristic_solutions(); + bool is_feasible = diving_worker->presolve_start_bounds(settings_); + if (!is_feasible) { + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; + } - f_t now = toc(exploration_stats_.start_time); - f_t time_since_last_log = - exploration_stats_.last_log == 0 ? 1.0 : toc(exploration_stats_.last_log); - i_t nodes_since_last_log = exploration_stats_.nodes_since_last_log; - - if (((nodes_since_last_log >= 1000 || abs_gap < 10 * settings_.absolute_mip_gap_tol) && - time_since_last_log >= 1) || - (time_since_last_log > 30) || now > settings_.time_limit) { - i_t depth = node_queue_.bfs_top()->depth; - i_t int_infeas = node_queue_.bfs_top()->integer_infeasible; - report(' ', upper_bound_, lower_bound, depth, int_infeas); - exploration_stats_.last_log = tic(); - exploration_stats_.nodes_since_last_log = 0; - } + if (toc(exploration_stats_.start_time) > settings_.time_limit || + solver_status_ != mip_status_t::UNSET) { + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; + } - if (now > settings_.time_limit) { - solver_status_ = mip_status_t::TIME_LIMIT; - break; - } + for (int i = 1; i < num_search_strategies; ++i) { + auto strategy = search_strategies[i]; - // If there any node left in the heap, we pop the top node and explore it. - std::optional*> start_node = node_queue_.pop_best_first(); + if (bfs_worker->active_diving_workers[strategy] < bfs_worker->max_diving_workers[strategy]) { + diving_worker->search_strategy = strategy; + diving_worker->bfs_worker = bfs_worker; + diving_worker->set_active(); + bfs_worker->active_diving_workers[strategy]++; + bfs_worker->total_active_diving_workers++; - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound) { - // This node was put on the heap earlier but its lower bound is now greater than the - // current upper bound - search_tree_.graphviz_node( - settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); - search_tree_.update(start_node.value(), node_status_t::FATHOMED); - continue; - } + assert(bfs_worker->active_diving_workers[strategy].load() <= + bfs_worker->max_diving_workers[strategy]); + assert(bfs_worker->total_active_diving_workers.load() <= + bfs_worker->total_max_diving_workers); - worker->init_best_first(start_node.value(), original_lp_); - plunge_with(worker); - - lower_bound = get_lower_bound(); - abs_gap = compute_user_abs_gap(original_lp_, upper_bound_.load(), lower_bound); - rel_gap = user_relative_gap(original_lp_, upper_bound_.load(), lower_bound); +#pragma omp task affinity(*diving_worker) priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) \ + firstprivate(diving_worker) + dive_with(diving_worker); - if (abs_gap <= settings_.absolute_mip_gap_tol || rel_gap <= settings_.relative_mip_gap_tol) { - solver_status_ = mip_status_t::OPTIMAL; - break; + return true; } } + + diving_worker_pool_.return_worker_to_pool(diving_worker); + return false; } template @@ -1983,7 +2000,7 @@ lp_status_t branch_and_bound_t::solve_root_relaxation( lp_status_t root_status; // Launch a task for solving the root LP relaxation via dual simplex. -#pragma omp task default(shared) depend(out : root_status) +#pragma omp task default(shared) depend(out : root_status) priority(CUOPT_CRITICAL_TASK_PRIORITY) { root_status = solve_linear_program_with_advanced_basis(original_lp_, exploration_stats_.start_time, @@ -2456,7 +2473,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut tolerances_for_clique.absolute_mip_gap = settings_.absolute_mip_gap_tol; tolerances_for_clique.relative_mip_gap = settings_.relative_mip_gap_tol; -#pragma omp task depend(out : *clique_signal) firstprivate(tolerances_for_clique) +#pragma omp task priority(CUOPT_DEFAULT_TASK_PRIORITY) depend(out : *clique_signal) \ + firstprivate(tolerances_for_clique) { user_problem_t problem_copy = original_problem_; timer_t timer(std::numeric_limits::infinity()); @@ -2576,14 +2594,13 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut return mip_status_t::OPTIMAL; } - is_running_ = true; - lower_bound_ceiling_ = inf; + is_running_ = true; + lower_bound_numerical_ = inf; if (num_fractional != 0 && settings_.max_cut_passes > 0) { settings_.log.printf( - " | Explored | Unexplored | Objective | Bound | IntInf | Depth | Iter/Node | " - "Gap " - "| Time |\n"); + " | Explored | Unexplored | Objective | Bound | IntInf | Depth | Iter/Node | " + "Gap | Time |\n"); } cut_pool_t cut_pool(original_lp_.num_cols, settings_); @@ -2621,7 +2638,7 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut if (settings_.deterministic) { queue_external_solution_deterministic(user_assignment, work_units); } else { - set_new_solution(user_assignment); + set_solution_from_heuristics(user_assignment); } }; auto stop_root_cut_cpufj = [&]() { @@ -2643,7 +2660,8 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut cut_pass_result_t cut_pass_result; if (root_cut_cpufj_task) { -#pragma omp task shared(root_cut_cpufj_task) default(none) depend(out : *root_cut_cpufj_task) +#pragma omp task shared(root_cut_cpufj_task) priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) \ + depend(out : *root_cut_cpufj_task) detail::run_fj_cpu_task(*root_cut_cpufj_task, std::numeric_limits::infinity(), std::numeric_limits::infinity()); @@ -2821,8 +2839,6 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut root_vstatus_, original_lp_, log); - node_queue_.push(search_tree_.root.get_down_child()); - node_queue_.push(search_tree_.root.get_up_child()); if (symmetry_ != nullptr) { i_t removed = @@ -2843,11 +2859,12 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut exploration_stats_.nodes_unexplored = 2; exploration_stats_.nodes_since_last_log = 0; exploration_stats_.last_log = tic(); - min_node_queue_size_ = 2 * settings_.num_threads; + min_node_queue_size_ = 20; if (settings_.diving_settings.coefficient_diving != 0) { calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); } + if (settings_.deterministic) { settings_.log.printf( " | Explored | Unexplored | Objective | Bound | IntInf | Depth | Iter/Node " @@ -2862,10 +2879,29 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut { if (settings_.deterministic) { run_deterministic_coordinator(Arow_); - } else if (settings_.num_threads > 1) { - run_scheduler(); } else { - single_threaded_solve(); + const i_t num_workers = settings_.num_threads; + const i_t num_bfs_workers = std::max(settings_.num_threads / 2, 1); + const i_t num_diving_workers = num_workers - num_bfs_workers; + bfs_worker_pool_.init(num_bfs_workers, original_lp_, Arow_, var_types_, symmetry_, settings_); + + if (num_diving_workers > 0) { + diving_worker_pool_.init(num_diving_workers, + original_lp_, + Arow_, + var_types_, + symmetry_, + settings_, + num_bfs_workers); + } + + bfs_worker_t* initial_worker = bfs_worker_pool_.pop_idle_worker(); + node_queue_t& node_queue = initial_worker->node_queue; + node_queue.push_lockfree(search_tree_.root.get_down_child()); + node_queue.push_lockfree(search_tree_.root.get_up_child()); + initial_worker->lower_bound = initial_worker->node_queue.get_lower_bound(); + initial_worker->set_active(); + best_first_search_with(initial_worker); } } // Implicit barrier for all tasks created within the group (RINS, B&B workers) @@ -2877,31 +2913,28 @@ mip_status_t branch_and_bound_t::solve(mip_solution_t& solut lower_bound = deterministic_compute_lower_bound(); solver_status_ = deterministic_global_termination_status_; } else { - if (node_queue_.best_first_queue_size() > 0) { + lower_bound = lower_bound_numerical_; + + for (int i = 0; i < bfs_worker_pool_.size(); ++i) { + bfs_worker_t* worker = bfs_worker_pool_[i]; + // We need to clear the queue and use the info in the search tree for the lower bound - while (node_queue_.best_first_queue_size() > 0) { - std::optional*> start_node = node_queue_.pop_best_first(); - - if (!start_node.has_value()) { continue; } - if (upper_bound_.load() < start_node.value()->lower_bound) { - // This node was put on the heap earlier but its lower bound is now greater than the - // current upper bound - search_tree_.graphviz_node( - settings_.log, start_node.value(), "cutoff", start_node.value()->lower_bound); - search_tree_.update(start_node.value(), node_status_t::FATHOMED); - continue; - } else { - node_queue_.push( - start_node.value()); // Needed to ensure we don't lose the correct lower bound - break; - } + while (worker->node_queue.best_first_queue_size() > 0 && + worker->node_queue.get_lower_bound() > upper_bound_.load()) { + mip_node_t* start_node = worker->node_queue.pop(); + // This node was put on the heap earlier but its lower bound is now greater than the + // current upper bound + search_tree_.graphviz_node(settings_.log, start_node, "cutoff", start_node->lower_bound); + search_tree_.update(start_node, node_status_t::FATHOMED); + --exploration_stats_.nodes_unexplored; } - lower_bound = node_queue_.best_first_queue_size() > 0 ? node_queue_.get_lower_bound() - : search_tree_.root.lower_bound; - } else { - lower_bound = search_tree_.root.lower_bound; + + lower_bound = std::min(lower_bound, worker->node_queue.get_lower_bound()); } + + if (!std::isfinite(lower_bound)) { lower_bound = search_tree_.root.lower_bound; } } + set_final_solution(solution, lower_bound); return solver_status_; } @@ -3030,17 +3063,9 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri deterministic_horizon_step_ = 0.50; // Compute worker counts using the same formula as reliability-branching scheduler - const i_t num_workers = settings_.num_threads; - std::vector search_strategies = - get_search_strategies(settings_.diving_settings); - std::array max_num_workers = - get_max_workers(num_workers, search_strategies); - - const int num_bfs_workers = max_num_workers[search_strategy_t::BEST_FIRST]; - int num_diving_workers = 0; - for (size_t i = 1; i < search_strategies.size(); ++i) { - num_diving_workers += max_num_workers[search_strategies[i]]; - } + const i_t num_workers = settings_.num_threads; + const i_t num_bfs_workers = std::max(num_workers / 2, 1); + const i_t num_diving_workers = num_workers - num_bfs_workers; deterministic_mode_enabled_ = true; deterministic_current_horizon_ = deterministic_horizon_step_; @@ -3052,8 +3077,8 @@ void branch_and_bound_t::run_deterministic_coordinator(const csr_matri if (num_diving_workers > 0) { // Extract diving types from search_strategies (skip BEST_FIRST at index 0) - std::vector diving_types(search_strategies.begin() + 1, - search_strategies.end()); + std::vector diving_types(search_strategies + 1, + search_strategies + num_search_strategies); if (settings_.diving_settings.coefficient_diving != 0) { calculate_variable_locks(original_lp_, var_up_locks_, var_down_locks_); @@ -3443,10 +3468,10 @@ node_status_t branch_and_bound_t::solve_node_deterministic( // Solve LP relaxation worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols); - std::vector& leaf_vstatus = node_ptr->vstatus; - i_t node_iter = 0; - f_t lp_start_time = tic(); - std::vector leaf_edge_norms = edge_norms_; + decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols, worker.leaf_vstatus); + i_t node_iter = 0; + f_t lp_start_time = tic(); + std::vector leaf_edge_norms = edge_norms_; dual::status_t lp_status = dual_phase2_with_advanced_basis(2, 0, @@ -3454,7 +3479,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( lp_start_time, worker.leaf_problem, lp_settings, - leaf_vstatus, + worker.leaf_vstatus, worker.basis_factors, worker.basic_list, worker.nonbasic_list, @@ -3472,7 +3497,7 @@ node_status_t branch_and_bound_t::solve_node_deterministic( worker.basis_factors, worker.basic_list, worker.nonbasic_list, - leaf_vstatus, + worker.leaf_vstatus, leaf_edge_norms, &worker.work_context); lp_status = convert_lp_status_to_dual_status(second_status); @@ -3722,7 +3747,7 @@ void branch_and_bound_t::deterministic_sort_replay_events( deterministic_merge_pseudo_cost_updates(*deterministic_workers_); for (const auto& worker : *deterministic_workers_) { - fetch_min(lower_bound_ceiling_, worker.local_lower_bound_ceiling); + fetch_min(lower_bound_numerical_, worker.local_lower_bound_ceiling); } } @@ -3826,7 +3851,7 @@ template f_t branch_and_bound_t::deterministic_compute_lower_bound() { // Compute lower bound from BFS worker local structures only - f_t lower_bound = lower_bound_ceiling_.load(); + f_t lower_bound = lower_bound_numerical_.load(); // Check all BFS worker queues for (const auto& worker : *deterministic_workers_) { @@ -3930,8 +3955,10 @@ void branch_and_bound_t::deterministic_assign_diving_nodes() continue; // this worker is full, try next one } - auto entry = diving_heap_.pop(); - if (entry.has_value()) { worker.enqueue_dive_node(entry.value().node, original_lp_); } + if (!diving_heap_.empty()) { + auto entry = diving_heap_.pop(); + worker.enqueue_dive_node(entry.node, original_lp_); + } } diving_heap_.clear(); @@ -3981,11 +4008,6 @@ void branch_and_bound_t::deterministic_dive( { raft::common::nvtx::range scope("BB::deterministic_dive"); - // Create local search tree for the dive - search_tree_t dive_tree(std::move(entry.node)); - std::deque*> stack; - stack.push_front(&dive_tree.root); - worker.dive_lower = std::move(entry.resolved_lower); worker.dive_upper = std::move(entry.resolved_upper); @@ -3995,6 +4017,11 @@ void branch_and_bound_t::deterministic_dive( worker.lp_iters_this_dive = 0; worker.recompute_bounds_and_basis = true; + // Create local search tree for the dive + search_tree_t dive_tree(std::move(entry.node)); + circular_deque_t*> stack(2 * max_backtrack_depth + 4); + stack.push_front(&dive_tree.root); + while (!stack.empty() && deterministic_global_termination_status_ == mip_status_t::UNSET && nodes_this_dive < max_nodes_per_dive) { mip_node_t* node_ptr = stack.front(); @@ -4055,18 +4082,18 @@ void branch_and_bound_t::deterministic_dive( // Solve LP relaxation worker.leaf_solution.resize(worker.leaf_problem.num_rows, worker.leaf_problem.num_cols); - std::vector& leaf_vstatus = node_ptr->vstatus; - i_t node_iter = 0; - f_t lp_start_time = tic(); - std::vector leaf_edge_norms = edge_norms_; + i_t node_iter = 0; + f_t lp_start_time = tic(); + std::vector leaf_edge_norms = edge_norms_; + decompress_vstatus(node_ptr->packed_vstatus, worker.leaf_problem.num_cols, worker.leaf_vstatus); dual::status_t lp_status = dual_phase2_with_advanced_basis(2, 0, worker.recompute_bounds_and_basis, lp_start_time, worker.leaf_problem, lp_settings, - leaf_vstatus, + worker.leaf_vstatus, worker.basis_factors, worker.basic_list, worker.nonbasic_list, @@ -4083,7 +4110,7 @@ void branch_and_bound_t::deterministic_dive( worker.basis_factors, worker.basic_list, worker.nonbasic_list, - leaf_vstatus, + worker.leaf_vstatus, leaf_edge_norms, &worker.work_context); lp_status = convert_lp_status_to_dual_status(second_status); diff --git a/cpp/src/branch_and_bound/branch_and_bound.hpp b/cpp/src/branch_and_bound/branch_and_bound.hpp index 839335c8c9..0b32b2ece5 100644 --- a/cpp/src/branch_and_bound/branch_and_bound.hpp +++ b/cpp/src/branch_and_bound/branch_and_bound.hpp @@ -112,7 +112,7 @@ class branch_and_bound_t { } // Set a solution based on the user problem during the course of the solve - void set_new_solution(const std::vector& solution); + void set_solution_from_heuristics(const std::vector& solution); // This queues the solution to be processed at the correct work unit timestamp void queue_external_solution_deterministic(const std::vector& solution, double work_unit_ts); @@ -239,18 +239,14 @@ class branch_and_bound_t { // Pseudocosts pseudo_costs_t pc_; - // Heap storing the nodes waiting to be explored. - node_queue_t node_queue_; - // Search tree search_tree_t search_tree_; - // Count the number of workers per type that either are being executed or - // are waiting to be executed. - std::array, num_search_strategies> active_workers_per_strategy_; + // Worker pool dedicated to the best-first search + bfs_worker_pool_t bfs_worker_pool_; - // Worker pool - branch_and_bound_worker_pool_t worker_pool_; + // Worker pool dedicated to diving + diving_worker_pool_t diving_worker_pool_; // Global status of the solver. omp_atomic_t solver_status_; @@ -262,8 +258,9 @@ class branch_and_bound_t { i_t min_node_queue_size_; // In case, a best-first thread encounters a numerical issue when solving a node, - // its blocks the progression of the lower bound. - omp_atomic_t lower_bound_ceiling_; + // its blocks the progression of the lower bound as it cannot explore the + // corresponding subtree. + omp_atomic_t lower_bound_numerical_; std::function user_bound_callback_; void report_heuristic(f_t obj); @@ -317,22 +314,26 @@ class branch_and_bound_t { // Repairs low-quality solutions from the heuristics, if it is applicable. void repair_heuristic_solutions(); + // Launch a new diving worker from a given best-first worker. + bool launch_diving_worker(bfs_worker_t* bfs_worker); + + // Launch a new best-first worker from a given bfs worker. + void launch_bfs_worker(bfs_worker_t* worker); + + // Perform best-first search with a given bfs worker. + void best_first_search_with(bfs_worker_t* worker); + // We use best-first to pick the `start_node` and then perform a depth-first search // from this node (i.e., a plunge). It can only backtrack to a sibling node. // Unexplored nodes in the subtree are inserted back into the global heap. - void plunge_with(branch_and_bound_worker_t* worker); + void plunge_with(bfs_worker_t* worker, mip_node_t* start_node); + + // A worker attempts to steal nodes from another worker + void work_stealing(bfs_worker_t* worker); // Perform a deep dive in the subtree determined by the `start_node` in order // to find integer feasible solutions. - void dive_with(branch_and_bound_worker_t* worker); - - // Run the scheduler whose will schedule and manage - // all the other workers. - void run_scheduler(); - - // Run the branch-and-bound algorithm in single threaded mode. - // This disable all diving heuristics. - void single_threaded_solve(); + void dive_with(diving_worker_t* worker); // Solve the LP relaxation of a leaf node dual::status_t solve_node_lp(mip_node_t* node_ptr, diff --git a/cpp/src/branch_and_bound/constants.hpp b/cpp/src/branch_and_bound/constants.hpp index 39bfa0bf3a..fed422ef89 100644 --- a/cpp/src/branch_and_bound/constants.hpp +++ b/cpp/src/branch_and_bound/constants.hpp @@ -24,6 +24,9 @@ enum search_strategy_t : int { COEFFICIENT_DIVING = 4 // Coefficient diving (9.2.1) }; +constexpr search_strategy_t search_strategies[] = { + BEST_FIRST, PSEUDOCOST_DIVING, LINE_SEARCH_DIVING, GUIDED_DIVING, COEFFICIENT_DIVING}; + enum class branch_direction_t { NONE = -1, DOWN = 0, UP = 1 }; enum class branch_and_bound_mode_t { PARALLEL = 0, DETERMINISTIC = 1 }; diff --git a/cpp/src/branch_and_bound/deterministic_workers.hpp b/cpp/src/branch_and_bound/deterministic_workers.hpp index 88c4f0e940..fb1c0f450f 100644 --- a/cpp/src/branch_and_bound/deterministic_workers.hpp +++ b/cpp/src/branch_and_bound/deterministic_workers.hpp @@ -195,8 +195,8 @@ class deterministic_bfs_worker_t plunge_stack.pop_front(); return node; } - auto node_opt = backlog.pop(); - return node_opt.has_value() ? node_opt.value() : nullptr; + + return !backlog.empty() ? backlog.pop() : nullptr; } size_t queue_size() const diff --git a/cpp/src/branch_and_bound/diving_heuristics.cpp b/cpp/src/branch_and_bound/diving_heuristics.cpp index a0bb731c1e..4989e48c6a 100644 --- a/cpp/src/branch_and_bound/diving_heuristics.cpp +++ b/cpp/src/branch_and_bound/diving_heuristics.cpp @@ -92,7 +92,7 @@ branch_variable_t pseudocost_diving(pseudo_costs_t& pc, f_t score = 0; branch_direction_t dir = branch_direction_t::DOWN; - f_t root_val = (j < static_cast(root_solution.size())) ? root_solution[j] : solution[j]; + f_t root_val = root_solution[j]; if (solution[j] < root_val - f_t(0.4)) { score = score_down; diff --git a/cpp/src/branch_and_bound/mip_node.hpp b/cpp/src/branch_and_bound/mip_node.hpp index 39150eb4fe..2c0968ffce 100644 --- a/cpp/src/branch_and_bound/mip_node.hpp +++ b/cpp/src/branch_and_bound/mip_node.hpp @@ -91,7 +91,7 @@ class mip_node_t { branch_var_upper(std::numeric_limits::infinity()), fractional_val(std::numeric_limits::infinity()), objective_estimate(std::numeric_limits::infinity()), - vstatus(0) + packed_vstatus(0) { children[0] = nullptr; children[1] = nullptr; @@ -107,7 +107,7 @@ class mip_node_t { branch_dir(branch_direction_t::NONE), integer_infeasible(-1), objective_estimate(std::numeric_limits::infinity()), - vstatus(basis) + packed_vstatus(compress_vstatus(basis)) { children[0] = nullptr; children[1] = nullptr; @@ -131,7 +131,7 @@ class mip_node_t { fractional_val(branch_var_value), integer_infeasible(integer_inf), objective_estimate(parent_node->objective_estimate), - vstatus(basis) + packed_vstatus(compress_vstatus(basis)) { branch_var_lower = branch_direction == branch_direction_t::DOWN ? problem.lower[branch_var] : std::ceil(branch_var_value); @@ -186,7 +186,7 @@ class mip_node_t { children[0] = std::move(down_child); children[1] = std::move(up_child); // When we add children we no longer need to store our basis - vstatus.clear(); + packed_vstatus = {}; } bool is_inactive() const @@ -274,7 +274,7 @@ class mip_node_t { // This method creates a copy of the current node // with its parent set to `nullptr` // This detaches the node from the tree. - mip_node_t detach_copy() const + mip_node_t detach_copy() const { mip_node_t copy; copy.lower_bound = lower_bound; @@ -282,7 +282,7 @@ class mip_node_t { copy.depth = depth; copy.node_id = node_id; copy.integer_infeasible = integer_infeasible; - copy.vstatus = vstatus; + copy.packed_vstatus = packed_vstatus; copy.branch_var = branch_var; copy.branch_dir = branch_dir; copy.branch_var_lower = branch_var_lower; @@ -315,7 +315,11 @@ class mip_node_t { mip_node_t* parent; std::unique_ptr children[2]; - std::vector vstatus; + std::vector packed_vstatus; + + // Indicate if we can dive from this node or not. This is set to false when + // this node was already selected for diving once. + omp_atomic_t can_dive{true}; // Cumulative orbital fixing bound changes from root to this node. // Stored so that when a child starts a new plunge, the parent's diff --git a/cpp/src/branch_and_bound/node_queue.hpp b/cpp/src/branch_and_bound/node_queue.hpp index 09d030c96e..b07b371b81 100644 --- a/cpp/src/branch_and_bound/node_queue.hpp +++ b/cpp/src/branch_and_bound/node_queue.hpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include @@ -29,12 +28,14 @@ class heap_t { { buffer.push_back(node); std::push_heap(buffer.begin(), buffer.end(), comp); + ++num_entries_; } void push(T&& node) { buffer.push_back(std::move(node)); std::push_heap(buffer.begin(), buffer.end(), comp); + ++num_entries_; } template @@ -42,39 +43,125 @@ class heap_t { { buffer.emplace_back(std::forward(args)...); std::push_heap(buffer.begin(), buffer.end(), comp); + ++num_entries_; + assert(num_entries_.load() == buffer.size()); } - std::optional pop() + T pop() { - if (buffer.empty()) return std::nullopt; - + --num_entries_; std::pop_heap(buffer.begin(), buffer.end(), comp); T node = std::move(buffer.back()); buffer.pop_back(); + assert(num_entries_.load() == buffer.size()); return node; } - size_t size() const { return buffer.size(); } + size_t size() const { return num_entries_; } T& top() { return buffer.front(); } - void clear() { buffer.clear(); } - bool empty() const { return buffer.empty(); } + + void clear() + { + buffer.clear(); + num_entries_ = 0; + } + + bool empty() const { return num_entries_ == 0; } // Read-only access to underlying buffer for iteration without modification const std::vector& data() const { return buffer; } private: std::vector buffer; + omp_atomic_t num_entries_{0}; Comp comp; }; -// A queue storing the nodes waiting to be explored/dived from. +// A queue storing the nodes waiting to be explored. template class node_queue_t { + public: + void push_atomic(mip_node_t* new_node) + { + std::lock_guard lock(mutex_); + push_lockfree(new_node); + } + + void push_lockfree(mip_node_t* new_node) + { + assert(new_node != nullptr); + auto entry = std::make_shared(new_node); + best_first_heap_.push(entry); + if (new_node->can_dive) diving_heap_.push(entry); + lower_bound_ = best_first_heap_.top()->lower_bound; + } + + mip_node_t* pop() + { + std::lock_guard lock(mutex_); + if (best_first_heap_.empty()) { return nullptr; } + auto entry = best_first_heap_.pop(); + lower_bound_ = best_first_heap_.empty() ? std::numeric_limits::infinity() + : best_first_heap_.top()->lower_bound; + mip_node_t* node = std::exchange(entry->node, nullptr); + assert(node != nullptr); + return node; + } + + bool diving_init(const lp_problem_t& lp, + mip_node_t& start_node, + std::vector& start_lower, + std::vector& start_upper, + std::vector& bounds_changed) + { + std::lock_guard lock(mutex_); + + auto node = pop_diving(); + if (!node) return false; + + start_node = node->detach_copy(); + start_lower = lp.lower; + start_upper = lp.upper; + std::fill(bounds_changed.begin(), bounds_changed.end(), false); + node->get_variable_bounds(start_lower, start_upper, bounds_changed); + return true; + } + + bool steal_from(node_queue_t& victim, i_t nodes_to_steal) + { + assert(this != &victim); + + bool steal = false; + std::scoped_lock lock(mutex_, victim.mutex_); + + for (i_t k = 0; k < nodes_to_steal; ++k) { + if (victim.best_first_heap_.size() < nodes_to_steal) break; + auto entry = victim.best_first_heap_.pop(); + + // Invalidate the node for diving + mip_node_t* node = std::exchange(entry->node, nullptr); + push_lockfree(node); + victim.lower_bound_ = victim.best_first_heap_.empty() + ? std::numeric_limits::infinity() + : victim.best_first_heap_.top()->lower_bound; + steal = true; + } + return steal; + } + + i_t diving_queue_size() { return diving_heap_.size(); } + i_t best_first_queue_size() { return best_first_heap_.size(); } + + f_t get_lower_bound() + { + return best_first_heap_.empty() ? std::numeric_limits::infinity() : lower_bound_.load(); + } + private: struct heap_entry_t { mip_node_t* node = nullptr; - f_t lower_bound = -inf; - f_t score = inf; + f_t lower_bound = -std::numeric_limits::infinity(); + f_t score = std::numeric_limits::infinity(); heap_entry_t(mip_node_t* new_node) : node(new_node), lower_bound(new_node->lower_bound), score(new_node->objective_estimate) @@ -102,67 +189,24 @@ class node_queue_t { } }; - heap_t, lower_bound_comp> best_first_heap; - heap_t, score_comp> diving_heap; - omp_mutex_t mutex; - - public: - void push(mip_node_t* new_node) - { - std::lock_guard lock(mutex); - auto entry = std::make_shared(new_node); - best_first_heap.push(entry); - diving_heap.push(entry); - } - - std::optional*> pop_best_first() - { - std::lock_guard lock(mutex); - auto entry = best_first_heap.pop(); - - if (entry.has_value()) { return std::exchange(entry.value()->node, nullptr); } - - return std::nullopt; - } - - std::optional*> pop_diving() + mip_node_t* pop_diving() { - std::lock_guard lock(mutex); - - while (!diving_heap.empty()) { - auto entry = diving_heap.pop(); - - if (entry.has_value()) { - if (auto node_ptr = entry.value()->node; node_ptr != nullptr) { return node_ptr; } + while (!diving_heap_.empty()) { + auto entry = diving_heap_.pop(); + if (entry->node != nullptr) { + entry->node->can_dive = false; + return entry->node; } } - return std::nullopt; + return nullptr; } - i_t diving_queue_size() - { - std::lock_guard lock(mutex); - return diving_heap.size(); - } - - i_t best_first_queue_size() - { - std::lock_guard lock(mutex); - return best_first_heap.size(); - } + heap_t, lower_bound_comp> best_first_heap_; + heap_t, score_comp> diving_heap_; + omp_mutex_t mutex_; - f_t get_lower_bound() - { - std::lock_guard lock(mutex); - return best_first_heap.empty() ? inf : best_first_heap.top()->lower_bound; - } - - mip_node_t* bfs_top() - { - std::lock_guard lock(mutex); - return best_first_heap.empty() ? nullptr : best_first_heap.top()->node; - } + omp_atomic_t lower_bound_{std::numeric_limits::infinity()}; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp index e71ec987d3..fc3ee3bb3f 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.cpp +++ b/cpp/src/branch_and_bound/pseudo_costs.cpp @@ -1080,7 +1080,7 @@ void strong_branching_reduced(const lp_problem_t& original_lp, strong_branch_up); } else { if (effective_batch_pdlp != 0) { -#pragma omp task default(shared) +#pragma omp task default(shared) priority(CUOPT_HIGH_TASK_PRIORITY) batch_pdlp_strong_branching_task(settings, effective_batch_pdlp, start_time, @@ -1100,10 +1100,9 @@ void strong_branching_reduced(const lp_problem_t& original_lp, i_t n = std::min(4 * settings.num_threads, fractional.size()); // Here we are creating more tasks than the number of threads // such that they can be scheduled dynamically to the threads. -#pragma omp taskloop num_tasks(n) default(shared) +#pragma omp taskloop num_tasks(n) default(shared) priority(CUOPT_DEFAULT_TASK_PRIORITY) for (i_t k = 0; k < n; ++k) { - i_t start = std::floor(k * fractional.size() / n); - i_t end = std::floor((k + 1) * fractional.size() / n); + auto [start, end] = calculate_index_range(k, fractional.size(), n); constexpr bool verbose = false; if (verbose) { @@ -1506,8 +1505,7 @@ i_t pseudo_costs_t::reliable_variable_selection( // If `reliable_threshold == 0`, then we set the uninitialized pseudocosts to the average. // Otherwise, the best ones are initialized via strong branching, while the other are ignored. // - // In the latter, we are not using the average pseudocost (which calculated in the `initialized` - // method). + // So we only need to initialize the average for the former. if (reliable_threshold == 0) { avg_down = compute_pseudocost_average_down(); avg_up = compute_pseudocost_average_up(); @@ -1613,14 +1611,12 @@ i_t pseudo_costs_t::reliable_variable_selection( min_percent_solved_by_batch_pdlp_at_root_for_pdlp); } - const int num_tasks = std::max(max_num_tasks, 1); - const int task_priority = reliability_branching_settings.task_priority; + const int num_tasks = std::max(max_num_tasks, 1); // If both batch PDLP and DS are used we double the max number of candidates const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates : reliability_branching_settings.max_num_candidates; const i_t num_candidates = std::min(unreliable_list.size(), max_num_candidates); - assert(task_priority > 0); assert(max_num_candidates > 0); assert(num_candidates > 0); assert(num_tasks > 0); @@ -1662,7 +1658,7 @@ i_t pseudo_costs_t::reliable_variable_selection( single_pivot_objective_change_estimate(worker->leaf_problem, settings, local_Arow, - node_ptr->vstatus, + worker->leaf_vstatus, j, basic_map[j], leaf_solution, @@ -1714,7 +1710,7 @@ i_t pseudo_costs_t::reliable_variable_selection( std::atomic concurrent_halt{0}; if (use_pdlp) { -#pragma omp task default(shared) +#pragma omp task default(shared) priority(CUOPT_HIGH_TASK_PRIORITY) batch_pdlp_reliability_branching_task(settings.log, rb_mode, num_candidates, @@ -1749,7 +1745,8 @@ i_t pseudo_costs_t::reliable_variable_selection( f_t dual_simplex_start_time = tic(); if (rb_mode != 2) { -#pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) default(shared) +#pragma omp taskloop if (num_tasks > 1) priority(CUOPT_HIGH_TASK_PRIORITY) \ + num_tasks(num_tasks) default(shared) for (i_t i = 0; i < num_candidates; ++i) { auto [score, j] = unreliable_list[i]; @@ -1766,7 +1763,7 @@ i_t pseudo_costs_t::reliable_variable_selection( const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, - node_ptr->vstatus, + worker->leaf_vstatus, worker->leaf_edge_norms, worker->basis_factors, worker->basic_list, @@ -1811,7 +1808,7 @@ i_t pseudo_costs_t::reliable_variable_selection( const auto [obj, status] = trial_branching(worker->leaf_problem, settings, var_types, - node_ptr->vstatus, + worker->leaf_vstatus, worker->leaf_edge_norms, worker->basis_factors, worker->basic_list, diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp index 5f80a7915b..a370a00de8 100644 --- a/cpp/src/branch_and_bound/pseudo_costs.hpp +++ b/cpp/src/branch_and_bound/pseudo_costs.hpp @@ -38,10 +38,6 @@ struct reliability_branching_settings_t { // Upper bound for the maximum number of LP iterations for a single trial branching i_t upper_max_lp_iter = 500; - // Priority of the tasks created when running the trial branching in parallel. - // Set to 1 to have the same priority as the other tasks. - i_t task_priority = 5; - // The maximum number of candidates initialized by strong branching in a single // node i_t max_num_candidates = 100; diff --git a/cpp/src/branch_and_bound/worker.hpp b/cpp/src/branch_and_bound/worker.hpp index cf227e561f..29a3164dfc 100644 --- a/cpp/src/branch_and_bound/worker.hpp +++ b/cpp/src/branch_and_bound/worker.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -16,7 +17,6 @@ #include -#include #include namespace cuopt::linear_programming::dual_simplex { @@ -42,16 +42,36 @@ struct branch_and_bound_stats_t { omp_atomic_t lexical_reduction_pruned_nodes = 0; }; +template +bool is_search_strategy_enabled(search_strategy_t strategy, + bool has_incumbent, + diving_heuristics_settings_t settings) +{ + switch (strategy) { + case BEST_FIRST: return true; + case PSEUDOCOST_DIVING: return settings.pseudocost_diving != 0; + case LINE_SEARCH_DIVING: return settings.line_search_diving != 0; + case GUIDED_DIVING: return settings.guided_diving != 0 && has_incumbent; + case COEFFICIENT_DIVING: return settings.coefficient_diving != 0; + } + + return false; +} + template class branch_and_bound_worker_t { public: - const i_t worker_id; + using float_type = f_t; + using int_type = i_t; + + i_t worker_id; omp_atomic_t search_strategy; omp_atomic_t is_active; omp_atomic_t lower_bound; lp_problem_t leaf_problem; lp_solution_t leaf_solution; + std::vector leaf_vstatus; std::vector leaf_edge_norms; basis_update_mpf_t basis_factors; @@ -63,7 +83,6 @@ class branch_and_bound_worker_t { std::vector start_lower; std::vector start_upper; - mip_node_t* start_node; pcgenerator_t rng; @@ -89,53 +108,23 @@ class branch_and_bound_worker_t { const lp_problem_t& original_lp, const csr_matrix_t& Arow, const std::vector& var_type, - const simplex_solver_settings_t& settings) + const simplex_solver_settings_t& settings, + uint64_t rng_offset = 0) : worker_id(worker_id), search_strategy(BEST_FIRST), is_active(false), lower_bound(-std::numeric_limits::infinity()), leaf_problem(original_lp), leaf_solution(original_lp.num_rows, original_lp.num_cols), + leaf_vstatus(original_lp.num_cols), basis_factors(original_lp.num_rows, settings.refactor_frequency), basic_list(original_lp.num_rows), nonbasic_list(), node_presolver(leaf_problem, Arow, {}, var_type), bounds_changed(original_lp.num_cols, false), - rng(settings.random_seed + pcgenerator_t::default_seed + worker_id, - pcgenerator_t::default_stream ^ worker_id) - { - } - - // Set the `start_node` for best-first search. - void init_best_first(mip_node_t* node, const lp_problem_t& original_lp) + rng(settings.random_seed + pcgenerator_t::default_seed + rng_offset + worker_id, + pcgenerator_t::default_stream ^ (worker_id + rng_offset)) { - start_node = node; - start_lower = original_lp.lower; - start_upper = original_lp.upper; - search_strategy = BEST_FIRST; - lower_bound = node->lower_bound; - is_active = true; - } - - // Initialize the worker for diving, setting the `start_node`, `start_lower` and - // `start_upper`. Returns `true` if the starting node is feasible via - // bounds propagation. - bool init_diving(mip_node_t* node, - search_strategy_t type, - const lp_problem_t& original_lp, - const simplex_solver_settings_t& settings) - { - internal_node = node->detach_copy(); - start_node = &internal_node; - start_lower = original_lp.lower; - start_upper = original_lp.upper; - search_strategy = type; - lower_bound = node->lower_bound; - is_active = true; - - std::fill(bounds_changed.begin(), bounds_changed.end(), false); - node->get_variable_bounds(start_lower, start_upper, bounds_changed); - return node_presolver.bounds_strengthening(settings, bounds_changed, start_lower, start_upper); } // Set the variables bounds for the LP relaxation in the current node. @@ -160,14 +149,127 @@ class branch_and_bound_worker_t { settings, bounds_changed, leaf_problem.lower, leaf_problem.upper); } - private: - // For diving, we need to store the full node instead of - // of just a pointer, since it is not stored in the tree anymore. - // To keep the same interface across all worker types, - // this will be used as a temporary storage and - // will be pointed by `start_node`. - // For exploration, this will not be used. - mip_node_t internal_node; + void set_active() { is_active = true; } +}; + +template +class bfs_worker_t : public branch_and_bound_worker_t { + public: + using Base = branch_and_bound_worker_t; + bfs_worker_t(i_t worker_id, + const lp_problem_t& original_lp, + const csr_matrix_t& Arow, + const std::vector& var_type, + const simplex_solver_settings_t& settings, + uint64_t rng_offset = 0) + : Base(worker_id, original_lp, Arow, var_type, settings, rng_offset) + { + this->start_lower = original_lp.lower; + this->start_upper = original_lp.upper; + this->search_strategy = BEST_FIRST; + + max_diving_workers.fill(0); + active_diving_workers.fill(0); + total_active_diving_workers = 0; + } + + void set_inactive() { this->is_active = false; } + + // Steal nodes from another worker + bool steal_from(bfs_worker_t* victim, i_t nodes_to_steal) + { + if (!victim || nodes_to_steal < 1) return false; + if (victim == this || !victim->is_active || + victim->node_queue.best_first_queue_size() < 2 * nodes_to_steal) { + return false; + } + + return node_queue.steal_from(victim->node_queue, nodes_to_steal); + } + + // Calculate the number of diving workers that this worker can launch. Having a fixed number + // of workers allows the solver to be more deterministic. + void calculate_num_diving_workers(i_t num_bfs_workers, + i_t total_diving_workers, + bool has_incumbent, + const diving_heuristics_settings_t& settings) + { + i_t num_active = 0; + for (i_t i = 1; i < num_search_strategies; ++i) { + num_active += is_search_strategy_enabled(search_strategies[i], has_incumbent, settings); + } + + total_max_diving_workers = 0; + max_diving_workers.fill(0); + if (num_active == 0) { return; } + + for (size_t i = 1, k = 0; i < num_search_strategies; ++i) { + if (is_search_strategy_enabled(search_strategies[i], has_incumbent, settings)) { + // Calculate the number of workers for a given diving heuristic + auto [type_start, type_end] = calculate_index_range(k, total_diving_workers, num_active); + i_t workers_per_type = type_end - type_start; + + // Calculate the number of diving workers allocated to this (best-first) worker + auto [start, end] = + calculate_index_range(this->worker_id, workers_per_type, num_bfs_workers); + max_diving_workers[i] = end - start; + total_max_diving_workers += max_diving_workers[i]; + ++k; + } + } + } + + // The worker-local node heap. + node_queue_t node_queue; + + // The number of diving workers of each type that this (best-first) worker can launch. + std::array max_diving_workers; + + // The number of active diving workers of each type associated with this (best-first) worker. + std::array, num_search_strategies> active_diving_workers; + + // Keep track of the total number of active diving worker that are associated with this + // (best-first) worker + omp_atomic_t total_active_diving_workers{0}; + + // The maximum number of diving worker that are associated with this + // (best-first) worker + i_t total_max_diving_workers{0}; +}; + +template +class diving_worker_t : public branch_and_bound_worker_t { + public: + using Base = branch_and_bound_worker_t; + using Base::Base; + + // Apply bound strengthening to the starting variable bounds + bool presolve_start_bounds(const simplex_solver_settings_t& settings) + { + return this->node_presolver.bounds_strengthening( + settings, this->bounds_changed, this->start_lower, this->start_upper); + } + + // Set this node inactive + void set_inactive() + { + if (!this->is_active.load()) { return; } + assert(bfs_worker != nullptr); + assert(bfs_worker->active_diving_workers[this->search_strategy].load() > 0); + assert(bfs_worker->total_active_diving_workers.load() > 0); + + this->is_active = false; + --bfs_worker->active_diving_workers[this->search_strategy]; + --bfs_worker->total_active_diving_workers; + } + + f_t get_lower_bound() { return this->lower_bound; } + + mip_node_t start_node; + + // The best-first worker that is associated with this diving worker. Used for controlling the + // number of active diving workers. + bfs_worker_t* bfs_worker{nullptr}; }; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/branch_and_bound/worker_pool.hpp b/cpp/src/branch_and_bound/worker_pool.hpp index 01df7b0520..d75794bbf0 100644 --- a/cpp/src/branch_and_bound/worker_pool.hpp +++ b/cpp/src/branch_and_bound/worker_pool.hpp @@ -8,127 +8,102 @@ #pragma once #include +#include namespace cuopt::linear_programming::dual_simplex { -template -class branch_and_bound_worker_pool_t { +template +class worker_pool_t { public: + using i_t = typename WorkerType::int_type; + using f_t = typename WorkerType::float_type; + void init(i_t num_workers, const lp_problem_t& original_lp, const csr_matrix_t& Arow, const std::vector& var_type, mip_symmetry_t* symmetry, - const simplex_solver_settings_t& settings) + const simplex_solver_settings_t& settings, + const uint64_t rng_offset = 0) { + assert(!is_initialized_); + assert(num_workers > 0); + workers_.resize(num_workers); num_idle_workers_ = num_workers; + idle_workers_.clear_resize(num_workers); for (i_t i = 0; i < num_workers; ++i) { - workers_[i] = std::make_unique>( - i, original_lp, Arow, var_type, settings); + workers_[i] = + std::make_unique(i, original_lp, Arow, var_type, settings, rng_offset); + idle_workers_.push_back(i); // Propagate the (possibly null) symmetry pointer; workers lazily build // their orbital_fixing/lexical_reduction state via ensure_orbital_fixing(). workers_[i]->symmetry_ptr = symmetry; - idle_workers_.push_front(i); } - is_initialized = true; + is_initialized_ = true; } - // Here, we are assuming that the scheduler is the only - // thread that can retrieve/pop an idle worker. - branch_and_bound_worker_t* get_idle_worker() + WorkerType* pop_idle_worker() { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); if (idle_workers_.empty()) { return nullptr; } else { i_t idx = idle_workers_.front(); - return workers_[idx].get(); - } - } - - // Here, we are assuming that the scheduler is the only - // thread that can retrieve/pop an idle worker. - void pop_idle_worker() - { - std::lock_guard lock(mutex_); - if (!idle_workers_.empty()) { idle_workers_.pop_front(); num_idle_workers_--; + assert(idle_workers_.size() == static_cast(num_idle_workers_.load())); + assert(idx >= 0 && static_cast(idx) < workers_.size()); + return workers_[idx].get(); } } - void return_worker_to_pool(branch_and_bound_worker_t* worker) + void return_worker_to_pool(WorkerType* worker) { - worker->is_active = false; - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); + assert(worker != nullptr); + assert(workers_[worker->worker_id].get() == worker); + assert(static_cast(num_idle_workers_.load()) == idle_workers_.size()); + assert(idle_workers_.size() <= workers_.size()); + + worker->set_inactive(); + assert(!worker->is_active.load()); idle_workers_.push_back(worker->worker_id); num_idle_workers_++; } - f_t get_lower_bound() + WorkerType* operator[](i_t id) { - f_t lower_bound = std::numeric_limits::infinity(); - - if (is_initialized) { - for (i_t i = 0; i < workers_.size(); ++i) { - if (workers_[i]->search_strategy == BEST_FIRST && workers_[i]->is_active) { - lower_bound = std::min(workers_[i]->lower_bound.load(), lower_bound); - } - } - } - - return lower_bound; + assert(id >= 0 && static_cast(id) < workers_.size()); + assert(workers_[id] != nullptr); + return workers_[id].get(); + } + WorkerType* operator[](i_t id) const + { + assert(id >= 0 && static_cast(id) < workers_.size()); + assert(workers_[id] != nullptr); + return workers_[id].get(); } - i_t num_idle_workers() { return num_idle_workers_; } + bool is_initialized() const { return is_initialized_; } + + i_t num_idle() const { return num_idle_workers_; } + i_t size() const { return workers_.size(); } private: - // Worker pool - std::vector>> workers_; - bool is_initialized = false; + std::vector> workers_; + bool is_initialized_ = false; omp_mutex_t mutex_; - std::deque idle_workers_; - omp_atomic_t num_idle_workers_; + circular_deque_t idle_workers_; + omp_atomic_t num_idle_workers_{0}; }; -template -std::vector get_search_strategies( - diving_heuristics_settings_t settings) -{ - std::vector types; - types.reserve(num_search_strategies); - types.push_back(BEST_FIRST); - if (settings.pseudocost_diving != 0) { types.push_back(PSEUDOCOST_DIVING); } - if (settings.line_search_diving != 0) { types.push_back(LINE_SEARCH_DIVING); } - if (settings.guided_diving != 0) { types.push_back(GUIDED_DIVING); } - if (settings.coefficient_diving != 0) { types.push_back(COEFFICIENT_DIVING); } - return types; -} - -template -std::array get_max_workers( - i_t num_workers, const std::vector& strategies) -{ - std::array max_num_workers; - max_num_workers.fill(0); - - i_t bfs_workers = std::max(strategies.size() == 1 ? num_workers : num_workers / 4, 1); - max_num_workers[BEST_FIRST] = bfs_workers; - - i_t diving_workers = (num_workers - bfs_workers); - i_t m = strategies.size() - 1; - - for (size_t i = 1, k = 0; i < strategies.size(); ++i) { - i_t start = (double)k * diving_workers / m; - i_t end = (double)(k + 1) * diving_workers / m; - max_num_workers[strategies[i]] = end - start; - ++k; - } +template +using bfs_worker_pool_t = worker_pool_t>; - return max_num_workers; -} +template +using diving_worker_pool_t = worker_pool_t>; } // namespace cuopt::linear_programming::dual_simplex diff --git a/cpp/src/dual_simplex/initial_basis.cpp b/cpp/src/dual_simplex/initial_basis.cpp index d69bf8877e..e415f03672 100644 --- a/cpp/src/dual_simplex/initial_basis.cpp +++ b/cpp/src/dual_simplex/initial_basis.cpp @@ -18,6 +18,88 @@ namespace cuopt::linear_programming::dual_simplex { +uint8_t encode(variable_status_t vstatus) +{ + assert(vstatus != variable_status_t::SUPERBASIC && + "packed_variable_status_t does not support superbasic variables"); + + uint8_t val = 0; + switch (vstatus) { + case variable_status_t::BASIC: val = 0b00; break; + case variable_status_t::NONBASIC_LOWER: val = 0b01; break; + case variable_status_t::NONBASIC_UPPER: val = 0b10; break; + case variable_status_t::NONBASIC_FIXED: val = 0b01; break; + default: val = 0b11; + } + + return val; +} + +variable_status_t decode(uint8_t val) +{ + val &= 0b11; + if (val == 0b00) return variable_status_t::BASIC; + if (val == 0b01) return variable_status_t::NONBASIC_LOWER; + if (val == 0b10) return variable_status_t::NONBASIC_UPPER; + return variable_status_t::NONBASIC_FREE; +} + +std::vector compress_vstatus(const std::vector& vstatus) +{ + size_t compressed_size = vstatus.size() / 4; + size_t has_tail = (vstatus.size() % 4 > 0); + + std::vector packed_vstatus; + packed_vstatus.resize(compressed_size + has_tail); + + for (size_t i = 0; i < compressed_size; ++i) { + size_t j = i * 4; + packed_vstatus[i] = 0; + packed_vstatus[i] |= encode(vstatus[j]); + packed_vstatus[i] |= encode(vstatus[j + 1]) << 2; + packed_vstatus[i] |= encode(vstatus[j + 2]) << 4; + packed_vstatus[i] |= encode(vstatus[j + 3]) << 6; + } + + if (has_tail) { + size_t j = compressed_size * 4; + packed_vstatus[compressed_size] = 0; + packed_vstatus[compressed_size] |= encode(vstatus[j]); + if (j + 1 < vstatus.size()) packed_vstatus[compressed_size] |= encode(vstatus[j + 1]) << 2; + if (j + 2 < vstatus.size()) packed_vstatus[compressed_size] |= encode(vstatus[j + 2]) << 4; + if (j + 3 < vstatus.size()) packed_vstatus[compressed_size] |= encode(vstatus[j + 3]) << 6; + } + + return packed_vstatus; +} + +void decompress_vstatus(const std::vector& packed_vstatus, + size_t vstatus_size, + std::vector& vstatus) +{ + size_t compressed_size = vstatus_size / 4; + size_t has_tail = (vstatus_size % 4 > 0); + + vstatus.resize(vstatus_size); + assert(packed_vstatus.size() == compressed_size + has_tail); + + for (size_t i = 0; i < compressed_size; ++i) { + size_t j = i * 4; + vstatus[j] = decode(packed_vstatus[i]); + vstatus[j + 1] = decode(packed_vstatus[i] >> 2); + vstatus[j + 2] = decode(packed_vstatus[i] >> 4); + vstatus[j + 3] = decode(packed_vstatus[i] >> 6); + } + + if (has_tail) { + size_t j = compressed_size * 4; + vstatus[j] = decode(packed_vstatus[compressed_size]); + if (j + 1 < vstatus.size()) vstatus[j + 1] = decode(packed_vstatus[compressed_size] >> 2); + if (j + 2 < vstatus.size()) vstatus[j + 2] = decode(packed_vstatus[compressed_size] >> 4); + if (j + 3 < vstatus.size()) vstatus[j + 3] = decode(packed_vstatus[compressed_size] >> 6); + } +} + template i_t initial_basis_selection(const lp_problem_t& problem, const simplex_solver_settings_t& settings, diff --git a/cpp/src/dual_simplex/initial_basis.hpp b/cpp/src/dual_simplex/initial_basis.hpp index 646785fbd2..f4cb19efc5 100644 --- a/cpp/src/dual_simplex/initial_basis.hpp +++ b/cpp/src/dual_simplex/initial_basis.hpp @@ -1,6 +1,6 @@ /* clang-format off */ /* - * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 */ /* clang-format on */ @@ -23,6 +23,11 @@ enum class variable_status_t : int8_t { SUPERBASIC = 4 }; +std::vector compress_vstatus(const std::vector& vstatus); +void decompress_vstatus(const std::vector& packed_vstatus, + size_t vstatus_size, + std::vector& vstatus); + template i_t initial_basis_selection(const lp_problem_t& problem, const simplex_solver_settings_t& settings, diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp index 4683fa1d9e..27eac7f985 100644 --- a/cpp/src/dual_simplex/simplex_solver_settings.hpp +++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp @@ -114,6 +114,9 @@ struct simplex_solver_settings_t { mip_batch_pdlp_reliability_branching(0), strong_branching_simplex_iteration_limit(-1), random_seed(0), + bnb_steal_chance(-1), + bnb_nodes_per_steal(-1), + bnb_max_steal_attempts(-1), reliability_branching(-1), inside_mip(0), sub_mip(0), @@ -206,13 +209,21 @@ struct simplex_solver_settings_t { // PDLP only // Set the maximum number of simplex iterations allowed per trial branch when applying // strong branching to the root node. - // -1 - Automatic (iteration limit = 200) - // 0, 1 - Estimate the objective change using a single pivot of dual simplex - // >1 - Set as the iteration limit in dual simplex + // -1 - automatic (iteration limit = 200) + // 0, 1 - estimate the objective change using a single pivot of dual simplex + // >1 - set as the iteration limit in dual simplex i_t strong_branching_simplex_iteration_limit; diving_heuristics_settings_t diving_settings; // Settings for the diving heuristics + // In B&B, indicate the chance in which a worker can steal a node from another worker. + // -1 - automatic (0.05) + // 0 - disable + // >0 - set the stealing chance [0, 1] + f_t bnb_steal_chance; + i_t bnb_nodes_per_steal; + i_t bnb_max_steal_attempts; + // Settings for the reliability branching. // - -1: automatic // - 0: disable (use pseudocost branching instead) diff --git a/cpp/src/mip_heuristics/diversity/lns/rins.cu b/cpp/src/mip_heuristics/diversity/lns/rins.cu index 9396d7158a..e1318edf4d 100644 --- a/cpp/src/mip_heuristics/diversity/lns/rins.cu +++ b/cpp/src/mip_heuristics/diversity/lns/rins.cu @@ -64,7 +64,7 @@ void rins_t::node_callback(const std::vector& solution, f_t objec lp_optimal_solution = solution; CUOPT_LOG_DEBUG("Launching RINS task"); -#pragma omp task default(none) +#pragma omp task default(none) priority(CUOPT_MEDIUM_TASK_PRIORITY) run_rins(); } else { launch_new_task = true; @@ -225,7 +225,8 @@ void rins_t::run_rins() fj_cpu->log_prefix = "[RINS] "; CUOPT_LOG_DEBUG("Launching CPUFJ (RINS) task"); -#pragma omp task shared(fj_cpu) firstprivate(time_limit) default(none) +#pragma omp task shared(fj_cpu) firstprivate(time_limit) \ + priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) cpufj_solve(fj_cpu.get(), time_limit); f_t lower_bound = context.branch_and_bound_ptr ? context.branch_and_bound_ptr->get_lower_bound() diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu index 12b6c04070..46ffde6089 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_cpufj.cu @@ -45,7 +45,8 @@ void early_cpufj_t::start() double) { this->try_update_best(solver_obj, assignment); }; CUOPT_LOG_DEBUG("Launching early CPUFJ task"); -#pragma omp task shared(fj_cpu_) depend(out : *fj_cpu_) default(none) +#pragma omp task shared(fj_cpu_) priority(CUOPT_DEFAULT_TASK_PRIORITY) \ + depend(out : *fj_cpu_) default(none) cpufj_solve(fj_cpu_.get()); } diff --git a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu index 758c6272c1..eac901151b 100644 --- a/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu +++ b/cpp/src/mip_heuristics/feasibility_jump/early_gpufj.cu @@ -60,7 +60,8 @@ void early_gpufj_t::start() CUOPT_LOG_DEBUG("Launching early GPUFJ task"); -#pragma omp task default(none) shared(fj_ptr_) depend(out : *fj_ptr_) +#pragma omp task default(none) shared(fj_ptr_) priority(CUOPT_DEFAULT_TASK_PRIORITY) \ + depend(out : *fj_ptr_) { RAFT_CUDA_TRY(cudaSetDevice(this->device_id_)); fj_ptr_->solve(*this->solution_ptr_); diff --git a/cpp/src/mip_heuristics/local_search/local_search.cu b/cpp/src/mip_heuristics/local_search/local_search.cu index 4a13425437..8a288b2da8 100644 --- a/cpp/src/mip_heuristics/local_search/local_search.cu +++ b/cpp/src/mip_heuristics/local_search/local_search.cu @@ -96,7 +96,8 @@ void local_search_t::start_cpufj_scratch_threads(population_t::start_cpufj_lptopt_scratch_threads( CUOPT_LOG_DEBUG("Launching scratch CPUFJ (on LP optimal) task"); -#pragma omp task shared(scratch_cpu_fj_on_lp_opt) default(none) \ - depend(out : *scratch_cpu_fj_on_lp_opt) +#pragma omp task shared(scratch_cpu_fj_on_lp_opt) \ + priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) depend(out : *scratch_cpu_fj_on_lp_opt) cpufj_solve(scratch_cpu_fj_on_lp_opt.get()); } @@ -199,7 +200,8 @@ void local_search_t::start_cpufj_deterministic( }; CUOPT_LOG_DEBUG("Launching deterministic CPUFJ task"); -#pragma omp task shared(deterministic_cpu_fj) default(none) depend(inout : *deterministic_cpu_fj) +#pragma omp task shared(deterministic_cpu_fj) priority(CUOPT_DEFAULT_TASK_PRIORITY) default(none) \ + depend(inout : *deterministic_cpu_fj) cpufj_solve(deterministic_cpu_fj.get()); // Signal that registration is complete - B&B can now wait on producers @@ -270,7 +272,8 @@ bool local_search_t::do_fj_solve(solution_t& solution, size_t n = std::min(omp_get_num_threads() - 1, ls_cpu_fj.size()); CUOPT_LOG_DEBUG("Launching %d CPUFJ tasks", n); -#pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(n) nogroup +#pragma omp taskloop shared(ls_cpu_fj) default(none) num_tasks(n) \ + nogroup priority(CUOPT_DEFAULT_TASK_PRIORITY) for (size_t i = 0; i < n; ++i) { cpufj_solve(ls_cpu_fj[i].get()); } diff --git a/cpp/src/mip_heuristics/mip_constants.hpp b/cpp/src/mip_heuristics/mip_constants.hpp index f50c58194e..7d28048ba9 100644 --- a/cpp/src/mip_heuristics/mip_constants.hpp +++ b/cpp/src/mip_heuristics/mip_constants.hpp @@ -25,3 +25,18 @@ // MIP-only gate: skip the concurrent barrier when fewer threads are available than this // (1 PDLP + 1 dual simplex + 1 barrier). Stand-alone LP always runs all three. #define CUOPT_CONCURRENT_LP_BARRIER_REQUIRED_THREAD_COUNT 3 + +/* @brief Priority classes for the omp tasks. Highest value = higher priority. + * Note that this only gives a hint to the runtime, such that the high priority + * is not guarantee to be executed before a low priority one (i.e., do not rely on + * these values for correctness). + */ +#define CUOPT_CRITICAL_TASK_PRIORITY 1000 +#define CUOPT_HIGH_TASK_PRIORITY 100 +#define CUOPT_MEDIUM_TASK_PRIORITY 10 +#define CUOPT_DEFAULT_TASK_PRIORITY 1 + +// Default values for work stealing in B&B +#define MIP_DEFAULT_STEAL_CHANCE 0.05 +#define MIP_DEFAULT_NODES_PER_STEAL 10 +#define MIP_DEFAULT_MAX_STEAL_ATTEMPTS 3 diff --git a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu index 3d62b99f66..43c58ff1d7 100644 --- a/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu +++ b/cpp/src/mip_heuristics/presolve/conditional_bound_strengthening.cu @@ -249,7 +249,8 @@ void conditional_bound_strengthening_t::select_constraint_pairs_host( i_t num_tasks = std::max(omp_get_num_threads() - 2, 1); CUOPT_LOG_INFO("Selecting constraint pairs with %d tasks", num_tasks); -#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(shared) +#pragma omp taskloop num_tasks(num_tasks) private(cnstr_pair) default(shared) \ + priority(CUOPT_DEFAULT_TASK_PRIORITY) for (i_t cnstr = 0; cnstr < problem.n_constraints; ++cnstr) { for (i_t jj = offsets[cnstr]; jj < offsets[cnstr + 1]; ++jj) { int var = variables[jj]; diff --git a/cpp/src/mip_heuristics/presolve/probing_cache.cu b/cpp/src/mip_heuristics/presolve/probing_cache.cu index 36b96dceaf..3767e0f2d0 100644 --- a/cpp/src/mip_heuristics/presolve/probing_cache.cu +++ b/cpp/src/mip_heuristics/presolve/probing_cache.cu @@ -896,17 +896,16 @@ bool compute_probing_cache(bound_presolve_t& bound_presolve, if (timer.check_time_limit() || early_exit || problem_is_infeasible.load()) { break; } size_t step_end = std::min(step_start + step_size, priority_indices.size()); -#pragma omp taskloop num_tasks(num_tasks) default(shared) +#pragma omp taskloop num_tasks(num_tasks) default(shared) priority(CUOPT_DEFAULT_TASK_PRIORITY) for (size_t task_id = 0; task_id < num_tasks; ++task_id) { - size_t n = step_end - step_start; - size_t begin = step_start + std::floor(static_cast(n) * task_id / num_tasks); - size_t end = step_start + std::floor(static_cast(n) * (task_id + 1) / num_tasks); + size_t n = step_end - step_start; + auto [begin, end] = calculate_index_range(task_id, n, num_tasks); auto& multi_probe_presolve = multi_probe_presolve_pool[task_id]; auto& modification_vector = modification_vector_pool[task_id]; auto& substitution_vector = substitution_vector_pool[task_id]; if (timer.check_time_limit()) { continue; } - for (size_t i = begin; i < end; ++i) { + for (size_t i = step_start + begin; i < step_start + end; ++i) { auto var_idx = priority_indices[i]; if (timer.check_time_limit()) { continue; } diff --git a/cpp/src/mip_heuristics/solve.cu b/cpp/src/mip_heuristics/solve.cu index 1c7cfe6f06..2b64a7c681 100644 --- a/cpp/src/mip_heuristics/solve.cu +++ b/cpp/src/mip_heuristics/solve.cu @@ -812,10 +812,9 @@ mip_solution_t solve_mip(optimization_problem_t& op_problem, } template -mip_solution_t solve_mip( - raft::handle_t const* handle_ptr, - const cuopt::linear_programming::io::mps_data_model_t& mps_data_model, - mip_solver_settings_t const& settings) +mip_solution_t solve_mip(raft::handle_t const* handle_ptr, + const io::mps_data_model_t& mps_data_model, + mip_solver_settings_t const& settings) { auto op_problem = mps_data_model_to_optimization_problem(handle_ptr, mps_data_model); return solve_mip(op_problem, settings); diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu index 3bcbca6939..c25ade0c05 100644 --- a/cpp/src/mip_heuristics/solver.cu +++ b/cpp/src/mip_heuristics/solver.cu @@ -451,7 +451,7 @@ solution_t mip_solver_t::run_solver() branch_and_bound->set_concurrent_lp_root_solve(true); context.problem_ptr->branch_and_bound_callback = - std::bind(&dual_simplex::branch_and_bound_t::set_new_solution, + std::bind(&dual_simplex::branch_and_bound_t::set_solution_from_heuristics, branch_and_bound.get(), std::placeholders::_1); } else if (context.settings.determinism_mode == CUOPT_MODE_DETERMINISTIC) { @@ -488,7 +488,7 @@ solution_t mip_solver_t::run_solver() #pragma omp taskgroup { if (!context.settings.heuristics_only) { -#pragma omp task default(shared) +#pragma omp task default(shared) priority(CUOPT_CRITICAL_TASK_PRIORITY) { branch_and_bound_status = branch_and_bound->solve(branch_and_bound_solution); } diff --git a/cpp/src/utilities/circular_deque.hpp b/cpp/src/utilities/circular_deque.hpp new file mode 100644 index 0000000000..6e9d8f5b05 --- /dev/null +++ b/cpp/src/utilities/circular_deque.hpp @@ -0,0 +1,114 @@ +/* clang-format off */ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + */ +/* clang-format on */ + +#pragma once + +#include +#include +#include + +namespace cuopt { + +// A fixed-capacity double-ended queue backed by a circular buffer. +// All operations are O(1) with no dynamic allocation after construction. +// +// Preconditions (asserted in debug builds): +// - push_front / push_back : size() < capacity() +// - pop_front / pop_back : !empty() +// - front / back : !empty() +template +class circular_deque_t { + public: + circular_deque_t() : buffer_(1), capacity_(1), head_(0), tail_(0) {} + + // Allocates storage for exactly `capacity` elements up front. + explicit circular_deque_t(size_t capacity) + : buffer_(capacity + 1), // +1 to distinguish full (next(tail)==head) from empty (head==tail) + capacity_(capacity + 1), + head_(0), + tail_(0) + { + assert(capacity > 0); + } + + bool empty() const { return head_ == tail_; } + bool full() const { return next(tail_) == head_; } + + size_t size() const { return (tail_ - head_ + capacity_) % capacity_; } + size_t capacity() const { return capacity_ - 1; } + + void clear_resize(size_t new_capacity) + { + assert(new_capacity > 0); + head_ = 0; + tail_ = 0; + capacity_ = new_capacity + 1; + buffer_.resize(capacity_); + } + + void push_back(T val) + { + assert(!full()); + buffer_[tail_] = std::move(val); + tail_ = next(tail_); + } + + void push_front(T val) + { + assert(!full()); + head_ = prev(head_); + buffer_[head_] = std::move(val); + } + + T pop_front() + { + assert(!empty()); + T val = std::move(buffer_[head_]); + head_ = next(head_); + return val; + } + + T pop_back() + { + assert(!empty()); + tail_ = prev(tail_); + return std::move(buffer_[tail_]); + } + + T& front() + { + assert(!empty()); + return buffer_[head_]; + } + const T& front() const + { + assert(!empty()); + return buffer_[head_]; + } + + T& back() + { + assert(!empty()); + return buffer_[prev(tail_)]; + } + const T& back() const + { + assert(!empty()); + return buffer_[prev(tail_)]; + } + + private: + size_t next(size_t idx) const { return (idx + 1) % capacity_; } + size_t prev(size_t idx) const { return (idx + capacity_ - 1) % capacity_; } + + std::vector buffer_; + size_t capacity_; + size_t head_; + size_t tail_; +}; + +} // namespace cuopt diff --git a/cpp/src/utilities/omp_helpers.hpp b/cpp/src/utilities/omp_helpers.hpp index a13b9ec887..59b754460d 100644 --- a/cpp/src/utilities/omp_helpers.hpp +++ b/cpp/src/utilities/omp_helpers.hpp @@ -7,11 +7,27 @@ #pragma once +#include +#include + +namespace cuopt { + +// Split `total` entries across `n` blocks and return the [start, end) range for `k`-th block. +// Useful for splitting work across OpenMP tasks/threads/workers. +template +std::pair calculate_index_range(i_t k, double total, double n) +{ + i_t start = std::floor(k * total / n); + i_t end = std::floor((k + 1) * total / n); + return {start, end}; +} + +} // namespace cuopt + #ifdef _OPENMP #include #include -#include namespace cuopt { @@ -20,11 +36,9 @@ namespace cuopt { class omp_mutex_t { public: omp_mutex_t() : mutex(new omp_lock_t) { omp_init_lock(mutex.get()); } - - omp_mutex_t(const omp_mutex_t&) = delete; - omp_mutex_t(omp_mutex_t&& other) { *this = std::move(other); } + omp_mutex_t(const omp_mutex_t&) = delete; omp_mutex_t& operator=(const omp_mutex_t&) = delete; omp_mutex_t& operator=(omp_mutex_t&& other) @@ -203,17 +217,10 @@ class omp_atomic_t { private: T val; -#ifndef __NVCC__ friend double fetch_min(omp_atomic_t& atomic_var, double other); friend double fetch_max(omp_atomic_t& atomic_var, double other); -#endif }; -// Atomic CAS are only supported in OpenMP v5.1 -// (gcc 12+ or clang 14+), however, nvcc (or the host compiler) cannot -// parse it correctly yet -#ifndef __NVCC__ - // Free non-template functions are necessary because of a clang 20 bug // when omp atomic compare is used within a templated context. // see https://github.com/llvm/llvm-project/issues/127466 @@ -238,7 +245,6 @@ inline double fetch_max(omp_atomic_t& atomic_var, double other) } return old; } -#endif #endif diff --git a/cpp/src/utilities/pcgenerator.hpp b/cpp/src/utilities/pcgenerator.hpp index 29a865f02f..e83e5f36ad 100644 --- a/cpp/src/utilities/pcgenerator.hpp +++ b/cpp/src/utilities/pcgenerator.hpp @@ -21,12 +21,11 @@ class pcgenerator_t { static constexpr uint64_t default_stream = 0xda3e39cb94b95bdbULL; /** - * @brief ctor. Initializes the PCG - * @param rng_state is the generator state used for initializing the generator - * @param subsequence specifies the subsequence to be generated out of 2^64 possible subsequences - * In a parallel setting, like threads of a CUDA kernel, each thread is required to generate a - * unique set of random numbers. This can be achieved by initializing the generator with same - * rng_state for all the threads and diststreamt values for subsequence. + * @brief Initializes the PCG generator. + * @param seed Generator state seed. + * @param subsequence Selects one of 2^64 independent subsequences. Use distinct values per + * thread to guarantee non-overlapping streams in parallel contexts. + * @param offset Number of outputs to skip ahead before the first draw. */ pcgenerator_t(const uint64_t seed = default_seed, const uint64_t subsequence = default_stream, @@ -35,7 +34,12 @@ class pcgenerator_t { set_seed(seed, subsequence, offset); } - // Set the seed, subsequence and offset of the PCG + /** + * @brief Re-seeds the generator. + * @param seed Generator state seed. + * @param subsequence Selects one of 2^64 independent subsequences. + * @param offset Number of outputs to skip ahead before the first draw. + */ void set_seed(uint64_t seed, const uint64_t subsequence = default_stream, uint64_t offset = 0) { state = uint64_t(0); @@ -47,8 +51,12 @@ class pcgenerator_t { skipahead(offset); } - // Based on "Random Number Generation with Arbitrary Strides" F. B. Brown - // Link https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf + /** + * @brief Advances the generator state by @p offset steps in O(log offset) time. + * + * Uses the closed-form LCG jump described in "Random Number Generation with Arbitrary Strides" + * (F. B. Brown, https://mcnp.lanl.gov/pdf_files/anl-rn-arb-stride.pdf). + */ void skipahead(uint64_t offset) { uint64_t G = 1; @@ -68,9 +76,7 @@ class pcgenerator_t { } /** - * @defgroup NextRand Generate the next random number - * @brief This code is derived from PCG basic code - * @{ + * @returns the next uniformly distributed 32-bit unsigned integer. */ uint32_t next_u32() { @@ -83,6 +89,9 @@ class pcgenerator_t { return ret; } + /** + * @returns the next uniformly distributed 64-bit unsigned integer. + */ uint64_t next_u64() { uint64_t ret; @@ -93,6 +102,10 @@ class pcgenerator_t { return ret; } + /** + * @returns the next uniformly distributed non-negative 32-bit signed integer in [0, + * INT32_MAX]. + */ int32_t next_i32() { int32_t ret; @@ -102,6 +115,10 @@ class pcgenerator_t { return ret; } + /** + * @returns the next uniformly distributed non-negative 64-bit signed integer in [0, + * INT64_MAX]. + */ int64_t next_i64() { int64_t ret; @@ -111,10 +128,19 @@ class pcgenerator_t { return ret; } - float next_float() { return static_cast((next_u32() >> 8) * 0x1.0p-24); } + /** + * @returns a uniformly distributed float in [0, 1). + */ + float next_float() { return (next_u32() >> 8) * 0x1.0p-24; } - double next_double() { return static_cast((next_u64() >> 11) * 0x1.0p-53); } + /** + * @returns a uniformly distributed double in [0, 1). + */ + double next_double() { return (next_u64() >> 11) * 0x1.0p-53; } + /** + * @returns the next random value of type @p T. + */ template T next() { @@ -130,9 +156,11 @@ class pcgenerator_t { void next(float& ret) { ret = next_float(); } void next(double& ret) { ret = next_double(); } - /// Draws a sample from a uniform distribution. The samples are uniformly distributed over - /// the semi-closed interval `[low, high)`. This routine may have a **slight bias** toward - /// some numbers in the range (scaling by floating-point). + /** + * @brief Draws a sample from a uniform distribution over `[low, high)`. + * + * May have a slight bias toward some values due to floating-point scaling. + */ template T uniform(T low, T high) { @@ -141,7 +169,9 @@ class pcgenerator_t { return low + (val * range); } - // Shuffles the contents of a sequence using the Fisher–Yates algorithm. + /** + * @brief Shuffles @p seq in-place using the Fisher-Yates algorithm. + */ template void shuffle(std::vector& seq) {