scalometer/include/strategy.hpp

#ifndef STRATEGY_HPP
#define STRATEGY_HPP

#include <omp.h>
#include <stdexcept>
#include <string>

#ifdef ENABLE_EVENTIFY
#include <eventify/task_system.hxx>
#endif

// Parallelization strategies are defined here. Assumption for now: there is
// always an outer loop than can be parallelized. The strategies are templates
// instanciated when adding kernels to the kernel registry.
// Here, we only define the treatment of the outermost loop. The loop bodies
// are defined in kernels.cpp

namespace strategy {

// define concept to ensure that the loop bodies defined in kernels.cpp
// represent one invocable iteration of a parallel loop
template <typename Func>
concept invocable_with_int = requires(Func&& f, int i) {
  { std::forward<Func>(f)(i) };  // Checks if calling f(i) is valid
};

// for OpenMP, we just use the for pragma for the outermost loop
template <typename Func>
  requires invocable_with_int<Func>
void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads,
                     Func&& loop_body) {
  omp_set_num_threads(static_cast<int>(n_threads));

#pragma omp parallel for schedule(static)
  for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
    loop_body(i);
  }
}

#ifdef ENABLE_EVENTIFY
// for eventify, we calculate indices for evenly divided chunks of the outermost
// loop, create independent tasks and submit them to the tasking system
template <typename Func>
  requires invocable_with_int<Func>
void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks,
                       Func&& loop_body) {
  auto task_system = eventify::task_system{};
  int tripcount = kernel_end_idx - kernel_start_idx + 1;
  int chunk_size = tripcount / n_tasks;
  int remainder = tripcount % n_tasks;

  for (int tid = 0; tid < n_tasks; ++tid) {
    auto task = [tid, tripcount, chunk_size, remainder, loop_body] {
      int start_idx = tid * chunk_size;
      int end_idx = start_idx + chunk_size - 1;
      if (tripcount - end_idx == remainder) end_idx += remainder;

      for (int i = start_idx; i < end_idx; ++i) {
        loop_body(i);
      }
    };
    task_system.submit(task);
  }
}
#endif //ENABLE_EVENTIFY

// parallelization strategy selector
template <typename Func>
  requires invocable_with_int<Func>
void execute_strategy(const std::string& strategy_name, int kernel_start_idx,
                      int kernel_end_idx, int num_threads_or_tasks,
                      Func&& loop_body) {
  if (strategy_name == "omp") {
    openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks,
                    std::forward<Func>(loop_body));
#ifdef ENABLE_EVENTIFY
  } else if (strategy_name == "eventify") {
    eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks,
                      std::forward<Func>(loop_body));
#endif
  } else {
    throw std::invalid_argument("Unknown strategy: " + strategy_name);
  }
}

}  // namespace strategy
#endif  // STRATEGY_HPP