scalometer/include/strategy.hpp

#ifndef STRATEGY_HPP
#define STRATEGY_HPP

#include <omp.h>
#include <stdexcept>
#include <string>
#include <eventify/task_system.hxx>

// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized.
// The strategies are templates instanciated when adding kernels to the kernel registry.
// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp

namespace strategy {

  // define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop 
  template <typename Func>
  concept invocable_with_int = requires(Func&& f, int i) {
      { std::forward<Func>(f)(i) };  // Checks if calling f(i) is valid
  };
  
  
  // for OpenMP, we just use the for pragma for the outermost loop
  template <typename Func>
  requires invocable_with_int <Func>
  void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) {
      omp_set_num_threads(static_cast<int>(n_threads));
  
      #pragma omp parallel for schedule(static)
      for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
          loop_body(i);
      }
  }
  
  // for eventify, we calculate indices for evenly divided chunks of the outermost loop,
  // create independent tasks and submit them to the tasking system
  template <typename Func>
  requires invocable_with_int <Func>
  void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) {
    auto task_system = eventify::task_system {};
    int tripcount = kernel_end_idx - kernel_start_idx + 1;
    int chunk_size = tripcount / n_tasks;
    int remainder = tripcount % n_tasks;
  
    for (int tid = 0; tid < n_tasks; ++tid) {
      auto task = [tid, tripcount, chunk_size, remainder, loop_body]{
        int start_idx = tid * chunk_size;
        int end_idx = start_idx + chunk_size - 1;
        if (tripcount - end_idx == remainder) end_idx += remainder;
        
        for (int i = start_idx; i < end_idx; ++i) {
          loop_body(i);
        }
      };
      task_system.submit(task);
    }
  }
  
  // parallelization strategy selector 
  template <typename Func>
  requires invocable_with_int<Func>
  void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) {
      if (strategy_name == "omp") {
          openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
      } else if (strategy_name == "eventify") {
          eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body)); 
      } else {
          throw std::invalid_argument("Unknown strategy: " + strategy_name);
      }
  }

}
#endif //STRATEGY_HPP
Initial commit 2024-12-13 00:33:08 +01:00			`#ifndef STRATEGY_HPP`
			`#define STRATEGY_HPP`

			`#include <omp.h>`
			`#include <stdexcept>`
			`#include <string>`
			`#include <eventify/task_system.hxx>`

			`// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized.`
			`// The strategies are templates instanciated when adding kernels to the kernel registry.`
			`// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp`

			`namespace strategy {`

			`// define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop`
			`template <typename Func>`
			`concept invocable_with_int = requires(Func&& f, int i) {`
			`{ std::forward<Func>(f)(i) }; // Checks if calling f(i) is valid`
			`};`


			`// for OpenMP, we just use the for pragma for the outermost loop`
			`template <typename Func>`
			`requires invocable_with_int <Func>`
			`void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) {`
			`omp_set_num_threads(static_cast<int>(n_threads));`

			`#pragma omp parallel for schedule(static)`
			`for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {`
			`loop_body(i);`
			`}`
			`}`

			`// for eventify, we calculate indices for evenly divided chunks of the outermost loop,`
			`// create independent tasks and submit them to the tasking system`
			`template <typename Func>`
			`requires invocable_with_int <Func>`
			`void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) {`
			`auto task_system = eventify::task_system {};`
			`int tripcount = kernel_end_idx - kernel_start_idx + 1;`
			`int chunk_size = tripcount / n_tasks;`
			`int remainder = tripcount % n_tasks;`

			`for (int tid = 0; tid < n_tasks; ++tid) {`
			`auto task = [tid, tripcount, chunk_size, remainder, loop_body]{`
			`int start_idx = tid * chunk_size;`
			`int end_idx = start_idx + chunk_size - 1;`
			`if (tripcount - end_idx == remainder) end_idx += remainder;`

			`for (int i = start_idx; i < end_idx; ++i) {`
			`loop_body(i);`
			`}`
			`};`
			`task_system.submit(task);`
			`}`
			`}`

			`// parallelization strategy selector`
			`template <typename Func>`
			`requires invocable_with_int<Func>`
			`void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) {`
			`if (strategy_name == "omp") {`
			`openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));`
			`} else if (strategy_name == "eventify") {`
			`eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));`
			`} else {`
			`throw std::invalid_argument("Unknown strategy: " + strategy_name);`
			`}`
			`}`

			`}`
			`#endif //STRATEGY_HPP`