diff --git a/include/kernels.hpp b/include/kernels.hpp index f6c6be4..d44a24f 100644 --- a/include/kernels.hpp +++ b/include/kernels.hpp @@ -1,41 +1,41 @@ #ifndef KERNELS_HPP #define KERNELS_HPP -#include #include +#include #include - class Kernel { -public: + public: using StrategyFunction = std::function; using PreparationFunction = std::function; - Kernel(const std::string& name, StrategyFunction strategy_function, PreparationFunction preparation_function); + Kernel(const std::string& name, StrategyFunction strategy_function, + PreparationFunction preparation_function); void prepare() const; void execute(int n_threads_or_tasks, int kernel_tripcount) const; -private: + private: std::string name_; StrategyFunction strategy_function_; PreparationFunction preparation_function_; }; class KernelRegistry { -public: + public: using KernelBuilder = std::function; void register_kernel(const std::string& name, KernelBuilder factory); Kernel load_kernel(const std::string& name) const; std::vector list_available_kernels() const; -private: - // FIXME: no benchmarking of maps done. The registry is expected to stay small, though + private: + // FIXME: no benchmarking of maps done. The registry is expected to stay + // small, though std::unordered_map registry_; }; void initialize_registry(KernelRegistry* registry, std::string strategy_name); -#endif // KERNELS_HPP - +#endif // KERNELS_HPP diff --git a/include/strategy.hpp b/include/strategy.hpp index c298113..d4f93ed 100644 --- a/include/strategy.hpp +++ b/include/strategy.hpp @@ -2,71 +2,80 @@ #define STRATEGY_HPP #include + +#include #include #include -#include -// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized. -// The strategies are templates instanciated when adding kernels to the kernel registry. -// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp +// Parallelization strategies are defined here. Assumption for now: there is +// always an outer loop than can be parallelized. The strategies are templates +// instanciated when adding kernels to the kernel registry. +// Here, we only define the treatment of the outermost loop. The loop bodies +// are defined in kernels.cpp namespace strategy { - // define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop - template - concept invocable_with_int = requires(Func&& f, int i) { - { std::forward(f)(i) }; // Checks if calling f(i) is valid - }; - - - // for OpenMP, we just use the for pragma for the outermost loop - template - requires invocable_with_int - void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) { - omp_set_num_threads(static_cast(n_threads)); - - #pragma omp parallel for schedule(static) - for (int i = kernel_start_idx; i < kernel_end_idx; ++i) { - loop_body(i); - } - } - - // for eventify, we calculate indices for evenly divided chunks of the outermost loop, - // create independent tasks and submit them to the tasking system - template - requires invocable_with_int - void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) { - auto task_system = eventify::task_system {}; - int tripcount = kernel_end_idx - kernel_start_idx + 1; - int chunk_size = tripcount / n_tasks; - int remainder = tripcount % n_tasks; - - for (int tid = 0; tid < n_tasks; ++tid) { - auto task = [tid, tripcount, chunk_size, remainder, loop_body]{ - int start_idx = tid * chunk_size; - int end_idx = start_idx + chunk_size - 1; - if (tripcount - end_idx == remainder) end_idx += remainder; - - for (int i = start_idx; i < end_idx; ++i) { - loop_body(i); - } - }; - task_system.submit(task); - } - } - - // parallelization strategy selector - template - requires invocable_with_int - void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) { - if (strategy_name == "omp") { - openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward(loop_body)); - } else if (strategy_name == "eventify") { - eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward(loop_body)); - } else { - throw std::invalid_argument("Unknown strategy: " + strategy_name); - } - } +// define concept to ensure that the loop bodies defined in kernels.cpp +// represent one invocable iteration of a parallel loop +template +concept invocable_with_int = requires(Func&& f, int i) { + { std::forward(f)(i) }; // Checks if calling f(i) is valid +}; +// for OpenMP, we just use the for pragma for the outermost loop +template + requires invocable_with_int +void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, + Func&& loop_body) { + omp_set_num_threads(static_cast(n_threads)); + +#pragma omp parallel for schedule(static) + for (int i = kernel_start_idx; i < kernel_end_idx; ++i) { + loop_body(i); + } } -#endif //STRATEGY_HPP + +// for eventify, we calculate indices for evenly divided chunks of the outermost +// loop, create independent tasks and submit them to the tasking system +template + requires invocable_with_int +void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, + Func&& loop_body) { + auto task_system = eventify::task_system{}; + int tripcount = kernel_end_idx - kernel_start_idx + 1; + int chunk_size = tripcount / n_tasks; + int remainder = tripcount % n_tasks; + + for (int tid = 0; tid < n_tasks; ++tid) { + auto task = [tid, tripcount, chunk_size, remainder, loop_body] { + int start_idx = tid * chunk_size; + int end_idx = start_idx + chunk_size - 1; + if (tripcount - end_idx == remainder) end_idx += remainder; + + for (int i = start_idx; i < end_idx; ++i) { + loop_body(i); + } + }; + task_system.submit(task); + } +} + +// parallelization strategy selector +template + requires invocable_with_int +void execute_strategy(const std::string& strategy_name, int kernel_start_idx, + int kernel_end_idx, int num_threads_or_tasks, + Func&& loop_body) { + if (strategy_name == "omp") { + openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, + std::forward(loop_body)); + } else if (strategy_name == "eventify") { + eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, + std::forward(loop_body)); + } else { + throw std::invalid_argument("Unknown strategy: " + strategy_name); + } +} + +} // namespace strategy +#endif // STRATEGY_HPP diff --git a/include/utils.hpp b/include/utils.hpp index c061eb8..ace675e 100644 --- a/include/utils.hpp +++ b/include/utils.hpp @@ -5,12 +5,12 @@ // Function to initialize a vector with random numbers void initialize_vector(std::vector& v) { - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(0.0f, 1.0f); - for (auto& elem : v) { - elem = dis(gen); - } + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0.0f, 1.0f); + for (auto& elem : v) { + elem = dis(gen); + } } -#endif //UTILS_HPP +#endif // UTILS_HPP diff --git a/src/kernels.cpp b/src/kernels.cpp index e6cef3c..d9f951a 100644 --- a/src/kernels.cpp +++ b/src/kernels.cpp @@ -1,21 +1,26 @@ +#include "kernels.hpp" + #include #include -#include "kernels.hpp" + #include "strategy.hpp" #include "utils.hpp" -Kernel::Kernel(const std::string& name, Kernel::StrategyFunction strategy_function, Kernel::PreparationFunction preparation_function) - : name_(name), strategy_function_(std::move(strategy_function)), preparation_function_(std::move(preparation_function)) {} +Kernel::Kernel(const std::string& name, + Kernel::StrategyFunction strategy_function, + Kernel::PreparationFunction preparation_function) + : name_(name), + strategy_function_(std::move(strategy_function)), + preparation_function_(std::move(preparation_function)) {} -void Kernel::prepare() const { - preparation_function_(); -} +void Kernel::prepare() const { preparation_function_(); } void Kernel::execute(int num_threads_or_tasks, int kernel_tripcount) const { strategy_function_(0, kernel_tripcount, num_threads_or_tasks); } -void KernelRegistry::register_kernel(const std::string& name, KernelBuilder factory) { +void KernelRegistry::register_kernel(const std::string& name, + KernelBuilder factory) { registry_.emplace(name, std::move(factory)); } @@ -35,17 +40,17 @@ std::vector KernelRegistry::list_available_kernels() const { return kernel_names; } -// New kernels go here, each can have it's own set of arguments and initializations -// execute() contains the full kernel code minus an outer for loop (i=start, iregister_kernel("stream_triad", [&]() { auto a = std::make_shared>(); auto b = std::make_shared>(); auto c = std::make_shared>(); - + auto prepare = [=]() { a->resize(VECTOR_SIZE); b->resize(VECTOR_SIZE); @@ -54,10 +59,11 @@ void initialize_registry(KernelRegistry* registry, std::string strategy_name) { initialize_vector(*c); }; - auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) { - strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) { - (*a)[i] = (*b)[i] + 0.5f * (*c)[i]; - }); + auto execute = [=](int kernel_start_idx, int kernel_end_idx, + int num_threads_or_tasks) { + strategy::execute_strategy( + strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, + [&](int i) { (*a)[i] = (*b)[i] + 0.5f * (*c)[i]; }); }; return Kernel("stream_triad", execute, prepare); @@ -74,10 +80,11 @@ void initialize_registry(KernelRegistry* registry, std::string strategy_name) { initialize_vector(*b); }; - auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) { - strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) { - (*a)[i] += 0.5f * (*b)[i]; - }); + auto execute = [=](int kernel_start_idx, int kernel_end_idx, + int num_threads_or_tasks) { + strategy::execute_strategy(strategy_name, kernel_start_idx, + kernel_end_idx, num_threads_or_tasks, + [&](int i) { (*a)[i] += 0.5f * (*b)[i]; }); }; return Kernel("daxpy", execute, prepare); @@ -106,16 +113,19 @@ void initialize_registry(KernelRegistry* registry, std::string strategy_name) { initialize_vector(*ry); initialize_vector(*rz); }; - - auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) { - strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) { - (*potential)[i] = (*charge1)[i] * (*charge2)[i] / std::sqrt((*rx)[i] * (*rx)[i] + (*ry)[i] * (*ry)[i] + (*rz)[i] * (*rz)[i]); - }); + + auto execute = [=](int kernel_start_idx, int kernel_end_idx, + int num_threads_or_tasks) { + strategy::execute_strategy( + strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, + [&](int i) { + (*potential)[i] = + (*charge1)[i] * (*charge2)[i] / + std::sqrt((*rx)[i] * (*rx)[i] + (*ry)[i] * (*ry)[i] + + (*rz)[i] * (*rz)[i]); + }); }; - + return Kernel("coulomb", execute, prepare); }); - - } - diff --git a/src/main.cpp b/src/main.cpp index 408f61a..f507904 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,51 +1,54 @@ -#include #include +#include + #include "kernels.hpp" int main(int argc, char** argv) { + if (argc != 4) { + std::cerr << "Usage: " << argv[0] + << " \n"; + return 1; + } - if (argc != 4) { - std::cerr << "Usage: " << argv[0] << " \n"; - return 1; + std::string kernel_name = argv[1]; + std::string strategy_name = argv[2]; + int num_threads_or_tasks = std::stoul(argv[3]); + + // registry contains a map of kernels generated from kernel builders for the + // selected parallelization strategy + KernelRegistry registry; + initialize_registry(®istry, strategy_name); + + try { + // find kernel in unordered_map by it's name. prepare() allocates and + // initializes data structures needed for the selected kernel + Kernel kernel = registry.load_kernel(kernel_name); + kernel.prepare(); + + // Time the kernel execution + auto start_time = std::chrono::high_resolution_clock::now(); + + // VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM + kernel.execute(num_threads_or_tasks, VECTOR_SIZE); + + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end_time - start_time; + + std::cout << "Kernel: " << kernel_name << "\n"; + std::cout << "Parallelization strategy: " << strategy_name << "\n"; + std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n"; + std::cout << "Kernel execution time [ms]: " << duration.count() << "\n"; + + } catch (const std::invalid_argument& e) { + // If kernel name is invalid, list available kernels + std::cerr << e.what() << "\n"; + std::cerr << "Available kernels are:\n"; + for (const auto& kernel_name : registry.list_available_kernels()) { + std::cerr << " - " << kernel_name << "\n"; } - - std::string kernel_name = argv[1]; - std::string strategy_name = argv[2]; - int num_threads_or_tasks = std::stoul(argv[3]); - // registry contains a map of kernels generated from kernel builders for the selected parallelization strategy - KernelRegistry registry; - initialize_registry(®istry, strategy_name); - - try{ - // find kernel in unordered_map by it's name. prepare() allocates and initializes data structures needed for the selected kernel - Kernel kernel = registry.load_kernel(kernel_name); - kernel.prepare(); - - // Time the kernel execution - auto start_time = std::chrono::high_resolution_clock::now(); - - // VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM - kernel.execute(num_threads_or_tasks, VECTOR_SIZE); - - auto end_time = std::chrono::high_resolution_clock::now(); - std::chrono::duration duration = end_time - start_time; - - std::cout << "Kernel: " << kernel_name << "\n"; - std::cout << "Parallelization strategy: " << strategy_name << "\n"; - std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n"; - std::cout << "Kernel execution time [ms]: " << duration.count() << "\n"; - } catch (const std::invalid_argument& e) { - // If kernel name is invalid, list available kernels - std::cerr << e.what() << "\n"; - std::cerr << "Available kernels are:\n"; - - // List available kernels from registry - for (const auto& kernel_name : registry.list_available_kernels()) { - std::cerr << " - " << kernel_name << "\n"; - } - - return 1; - } - return 0; + return 1; + } + + return 0; }