Applied Google formatting
This commit is contained in:
parent
8dd3de290b
commit
3f9e253f23
|
@ -1,41 +1,41 @@
|
|||
#ifndef KERNELS_HPP
|
||||
#define KERNELS_HPP
|
||||
|
||||
#include <string>
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
|
||||
class Kernel {
|
||||
public:
|
||||
public:
|
||||
using StrategyFunction = std::function<void(int, int, int)>;
|
||||
using PreparationFunction = std::function<void()>;
|
||||
|
||||
Kernel(const std::string& name, StrategyFunction strategy_function, PreparationFunction preparation_function);
|
||||
Kernel(const std::string& name, StrategyFunction strategy_function,
|
||||
PreparationFunction preparation_function);
|
||||
|
||||
void prepare() const;
|
||||
void execute(int n_threads_or_tasks, int kernel_tripcount) const;
|
||||
|
||||
private:
|
||||
private:
|
||||
std::string name_;
|
||||
StrategyFunction strategy_function_;
|
||||
PreparationFunction preparation_function_;
|
||||
};
|
||||
|
||||
class KernelRegistry {
|
||||
public:
|
||||
public:
|
||||
using KernelBuilder = std::function<Kernel()>;
|
||||
|
||||
void register_kernel(const std::string& name, KernelBuilder factory);
|
||||
Kernel load_kernel(const std::string& name) const;
|
||||
std::vector<std::string> list_available_kernels() const;
|
||||
|
||||
private:
|
||||
// FIXME: no benchmarking of maps done. The registry is expected to stay small, though
|
||||
private:
|
||||
// FIXME: no benchmarking of maps done. The registry is expected to stay
|
||||
// small, though
|
||||
std::unordered_map<std::string, KernelBuilder> registry_;
|
||||
};
|
||||
|
||||
void initialize_registry(KernelRegistry* registry, std::string strategy_name);
|
||||
|
||||
#endif // KERNELS_HPP
|
||||
|
||||
#endif // KERNELS_HPP
|
||||
|
|
|
@ -2,71 +2,80 @@
|
|||
#define STRATEGY_HPP
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
#include <eventify/task_system.hxx>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <eventify/task_system.hxx>
|
||||
|
||||
// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized.
|
||||
// The strategies are templates instanciated when adding kernels to the kernel registry.
|
||||
// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp
|
||||
// Parallelization strategies are defined here. Assumption for now: there is
|
||||
// always an outer loop than can be parallelized. The strategies are templates
|
||||
// instanciated when adding kernels to the kernel registry.
|
||||
// Here, we only define the treatment of the outermost loop. The loop bodies
|
||||
// are defined in kernels.cpp
|
||||
|
||||
namespace strategy {
|
||||
|
||||
// define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop
|
||||
template <typename Func>
|
||||
concept invocable_with_int = requires(Func&& f, int i) {
|
||||
{ std::forward<Func>(f)(i) }; // Checks if calling f(i) is valid
|
||||
};
|
||||
|
||||
|
||||
// for OpenMP, we just use the for pragma for the outermost loop
|
||||
template <typename Func>
|
||||
requires invocable_with_int <Func>
|
||||
void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) {
|
||||
omp_set_num_threads(static_cast<int>(n_threads));
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
|
||||
loop_body(i);
|
||||
}
|
||||
}
|
||||
|
||||
// for eventify, we calculate indices for evenly divided chunks of the outermost loop,
|
||||
// create independent tasks and submit them to the tasking system
|
||||
template <typename Func>
|
||||
requires invocable_with_int <Func>
|
||||
void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) {
|
||||
auto task_system = eventify::task_system {};
|
||||
int tripcount = kernel_end_idx - kernel_start_idx + 1;
|
||||
int chunk_size = tripcount / n_tasks;
|
||||
int remainder = tripcount % n_tasks;
|
||||
|
||||
for (int tid = 0; tid < n_tasks; ++tid) {
|
||||
auto task = [tid, tripcount, chunk_size, remainder, loop_body]{
|
||||
int start_idx = tid * chunk_size;
|
||||
int end_idx = start_idx + chunk_size - 1;
|
||||
if (tripcount - end_idx == remainder) end_idx += remainder;
|
||||
|
||||
for (int i = start_idx; i < end_idx; ++i) {
|
||||
loop_body(i);
|
||||
}
|
||||
};
|
||||
task_system.submit(task);
|
||||
}
|
||||
}
|
||||
|
||||
// parallelization strategy selector
|
||||
template <typename Func>
|
||||
requires invocable_with_int<Func>
|
||||
void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) {
|
||||
if (strategy_name == "omp") {
|
||||
openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
|
||||
} else if (strategy_name == "eventify") {
|
||||
eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
|
||||
} else {
|
||||
throw std::invalid_argument("Unknown strategy: " + strategy_name);
|
||||
}
|
||||
}
|
||||
// define concept to ensure that the loop bodies defined in kernels.cpp
|
||||
// represent one invocable iteration of a parallel loop
|
||||
template <typename Func>
|
||||
concept invocable_with_int = requires(Func&& f, int i) {
|
||||
{ std::forward<Func>(f)(i) }; // Checks if calling f(i) is valid
|
||||
};
|
||||
|
||||
// for OpenMP, we just use the for pragma for the outermost loop
|
||||
template <typename Func>
|
||||
requires invocable_with_int<Func>
|
||||
void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads,
|
||||
Func&& loop_body) {
|
||||
omp_set_num_threads(static_cast<int>(n_threads));
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
|
||||
loop_body(i);
|
||||
}
|
||||
}
|
||||
#endif //STRATEGY_HPP
|
||||
|
||||
// for eventify, we calculate indices for evenly divided chunks of the outermost
|
||||
// loop, create independent tasks and submit them to the tasking system
|
||||
template <typename Func>
|
||||
requires invocable_with_int<Func>
|
||||
void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks,
|
||||
Func&& loop_body) {
|
||||
auto task_system = eventify::task_system{};
|
||||
int tripcount = kernel_end_idx - kernel_start_idx + 1;
|
||||
int chunk_size = tripcount / n_tasks;
|
||||
int remainder = tripcount % n_tasks;
|
||||
|
||||
for (int tid = 0; tid < n_tasks; ++tid) {
|
||||
auto task = [tid, tripcount, chunk_size, remainder, loop_body] {
|
||||
int start_idx = tid * chunk_size;
|
||||
int end_idx = start_idx + chunk_size - 1;
|
||||
if (tripcount - end_idx == remainder) end_idx += remainder;
|
||||
|
||||
for (int i = start_idx; i < end_idx; ++i) {
|
||||
loop_body(i);
|
||||
}
|
||||
};
|
||||
task_system.submit(task);
|
||||
}
|
||||
}
|
||||
|
||||
// parallelization strategy selector
|
||||
template <typename Func>
|
||||
requires invocable_with_int<Func>
|
||||
void execute_strategy(const std::string& strategy_name, int kernel_start_idx,
|
||||
int kernel_end_idx, int num_threads_or_tasks,
|
||||
Func&& loop_body) {
|
||||
if (strategy_name == "omp") {
|
||||
openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks,
|
||||
std::forward<Func>(loop_body));
|
||||
} else if (strategy_name == "eventify") {
|
||||
eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks,
|
||||
std::forward<Func>(loop_body));
|
||||
} else {
|
||||
throw std::invalid_argument("Unknown strategy: " + strategy_name);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace strategy
|
||||
#endif // STRATEGY_HPP
|
||||
|
|
|
@ -5,12 +5,12 @@
|
|||
|
||||
// Function to initialize a vector with random numbers
|
||||
void initialize_vector(std::vector<float>& v) {
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_real_distribution<float> dis(0.0f, 1.0f);
|
||||
for (auto& elem : v) {
|
||||
elem = dis(gen);
|
||||
}
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_real_distribution<float> dis(0.0f, 1.0f);
|
||||
for (auto& elem : v) {
|
||||
elem = dis(gen);
|
||||
}
|
||||
}
|
||||
|
||||
#endif //UTILS_HPP
|
||||
#endif // UTILS_HPP
|
||||
|
|
|
@ -1,21 +1,26 @@
|
|||
#include "kernels.hpp"
|
||||
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include "kernels.hpp"
|
||||
|
||||
#include "strategy.hpp"
|
||||
#include "utils.hpp"
|
||||
|
||||
Kernel::Kernel(const std::string& name, Kernel::StrategyFunction strategy_function, Kernel::PreparationFunction preparation_function)
|
||||
: name_(name), strategy_function_(std::move(strategy_function)), preparation_function_(std::move(preparation_function)) {}
|
||||
Kernel::Kernel(const std::string& name,
|
||||
Kernel::StrategyFunction strategy_function,
|
||||
Kernel::PreparationFunction preparation_function)
|
||||
: name_(name),
|
||||
strategy_function_(std::move(strategy_function)),
|
||||
preparation_function_(std::move(preparation_function)) {}
|
||||
|
||||
void Kernel::prepare() const {
|
||||
preparation_function_();
|
||||
}
|
||||
void Kernel::prepare() const { preparation_function_(); }
|
||||
|
||||
void Kernel::execute(int num_threads_or_tasks, int kernel_tripcount) const {
|
||||
strategy_function_(0, kernel_tripcount, num_threads_or_tasks);
|
||||
}
|
||||
|
||||
void KernelRegistry::register_kernel(const std::string& name, KernelBuilder factory) {
|
||||
void KernelRegistry::register_kernel(const std::string& name,
|
||||
KernelBuilder factory) {
|
||||
registry_.emplace(name, std::move(factory));
|
||||
}
|
||||
|
||||
|
@ -35,17 +40,17 @@ std::vector<std::string> KernelRegistry::list_available_kernels() const {
|
|||
return kernel_names;
|
||||
}
|
||||
|
||||
// New kernels go here, each can have it's own set of arguments and initializations
|
||||
// execute() contains the full kernel code minus an outer for loop (i=start, i<end, ++i),
|
||||
// defined in the respective parallelization strategy
|
||||
// New kernels go here, each can have it's own set of arguments and
|
||||
// initializations execute() contains the full kernel code minus an outer for
|
||||
// loop (i=start, i<end, ++i), defined in the respective parallelization
|
||||
// strategy
|
||||
void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
|
||||
|
||||
// STREAM TRIAD
|
||||
registry->register_kernel("stream_triad", [&]() {
|
||||
auto a = std::make_shared<std::vector<float>>();
|
||||
auto b = std::make_shared<std::vector<float>>();
|
||||
auto c = std::make_shared<std::vector<float>>();
|
||||
|
||||
|
||||
auto prepare = [=]() {
|
||||
a->resize(VECTOR_SIZE);
|
||||
b->resize(VECTOR_SIZE);
|
||||
|
@ -54,10 +59,11 @@ void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
|
|||
initialize_vector(*c);
|
||||
};
|
||||
|
||||
auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
|
||||
strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
|
||||
(*a)[i] = (*b)[i] + 0.5f * (*c)[i];
|
||||
});
|
||||
auto execute = [=](int kernel_start_idx, int kernel_end_idx,
|
||||
int num_threads_or_tasks) {
|
||||
strategy::execute_strategy(
|
||||
strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks,
|
||||
[&](int i) { (*a)[i] = (*b)[i] + 0.5f * (*c)[i]; });
|
||||
};
|
||||
|
||||
return Kernel("stream_triad", execute, prepare);
|
||||
|
@ -74,10 +80,11 @@ void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
|
|||
initialize_vector(*b);
|
||||
};
|
||||
|
||||
auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
|
||||
strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
|
||||
(*a)[i] += 0.5f * (*b)[i];
|
||||
});
|
||||
auto execute = [=](int kernel_start_idx, int kernel_end_idx,
|
||||
int num_threads_or_tasks) {
|
||||
strategy::execute_strategy(strategy_name, kernel_start_idx,
|
||||
kernel_end_idx, num_threads_or_tasks,
|
||||
[&](int i) { (*a)[i] += 0.5f * (*b)[i]; });
|
||||
};
|
||||
|
||||
return Kernel("daxpy", execute, prepare);
|
||||
|
@ -106,16 +113,19 @@ void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
|
|||
initialize_vector(*ry);
|
||||
initialize_vector(*rz);
|
||||
};
|
||||
|
||||
auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
|
||||
strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
|
||||
(*potential)[i] = (*charge1)[i] * (*charge2)[i] / std::sqrt((*rx)[i] * (*rx)[i] + (*ry)[i] * (*ry)[i] + (*rz)[i] * (*rz)[i]);
|
||||
});
|
||||
|
||||
auto execute = [=](int kernel_start_idx, int kernel_end_idx,
|
||||
int num_threads_or_tasks) {
|
||||
strategy::execute_strategy(
|
||||
strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks,
|
||||
[&](int i) {
|
||||
(*potential)[i] =
|
||||
(*charge1)[i] * (*charge2)[i] /
|
||||
std::sqrt((*rx)[i] * (*rx)[i] + (*ry)[i] * (*ry)[i] +
|
||||
(*rz)[i] * (*rz)[i]);
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
return Kernel("coulomb", execute, prepare);
|
||||
});
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
|
89
src/main.cpp
89
src/main.cpp
|
@ -1,51 +1,54 @@
|
|||
#include <iostream>
|
||||
#include <chrono>
|
||||
#include <iostream>
|
||||
|
||||
#include "kernels.hpp"
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
if (argc != 4) {
|
||||
std::cerr << "Usage: " << argv[0]
|
||||
<< " <kernel_name> <strategy> <num_threads_or_tasks>\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (argc != 4) {
|
||||
std::cerr << "Usage: " << argv[0] << " <kernel_name> <strategy> <num_threads_or_tasks>\n";
|
||||
return 1;
|
||||
std::string kernel_name = argv[1];
|
||||
std::string strategy_name = argv[2];
|
||||
int num_threads_or_tasks = std::stoul(argv[3]);
|
||||
|
||||
// registry contains a map of kernels generated from kernel builders for the
|
||||
// selected parallelization strategy
|
||||
KernelRegistry registry;
|
||||
initialize_registry(®istry, strategy_name);
|
||||
|
||||
try {
|
||||
// find kernel in unordered_map by it's name. prepare() allocates and
|
||||
// initializes data structures needed for the selected kernel
|
||||
Kernel kernel = registry.load_kernel(kernel_name);
|
||||
kernel.prepare();
|
||||
|
||||
// Time the kernel execution
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM
|
||||
kernel.execute(num_threads_or_tasks, VECTOR_SIZE);
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double, std::milli> duration = end_time - start_time;
|
||||
|
||||
std::cout << "Kernel: " << kernel_name << "\n";
|
||||
std::cout << "Parallelization strategy: " << strategy_name << "\n";
|
||||
std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n";
|
||||
std::cout << "Kernel execution time [ms]: " << duration.count() << "\n";
|
||||
|
||||
} catch (const std::invalid_argument& e) {
|
||||
// If kernel name is invalid, list available kernels
|
||||
std::cerr << e.what() << "\n";
|
||||
std::cerr << "Available kernels are:\n";
|
||||
for (const auto& kernel_name : registry.list_available_kernels()) {
|
||||
std::cerr << " - " << kernel_name << "\n";
|
||||
}
|
||||
|
||||
std::string kernel_name = argv[1];
|
||||
std::string strategy_name = argv[2];
|
||||
int num_threads_or_tasks = std::stoul(argv[3]);
|
||||
|
||||
// registry contains a map of kernels generated from kernel builders for the selected parallelization strategy
|
||||
KernelRegistry registry;
|
||||
initialize_registry(®istry, strategy_name);
|
||||
|
||||
try{
|
||||
// find kernel in unordered_map by it's name. prepare() allocates and initializes data structures needed for the selected kernel
|
||||
Kernel kernel = registry.load_kernel(kernel_name);
|
||||
kernel.prepare();
|
||||
|
||||
// Time the kernel execution
|
||||
auto start_time = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM
|
||||
kernel.execute(num_threads_or_tasks, VECTOR_SIZE);
|
||||
|
||||
auto end_time = std::chrono::high_resolution_clock::now();
|
||||
std::chrono::duration<double, std::milli> duration = end_time - start_time;
|
||||
|
||||
std::cout << "Kernel: " << kernel_name << "\n";
|
||||
std::cout << "Parallelization strategy: " << strategy_name << "\n";
|
||||
std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n";
|
||||
std::cout << "Kernel execution time [ms]: " << duration.count() << "\n";
|
||||
} catch (const std::invalid_argument& e) {
|
||||
// If kernel name is invalid, list available kernels
|
||||
std::cerr << e.what() << "\n";
|
||||
std::cerr << "Available kernels are:\n";
|
||||
|
||||
// List available kernels from registry
|
||||
for (const auto& kernel_name : registry.list_available_kernels()) {
|
||||
std::cerr << " - " << kernel_name << "\n";
|
||||
}
|
||||
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue