diff --git a/.gitignore b/.gitignore index e257658..ad09842 100644 --- a/.gitignore +++ b/.gitignore @@ -32,3 +32,5 @@ *.out *.app +bin/* +obj/* diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..bcb35f8 --- /dev/null +++ b/Makefile @@ -0,0 +1,36 @@ +CXX ?= g++ +OPTFLAGS ?= -O3 -march=native + +VECTOR_SIZE ?= 268435456 +CPPFLAGS = -DVECTOR_SIZE=$(VECTOR_SIZE) +INCLUDES = -I./include -I$(EVENTIFY_ROOT)/include + +CXXFLAGS = $(CPPFLAGS) -std=c++20 -fopenmp $(INCLUDES) $(OPTFLAGS) +LDFLAGS = -fopenmp -L$(EVENTIFY_ROOT)/lib -leventify + +SRC_DIR = src +INCLUDE_DIR = include +OBJ_DIR = obj +BIN_DIR = bin + +SRCS = $(wildcard $(SRC_DIR)/*.cpp) + +OBJS = $(SRCS:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.o) + +TARGET = $(BIN_DIR)/benchmark + +# Default rule to build the program +all: $(TARGET) + +$(TARGET): $(OBJS) + @mkdir -p $(BIN_DIR) + $(CXX) $(LDFLAGS) $(OBJS) -o $@ + +$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp + @mkdir -p $(OBJ_DIR) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +clean: + rm -rf $(BIN_DIR) $(OBJ_DIR) + +.PHONY: all clean diff --git a/README.md b/README.md index 0624f2f..7405056 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,105 @@ -# pkbf +pkbf - Parallel Kernel Benchmarking Framework +This project provides a benchmarking framework for parallel computing kernels, where the execution of the kernels can be parallelized using OpenMP or Eventify to compare both for the FlexFMM collaborative project. +The application is designed to make adding kernels and parallelization strategies as easy as possible. + +## Features + +- **Kernel Registry**: A registry that allows the user to register and execute different computational kernels easily. +- **Parallelization Strategies**: Two strategies for parallelizing the execution of kernel loops: + - **OpenMP**: Uses OpenMP directives to parallelize the outermost loop. + - **Eventify**: Uses the Eventify tasking system for parallelism. +- **Kernel Execution**: Kernels such as **STREAM TRIAD** and **DAXPY** are implemented, and their execution can be timed and compared across different parallelization strategies. + +## Project Structure + +. +├── bin/ # Compiled executable +├── include/ # Header files +│ ├── kernels.hpp # Kernel and KernelRegistry declarations +│ ├── strategy.hpp # Parallelization strategies (OpenMP, Eventify) +│ └── utils.hpp # Utility functions for initialization +├── src/ # Source files +│ ├── kernels.cpp # Kernel and KernelRegistry implementations +│ ├── strategy.cpp # Parallelization strategies (OpenMP, Eventify) +│ ├── main.cpp # Main entry point for benchmarking +├── Makefile # Makefile to build the project +└── README.md # Project documentation + +## Requirements + +- C++20 or higher +- OpenMP support (for OpenMP parallelization strategy) +- Eventify library (for Eventify parallelization strategy) + +### Dependencies: + +- **Eventify**: Ensure that the Eventify library is properly installed and the environment variable `EVENTIFY_ROOT` points to the root directory of the Eventify installation. + +## Building the Project + +To build the project, run: + +``` +make +``` + +This will compile the source files and generate an executable called `benchmark` in the `bin/` directory. + +### Clean Up + +To remove all compiled files and the executable, run: + +``` +make clean +``` + +## Usage + +### Running the Benchmark + +To run a kernel benchmark, use the following command: + +``` +./bin/benchmark +``` + +- ``: The name of the kernel to run. Example: `stream_triad` +- ``: The parallelization strategy to use. Available options: `omp` (for OpenMP) and `eventify` (for Eventify). +- ``: The number of threads or tasks to use for parallel execution. This depends on the parallelization strategy (e.g., number of threads for OpenMP, number of tasks for Eventify). + +### Example: + +To run the `stream_triad` kernel with the OpenMP strategy using 4 threads: + +``` +./bin/benchmark stream_triad omp 4 +``` + +To run the `daxpy` kernel with the Eventify strategy using 8 tasks: + +``` +./bin/benchmark daxpy eventify 8 +``` + +### Error Handling + +- If an invalid kernel name is provided, the program will print an error message and list available kernels. + +Example of an invalid kernel name: + +``` +$ ./bin/benchmark invalid_kernel omp 4 +Kernel not found: invalid_kernel +Available kernels are: + - stream_triad + - daxpy +``` + +## Contributing + +Feel free to submit issues or pull requests to improve the project. + +## License + +This project is licensed under the MIT License. diff --git a/include/kernels.hpp b/include/kernels.hpp new file mode 100644 index 0000000..f6c6be4 --- /dev/null +++ b/include/kernels.hpp @@ -0,0 +1,41 @@ +#ifndef KERNELS_HPP +#define KERNELS_HPP + +#include +#include +#include + + +class Kernel { +public: + using StrategyFunction = std::function; + using PreparationFunction = std::function; + + Kernel(const std::string& name, StrategyFunction strategy_function, PreparationFunction preparation_function); + + void prepare() const; + void execute(int n_threads_or_tasks, int kernel_tripcount) const; + +private: + std::string name_; + StrategyFunction strategy_function_; + PreparationFunction preparation_function_; +}; + +class KernelRegistry { +public: + using KernelBuilder = std::function; + + void register_kernel(const std::string& name, KernelBuilder factory); + Kernel load_kernel(const std::string& name) const; + std::vector list_available_kernels() const; + +private: + // FIXME: no benchmarking of maps done. The registry is expected to stay small, though + std::unordered_map registry_; +}; + +void initialize_registry(KernelRegistry* registry, std::string strategy_name); + +#endif // KERNELS_HPP + diff --git a/include/strategy.hpp b/include/strategy.hpp new file mode 100644 index 0000000..c298113 --- /dev/null +++ b/include/strategy.hpp @@ -0,0 +1,72 @@ +#ifndef STRATEGY_HPP +#define STRATEGY_HPP + +#include +#include +#include +#include + +// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized. +// The strategies are templates instanciated when adding kernels to the kernel registry. +// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp + +namespace strategy { + + // define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop + template + concept invocable_with_int = requires(Func&& f, int i) { + { std::forward(f)(i) }; // Checks if calling f(i) is valid + }; + + + // for OpenMP, we just use the for pragma for the outermost loop + template + requires invocable_with_int + void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) { + omp_set_num_threads(static_cast(n_threads)); + + #pragma omp parallel for schedule(static) + for (int i = kernel_start_idx; i < kernel_end_idx; ++i) { + loop_body(i); + } + } + + // for eventify, we calculate indices for evenly divided chunks of the outermost loop, + // create independent tasks and submit them to the tasking system + template + requires invocable_with_int + void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) { + auto task_system = eventify::task_system {}; + int tripcount = kernel_end_idx - kernel_start_idx + 1; + int chunk_size = tripcount / n_tasks; + int remainder = tripcount % n_tasks; + + for (int tid = 0; tid < n_tasks; ++tid) { + auto task = [tid, tripcount, chunk_size, remainder, loop_body]{ + int start_idx = tid * chunk_size; + int end_idx = start_idx + chunk_size - 1; + if (tripcount - end_idx == remainder) end_idx += remainder; + + for (int i = start_idx; i < end_idx; ++i) { + loop_body(i); + } + }; + task_system.submit(task); + } + } + + // parallelization strategy selector + template + requires invocable_with_int + void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) { + if (strategy_name == "omp") { + openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward(loop_body)); + } else if (strategy_name == "eventify") { + eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward(loop_body)); + } else { + throw std::invalid_argument("Unknown strategy: " + strategy_name); + } + } + +} +#endif //STRATEGY_HPP diff --git a/include/utils.hpp b/include/utils.hpp new file mode 100644 index 0000000..c061eb8 --- /dev/null +++ b/include/utils.hpp @@ -0,0 +1,16 @@ +#ifndef UTILS_HPP +#define UTILS_HPP + +#include + +// Function to initialize a vector with random numbers +void initialize_vector(std::vector& v) { + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_real_distribution dis(0.0f, 1.0f); + for (auto& elem : v) { + elem = dis(gen); + } +} + +#endif //UTILS_HPP diff --git a/src/kernels.cpp b/src/kernels.cpp new file mode 100644 index 0000000..d1f14b3 --- /dev/null +++ b/src/kernels.cpp @@ -0,0 +1,87 @@ +#include +#include +#include "kernels.hpp" +#include "strategy.hpp" +#include "utils.hpp" + +Kernel::Kernel(const std::string& name, Kernel::StrategyFunction strategy_function, Kernel::PreparationFunction preparation_function) + : name_(name), strategy_function_(std::move(strategy_function)), preparation_function_(std::move(preparation_function)) {} + +void Kernel::prepare() const { + preparation_function_(); +} + +void Kernel::execute(int num_threads_or_tasks, int kernel_tripcount) const { + strategy_function_(0, kernel_tripcount, num_threads_or_tasks); +} + +void KernelRegistry::register_kernel(const std::string& name, KernelBuilder factory) { + registry_.emplace(name, std::move(factory)); +} + +Kernel KernelRegistry::load_kernel(const std::string& name) const { + auto it = registry_.find(name); + if (it == registry_.end()) { + throw std::invalid_argument("Kernel not found: " + name); + } + return it->second(); +} + +std::vector KernelRegistry::list_available_kernels() const { + std::vector kernel_names; + for (const auto& entry : registry_) { + kernel_names.push_back(entry.first); + } + return kernel_names; +} + +// New kernels go here, each can have it's own set of arguments and initializations +// execute() contains the full kernel code minus an outer for loop (i=start, iregister_kernel("stream_triad", [&]() { + auto a = std::make_shared>(); + auto b = std::make_shared>(); + auto c = std::make_shared>(); + + auto prepare = [=]() { + a->resize(VECTOR_SIZE); + b->resize(VECTOR_SIZE); + c->resize(VECTOR_SIZE); + initialize_vector(*b); + initialize_vector(*c); + }; + + auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) { + strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) { + (*a)[i] = (*b)[i] + 0.5f * (*c)[i]; + }); + }; + + return Kernel("stream_triad", execute, prepare); + }); + + // DAXPY + registry->register_kernel("daxpy", [&]() { + auto a = std::make_shared>(); + auto b = std::make_shared>(); + + auto prepare = [=]() { + a->resize(VECTOR_SIZE); + b->resize(VECTOR_SIZE); + initialize_vector(*b); + }; + + auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) { + strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) { + (*a)[i] += 0.5f * (*b)[i]; + }); + }; + + return Kernel("daxpy", execute, prepare); + }); + +} + diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..408f61a --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,51 @@ +#include +#include +#include "kernels.hpp" + +int main(int argc, char** argv) { + + if (argc != 4) { + std::cerr << "Usage: " << argv[0] << " \n"; + return 1; + } + + std::string kernel_name = argv[1]; + std::string strategy_name = argv[2]; + int num_threads_or_tasks = std::stoul(argv[3]); + + // registry contains a map of kernels generated from kernel builders for the selected parallelization strategy + KernelRegistry registry; + initialize_registry(®istry, strategy_name); + + try{ + // find kernel in unordered_map by it's name. prepare() allocates and initializes data structures needed for the selected kernel + Kernel kernel = registry.load_kernel(kernel_name); + kernel.prepare(); + + // Time the kernel execution + auto start_time = std::chrono::high_resolution_clock::now(); + + // VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM + kernel.execute(num_threads_or_tasks, VECTOR_SIZE); + + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end_time - start_time; + + std::cout << "Kernel: " << kernel_name << "\n"; + std::cout << "Parallelization strategy: " << strategy_name << "\n"; + std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n"; + std::cout << "Kernel execution time [ms]: " << duration.count() << "\n"; + } catch (const std::invalid_argument& e) { + // If kernel name is invalid, list available kernels + std::cerr << e.what() << "\n"; + std::cerr << "Available kernels are:\n"; + + // List available kernels from registry + for (const auto& kernel_name : registry.list_available_kernels()) { + std::cerr << " - " << kernel_name << "\n"; + } + + return 1; + } + return 0; +}