Initial commit

This commit is contained in:
Patrick Lipka 2024-12-13 00:33:08 +01:00
parent c1b0fee200
commit d53f0d883c
8 changed files with 409 additions and 1 deletions

2
.gitignore vendored
View File

@ -32,3 +32,5 @@
*.out *.out
*.app *.app
bin/*
obj/*

36
Makefile Normal file
View File

@ -0,0 +1,36 @@
CXX ?= g++
OPTFLAGS ?= -O3 -march=native
VECTOR_SIZE ?= 268435456
CPPFLAGS = -DVECTOR_SIZE=$(VECTOR_SIZE)
INCLUDES = -I./include -I$(EVENTIFY_ROOT)/include
CXXFLAGS = $(CPPFLAGS) -std=c++20 -fopenmp $(INCLUDES) $(OPTFLAGS)
LDFLAGS = -fopenmp -L$(EVENTIFY_ROOT)/lib -leventify
SRC_DIR = src
INCLUDE_DIR = include
OBJ_DIR = obj
BIN_DIR = bin
SRCS = $(wildcard $(SRC_DIR)/*.cpp)
OBJS = $(SRCS:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.o)
TARGET = $(BIN_DIR)/benchmark
# Default rule to build the program
all: $(TARGET)
$(TARGET): $(OBJS)
@mkdir -p $(BIN_DIR)
$(CXX) $(LDFLAGS) $(OBJS) -o $@
$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp
@mkdir -p $(OBJ_DIR)
$(CXX) $(CXXFLAGS) -c $< -o $@
clean:
rm -rf $(BIN_DIR) $(OBJ_DIR)
.PHONY: all clean

105
README.md
View File

@ -1,2 +1,105 @@
# pkbf pkbf - Parallel Kernel Benchmarking Framework
This project provides a benchmarking framework for parallel computing kernels, where the execution of the kernels can be parallelized using OpenMP or Eventify to compare both for the FlexFMM collaborative project.
The application is designed to make adding kernels and parallelization strategies as easy as possible.
## Features
- **Kernel Registry**: A registry that allows the user to register and execute different computational kernels easily.
- **Parallelization Strategies**: Two strategies for parallelizing the execution of kernel loops:
- **OpenMP**: Uses OpenMP directives to parallelize the outermost loop.
- **Eventify**: Uses the Eventify tasking system for parallelism.
- **Kernel Execution**: Kernels such as **STREAM TRIAD** and **DAXPY** are implemented, and their execution can be timed and compared across different parallelization strategies.
## Project Structure
.
├── bin/ # Compiled executable
├── include/ # Header files
│ ├── kernels.hpp # Kernel and KernelRegistry declarations
│ ├── strategy.hpp # Parallelization strategies (OpenMP, Eventify)
│ └── utils.hpp # Utility functions for initialization
├── src/ # Source files
│ ├── kernels.cpp # Kernel and KernelRegistry implementations
│ ├── strategy.cpp # Parallelization strategies (OpenMP, Eventify)
│ ├── main.cpp # Main entry point for benchmarking
├── Makefile # Makefile to build the project
└── README.md # Project documentation
## Requirements
- C++20 or higher
- OpenMP support (for OpenMP parallelization strategy)
- Eventify library (for Eventify parallelization strategy)
### Dependencies:
- **Eventify**: Ensure that the Eventify library is properly installed and the environment variable `EVENTIFY_ROOT` points to the root directory of the Eventify installation.
## Building the Project
To build the project, run:
```
make
```
This will compile the source files and generate an executable called `benchmark` in the `bin/` directory.
### Clean Up
To remove all compiled files and the executable, run:
```
make clean
```
## Usage
### Running the Benchmark
To run a kernel benchmark, use the following command:
```
./bin/benchmark <kernel_name> <strategy> <num_threads_or_tasks>
```
- `<kernel_name>`: The name of the kernel to run. Example: `stream_triad`
- `<strategy>`: The parallelization strategy to use. Available options: `omp` (for OpenMP) and `eventify` (for Eventify).
- `<num_threads_or_tasks>`: The number of threads or tasks to use for parallel execution. This depends on the parallelization strategy (e.g., number of threads for OpenMP, number of tasks for Eventify).
### Example:
To run the `stream_triad` kernel with the OpenMP strategy using 4 threads:
```
./bin/benchmark stream_triad omp 4
```
To run the `daxpy` kernel with the Eventify strategy using 8 tasks:
```
./bin/benchmark daxpy eventify 8
```
### Error Handling
- If an invalid kernel name is provided, the program will print an error message and list available kernels.
Example of an invalid kernel name:
```
$ ./bin/benchmark invalid_kernel omp 4
Kernel not found: invalid_kernel
Available kernels are:
- stream_triad
- daxpy
```
## Contributing
Feel free to submit issues or pull requests to improve the project.
## License
This project is licensed under the MIT License.

41
include/kernels.hpp Normal file
View File

@ -0,0 +1,41 @@
#ifndef KERNELS_HPP
#define KERNELS_HPP
#include <string>
#include <functional>
#include <unordered_map>
class Kernel {
public:
using StrategyFunction = std::function<void(int, int, int)>;
using PreparationFunction = std::function<void()>;
Kernel(const std::string& name, StrategyFunction strategy_function, PreparationFunction preparation_function);
void prepare() const;
void execute(int n_threads_or_tasks, int kernel_tripcount) const;
private:
std::string name_;
StrategyFunction strategy_function_;
PreparationFunction preparation_function_;
};
class KernelRegistry {
public:
using KernelBuilder = std::function<Kernel()>;
void register_kernel(const std::string& name, KernelBuilder factory);
Kernel load_kernel(const std::string& name) const;
std::vector<std::string> list_available_kernels() const;
private:
// FIXME: no benchmarking of maps done. The registry is expected to stay small, though
std::unordered_map<std::string, KernelBuilder> registry_;
};
void initialize_registry(KernelRegistry* registry, std::string strategy_name);
#endif // KERNELS_HPP

72
include/strategy.hpp Normal file
View File

@ -0,0 +1,72 @@
#ifndef STRATEGY_HPP
#define STRATEGY_HPP
#include <omp.h>
#include <stdexcept>
#include <string>
#include <eventify/task_system.hxx>
// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized.
// The strategies are templates instanciated when adding kernels to the kernel registry.
// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp
namespace strategy {
// define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop
template <typename Func>
concept invocable_with_int = requires(Func&& f, int i) {
{ std::forward<Func>(f)(i) }; // Checks if calling f(i) is valid
};
// for OpenMP, we just use the for pragma for the outermost loop
template <typename Func>
requires invocable_with_int <Func>
void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) {
omp_set_num_threads(static_cast<int>(n_threads));
#pragma omp parallel for schedule(static)
for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
loop_body(i);
}
}
// for eventify, we calculate indices for evenly divided chunks of the outermost loop,
// create independent tasks and submit them to the tasking system
template <typename Func>
requires invocable_with_int <Func>
void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) {
auto task_system = eventify::task_system {};
int tripcount = kernel_end_idx - kernel_start_idx + 1;
int chunk_size = tripcount / n_tasks;
int remainder = tripcount % n_tasks;
for (int tid = 0; tid < n_tasks; ++tid) {
auto task = [tid, tripcount, chunk_size, remainder, loop_body]{
int start_idx = tid * chunk_size;
int end_idx = start_idx + chunk_size - 1;
if (tripcount - end_idx == remainder) end_idx += remainder;
for (int i = start_idx; i < end_idx; ++i) {
loop_body(i);
}
};
task_system.submit(task);
}
}
// parallelization strategy selector
template <typename Func>
requires invocable_with_int<Func>
void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) {
if (strategy_name == "omp") {
openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
} else if (strategy_name == "eventify") {
eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
} else {
throw std::invalid_argument("Unknown strategy: " + strategy_name);
}
}
}
#endif //STRATEGY_HPP

16
include/utils.hpp Normal file
View File

@ -0,0 +1,16 @@
#ifndef UTILS_HPP
#define UTILS_HPP
#include <random>
// Function to initialize a vector with random numbers
void initialize_vector(std::vector<float>& v) {
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(0.0f, 1.0f);
for (auto& elem : v) {
elem = dis(gen);
}
}
#endif //UTILS_HPP

87
src/kernels.cpp Normal file
View File

@ -0,0 +1,87 @@
#include <memory>
#include <stdexcept>
#include "kernels.hpp"
#include "strategy.hpp"
#include "utils.hpp"
Kernel::Kernel(const std::string& name, Kernel::StrategyFunction strategy_function, Kernel::PreparationFunction preparation_function)
: name_(name), strategy_function_(std::move(strategy_function)), preparation_function_(std::move(preparation_function)) {}
void Kernel::prepare() const {
preparation_function_();
}
void Kernel::execute(int num_threads_or_tasks, int kernel_tripcount) const {
strategy_function_(0, kernel_tripcount, num_threads_or_tasks);
}
void KernelRegistry::register_kernel(const std::string& name, KernelBuilder factory) {
registry_.emplace(name, std::move(factory));
}
Kernel KernelRegistry::load_kernel(const std::string& name) const {
auto it = registry_.find(name);
if (it == registry_.end()) {
throw std::invalid_argument("Kernel not found: " + name);
}
return it->second();
}
std::vector<std::string> KernelRegistry::list_available_kernels() const {
std::vector<std::string> kernel_names;
for (const auto& entry : registry_) {
kernel_names.push_back(entry.first);
}
return kernel_names;
}
// New kernels go here, each can have it's own set of arguments and initializations
// execute() contains the full kernel code minus an outer for loop (i=start, i<end, ++i),
// defined in the respective parallelization strategy
void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
// STREAM TRIAD
registry->register_kernel("stream_triad", [&]() {
auto a = std::make_shared<std::vector<float>>();
auto b = std::make_shared<std::vector<float>>();
auto c = std::make_shared<std::vector<float>>();
auto prepare = [=]() {
a->resize(VECTOR_SIZE);
b->resize(VECTOR_SIZE);
c->resize(VECTOR_SIZE);
initialize_vector(*b);
initialize_vector(*c);
};
auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
(*a)[i] = (*b)[i] + 0.5f * (*c)[i];
});
};
return Kernel("stream_triad", execute, prepare);
});
// DAXPY
registry->register_kernel("daxpy", [&]() {
auto a = std::make_shared<std::vector<float>>();
auto b = std::make_shared<std::vector<float>>();
auto prepare = [=]() {
a->resize(VECTOR_SIZE);
b->resize(VECTOR_SIZE);
initialize_vector(*b);
};
auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
(*a)[i] += 0.5f * (*b)[i];
});
};
return Kernel("daxpy", execute, prepare);
});
}

51
src/main.cpp Normal file
View File

@ -0,0 +1,51 @@
#include <iostream>
#include <chrono>
#include "kernels.hpp"
int main(int argc, char** argv) {
if (argc != 4) {
std::cerr << "Usage: " << argv[0] << " <kernel_name> <strategy> <num_threads_or_tasks>\n";
return 1;
}
std::string kernel_name = argv[1];
std::string strategy_name = argv[2];
int num_threads_or_tasks = std::stoul(argv[3]);
// registry contains a map of kernels generated from kernel builders for the selected parallelization strategy
KernelRegistry registry;
initialize_registry(&registry, strategy_name);
try{
// find kernel in unordered_map by it's name. prepare() allocates and initializes data structures needed for the selected kernel
Kernel kernel = registry.load_kernel(kernel_name);
kernel.prepare();
// Time the kernel execution
auto start_time = std::chrono::high_resolution_clock::now();
// VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM
kernel.execute(num_threads_or_tasks, VECTOR_SIZE);
auto end_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration = end_time - start_time;
std::cout << "Kernel: " << kernel_name << "\n";
std::cout << "Parallelization strategy: " << strategy_name << "\n";
std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n";
std::cout << "Kernel execution time [ms]: " << duration.count() << "\n";
} catch (const std::invalid_argument& e) {
// If kernel name is invalid, list available kernels
std::cerr << e.what() << "\n";
std::cerr << "Available kernels are:\n";
// List available kernels from registry
for (const auto& kernel_name : registry.list_available_kernels()) {
std::cerr << " - " << kernel_name << "\n";
}
return 1;
}
return 0;
}