Initial commit
This commit is contained in:
parent
c1b0fee200
commit
d53f0d883c
|
@ -32,3 +32,5 @@
|
||||||
*.out
|
*.out
|
||||||
*.app
|
*.app
|
||||||
|
|
||||||
|
bin/*
|
||||||
|
obj/*
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
CXX ?= g++
|
||||||
|
OPTFLAGS ?= -O3 -march=native
|
||||||
|
|
||||||
|
VECTOR_SIZE ?= 268435456
|
||||||
|
CPPFLAGS = -DVECTOR_SIZE=$(VECTOR_SIZE)
|
||||||
|
INCLUDES = -I./include -I$(EVENTIFY_ROOT)/include
|
||||||
|
|
||||||
|
CXXFLAGS = $(CPPFLAGS) -std=c++20 -fopenmp $(INCLUDES) $(OPTFLAGS)
|
||||||
|
LDFLAGS = -fopenmp -L$(EVENTIFY_ROOT)/lib -leventify
|
||||||
|
|
||||||
|
SRC_DIR = src
|
||||||
|
INCLUDE_DIR = include
|
||||||
|
OBJ_DIR = obj
|
||||||
|
BIN_DIR = bin
|
||||||
|
|
||||||
|
SRCS = $(wildcard $(SRC_DIR)/*.cpp)
|
||||||
|
|
||||||
|
OBJS = $(SRCS:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.o)
|
||||||
|
|
||||||
|
TARGET = $(BIN_DIR)/benchmark
|
||||||
|
|
||||||
|
# Default rule to build the program
|
||||||
|
all: $(TARGET)
|
||||||
|
|
||||||
|
$(TARGET): $(OBJS)
|
||||||
|
@mkdir -p $(BIN_DIR)
|
||||||
|
$(CXX) $(LDFLAGS) $(OBJS) -o $@
|
||||||
|
|
||||||
|
$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp
|
||||||
|
@mkdir -p $(OBJ_DIR)
|
||||||
|
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(BIN_DIR) $(OBJ_DIR)
|
||||||
|
|
||||||
|
.PHONY: all clean
|
105
README.md
105
README.md
|
@ -1,2 +1,105 @@
|
||||||
# pkbf
|
pkbf - Parallel Kernel Benchmarking Framework
|
||||||
|
|
||||||
|
This project provides a benchmarking framework for parallel computing kernels, where the execution of the kernels can be parallelized using OpenMP or Eventify to compare both for the FlexFMM collaborative project.
|
||||||
|
The application is designed to make adding kernels and parallelization strategies as easy as possible.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- **Kernel Registry**: A registry that allows the user to register and execute different computational kernels easily.
|
||||||
|
- **Parallelization Strategies**: Two strategies for parallelizing the execution of kernel loops:
|
||||||
|
- **OpenMP**: Uses OpenMP directives to parallelize the outermost loop.
|
||||||
|
- **Eventify**: Uses the Eventify tasking system for parallelism.
|
||||||
|
- **Kernel Execution**: Kernels such as **STREAM TRIAD** and **DAXPY** are implemented, and their execution can be timed and compared across different parallelization strategies.
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
.
|
||||||
|
├── bin/ # Compiled executable
|
||||||
|
├── include/ # Header files
|
||||||
|
│ ├── kernels.hpp # Kernel and KernelRegistry declarations
|
||||||
|
│ ├── strategy.hpp # Parallelization strategies (OpenMP, Eventify)
|
||||||
|
│ └── utils.hpp # Utility functions for initialization
|
||||||
|
├── src/ # Source files
|
||||||
|
│ ├── kernels.cpp # Kernel and KernelRegistry implementations
|
||||||
|
│ ├── strategy.cpp # Parallelization strategies (OpenMP, Eventify)
|
||||||
|
│ ├── main.cpp # Main entry point for benchmarking
|
||||||
|
├── Makefile # Makefile to build the project
|
||||||
|
└── README.md # Project documentation
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- C++20 or higher
|
||||||
|
- OpenMP support (for OpenMP parallelization strategy)
|
||||||
|
- Eventify library (for Eventify parallelization strategy)
|
||||||
|
|
||||||
|
### Dependencies:
|
||||||
|
|
||||||
|
- **Eventify**: Ensure that the Eventify library is properly installed and the environment variable `EVENTIFY_ROOT` points to the root directory of the Eventify installation.
|
||||||
|
|
||||||
|
## Building the Project
|
||||||
|
|
||||||
|
To build the project, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
make
|
||||||
|
```
|
||||||
|
|
||||||
|
This will compile the source files and generate an executable called `benchmark` in the `bin/` directory.
|
||||||
|
|
||||||
|
### Clean Up
|
||||||
|
|
||||||
|
To remove all compiled files and the executable, run:
|
||||||
|
|
||||||
|
```
|
||||||
|
make clean
|
||||||
|
```
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### Running the Benchmark
|
||||||
|
|
||||||
|
To run a kernel benchmark, use the following command:
|
||||||
|
|
||||||
|
```
|
||||||
|
./bin/benchmark <kernel_name> <strategy> <num_threads_or_tasks>
|
||||||
|
```
|
||||||
|
|
||||||
|
- `<kernel_name>`: The name of the kernel to run. Example: `stream_triad`
|
||||||
|
- `<strategy>`: The parallelization strategy to use. Available options: `omp` (for OpenMP) and `eventify` (for Eventify).
|
||||||
|
- `<num_threads_or_tasks>`: The number of threads or tasks to use for parallel execution. This depends on the parallelization strategy (e.g., number of threads for OpenMP, number of tasks for Eventify).
|
||||||
|
|
||||||
|
### Example:
|
||||||
|
|
||||||
|
To run the `stream_triad` kernel with the OpenMP strategy using 4 threads:
|
||||||
|
|
||||||
|
```
|
||||||
|
./bin/benchmark stream_triad omp 4
|
||||||
|
```
|
||||||
|
|
||||||
|
To run the `daxpy` kernel with the Eventify strategy using 8 tasks:
|
||||||
|
|
||||||
|
```
|
||||||
|
./bin/benchmark daxpy eventify 8
|
||||||
|
```
|
||||||
|
|
||||||
|
### Error Handling
|
||||||
|
|
||||||
|
- If an invalid kernel name is provided, the program will print an error message and list available kernels.
|
||||||
|
|
||||||
|
Example of an invalid kernel name:
|
||||||
|
|
||||||
|
```
|
||||||
|
$ ./bin/benchmark invalid_kernel omp 4
|
||||||
|
Kernel not found: invalid_kernel
|
||||||
|
Available kernels are:
|
||||||
|
- stream_triad
|
||||||
|
- daxpy
|
||||||
|
```
|
||||||
|
|
||||||
|
## Contributing
|
||||||
|
|
||||||
|
Feel free to submit issues or pull requests to improve the project.
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This project is licensed under the MIT License.
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
#ifndef KERNELS_HPP
|
||||||
|
#define KERNELS_HPP
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
|
||||||
|
class Kernel {
|
||||||
|
public:
|
||||||
|
using StrategyFunction = std::function<void(int, int, int)>;
|
||||||
|
using PreparationFunction = std::function<void()>;
|
||||||
|
|
||||||
|
Kernel(const std::string& name, StrategyFunction strategy_function, PreparationFunction preparation_function);
|
||||||
|
|
||||||
|
void prepare() const;
|
||||||
|
void execute(int n_threads_or_tasks, int kernel_tripcount) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string name_;
|
||||||
|
StrategyFunction strategy_function_;
|
||||||
|
PreparationFunction preparation_function_;
|
||||||
|
};
|
||||||
|
|
||||||
|
class KernelRegistry {
|
||||||
|
public:
|
||||||
|
using KernelBuilder = std::function<Kernel()>;
|
||||||
|
|
||||||
|
void register_kernel(const std::string& name, KernelBuilder factory);
|
||||||
|
Kernel load_kernel(const std::string& name) const;
|
||||||
|
std::vector<std::string> list_available_kernels() const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
// FIXME: no benchmarking of maps done. The registry is expected to stay small, though
|
||||||
|
std::unordered_map<std::string, KernelBuilder> registry_;
|
||||||
|
};
|
||||||
|
|
||||||
|
void initialize_registry(KernelRegistry* registry, std::string strategy_name);
|
||||||
|
|
||||||
|
#endif // KERNELS_HPP
|
||||||
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
#ifndef STRATEGY_HPP
|
||||||
|
#define STRATEGY_HPP
|
||||||
|
|
||||||
|
#include <omp.h>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
#include <eventify/task_system.hxx>
|
||||||
|
|
||||||
|
// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized.
|
||||||
|
// The strategies are templates instanciated when adding kernels to the kernel registry.
|
||||||
|
// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp
|
||||||
|
|
||||||
|
namespace strategy {
|
||||||
|
|
||||||
|
// define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop
|
||||||
|
template <typename Func>
|
||||||
|
concept invocable_with_int = requires(Func&& f, int i) {
|
||||||
|
{ std::forward<Func>(f)(i) }; // Checks if calling f(i) is valid
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
// for OpenMP, we just use the for pragma for the outermost loop
|
||||||
|
template <typename Func>
|
||||||
|
requires invocable_with_int <Func>
|
||||||
|
void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) {
|
||||||
|
omp_set_num_threads(static_cast<int>(n_threads));
|
||||||
|
|
||||||
|
#pragma omp parallel for schedule(static)
|
||||||
|
for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
|
||||||
|
loop_body(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// for eventify, we calculate indices for evenly divided chunks of the outermost loop,
|
||||||
|
// create independent tasks and submit them to the tasking system
|
||||||
|
template <typename Func>
|
||||||
|
requires invocable_with_int <Func>
|
||||||
|
void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) {
|
||||||
|
auto task_system = eventify::task_system {};
|
||||||
|
int tripcount = kernel_end_idx - kernel_start_idx + 1;
|
||||||
|
int chunk_size = tripcount / n_tasks;
|
||||||
|
int remainder = tripcount % n_tasks;
|
||||||
|
|
||||||
|
for (int tid = 0; tid < n_tasks; ++tid) {
|
||||||
|
auto task = [tid, tripcount, chunk_size, remainder, loop_body]{
|
||||||
|
int start_idx = tid * chunk_size;
|
||||||
|
int end_idx = start_idx + chunk_size - 1;
|
||||||
|
if (tripcount - end_idx == remainder) end_idx += remainder;
|
||||||
|
|
||||||
|
for (int i = start_idx; i < end_idx; ++i) {
|
||||||
|
loop_body(i);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
task_system.submit(task);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// parallelization strategy selector
|
||||||
|
template <typename Func>
|
||||||
|
requires invocable_with_int<Func>
|
||||||
|
void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) {
|
||||||
|
if (strategy_name == "omp") {
|
||||||
|
openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
|
||||||
|
} else if (strategy_name == "eventify") {
|
||||||
|
eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
|
||||||
|
} else {
|
||||||
|
throw std::invalid_argument("Unknown strategy: " + strategy_name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
#endif //STRATEGY_HPP
|
|
@ -0,0 +1,16 @@
|
||||||
|
#ifndef UTILS_HPP
|
||||||
|
#define UTILS_HPP
|
||||||
|
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
// Function to initialize a vector with random numbers
|
||||||
|
void initialize_vector(std::vector<float>& v) {
|
||||||
|
std::random_device rd;
|
||||||
|
std::mt19937 gen(rd());
|
||||||
|
std::uniform_real_distribution<float> dis(0.0f, 1.0f);
|
||||||
|
for (auto& elem : v) {
|
||||||
|
elem = dis(gen);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif //UTILS_HPP
|
|
@ -0,0 +1,87 @@
|
||||||
|
#include <memory>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include "kernels.hpp"
|
||||||
|
#include "strategy.hpp"
|
||||||
|
#include "utils.hpp"
|
||||||
|
|
||||||
|
Kernel::Kernel(const std::string& name, Kernel::StrategyFunction strategy_function, Kernel::PreparationFunction preparation_function)
|
||||||
|
: name_(name), strategy_function_(std::move(strategy_function)), preparation_function_(std::move(preparation_function)) {}
|
||||||
|
|
||||||
|
void Kernel::prepare() const {
|
||||||
|
preparation_function_();
|
||||||
|
}
|
||||||
|
|
||||||
|
void Kernel::execute(int num_threads_or_tasks, int kernel_tripcount) const {
|
||||||
|
strategy_function_(0, kernel_tripcount, num_threads_or_tasks);
|
||||||
|
}
|
||||||
|
|
||||||
|
void KernelRegistry::register_kernel(const std::string& name, KernelBuilder factory) {
|
||||||
|
registry_.emplace(name, std::move(factory));
|
||||||
|
}
|
||||||
|
|
||||||
|
Kernel KernelRegistry::load_kernel(const std::string& name) const {
|
||||||
|
auto it = registry_.find(name);
|
||||||
|
if (it == registry_.end()) {
|
||||||
|
throw std::invalid_argument("Kernel not found: " + name);
|
||||||
|
}
|
||||||
|
return it->second();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> KernelRegistry::list_available_kernels() const {
|
||||||
|
std::vector<std::string> kernel_names;
|
||||||
|
for (const auto& entry : registry_) {
|
||||||
|
kernel_names.push_back(entry.first);
|
||||||
|
}
|
||||||
|
return kernel_names;
|
||||||
|
}
|
||||||
|
|
||||||
|
// New kernels go here, each can have it's own set of arguments and initializations
|
||||||
|
// execute() contains the full kernel code minus an outer for loop (i=start, i<end, ++i),
|
||||||
|
// defined in the respective parallelization strategy
|
||||||
|
void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
|
||||||
|
|
||||||
|
// STREAM TRIAD
|
||||||
|
registry->register_kernel("stream_triad", [&]() {
|
||||||
|
auto a = std::make_shared<std::vector<float>>();
|
||||||
|
auto b = std::make_shared<std::vector<float>>();
|
||||||
|
auto c = std::make_shared<std::vector<float>>();
|
||||||
|
|
||||||
|
auto prepare = [=]() {
|
||||||
|
a->resize(VECTOR_SIZE);
|
||||||
|
b->resize(VECTOR_SIZE);
|
||||||
|
c->resize(VECTOR_SIZE);
|
||||||
|
initialize_vector(*b);
|
||||||
|
initialize_vector(*c);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
|
||||||
|
strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
|
||||||
|
(*a)[i] = (*b)[i] + 0.5f * (*c)[i];
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
return Kernel("stream_triad", execute, prepare);
|
||||||
|
});
|
||||||
|
|
||||||
|
// DAXPY
|
||||||
|
registry->register_kernel("daxpy", [&]() {
|
||||||
|
auto a = std::make_shared<std::vector<float>>();
|
||||||
|
auto b = std::make_shared<std::vector<float>>();
|
||||||
|
|
||||||
|
auto prepare = [=]() {
|
||||||
|
a->resize(VECTOR_SIZE);
|
||||||
|
b->resize(VECTOR_SIZE);
|
||||||
|
initialize_vector(*b);
|
||||||
|
};
|
||||||
|
|
||||||
|
auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
|
||||||
|
strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
|
||||||
|
(*a)[i] += 0.5f * (*b)[i];
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
return Kernel("daxpy", execute, prepare);
|
||||||
|
});
|
||||||
|
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
#include <iostream>
|
||||||
|
#include <chrono>
|
||||||
|
#include "kernels.hpp"
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
|
||||||
|
if (argc != 4) {
|
||||||
|
std::cerr << "Usage: " << argv[0] << " <kernel_name> <strategy> <num_threads_or_tasks>\n";
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string kernel_name = argv[1];
|
||||||
|
std::string strategy_name = argv[2];
|
||||||
|
int num_threads_or_tasks = std::stoul(argv[3]);
|
||||||
|
|
||||||
|
// registry contains a map of kernels generated from kernel builders for the selected parallelization strategy
|
||||||
|
KernelRegistry registry;
|
||||||
|
initialize_registry(®istry, strategy_name);
|
||||||
|
|
||||||
|
try{
|
||||||
|
// find kernel in unordered_map by it's name. prepare() allocates and initializes data structures needed for the selected kernel
|
||||||
|
Kernel kernel = registry.load_kernel(kernel_name);
|
||||||
|
kernel.prepare();
|
||||||
|
|
||||||
|
// Time the kernel execution
|
||||||
|
auto start_time = std::chrono::high_resolution_clock::now();
|
||||||
|
|
||||||
|
// VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM
|
||||||
|
kernel.execute(num_threads_or_tasks, VECTOR_SIZE);
|
||||||
|
|
||||||
|
auto end_time = std::chrono::high_resolution_clock::now();
|
||||||
|
std::chrono::duration<double, std::milli> duration = end_time - start_time;
|
||||||
|
|
||||||
|
std::cout << "Kernel: " << kernel_name << "\n";
|
||||||
|
std::cout << "Parallelization strategy: " << strategy_name << "\n";
|
||||||
|
std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n";
|
||||||
|
std::cout << "Kernel execution time [ms]: " << duration.count() << "\n";
|
||||||
|
} catch (const std::invalid_argument& e) {
|
||||||
|
// If kernel name is invalid, list available kernels
|
||||||
|
std::cerr << e.what() << "\n";
|
||||||
|
std::cerr << "Available kernels are:\n";
|
||||||
|
|
||||||
|
// List available kernels from registry
|
||||||
|
for (const auto& kernel_name : registry.list_available_kernels()) {
|
||||||
|
std::cerr << " - " << kernel_name << "\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
Loading…
Reference in New Issue