Initial commit

2024-12-13 00:33:08 +01:00 · 2024-12-13 00:33:08 +01:00 · d53f0d883c
parent c1b0fee200
commit d53f0d883c
8 changed files with 409 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -32,3 +32,5 @@
 *.out
 *.app

+bin/*
+obj/*
--- a/36
+++ b/36
@ -0,0 +1,36 @@
+CXX ?= g++
+OPTFLAGS ?= -O3 -march=native
+
+VECTOR_SIZE ?= 268435456
+CPPFLAGS = -DVECTOR_SIZE=$(VECTOR_SIZE)
+INCLUDES = -I./include -I$(EVENTIFY_ROOT)/include
+
+CXXFLAGS = $(CPPFLAGS) -std=c++20 -fopenmp $(INCLUDES) $(OPTFLAGS)
+LDFLAGS = -fopenmp -L$(EVENTIFY_ROOT)/lib -leventify
+
+SRC_DIR = src
+INCLUDE_DIR = include
+OBJ_DIR = obj
+BIN_DIR = bin
+
+SRCS = $(wildcard $(SRC_DIR)/*.cpp)
+
+OBJS = $(SRCS:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.o)
+
+TARGET = $(BIN_DIR)/benchmark
+
+# Default rule to build the program
+all: $(TARGET)
+
+$(TARGET): $(OBJS)
+	@mkdir -p $(BIN_DIR)
+	$(CXX) $(LDFLAGS) $(OBJS) -o $@
+
+$(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp
+	@mkdir -p $(OBJ_DIR)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+clean:
+	rm -rf $(BIN_DIR) $(OBJ_DIR)
+
+.PHONY: all clean
--- a/README.md
+++ b/README.md
@ -1,2 +1,105 @@
-# pkbf
+pkbf - Parallel Kernel Benchmarking Framework

+This project provides a benchmarking framework for parallel computing kernels, where the execution of the kernels can be parallelized using OpenMP or Eventify to compare both for the FlexFMM collaborative project. 
+The application is designed to make adding kernels and parallelization strategies as easy as possible.
+
+## Features
+
+- **Kernel Registry**: A registry that allows the user to register and execute different computational kernels easily.
+- **Parallelization Strategies**: Two strategies for parallelizing the execution of kernel loops:
+  - **OpenMP**: Uses OpenMP directives to parallelize the outermost loop.
+  - **Eventify**: Uses the Eventify tasking system for parallelism.
+- **Kernel Execution**: Kernels such as **STREAM TRIAD** and **DAXPY** are implemented, and their execution can be timed and compared across different parallelization strategies.
+
+## Project Structure
+
+.
+├── bin/              # Compiled executable
+├── include/          # Header files
+│   ├── kernels.hpp   # Kernel and KernelRegistry declarations
+│   ├── strategy.hpp  # Parallelization strategies (OpenMP, Eventify)
+│   └── utils.hpp     # Utility functions for initialization
+├── src/              # Source files
+│   ├── kernels.cpp   # Kernel and KernelRegistry implementations
+│   ├── strategy.cpp  # Parallelization strategies (OpenMP, Eventify)
+│   ├── main.cpp      # Main entry point for benchmarking
+├── Makefile          # Makefile to build the project
+└── README.md         # Project documentation
+
+## Requirements
+
+- C++20 or higher
+- OpenMP support (for OpenMP parallelization strategy)
+- Eventify library (for Eventify parallelization strategy)
+
+### Dependencies:
+
+- **Eventify**: Ensure that the Eventify library is properly installed and the environment variable `EVENTIFY_ROOT` points to the root directory of the Eventify installation.
+
+## Building the Project
+
+To build the project, run:
+
+```
+make
+```
+
+This will compile the source files and generate an executable called `benchmark` in the `bin/` directory.
+
+### Clean Up
+
+To remove all compiled files and the executable, run:
+
+```
+make clean
+```
+
+## Usage
+
+### Running the Benchmark
+
+To run a kernel benchmark, use the following command:
+
+```
+./bin/benchmark <kernel_name> <strategy> <num_threads_or_tasks>
+```
+
+- `<kernel_name>`: The name of the kernel to run. Example: `stream_triad`
+- `<strategy>`: The parallelization strategy to use. Available options: `omp` (for OpenMP) and `eventify` (for Eventify).
+- `<num_threads_or_tasks>`: The number of threads or tasks to use for parallel execution. This depends on the parallelization strategy (e.g., number of threads for OpenMP, number of tasks for Eventify).
+
+### Example:
+
+To run the `stream_triad` kernel with the OpenMP strategy using 4 threads:
+
+```
+./bin/benchmark stream_triad omp 4
+```
+
+To run the `daxpy` kernel with the Eventify strategy using 8 tasks:
+
+```
+./bin/benchmark daxpy eventify 8
+```
+
+### Error Handling
+
+- If an invalid kernel name is provided, the program will print an error message and list available kernels.
+
+Example of an invalid kernel name:
+
+```
+$ ./bin/benchmark invalid_kernel omp 4
+Kernel not found: invalid_kernel
+Available kernels are:
+  - stream_triad
+  - daxpy
+```
+
+## Contributing
+
+Feel free to submit issues or pull requests to improve the project.
+
+## License
+
+This project is licensed under the MIT License.
--- a/include/kernels.hpp
+++ b/include/kernels.hpp
@ -0,0 +1,41 @@
+#ifndef KERNELS_HPP
+#define KERNELS_HPP
+
+#include <string>
+#include <functional>
+#include <unordered_map>
+
+
+class Kernel {
+public:
+  using StrategyFunction = std::function<void(int, int, int)>;
+  using PreparationFunction = std::function<void()>;
+
+  Kernel(const std::string& name, StrategyFunction strategy_function, PreparationFunction preparation_function);
+
+  void prepare() const;
+  void execute(int n_threads_or_tasks, int kernel_tripcount) const;
+
+private:
+  std::string name_;
+  StrategyFunction strategy_function_;
+  PreparationFunction preparation_function_;
+};
+
+class KernelRegistry {
+public:
+  using KernelBuilder = std::function<Kernel()>;
+
+  void register_kernel(const std::string& name, KernelBuilder factory);
+  Kernel load_kernel(const std::string& name) const;
+  std::vector<std::string> list_available_kernels() const;
+
+private:
+  // FIXME: no benchmarking of maps done. The registry is expected to stay small, though
+  std::unordered_map<std::string, KernelBuilder> registry_;
+};
+
+void initialize_registry(KernelRegistry* registry, std::string strategy_name);
+
+#endif // KERNELS_HPP
+
--- a/include/strategy.hpp
+++ b/include/strategy.hpp
@ -0,0 +1,72 @@
+#ifndef STRATEGY_HPP
+#define STRATEGY_HPP
+
+#include <omp.h>
+#include <stdexcept>
+#include <string>
+#include <eventify/task_system.hxx>
+
+// Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized.
+// The strategies are templates instanciated when adding kernels to the kernel registry.
+// Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp
+
+namespace strategy {
+
+  // define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop 
+  template <typename Func>
+  concept invocable_with_int = requires(Func&& f, int i) {
+      { std::forward<Func>(f)(i) };  // Checks if calling f(i) is valid
+  };
+  
+  
+  // for OpenMP, we just use the for pragma for the outermost loop
+  template <typename Func>
+  requires invocable_with_int <Func>
+  void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) {
+      omp_set_num_threads(static_cast<int>(n_threads));
+  
+      #pragma omp parallel for schedule(static)
+      for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
+          loop_body(i);
+      }
+  }
+  
+  // for eventify, we calculate indices for evenly divided chunks of the outermost loop,
+  // create independent tasks and submit them to the tasking system
+  template <typename Func>
+  requires invocable_with_int <Func>
+  void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) {
+    auto task_system = eventify::task_system {};
+    int tripcount = kernel_end_idx - kernel_start_idx + 1;
+    int chunk_size = tripcount / n_tasks;
+    int remainder = tripcount % n_tasks;
+  
+    for (int tid = 0; tid < n_tasks; ++tid) {
+      auto task = [tid, tripcount, chunk_size, remainder, loop_body]{
+        int start_idx = tid * chunk_size;
+        int end_idx = start_idx + chunk_size - 1;
+        if (tripcount - end_idx == remainder) end_idx += remainder;
+        
+        for (int i = start_idx; i < end_idx; ++i) {
+          loop_body(i);
+        }
+      };
+      task_system.submit(task);
+    }
+  }
+  
+  // parallelization strategy selector 
+  template <typename Func>
+  requires invocable_with_int<Func>
+  void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) {
+      if (strategy_name == "omp") {
+          openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
+      } else if (strategy_name == "eventify") {
+          eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body)); 
+      } else {
+          throw std::invalid_argument("Unknown strategy: " + strategy_name);
+      }
+  }
+
+}
+#endif //STRATEGY_HPP
--- a/include/utils.hpp
+++ b/include/utils.hpp
@ -0,0 +1,16 @@
+#ifndef UTILS_HPP
+#define UTILS_HPP
+
+#include <random>
+
+// Function to initialize a vector with random numbers
+void initialize_vector(std::vector<float>& v) {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<float> dis(0.0f, 1.0f);
+    for (auto& elem : v) {
+        elem = dis(gen);
+    }
+}
+
+#endif //UTILS_HPP
--- a/src/kernels.cpp
+++ b/src/kernels.cpp
@ -0,0 +1,87 @@
+#include <memory>
+#include <stdexcept>
+#include "kernels.hpp"
+#include "strategy.hpp"
+#include "utils.hpp"
+
+Kernel::Kernel(const std::string& name, Kernel::StrategyFunction strategy_function, Kernel::PreparationFunction preparation_function)
+  : name_(name), strategy_function_(std::move(strategy_function)), preparation_function_(std::move(preparation_function)) {}
+
+void Kernel::prepare() const {
+  preparation_function_();
+}
+
+void Kernel::execute(int num_threads_or_tasks, int kernel_tripcount) const {
+  strategy_function_(0, kernel_tripcount, num_threads_or_tasks);
+}
+
+void KernelRegistry::register_kernel(const std::string& name, KernelBuilder factory) {
+  registry_.emplace(name, std::move(factory));
+}
+
+Kernel KernelRegistry::load_kernel(const std::string& name) const {
+  auto it = registry_.find(name);
+  if (it == registry_.end()) {
+    throw std::invalid_argument("Kernel not found: " + name);
+  }
+  return it->second();
+}
+
+std::vector<std::string> KernelRegistry::list_available_kernels() const {
+  std::vector<std::string> kernel_names;
+  for (const auto& entry : registry_) {
+    kernel_names.push_back(entry.first);
+  }
+  return kernel_names;
+}
+
+// New kernels go here, each can have it's own set of arguments and initializations
+// execute() contains the full kernel code minus an outer for loop (i=start, i<end, ++i), 
+// defined in the respective parallelization strategy
+void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
+
+  // STREAM TRIAD
+  registry->register_kernel("stream_triad", [&]() {
+    auto a = std::make_shared<std::vector<float>>();
+    auto b = std::make_shared<std::vector<float>>();
+    auto c = std::make_shared<std::vector<float>>();
+    
+    auto prepare = [=]() {
+      a->resize(VECTOR_SIZE);
+      b->resize(VECTOR_SIZE);
+      c->resize(VECTOR_SIZE);
+      initialize_vector(*b);
+      initialize_vector(*c);
+    };
+
+    auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
+      strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
+        (*a)[i] = (*b)[i] + 0.5f * (*c)[i];
+      });
+    };
+
+    return Kernel("stream_triad", execute, prepare);
+  });
+
+  // DAXPY
+  registry->register_kernel("daxpy", [&]() {
+    auto a = std::make_shared<std::vector<float>>();
+    auto b = std::make_shared<std::vector<float>>();
+
+    auto prepare = [=]() {
+      a->resize(VECTOR_SIZE);
+      b->resize(VECTOR_SIZE);
+      initialize_vector(*b);
+    };
+
+    auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
+      strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
+        (*a)[i] += 0.5f * (*b)[i];
+      });
+    };
+
+    return Kernel("daxpy", execute, prepare);
+  });
+
+}
+
--- a/src/main.cpp
+++ b/src/main.cpp
@ -0,0 +1,51 @@
+#include <iostream>
+#include <chrono>
+#include "kernels.hpp"
+
+int main(int argc, char** argv) {
+
+    if (argc != 4) {
+        std::cerr << "Usage: " << argv[0] << " <kernel_name> <strategy> <num_threads_or_tasks>\n";
+        return 1;
+    }
+
+    std::string kernel_name = argv[1];
+    std::string strategy_name = argv[2];
+    int num_threads_or_tasks = std::stoul(argv[3]);
+    
+    // registry contains a map of kernels generated from kernel builders for the selected parallelization strategy
+    KernelRegistry registry;
+    initialize_registry(&registry, strategy_name);
+    
+    try{ 
+      // find kernel in unordered_map by it's name. prepare() allocates and initializes data structures needed for the selected kernel
+      Kernel kernel = registry.load_kernel(kernel_name);
+      kernel.prepare();
+
+      // Time the kernel execution
+      auto start_time = std::chrono::high_resolution_clock::now();
+      
+      // VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM
+      kernel.execute(num_threads_or_tasks, VECTOR_SIZE);
+      
+      auto end_time = std::chrono::high_resolution_clock::now();
+      std::chrono::duration<double, std::milli> duration = end_time - start_time;
+
+      std::cout << "Kernel: " << kernel_name << "\n";
+      std::cout << "Parallelization strategy: " << strategy_name << "\n";
+      std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n";
+      std::cout << "Kernel execution time [ms]: " << duration.count() << "\n";
+    } catch (const std::invalid_argument& e) {
+        // If kernel name is invalid, list available kernels
+        std::cerr << e.what() << "\n";
+        std::cerr << "Available kernels are:\n";
+        
+        // List available kernels from registry
+        for (const auto& kernel_name : registry.list_available_kernels()) {
+            std::cerr << "  - " << kernel_name << "\n";
+        }
+
+        return 1;
+    }
+    return 0;
+}