Initial commit

2024-12-13 00:33:08 +01:00 · 2024-12-13 00:33:08 +01:00 · d53f0d883c
parent c1b0fee200
commit d53f0d883c
8 changed files with 409 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -32,3 +32,5 @@
 *.out
 *.app
 bin/*
 obj/*
--- a/36
+++ b/36
@ -0,0 +1,36 @@
 CXX ?= g++
 OPTFLAGS ?= -O3 -march=native
 VECTOR_SIZE ?= 268435456
 CPPFLAGS = -DVECTOR_SIZE=$(VECTOR_SIZE)
 INCLUDES = -I./include -I$(EVENTIFY_ROOT)/include
 CXXFLAGS = $(CPPFLAGS) -std=c++20 -fopenmp $(INCLUDES) $(OPTFLAGS)
 LDFLAGS = -fopenmp -L$(EVENTIFY_ROOT)/lib -leventify
 SRC_DIR = src
 INCLUDE_DIR = include
 OBJ_DIR = obj
 BIN_DIR = bin
 SRCS = $(wildcard $(SRC_DIR)/*.cpp)
 OBJS = $(SRCS:$(SRC_DIR)/%.cpp=$(OBJ_DIR)/%.o)
 TARGET = $(BIN_DIR)/benchmark
 # Default rule to build the program
 all: $(TARGET)
 $(TARGET): $(OBJS)
 	@mkdir -p $(BIN_DIR)
 	$(CXX) $(LDFLAGS) $(OBJS) -o $@
 $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp
 	@mkdir -p $(OBJ_DIR)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 clean:
 	rm -rf $(BIN_DIR) $(OBJ_DIR)
 .PHONY: all clean
--- a/README.md
+++ b/README.md
@ -1,2 +1,105 @@
-# pkbf
+pkbf - Parallel Kernel Benchmarking Framework
 This project provides a benchmarking framework for parallel computing kernels, where the execution of the kernels can be parallelized using OpenMP or Eventify to compare both for the FlexFMM collaborative project. 
 The application is designed to make adding kernels and parallelization strategies as easy as possible.
 ## Features
 - **Kernel Registry**: A registry that allows the user to register and execute different computational kernels easily.
 - **Parallelization Strategies**: Two strategies for parallelizing the execution of kernel loops:
  - **OpenMP**: Uses OpenMP directives to parallelize the outermost loop.
  - **Eventify**: Uses the Eventify tasking system for parallelism.
 - **Kernel Execution**: Kernels such as **STREAM TRIAD** and **DAXPY** are implemented, and their execution can be timed and compared across different parallelization strategies.
 ## Project Structure
 .
 ├── bin/              # Compiled executable
 ├── include/          # Header files
 │   ├── kernels.hpp   # Kernel and KernelRegistry declarations
 │   ├── strategy.hpp  # Parallelization strategies (OpenMP, Eventify)
 │   └── utils.hpp     # Utility functions for initialization
 ├── src/              # Source files
 │   ├── kernels.cpp   # Kernel and KernelRegistry implementations
 │   ├── strategy.cpp  # Parallelization strategies (OpenMP, Eventify)
 │   ├── main.cpp      # Main entry point for benchmarking
 ├── Makefile          # Makefile to build the project
 └── README.md         # Project documentation
 ## Requirements
 - C++20 or higher
 - OpenMP support (for OpenMP parallelization strategy)
 - Eventify library (for Eventify parallelization strategy)
 ### Dependencies:
 - **Eventify**: Ensure that the Eventify library is properly installed and the environment variable `EVENTIFY_ROOT` points to the root directory of the Eventify installation.
 ## Building the Project
 To build the project, run:
 ```
 make
 ```
 This will compile the source files and generate an executable called `benchmark` in the `bin/` directory.
 ### Clean Up
 To remove all compiled files and the executable, run:
 ```
 make clean
 ```
 ## Usage
 ### Running the Benchmark
 To run a kernel benchmark, use the following command:
 ```
 ./bin/benchmark <kernel_name> <strategy> <num_threads_or_tasks>
 ```
 - `<kernel_name>`: The name of the kernel to run. Example: `stream_triad`
 - `<strategy>`: The parallelization strategy to use. Available options: `omp` (for OpenMP) and `eventify` (for Eventify).
 - `<num_threads_or_tasks>`: The number of threads or tasks to use for parallel execution. This depends on the parallelization strategy (e.g., number of threads for OpenMP, number of tasks for Eventify).
 ### Example:
 To run the `stream_triad` kernel with the OpenMP strategy using 4 threads:
 ```
 ./bin/benchmark stream_triad omp 4
 ```
 To run the `daxpy` kernel with the Eventify strategy using 8 tasks:
 ```
 ./bin/benchmark daxpy eventify 8
 ```
 ### Error Handling
 - If an invalid kernel name is provided, the program will print an error message and list available kernels.
 Example of an invalid kernel name:
 ```
 $ ./bin/benchmark invalid_kernel omp 4
 Kernel not found: invalid_kernel
 Available kernels are:
  - stream_triad
  - daxpy
 ```
 ## Contributing
 Feel free to submit issues or pull requests to improve the project.
 ## License
 This project is licensed under the MIT License.
--- a/include/kernels.hpp
+++ b/include/kernels.hpp
@ -0,0 +1,41 @@
 #ifndef KERNELS_HPP
 #define KERNELS_HPP
 #include <string>
 #include <functional>
 #include <unordered_map>
 class Kernel {
 public:
  using StrategyFunction = std::function<void(int, int, int)>;
  using PreparationFunction = std::function<void()>;
  Kernel(const std::string& name, StrategyFunction strategy_function, PreparationFunction preparation_function);
  void prepare() const;
  void execute(int n_threads_or_tasks, int kernel_tripcount) const;
 private:
  std::string name_;
  StrategyFunction strategy_function_;
  PreparationFunction preparation_function_;
 };
 class KernelRegistry {
 public:
  using KernelBuilder = std::function<Kernel()>;
  void register_kernel(const std::string& name, KernelBuilder factory);
  Kernel load_kernel(const std::string& name) const;
  std::vector<std::string> list_available_kernels() const;
 private:
  // FIXME: no benchmarking of maps done. The registry is expected to stay small, though
  std::unordered_map<std::string, KernelBuilder> registry_;
 };
 void initialize_registry(KernelRegistry* registry, std::string strategy_name);
 #endif // KERNELS_HPP
--- a/include/strategy.hpp
+++ b/include/strategy.hpp
@ -0,0 +1,72 @@
 #ifndef STRATEGY_HPP
 #define STRATEGY_HPP
 #include <omp.h>
 #include <stdexcept>
 #include <string>
 #include <eventify/task_system.hxx>
 // Parallelization strategies are defined here. Assumption for now: there is always an outer loop than can be parallelized.
 // The strategies are templates instanciated when adding kernels to the kernel registry.
 // Here, we only define the treatment of the outermost loop. The loop bodies are defined in kernels.cpp
 namespace strategy {
  // define concept to ensure that the loop bodies defined in kernels.cpp represent one invocable iteration of a parallel loop 
  template <typename Func>
  concept invocable_with_int = requires(Func&& f, int i) {
      { std::forward<Func>(f)(i) };  // Checks if calling f(i) is valid
  };
  // for OpenMP, we just use the for pragma for the outermost loop
  template <typename Func>
  requires invocable_with_int <Func>
  void openmp_strategy(int kernel_start_idx, int kernel_end_idx, int n_threads, Func&& loop_body) {
      omp_set_num_threads(static_cast<int>(n_threads));
      #pragma omp parallel for schedule(static)
      for (int i = kernel_start_idx; i < kernel_end_idx; ++i) {
          loop_body(i);
      }
  }
  // for eventify, we calculate indices for evenly divided chunks of the outermost loop,
  // create independent tasks and submit them to the tasking system
  template <typename Func>
  requires invocable_with_int <Func>
  void eventify_strategy(int kernel_start_idx, int kernel_end_idx, int n_tasks, Func&& loop_body) {
    auto task_system = eventify::task_system {};
    int tripcount = kernel_end_idx - kernel_start_idx + 1;
    int chunk_size = tripcount / n_tasks;
    int remainder = tripcount % n_tasks;
    for (int tid = 0; tid < n_tasks; ++tid) {
      auto task = [tid, tripcount, chunk_size, remainder, loop_body]{
        int start_idx = tid * chunk_size;
        int end_idx = start_idx + chunk_size - 1;
        if (tripcount - end_idx == remainder) end_idx += remainder;
        for (int i = start_idx; i < end_idx; ++i) {
          loop_body(i);
        }
      };
      task_system.submit(task);
    }
  }
  // parallelization strategy selector 
  template <typename Func>
  requires invocable_with_int<Func>
  void execute_strategy(const std::string& strategy_name, int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks, Func&& loop_body) {
      if (strategy_name == "omp") {
          openmp_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body));
      } else if (strategy_name == "eventify") {
          eventify_strategy(kernel_start_idx, kernel_end_idx, num_threads_or_tasks, std::forward<Func>(loop_body)); 
      } else {
          throw std::invalid_argument("Unknown strategy: " + strategy_name);
      }
  }
 }
 #endif //STRATEGY_HPP
--- a/include/utils.hpp
+++ b/include/utils.hpp
@ -0,0 +1,16 @@
 #ifndef UTILS_HPP
 #define UTILS_HPP
 #include <random>
 // Function to initialize a vector with random numbers
 void initialize_vector(std::vector<float>& v) {
    std::random_device rd;
    std::mt19937 gen(rd());
    std::uniform_real_distribution<float> dis(0.0f, 1.0f);
    for (auto& elem : v) {
        elem = dis(gen);
    }
 }
 #endif //UTILS_HPP
--- a/src/kernels.cpp
+++ b/src/kernels.cpp
@ -0,0 +1,87 @@
 #include <memory>
 #include <stdexcept>
 #include "kernels.hpp"
 #include "strategy.hpp"
 #include "utils.hpp"
 Kernel::Kernel(const std::string& name, Kernel::StrategyFunction strategy_function, Kernel::PreparationFunction preparation_function)
  : name_(name), strategy_function_(std::move(strategy_function)), preparation_function_(std::move(preparation_function)) {}
 void Kernel::prepare() const {
  preparation_function_();
 }
 void Kernel::execute(int num_threads_or_tasks, int kernel_tripcount) const {
  strategy_function_(0, kernel_tripcount, num_threads_or_tasks);
 }
 void KernelRegistry::register_kernel(const std::string& name, KernelBuilder factory) {
  registry_.emplace(name, std::move(factory));
 }
 Kernel KernelRegistry::load_kernel(const std::string& name) const {
  auto it = registry_.find(name);
  if (it == registry_.end()) {
    throw std::invalid_argument("Kernel not found: " + name);
  }
  return it->second();
 }
 std::vector<std::string> KernelRegistry::list_available_kernels() const {
  std::vector<std::string> kernel_names;
  for (const auto& entry : registry_) {
    kernel_names.push_back(entry.first);
  }
  return kernel_names;
 }
 // New kernels go here, each can have it's own set of arguments and initializations
 // execute() contains the full kernel code minus an outer for loop (i=start, i<end, ++i), 
 // defined in the respective parallelization strategy
 void initialize_registry(KernelRegistry* registry, std::string strategy_name) {
  // STREAM TRIAD
  registry->register_kernel("stream_triad", [&]() {
    auto a = std::make_shared<std::vector<float>>();
    auto b = std::make_shared<std::vector<float>>();
    auto c = std::make_shared<std::vector<float>>();
    auto prepare = [=]() {
      a->resize(VECTOR_SIZE);
      b->resize(VECTOR_SIZE);
      c->resize(VECTOR_SIZE);
      initialize_vector(*b);
      initialize_vector(*c);
    };
    auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
      strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
        (*a)[i] = (*b)[i] + 0.5f * (*c)[i];
      });
    };
    return Kernel("stream_triad", execute, prepare);
  });
  // DAXPY
  registry->register_kernel("daxpy", [&]() {
    auto a = std::make_shared<std::vector<float>>();
    auto b = std::make_shared<std::vector<float>>();
    auto prepare = [=]() {
      a->resize(VECTOR_SIZE);
      b->resize(VECTOR_SIZE);
      initialize_vector(*b);
    };
    auto execute = [=](int kernel_start_idx, int kernel_end_idx, int num_threads_or_tasks) {
      strategy::execute_strategy(strategy_name, kernel_start_idx, kernel_end_idx, num_threads_or_tasks, [&](int i) {
        (*a)[i] += 0.5f * (*b)[i];
      });
    };
    return Kernel("daxpy", execute, prepare);
  });
 }
--- a/src/main.cpp
+++ b/src/main.cpp
@ -0,0 +1,51 @@
 #include <iostream>
 #include <chrono>
 #include "kernels.hpp"
 int main(int argc, char** argv) {
    if (argc != 4) {
        std::cerr << "Usage: " << argv[0] << " <kernel_name> <strategy> <num_threads_or_tasks>\n";
        return 1;
    }
    std::string kernel_name = argv[1];
    std::string strategy_name = argv[2];
    int num_threads_or_tasks = std::stoul(argv[3]);
    // registry contains a map of kernels generated from kernel builders for the selected parallelization strategy
    KernelRegistry registry;
    initialize_registry(&registry, strategy_name);
    try{ 
      // find kernel in unordered_map by it's name. prepare() allocates and initializes data structures needed for the selected kernel
      Kernel kernel = registry.load_kernel(kernel_name);
      kernel.prepare();
      // Time the kernel execution
      auto start_time = std::chrono::high_resolution_clock::now();
      // VECTOR_SIZE is a preprocessor variable to mimic the setup of STREAM
      kernel.execute(num_threads_or_tasks, VECTOR_SIZE);
      auto end_time = std::chrono::high_resolution_clock::now();
      std::chrono::duration<double, std::milli> duration = end_time - start_time;
      std::cout << "Kernel: " << kernel_name << "\n";
      std::cout << "Parallelization strategy: " << strategy_name << "\n";
      std::cout << "Number of threads / tasks: " << num_threads_or_tasks << "\n";
      std::cout << "Kernel execution time [ms]: " << duration.count() << "\n";
    } catch (const std::invalid_argument& e) {
        // If kernel name is invalid, list available kernels
        std::cerr << e.what() << "\n";
        std::cerr << "Available kernels are:\n";
        // List available kernels from registry
        for (const auto& kernel_name : registry.list_available_kernels()) {
            std::cerr << "  - " << kernel_name << "\n";
        }
        return 1;
    }
    return 0;
 }