Parallel computing

Parallel computing

#

OpenMP

compile

  • 编译选项
-fopenmp
  • or cmake
find_package(OpenMP)
if(OPENMP_FOUND)
  message(STATUS "OPENMP FOUND")
  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
  set(CMAKE_EXE_LINKER_FLAGS
    "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
  add_definitions(-DHAVE_OPENMP=1)
else()
  message(STATUS "OPENMP NOT FOUND")
  add_definitions(-DHAVE_OPENMP=0)
endif()
if("${USE_OPENMP}" STREQUAL "")
  if(OPENMP_FOUND)
    add_definitions(-DUSE_OPENMP=1)
    message(STATUS "use OPENMP")
  else()
    add_definitions(-DUSE_OPENMP=0)
    message(STATUS "not use OPENMP")
  endif()
else()
  if(USE_OPENMP)
    if(NOT OPENMP_FOUND)
      message(FATAL_ERROR "USE_OPENMP true but OPENMP NOT FOUND")
    endif()
    add_definitions(-DUSE_OPENMP=1)
    message(STATUS "use OPENMP")
  else()
    add_definitions(-DUSE_OPENMP=0)
    message(STATUS "not use OPENMP")
  endif()
endif()

notice

如果需要并行执行(e.g. 并行执行 for), 而不是将语句块多次执行,
如果是 for, 可以省略
for (..;..;): 要使用 parallel for;
而不是仅仅 parallel (没有 for)

examples

#if HAVE_OPENMP
/**
 * The pragma omp parallel is used to fork additional threads to carry out
 * the work enclosed in the construct in parallel.
 * The original thread will be denoted as master thread with thread ID 0.
 * Example (C program): Display parallel[%d] using multiple threads.
 * - Use flag -fopenmp to compile using GCC
 * - 使用 shared and std::atomic
 * @sa
 * - http://www.openmp.org/wp-content/uploads/openmp-examples-4.5.0.pdf
 * - https://en.wikipedia.org/wiki/OpenMP#Thread_creation
 * - openmp-4.5.pdf
 */
TEST(openmp, shared)
{
    std::atomic<int> count(0);
#   pragma omp parallel default(shared) shared(count) \
        num_threads(::omp_get_max_threads())
    ::printf("parallel[%d]\n", ++count);
    // parallel done
    int const num = ::omp_get_max_threads();
    EXPECT_EQ(num, count);
}
/// 使用 shared and std::atomic 2
TEST(openmp, shared2)
{
    std::atomic<int> count(0);
    int const num = ::omp_get_max_threads();
    ::omp_set_num_threads(num);
#   pragma omp parallel default(shared) shared(count) num_threads(num)
    ::printf("parallel[%d]\n", ++count);
    // parallel done
    EXPECT_EQ(num, count);
}
/// 整个 for 并行(或多个并行)执行
TEST(openmp, parallelFor)
{
    std::atomic<int> count(0);
    int const num = ::omp_get_max_threads();
#   pragma omp parallel for default(shared) shared(count) num_threads(num)
    for (int i = 0; i < num; ++i) { // 整个 for 并行执行
        ::printf("parallel[%d] i %d\n", ++count, i);
    }
    // parallel done
    EXPECT_EQ(8, count);
}
/// 使用 for 但是不写 for. 和上面 parallelFor 一样
TEST(openmp, parallelForOmitFor)
{
    std::atomic<int> count(0);
#   pragma omp parallel default(shared) shared(count) \
        num_threads(::omp_get_max_threads())
    {
        ::printf("parallel[%d]\n", ++count);
    }
    // parallel done
    int const num = ::omp_get_max_threads();
    EXPECT_EQ(num, count);
}
/// 整个 for 会被执行多次
TEST(openmp, multiFor)
{
    std::atomic<int> count(0);
    int i, c;
    int const num = ::omp_get_max_threads();
    ::omp_set_num_threads(num);
#   pragma omp parallel default(shared) shared(count) private(i, c) \
        num_threads(num)
    for (i = 0, c = ++count; i < c; ++i) { // 整个 for 会被执行多次
        ::printf("parallel[%d ~ %d]\n", i, c);
    }
    // parallel done
    EXPECT_EQ(num, count);
}
/// 使用 private
TEST(openmp, private)
{
    std::atomic<int> count(999);
#   pragma omp parallel default(shared) private(count) \
        num_threads(::omp_get_max_threads())
    {
        EXPECT_NE(999, count);
        ::printf("parallel[%d]\n", count = 20);
        // parallel done
    }
    EXPECT_EQ(999, count);
}
/// 使用 private 2
TEST(openmp, private2)
{
    int count = 999;
#   pragma omp parallel default(shared) private(count) \
        num_threads(::omp_get_max_threads())
    {
        EXPECT_NE(999, count);
        ::printf("parallel[%d]\n", count = 20);
        // parallel done
    }
    EXPECT_EQ(999, count);
}
#endif

use future or shared future

example

cv::Mat_<int32_t> rightDisparity(
    this->leftRect.rows, this->leftRect.cols, 0);
cv::Mat_<int32_t> leftDisparity(
    this->leftRect.rows, this->leftRect.cols, 0);
std::future<void> leftDone(std::async(
    std::launch::async,
    boost::bind(&ParallelSADStereoMatch::computeDisparity, this, _1, _2),
    true, std::ref(leftDisparity)));
std::future<void> rightDone(std::async(
    std::launch::async,
    boost::bind(&ParallelSADStereoMatch::computeDisparity, this, _1, _2),
    false, std::ref(rightDisparity)));
rightDone.get();
leftDone.get();

SIMD

SSE2

TODO


thread parallel for

example implementation

template<typename I, typename F>
void ParallelFor(
    I const& first,
    I const& last,
    F&& f,
    int const threadsNum = 1,
    int const threshold = 1000)
{
    unsigned const group = std::max(
        std::max(ptrdiff_t(1), ptrdiff_t(std::abs(threshold))),
        (last - first) / std::abs(threadsNum));
    std::vector<std::thread> threads;
    for (I it = first; it < last; it += group) {
        threads.push_back(std::thread([=, &f](){
            std::for_each(it, std::min(it + group, last), f); }));
    }
    std::for_each(threads.begin(), threads.end(), [](std::thread& t){
        t.join(); });
}