Parallel computing
#
OpenMP
- see also
compile
- 编译选项
-fopenmp
- or cmake
find_package(OpenMP)
if(OPENMP_FOUND)
message(STATUS "OPENMP FOUND")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS
"${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
add_definitions(-DHAVE_OPENMP=1)
else()
message(STATUS "OPENMP NOT FOUND")
add_definitions(-DHAVE_OPENMP=0)
endif()
if("${USE_OPENMP}" STREQUAL "")
if(OPENMP_FOUND)
add_definitions(-DUSE_OPENMP=1)
message(STATUS "use OPENMP")
else()
add_definitions(-DUSE_OPENMP=0)
message(STATUS "not use OPENMP")
endif()
else()
if(USE_OPENMP)
if(NOT OPENMP_FOUND)
message(FATAL_ERROR "USE_OPENMP true but OPENMP NOT FOUND")
endif()
add_definitions(-DUSE_OPENMP=1)
message(STATUS "use OPENMP")
else()
add_definitions(-DUSE_OPENMP=0)
message(STATUS "not use OPENMP")
endif()
endif()
notice
如果需要并行执行(e.g. 并行执行 for), 而不是将语句块多次执行,
如果是 for, 可以省略 for (..;..;)
: 要使用 parallel for
;
而不是仅仅 parallel
(没有 for
)
examples
#if HAVE_OPENMP
/**
* The pragma omp parallel is used to fork additional threads to carry out
* the work enclosed in the construct in parallel.
* The original thread will be denoted as master thread with thread ID 0.
* Example (C program): Display parallel[%d] using multiple threads.
* - Use flag -fopenmp to compile using GCC
* - 使用 shared and std::atomic
* @sa
* - http://www.openmp.org/wp-content/uploads/openmp-examples-4.5.0.pdf
* - https://en.wikipedia.org/wiki/OpenMP#Thread_creation
* - openmp-4.5.pdf
*/
TEST(openmp, shared)
{
std::atomic<int> count(0);
# pragma omp parallel default(shared) shared(count) \
num_threads(::omp_get_max_threads())
::printf("parallel[%d]\n", ++count);
// parallel done
int const num = ::omp_get_max_threads();
EXPECT_EQ(num, count);
}
/// 使用 shared and std::atomic 2
TEST(openmp, shared2)
{
std::atomic<int> count(0);
int const num = ::omp_get_max_threads();
::omp_set_num_threads(num);
# pragma omp parallel default(shared) shared(count) num_threads(num)
::printf("parallel[%d]\n", ++count);
// parallel done
EXPECT_EQ(num, count);
}
/// 整个 for 并行(或多个并行)执行
TEST(openmp, parallelFor)
{
std::atomic<int> count(0);
int const num = ::omp_get_max_threads();
# pragma omp parallel for default(shared) shared(count) num_threads(num)
for (int i = 0; i < num; ++i) { // 整个 for 并行执行
::printf("parallel[%d] i %d\n", ++count, i);
}
// parallel done
EXPECT_EQ(8, count);
}
/// 使用 for 但是不写 for. 和上面 parallelFor 一样
TEST(openmp, parallelForOmitFor)
{
std::atomic<int> count(0);
# pragma omp parallel default(shared) shared(count) \
num_threads(::omp_get_max_threads())
{
::printf("parallel[%d]\n", ++count);
}
// parallel done
int const num = ::omp_get_max_threads();
EXPECT_EQ(num, count);
}
/// 整个 for 会被执行多次
TEST(openmp, multiFor)
{
std::atomic<int> count(0);
int i, c;
int const num = ::omp_get_max_threads();
::omp_set_num_threads(num);
# pragma omp parallel default(shared) shared(count) private(i, c) \
num_threads(num)
for (i = 0, c = ++count; i < c; ++i) { // 整个 for 会被执行多次
::printf("parallel[%d ~ %d]\n", i, c);
}
// parallel done
EXPECT_EQ(num, count);
}
/// 使用 private
TEST(openmp, private)
{
std::atomic<int> count(999);
# pragma omp parallel default(shared) private(count) \
num_threads(::omp_get_max_threads())
{
EXPECT_NE(999, count);
::printf("parallel[%d]\n", count = 20);
// parallel done
}
EXPECT_EQ(999, count);
}
/// 使用 private 2
TEST(openmp, private2)
{
int count = 999;
# pragma omp parallel default(shared) private(count) \
num_threads(::omp_get_max_threads())
{
EXPECT_NE(999, count);
::printf("parallel[%d]\n", count = 20);
// parallel done
}
EXPECT_EQ(999, count);
}
#endif
use future or shared future
example
cv::Mat_<int32_t> rightDisparity(
this->leftRect.rows, this->leftRect.cols, 0);
cv::Mat_<int32_t> leftDisparity(
this->leftRect.rows, this->leftRect.cols, 0);
std::future<void> leftDone(std::async(
std::launch::async,
boost::bind(&ParallelSADStereoMatch::computeDisparity, this, _1, _2),
true, std::ref(leftDisparity)));
std::future<void> rightDone(std::async(
std::launch::async,
boost::bind(&ParallelSADStereoMatch::computeDisparity, this, _1, _2),
false, std::ref(rightDisparity)));
rightDone.get();
leftDone.get();
SIMD
SSE2
TODO
thread parallel for
example implementation
template<typename I, typename F>
void ParallelFor(
I const& first,
I const& last,
F&& f,
int const threadsNum = 1,
int const threshold = 1000)
{
unsigned const group = std::max(
std::max(ptrdiff_t(1), ptrdiff_t(std::abs(threshold))),
(last - first) / std::abs(threadsNum));
std::vector<std::thread> threads;
for (I it = first; it < last; it += group) {
threads.push_back(std::thread([=, &f](){
std::for_each(it, std::min(it + group, last), f); }));
}
std::for_each(threads.begin(), threads.end(), [](std::thread& t){
t.join(); });
}