贡献者: addis
#pragma
(预编译指令)给已有的程序进行 GPU 加速支持在 Linux 上运行的 C++ 和 Fortran 程序。本文以 C++ 为例。它由于易用性,它常用于高性能科学计算中。
Not executing on GPU.
,应该需要安装 OpenACC 版本的 GCC,或者直接用 NVIDIA SDK。
nvidia-smi
能用,然后安装好 NVIDIA SDK 以后,要在 .bashrc
最后添加以下几行设置环境(见这个文档)
NVARCH=`uname -s`_`uname -m`; export NVARCH
NVCOMPILERS=/opt/nvidia/hpc_sdk; export NVCOMPILERS
MANPATH=$MANPATH:$NVCOMPILERS/$NVARCH/23.11/compilers/man; export MANPATH
PATH=$NVCOMPILERS/$NVARCH/23.11/compilers/bin:$PATH; export PATH
source ~/.bashrc
以后,就可以试试 nvc --version
,nvcc --version
了。
nvc/nvc++
编译器支持 OpenACC 和 OpenMP,而 nvcc
则用于 CUDA 编程。
-fopenacc
选项即可! 例如 g++ -fopenacc test1.cpp -o test1.x
。在编译时可以加上 -fopt-info
选项输出更多信息。
nvc -acc -gpu=managed -Minfo=accel test1.cpp -o test1.x
#include <iostream>
#include <vector>
#include <openacc.h>
// Function to initialize the vectors with values
void initialize(std::vector<double>& a, std::vector<double>& b, int n) {
for(int i = 0; i < n; ++i) {
a[i] = static_cast<double>(i);
b[i] = static_cast<double>(2 * i);
}
}
// detect if GPU is actually running
void detect_gpu()
{
double a[100], b[100];
#pragma acc parallel loop
for (int i = 0; i < 100; ++i) {
if (i == 10) {
if (acc_on_device(acc_device_not_host))
printf("Executing on GPU.\n");
else
printf("Not executing on GPU.\n");
}
a[i] += b[i];
}
}
int main() {
const int n = 1000000; // Size of the vectors
std::vector<double> a(n), b(n), c(n);
double *pa = a.data(), *pb = b.data(), *pc = c.data();
// Initialize vectors a and b
initialize(a, b, n);
detect_gpu();
// Using OpenACC to offload the following computation to an accelerator
// and explicitly handle data movement
#pragma acc data copyin(pa[0:n], pb[0:n]) copyout(pc[0:n])
{
#pragma acc parallel loop
for(int i = 0; i < n; ++i)
pc[i] = pa[i] + pb[i];
}
// Display the first 10 results
for(int i = 0; i < 10; ++i) {
std::cout << "c[" << i << "] = " << c[i] << std::endl;
}
}
-ta=tesla
: Compiler option to target NVIDIA Tesla GPUs.
-Minfo=accel
: Provides feedback about the code generated by the compiler.
#pragma acc parallel
: GPU 并行运算
#pragma acc kernels
: Identifies a code block for parallelization, allowing the compiler to automatically manage parallelism.
#pragma acc loop
: Used within parallel or kernels regions to indicate loops that should be parallelized.
#pragma acc routine
: 让一个函数可以在 GPU 代码中被调用(也可以在 CPU 代码调用)。
#pragma acc declare
: Used for declaring variables or creating a data region.
#pragma acc data
: Manages data movement to and from the GPU.
#pragma acc enter data
: Specifies data that should be moved to the GPU.
#pragma acc exit data
: Specifies data to be moved back from the GPU.
#pragma acc update
: Synchronizes data between the host and the GPU.
copy, copyin, copyout
, create, present: Clauses for data construct to define how data is handled (e.g., whether it's copied to/from the GPU or just created there).
gang, worker, vector
: Used with loop directive to control how loop iterations are distributed over parallel execution units.
collapse(n)
: Collapses nested loops to enhance parallelism.
reduction(operator:list)
: Performs a reduction operation (like sum, max) across parallel elements.