CUDA представляет из себя модификацию языка C, поэтому для исходных файлов принято использовать специальное расширение - 'cu' вместо 'c'.
Основные термины:
___global___
Выполнение ядра с распараллеливанием на N потоков описывается следующим образом:
MyKernel<<<1, N>>>(параметры);
export PATH=$PATH:/opt/shared/nvidia/cuda/bin export LD_LIBRARY_PATH=/opt/shared/nvidia/cuda/lib64:$LD_LIBRARY_PATH
Тоже самое можно сделать с помощью утилиты module:
module load nvidia/cuda-toolkit
#include <stdio.h> #include "cuda.h" #define N 128 int assigned_device; int used_device; // Data on the host system int HostA[N]; int HostB[N]; int HostC[N]; // Pointers to data on the device int *DeviceA; int *DeviceB; int *DeviceC; //---------------------------------------------------------- __global__ void AddVectors(int* a, int* b, int* c) { int i = threadIdx.x; c[i] = a[i] + b[i]; } //---------------------------------------------------------- int main(int argc, char** argv) { // Define the device to use: if (argc < 2) { printf ("Error: device number is absent\n"); return 100; } assigned_device=atoi(argv[1]); if ( strlen(argv[1]) > 1 or ( assigned_device == 0 and strcmp(argv[1],"0") != 0 ) ) { printf ("Error: device number is incorrect\n"); return 110; } // Select the used device: if ( cudaSetDevice(assigned_device) != cudaSuccess or cudaGetDevice( &used_device ) != cudaSuccess or used_device != assigned_device ) { printf ("Error: unable to set device %d\n", assigned_device); return 120; } printf ("Used device: %d\n", used_device); // Initialize summands: for (int i=0; i<N; i++) { HostA[i]=i*2; HostB[i]=i*3; } // Allocate memory on the device: cudaMalloc((void**)&DeviceA, N*sizeof(int)); cudaMalloc((void**)&DeviceB, N*sizeof(int)); cudaMalloc((void**)&DeviceC, N*sizeof(int)); // Copy summands from host to device: cudaMemcpy(DeviceA, HostA, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(DeviceB, HostB, N*sizeof(int), cudaMemcpyHostToDevice); // Execute kernel: AddVectors<<<1, N>>>(DeviceA, DeviceB, DeviceC); // Copy result from device to host: cudaMemcpy(HostC, DeviceC, N*sizeof(int), cudaMemcpyDeviceToHost); // Show result: for (int i=0; i<N; i++) { printf ("%d + %d = %d\n",HostA[i],HostB[i],HostC[i]); } cudaFree(DeviceA); cudaFree(DeviceB); cudaFree(DeviceC); }
module load nvidia/cuda-toolkit
nvcc -arch=compute_20 addvectors.cu -o addvectors
Либо можно использовать 'makefile' следующего содержания
addvectors : addvectors.cu nvcc -arch=compute_20 addvectors.cu -o $@
#!/bin/sh #PBS -q teslaq #PBS -l walltime=0:01:00 #PBS -l select=1:ngpus=1:ncpus=4:mem=32gb cd $PBS_O_WORKDIR vnodes=$(qstat -f $PBS_JOBID|grep exec_vnode|sed -e 's/ *//') if [ $( echo $vnodes|grep -c '+') != 0 ] ; then echo "Error: several vnodes are provided." exit 100 fi gpu_nbr=$(echo $vnodes|sed 's/.*\[//'|sed 's/\].*//') echo "GPU number from vnodes = $gpu_nbr" echo export PATH=$PATH:/opt/shared/nvidia/cuda/bin export LD_LIBRARY_PATH=/opt/shared/nvidia/cuda/lib64:$LD_LIBRARY_PATH ./addvectors $gpu_nbr
qsub submit.sh
GPU number from vnodes = 0 Used device: 0 0 + 0 = 0 2 + 3 = 5 4 + 6 = 10 6 + 9 = 15 8 + 12 = 20 ... 246 + 369 = 615 248 + 372 = 620 250 + 375 = 625 252 + 378 = 630 254 + 381 = 635
mpi-selector --set openmpi_gcc-1.4.4
module load nvidia/cuda-toolkit
#include <stdio.h> #include "cuda.h" #define N 4 int used_device; // Data on the host system int HostA[N]; int HostB[N]; int HostC[N]; // Pointers to data on the device int *DeviceA; int *DeviceB; int *DeviceC; //---------------------------------------------------------- __global__ void AddVectors(int* a, int* b, int* c) { int i = threadIdx.x; c[i] = a[i] + b[i]; } //---------------------------------------------------------- extern "C" void exec_cuda(int mpi_rank, int assigned_device) { // Select the used device: if ( cudaSetDevice(assigned_device) != cudaSuccess or cudaGetDevice( &used_device ) != cudaSuccess or used_device != assigned_device ) { printf ("Error: unable to set device %d\n", assigned_device); return; } // Initialize summands: for (int i=0; i<N; i++) { HostA[i]=(i+mpi_rank*100)*2; HostB[i]=(i+mpi_rank*100)*3; } // Allocate memory on the device: cudaMalloc((void**)&DeviceA, N*sizeof(int)); cudaMalloc((void**)&DeviceB, N*sizeof(int)); cudaMalloc((void**)&DeviceC, N*sizeof(int)); // Copy summands from host to device: cudaMemcpy(DeviceA, HostA, N*sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(DeviceB, HostB, N*sizeof(int), cudaMemcpyHostToDevice); // Execute kernel: AddVectors<<<1, N>>>(DeviceA, DeviceB, DeviceC); // Copy result from device to host: cudaMemcpy(HostC, DeviceC, N*sizeof(int), cudaMemcpyDeviceToHost); // Show result: for (int i=0; i<N; i++) { printf ("%d + %d = %d\n",HostA[i],HostB[i],HostC[i]); } cudaFree(DeviceA); cudaFree(DeviceB); cudaFree(DeviceC); }
#include <mpi.h> #include <stdio.h> #include <unistd.h> #include <stdlib.h> extern void exec_cuda(int, int); int *gpu_by_rank; int *vnode_is_used; //------------------------------------------------------------ int define_gpu (int rank, char *host, int argc, char** argv) { int i, gpu = -1; // 'argv' looks like the following: // ./mpi_cuda sl002 0 sl002 1 sl002 2 sl003 0 sl003 1 sl003 2 for (i=1; i<=(argc-1)/2; i++) { if ( strcmp(host,argv[i*2-1]) != 0 ) continue; if ( vnode_is_used[i-1] !=0 ) continue; gpu=atoi(argv[i*2]); gpu_by_rank[rank] = gpu; vnode_is_used[i-1] = 1; break; } return gpu; } //------------------------------------------------------------ int main(int argc, char** argv) { int size, rank, i, k, gpu; char host[32]; MPI_Status status; MPI_Init(&argc,&argv); MPI_Comm_size(MPI_COMM_WORLD,&size); MPI_Comm_rank(MPI_COMM_WORLD,&rank); // Validate arguments: if ( size != (argc-1)/2 ) { MPI_Finalize(); printf ("Error: amount of GPUs do not match the MPI size!\n"); return 100; } // Define correspondence between ranks and vnodes gpu: if ( rank == 0 ) { gpu_by_rank = (int *)malloc(size*sizeof(int)); vnode_is_used = (int *)malloc(size*sizeof(int)); for ( i=0; i<size; i++ ) { gpu_by_rank[i]=-1; vnode_is_used[i]=0; } for ( i=1; i<size; i++ ) { MPI_Recv(host,32,MPI_CHAR,i,0,MPI_COMM_WORLD, &status); gpu = define_gpu(i,host, argc, argv); MPI_Send(&gpu,1,MPI_INT,i,0,MPI_COMM_WORLD); } gethostname(host,32); gpu=define_gpu(0,host, argc, argv); free(gpu_by_rank); free(vnode_is_used); } else { gethostname(host,32); MPI_Send(host,32,MPI_CHAR,0,0,MPI_COMM_WORLD); MPI_Recv(&gpu,1,MPI_INT,0,0,MPI_COMM_WORLD, &status); } printf("I'm number %d from %d and I run on host %s, gpu %d.\n",rank,size,host,gpu); exec_cuda(rank,gpu); MPI_Finalize(); return 0; }
nvcc -arch=compute_20 -c cuda_part.cu mpicc -c mpi_part.c
и затем собрать в исполняемый 'mpi_cuda':
mpicc mpi_part.o cuda_part.o -lm -lcudart -L/opt/shared/nvidia/cuda/lib64 -I/opt/shared/nvidia/cuda/include -o mpi_cuda
mpi_cuda : mpi_part.o cuda_part.o mpicc mpi_part.o cuda_part.o -lm -lcudart -L/opt/shared/nvidia/cuda/lib64 -I/opt/shared/nvidia/cuda/include -o mpi_cuda cuda_part.o : cuda_part.cu nvcc -arch=compute_20 -c cuda_part.cu mpi_part.o : mpi_part.c mpicc -c mpi_part.c
#!/bin/sh #PBS -q teslaq #PBS -l walltime=0:01:00 #PBS -l select=2:ngpus=3:ncpus=3:mpiprocs=3:mem=32gb,place=scatter cd $PBS_O_WORKDIR MPI_NP=$(wc -l $PBS_NODEFILE | awk '{ print $1 }') vnodes=$(qstat -f $PBS_JOBID|tr -d '\n'' ''\t'|sed 's/Hold_Types.*//'|sed 's/.*exec_vnode=//'|tr -d \(\)|tr + '\n'|sed 's/:.*//'|sort) echo "My vnodes:" for vnode in $vnodes ; do node=$(echo $vnode|sed 's/\[.*//') gpu=$(echo $vnode|sed 's/.*\[//'|sed 's/\]//') echo " $vnode = Node $node, GPU $gpu" done echo ## Replace all '[' and ']' by spaces before passing to program: vnodes=$(echo $vnodes|sed 's/\[/ /g'|sed 's/\]/ /g') export LD_LIBRARY_PATH=/opt/shared/nvidia/cuda/lib64:$LD_LIBRARY_PATH mpirun -x LD_LIBRARY_PATH -hostfile $PBS_NODEFILE -np $MPI_NP ./mpi_cuda $vnodes
qsub submit.sh
My vnodes: sl002[0] = Node sl002, GPU 0 sl002[1] = Node sl002, GPU 1 sl002[2] = Node sl002, GPU 2 sl003[0] = Node sl003, GPU 0 sl003[1] = Node sl003, GPU 1 sl003[2] = Node sl003, GPU 2 I'm number 1 from 6 and I run on host sl002, gpu 0. I'm number 2 from 6 and I run on host sl002, gpu 1. I'm number 3 from 6 and I run on host sl003, gpu 0. I'm number 0 from 6 and I run on host sl002, gpu 2. I'm number 4 from 6 and I run on host sl003, gpu 1. I'm number 5 from 6 and I run on host sl003, gpu 2. 0 + 0 = 0 2 + 3 = 5 4 + 6 = 10 6 + 9 = 15 400 + 600 = 1000 402 + 603 = 1005 404 + 606 = 1010 406 + 609 = 1015 600 + 900 = 1500 602 + 903 = 1505 604 + 906 = 1510 606 + 909 = 1515 200 + 300 = 500 202 + 303 = 505 204 + 306 = 510 206 + 309 = 515 800 + 1200 = 2000 802 + 1203 = 2005 804 + 1206 = 2010 806 + 1209 = 2015 1000 + 1500 = 2500 1002 + 1503 = 2505 1004 + 1506 = 2510 1006 + 1509 = 2515