Multi-block approach to parallel reduction in CUDA poses an additional challenge, compared to single-block approach, because blocks are limited in communication. The idea is to let each block compute a part of the input array, and then have one final block to merge all the partial results. To do that one can launch two kernels, implicitly creating a grid-wide synchronization point.
static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24; //this number is hardware-dependent; usually #SM*2 is a good number.
__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut) {
int thIdx = threadIdx.x;
int gthIdx = thIdx + blockIdx.x*blockSize;
const int gridSize = blockSize*gridDim.x;
int sum = 0;
for (int i = gthIdx; i < arraySize; i += gridSize)
sum += gArr[i];
__shared__ int shArr[blockSize];
shArr[thIdx] = sum;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[blockIdx.x] = shArr[0];
}
__host__ int sumArray(int* arr) {
int* dev_arr;
cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);
int out;
int* dev_out;
cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out);
//dev_out now holds the partial result
sumCommMultiBlock<<<1, blockSize>>>(dev_out, gridSize, dev_out);
//dev_out[0] now holds the final result
cudaDeviceSynchronize();
cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_arr);
cudaFree(dev_out);
return out;
}
One ideally wants to launch enough blocks to saturate all multiprocessors on the GPU at full occupancy. Exceeding this number - in particular, launching as many threads as there are elements in the array - is counter-productive. Doing so it does not increase the raw compute power anymore, but prevents using the very efficient first loop.
It is also possible to obtain the same result using a single kernel, with the help of last-block guard:
static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24;
__device__ bool lastBlock(int* counter) {
__threadfence(); //ensure that partial result is visible by all blocks
int last = 0;
if (threadIdx.x == 0)
last = atomicAdd(counter, 1);
return __syncthreads_or(last == gridDim.x-1);
}
__global__ void sumCommMultiBlock(const int *gArr, int arraySize, int *gOut, int* lastBlockCounter) {
int thIdx = threadIdx.x;
int gthIdx = thIdx + blockIdx.x*blockSize;
const int gridSize = blockSize*gridDim.x;
int sum = 0;
for (int i = gthIdx; i < arraySize; i += gridSize)
sum += gArr[i];
__shared__ int shArr[blockSize];
shArr[thIdx] = sum;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[blockIdx.x] = shArr[0];
if (lastBlock(lastBlockCounter)) {
shArr[thIdx] = thIdx<gridSize ? gOut[thIdx] : 0;
__syncthreads();
for (int size = blockSize/2; size>0; size/=2) { //uniform
if (thIdx<size)
shArr[thIdx] += shArr[thIdx+size];
__syncthreads();
}
if (thIdx == 0)
gOut[0] = shArr[0];
}
}
__host__ int sumArray(int* arr) {
int* dev_arr;
cudaMalloc((void**)&dev_arr, wholeArraySize * sizeof(int));
cudaMemcpy(dev_arr, arr, wholeArraySize * sizeof(int), cudaMemcpyHostToDevice);
int out;
int* dev_out;
cudaMalloc((void**)&dev_out, sizeof(int)*gridSize);
int* dev_lastBlockCounter;
cudaMalloc((void**)&dev_lastBlockCounter, sizeof(int));
cudaMemset(dev_lastBlockCounter, 0, sizeof(int));
sumCommMultiBlock<<<gridSize, blockSize>>>(dev_arr, wholeArraySize, dev_out, dev_lastBlockCounter);
cudaDeviceSynchronize();
cudaMemcpy(&out, dev_out, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(dev_arr);
cudaFree(dev_out);
return out;
}
Note that the kernel can be made faster, for example by using a warp-level parallel reduction.