Multi-block approach to parallel reduction is very similar to the single-block approach. The global input array must be split into sections, each reduced by a single block. When a partial result from each block is obtained, one final block reduces these to obtain the final result.
sumNoncommSingleBlock
is explained more in detail in the Single-block reduction example.lastBlock
accepts only the last block reaching it. If you want to avoid this, you can split the kernel into two separate calls.static const int wholeArraySize = 100000000;
static const int blockSize = 1024;
static const int gridSize = 24; //this number is hardware-dependent; usually #SM*2 is a good number.
__device__ bool lastBlock(int* counter) {
__threadfence(); //ensure that partial result is visible by all blocks
int last = 0;
if (threadIdx.x == 0)
last = atomicAdd(counter, 1);
return __syncthreads_or(last == gridDim.x-1);
}
__device__ void sumNoncommSingleBlock(const int* gArr, int arraySize, int* out) {
int thIdx = threadIdx.x;
__shared__ int shArr[blockSize*2];
__shared__ int offset;
shArr[thIdx] = thIdx<arraySize ? gArr[thIdx] : 0;
if (thIdx == 0)
offset = blockSize;
__syncthreads();
while (offset < arraySize) { //uniform
shArr[thIdx + blockSize] = thIdx+offset<arraySize ? gArr[thIdx+offset] : 0;
__syncthreads();
if (thIdx == 0)
offset += blockSize;
int sum = shArr[2*thIdx] + shArr[2*thIdx+1];
__syncthreads();
shArr[thIdx] = sum;
}
__syncthreads();
for (int stride = 1; stride<blockSize; stride*=2) { //uniform
int arrIdx = thIdx*stride*2;
if (arrIdx+stride<blockSize)
shArr[arrIdx] += shArr[arrIdx+stride];
__syncthreads();
}
if (thIdx == 0)
*out = shArr[0];
}
__global__ void sumNoncommMultiBlock(const int* gArr, int* out, int* lastBlockCounter) {
int arraySizePerBlock = wholeArraySize/gridSize;
const int* gArrForBlock = gArr+blockIdx.x*arraySizePerBlock;
int arraySize = arraySizePerBlock;
if (blockIdx.x == gridSize-1)
arraySize = wholeArraySize - blockIdx.x*arraySizePerBlock;
sumNoncommSingleBlock(gArrForBlock, arraySize, &out[blockIdx.x]);
if (lastBlock(lastBlockCounter))
sumNoncommSingleBlock(out, gridSize, out);
}
One ideally wants to launch enough blocks to saturate all multiprocessors on the GPU at full occupancy. Exceeding this number - in particular, launching as many threads as there are elements in the array - is counter-productive. Doing so it does not increase the raw compute power anymore, but prevents using the very efficient first loop.