On this post I’m going to solve a common problem on CUDA, and that is a Reduction problem.
But using the algorithms posted by Mark Harris “Optimizing Parallel Reduction in CUDA“
We know on GPU is not very effective perform a reduction operation, and trying to make the most optimal code can be very difficult and not very intuitive.
The post by Mark Harris only gives the algorithm but is not quite clear on how to use it, I’m going to do that.
Let us solve the following integral
this is not a calculus class, but in the old days, this was solved liked this
the sum of all the areas of the rectangles gives the approximate solution to the integral and is not other than PI . And yes you are indeed right, this is a reduction problem.
So… how are we going to tackle this one?
first we need to calculate all the rectangle areas. Since one rectangle does not depend on any other rectangle, is perfect for our GPU to solve this one.
__global__ void calcRectangles(float * rectangles,int n){ int i = blockIdx.x * blockDim.x + threadIdx.x; float x; if (i >= 0 && i < n){ x = (i + 0.5) / n; rectangles[i] = 4.0 / (1.0 + x*x); } }
where in the input
float * rectangles is the array to store all the areas.
int n the precision, bigger the n the more accurate the solution will be.
this kernel will give us an array of float values, this result will be out input to the reduce kernel.
Now let us prepare for the real deal
for each reduction I can only use 32 threads, this is because of my graphics card limitation, son in order to make large reduction I needed to use recursive kernel calls.
For example, to reduce an array of 2^21 elements

in the image we can see how an array of 2^21 elements is reduced to an array of 2^16 elements.
Now the same process with this new array, resulting in a 2^11 array, until it has 31 elements or less.
float calcReductionGeneral(int n, float * rectangles,int total , int type){ clock_t start; //vars to measure time clock_t end; float secondsTotal = 0; if (n<32){ //If the lenght of the array is less than 32, we make the reduction in the host start = clock(); float * resultadoGlobaL = (float*)malloc(sizeof(float)*n); float pi; for (int i=0;i<n;i++) pi = pi + rectangles[i]; pi = pi / total; end = clock(); secondsTotal = ((float)(end - start) / CLOCKS_PER_SEC); printf ("pi = %f " ,pi); free(resultadoGlobaL); return secondsTotal; } else{ //reduction on GPU float * resParcial = (float*)malloc(sizeof(float)*n/32); //partial solution, array of 32 floats for (int i=0;i<n/32;i++){ /*i.e. if we have 64 rectangles we are going to make 2 kernel calls first call with 0..31 elemets second call with 32..63 elements*/ float * inputParcial = (float*)malloc(sizeof(float)*32); //partial input to the kernel allocation float * input_d; gpuErrchk(cudaMalloc((void**)&input_d, 32*sizeof(float))); //GPU allocation copia(inputParcial,rectangles,i);//copy the relative 32 elements of the rectangles to the partial input /* i.e if we have 64 elements in the first loop iteration we copy the 0..31 elements and in the second iteration from 32..63 elements */ cudaMemcpy(input_d, inputParcial, sizeof(float)*32, cudaMemcpyHostToDevice); //We move the parcial input to the GPU float * output; gpuErrchk(cudaMalloc((void**)&output, 32*sizeof(float))); //allocating memory on GPU float * resParcial132 = (float*)malloc(sizeof(float)*32); //allocating memory for parcial result start = clock();//we measure time switch (type){ //calling the kernel, using the type parameter case 0: reduce0 <<<1,32,32 >>>(input_d, output); break; case 1: reduce1 <<<1,32,32 >>>(input_d, output); break; case 2: reduce2 <<<1,32,32 >>>(input_d, output); break; default: break; } end = clock(); secondsTotal = secondsTotal + ((float)(end - start) / CLOCKS_PER_SEC); gpuErrchk (cudaThreadSynchronize()); gpuErrchk (cudaMemcpy(resParcial132, output, sizeof(float)*32, cudaMemcpyDeviceToHost)); // we move the result to the host //printf ("resParcial132[0] = %f" ,resParcial132[0]); gpuErrchk (cudaThreadSynchronize()); resParcial[i]= resParcial132[0]; //clean up free(inputParcial); free(resParcial132); cudaFree(input_d); cudaFree(output); } return calcReductionGeneral((int)n/32,resParcial,total,type) + secondsTotal; //recursive call. } }
__global__ void reduce0(float *g_idata, float *g_odata) { extern __shared__ float sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for (unsigned int s = 1; s < blockDim.x; s *= 2) { if (tid % (2 * s) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
__global__ void reduce1(float *g_idata, float *g_odata) { extern __shared__ float sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); for (unsigned int s=1; s < blockDim.x; s *= 2) { int index = 2 * s * tid; if (index < blockDim.x) { sdata[index] += sdata[index + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
__global__ void reduce2(float *g_idata, float *g_odata) { extern __shared__ float sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadIdx.x; unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; sdata[tid] = g_idata[i]; __syncthreads(); for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); } // write result for this block to global mem if (tid == 0) g_odata[blockIdx.x] = sdata[0]; }
where reduce0, reduce1, and reduce2 are exactly the same algorithms used by Mark Harris.
Now, something to point out
pi = 3.141593 Time reduce0: 0.456775 seconds
pi = 3.141593 Time reduce1: 0.459540 seconds
pi = 3.141593 Time reduce2: 0.458745 seconds
the times of each algorithm for 4M elements is pretty much the same, where is this article “Optimizing Parallel Reduction in CUDA” the speedup from reduce0 to reduc1 is 2.33.
So… yeah.. I think they are selling a little bit of hot air in here, BUT, my implementation is very different from Nvidia’s, I try to use all the threads all the time and in their case they don’t, sticking to 128 threads because that is when the achieve peak performance and stuff like that.
The main structure of the program is the same, a tree based calling kernel function in a recursive manner and the results are very different still.
You can clone my entire code
git clone https://adri1988@bitbucket.org/adri1988/picudareduction.git