CUDA kernel for-loop performance

Hello,

I have a CUDA kernel that executes a for-loop which gives the correct results when comparing to HOST code version. However, the performance is spotty at best. Can anyone tell me if it is possible to define the for-loop in the CUDA kernel such that it is better parallelized? I would like to completely remove the for-loop from the kernel if possible.

ANY help/hints for how to increase performance would be appreciated.

The code is posted below:

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

// DEVICE code:

__global__ void myKern(const unsigned int N, const float ep,
                                       cuComplex *__restrict__ d_a, cuComplex *__restrict__ d_b){
  // global thread ID
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2 * i;

  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y = b*cosf(theta); k++;
  }
  
  if(i >= L){
    // if odd
    if(r) d_a[k++].x = -a;
  }

  d_b[0].x = r ? 1.0f : 1.0f / sqrtf(1.0f + ep*ep);

  // ****************************************************************
  // PROBLEM: Performance is horrible here - COSTLY FOR-LOOP
  // ****************************************************************
  // Can this for-loop be removed or defined differently to generate
  // better performance for the CUDA kernel ?
  for(; i < N; i++){
    d_b[0].x *= d_a[i].x;
    d_b[0].y *= d_a[i].y;
  }
  // ****************************************************************
}

// HOST code:

int main() {

  const float ep = 0.01f;
  const unsigned int N = 8;

  cuComplex *d_a, *d_b;
  cuComplex *h_a, h_b;
  cuComplex chk_b;

  h_b.x = h_b.y = 0.0f;

  h_a = (cuComplex*)malloc(N*sizeof(cuComplex));
  cudaMalloc((void**)&d_a, N*sizeof(cuComplex));
  cudaMalloc((void**)&d_b, sizeof(cuComplex));

  // Randomly generate array values
  h_a = genRandom(N);

  // copy to DEVICE
  cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

  // call CUDA kernel
  unsigned int tpb = 1024;
  unsigned int nblocks = (N + tpb - 1) / tpb;
  myKern<<<nblocks,tpb>>>(N, ep, d_a, d_b);

  // copy results to HOST
  cudaMemcpy(&chk_b, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);

  // Execute HOST-version of DEVICE code
  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;

  unsigned int i = 0;
  unsigned int k = 0;
  
  for(; i < L; i++){
    float theta = (float)(2*i + 1) + N - 1);
    h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
    h_a[k].x = a*cosf(theta); h_a[k].y = b*cosf(theta); k++;
  }
  
  // if odd
  if(r) h_a[k++].x = -a;

  h_b.x = r ? 1.0f : 1.0f / sqrtf(1.0f + ep*ep);
  for(i = 0; i < N; ++i){
    h_b.x *= h_a[i].x;
    h_b.y *= h_a[i].y;
  }

  // HOST and DEVICE values match
  printf("\nHOST: %0.6f, DEVICE: %0.6f\n", h_b.x, chk_b.x);

  free(h_a);
  cudaFree(d_a);
  cudaFree(d_b);

  return 0;
}

I apologize for the long post of code.

The mentioned loop in your kernel contains a race condition. And it is does not perform the same calculations as the host code.

What you need is a reduction operation.

Thank you for the reply striker159.

I will move the for-loop in the CUDA kernel to a reduction. Do you have any recommendations for library(s) to use for reduction ? I was thinking of CUB.

Thanks again.

Maybe the following code is better to examine. I manage to get to get matching HOST and DEVICE values but the performance from the for-loop in the CUDA kernel is bad. Besides the race condition, is there any performance optimization(s) I could do?

Thanks again for any help.

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

__global__ void myKern(const unsigned int N, cuComplex *d_a, cuComplex *d_b){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2*i;
  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y = b*cosf(theta); k++;
  }

  // **********************************************************************
  // Performance: Can this for-loop be removed for better performance?
  for(; i < N; ++i){
    d_b[0].x *= d_a[i].x;
    d_b[0].y *= d_a[i].y;
  }
  // **********************************************************************

}


int main(){

float tol = 0.01f;
const unsigned int N = 5;
cuComplex *h_a, *d_a, chk;
cuComplex h_b, *d_b;

h_b.x = h_b.y = 0.0f;

h_a = (cuComplex*)malloc(N*sizeof(cuComplex));

h_a = genRandomArray(N);  // generates random array of cuComplex

cudaMalloc((void**)&d_a, N*sizeof(cuComplex));
cudaMalloc((void**)&d_b, sizeof(cuComplex));

// copy to DEVICE
cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(cuComplex), cudaMemcpyHostToDevice);

// call CUDA kernel
unsigned int tpb = 1024;
unsigned int nblocks = (N + tpb - 1) / tpb;
myKern<<<nblocks,tpb>>>(N, d_a, d_b);

// copy from DEVICE
cudaMemcpy(&chk, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);

// execute HOST
float a = 0.5*(1.5 + 0.75);
float b = 0.5*(1.5 - 0.75);
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
unsigned int i = 0;
unsigned int k = 0;
for(; i < L; ++i){
  float theta = (float)(2*(i + 1) + N - 1);
  h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
  h_a[k].x = a*cosf(theta); h_a[k].y = b*cosf(theta); k++;
}

for(i = 0; i < N; ++i){
  h_b.x *= h_a[i].x;
  h_b.y *= h_a[i].y;
}

// compare HOST and DEVICE result
printf("HOST: %0.6f, DEVICE: %0.6f\n", h_b.x, chk.x);

free(h_a); 
cudaFree(d_a);
cudaFree(d_b);

}

You can maybe take a look at semaphore with atomicCAS.
Or you create an other kernel for the loop [0, N]

Thank you thomas.pegot for the reply. I really appreciate all the help/hints I am getting from this forum, you guys really are critical for learning CUDA performance.

The semaphore with atomicCAS could be a good thing to look into. It’s weird though that I get matching values for HOST and DEVICE - maybe that is just a happy coincidence? Do you know of a good example using atomicCAS for something along the lines of what I am using in the CUDA kernel lines 24-27?

I thought about trying something along the lines of the following in the CUDA kernel to replace lines 24-27:

...
if(i < N){
  d_b[0].x *= d_a[i].x;
  d_b[0].y *= d_a[i].y;
}
__syncthreads();
...

However, I believe it could still suffer from potential race condition.

Except if you create another kernel function that you allocate and send data to the device. So basically you send 2 times data to the device which can cost time. Ofcourse you need to cudaFree between both call.

Hi - only just joined this forum and saw this interesting post.
You notice how in the host code you run two loops - one to compute h_a and the second to perform the product reduction of h_a (starting with factor h_b - so why on line 40 have you initialised h_b to zero?)
Anyway, you need to do the same with the kernels. One kernel for computing h_a, and only once this is done can you perform the reduction. I am not sure why you have genRandomArray() when h_a is overwritten.

There are a few ways you can do the reduction. The below is an example that works well if N is a power of 2. Each kernel launch of myKern2 multiplies an index in the lower half of the array by an N/2 offset index in the upper half of the array. The kernel is call log2(N) times until on the final launch it is called with an M of 1. Then you copy out the first index and multiply by h_b.

This process will be vastly more efficient than CPU for N of say 2^20, but for small N CPU will be faster. Interestingly, when I run this code with N>64 the product reduction becomes close to zero.

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

__global__ void myKern1(const unsigned int N, cuComplex *d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2*i;
  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y =  b*cosf(theta); k++;
  }
}

__global__ void myKern2(const unsigned int N, cuComplex *d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i < N)
  {
      d_a[i].x *= d_a[N + i].x;
      d_a[i].y *= d_a[N + i].y;
  }
}

int main(){

//float tol = 0.01f;
const unsigned int N = 64;
cuComplex *h_a, *d_a, chk;
cuComplex h_b;

h_b.x = h_b.y = 1.0f;

h_a = (cuComplex*)malloc(N*sizeof(cuComplex));

//h_a = genRandomArray(N);  // generates random array of cuComplex

cudaMalloc((void**)&d_a, N*sizeof(cuComplex));

// copy to DEVICE
cudaMemcpy(d_a,  h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

// call CUDA kernel
unsigned int tpb = 1024;
unsigned int nblocks = (N + tpb - 1) / tpb;
myKern1<<<nblocks,tpb>>>(N, d_a);

unsigned int M = N;
while(M > 1)
{
    M /= 2;
    nblocks = (M + tpb - 1) / tpb;
    myKern2<<<nblocks,tpb>>>(M, d_a);
}

// copy from DEVICE
cudaMemcpy(&chk, d_a, sizeof(cuComplex), cudaMemcpyDeviceToHost);
chk.x *= h_b.x;
chk.y *= h_b.y;

// execute HOST
float a = 0.5*(1.5 + 0.75);
float b = 0.5*(1.5 - 0.75);
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
unsigned int i = 0;
unsigned int k = 0;
for(; i < L; ++i){
  float theta = (float)(2*(i + 1) + N - 1);
  h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
  h_a[k].x = a*cosf(theta); h_a[k].y =  b*cosf(theta); k++;
}

for(i = 0; i < N; ++i){
  h_b.x *= h_a[i].x;
  h_b.y *= h_a[i].y;
  printf("h_b = %g + %gi\n", h_b.x, h_b.y);
}

// compare HOST and DEVICE result
printf("HOST: %g, DEVICE: %g\n", h_b.x, chk.x);

free(h_a); 
cudaFree(d_a);

}

good job!

I think you need to use cuDoubleComplex else there is no point really to run it on GPU:

N:32
timeGPU(ms):0.010144
timeCPU (ms): 0.012357
HOST: 1.358799e-11, DEVICE: 1.358800e-11

N:1024
timeGPU(ms):0.010112
timeCPU (ms): 0.073378
HOST: 1.757215e-250, DEVICE: 1.757208e-250

Thanks greg.collecut for the information.

It never occurred to me to call the kernel multiple times from the CPU (lines 57-63), but it looks like that would do it for at least powers of 2.

Thanks again.

I believe you can replace:

M /= 2;

with

M = std::ceil(M / 2.0f);

For the case N not power of 2

N:63
timeGPU(ms):0.010176
timeCPU (ms): 0.012964
HOSTx: 0.000000e+00, DEVICE: 0.000000e+00
HOSTy: -0.000000e+00, DEVICE: -0.000000e+00

Thanks for all the help from everyone. You guys are great.

How about the following HOST C code? I just want to port the for-loop to DEVICE CUDA code such that the h_a array is not modified in the kernel just the h_b variable:

#include <cuComplex.h>
#include <stdio.h>
#include <cuda.h>

int main(){
  cuComplex *h_a, h_b;
  unsigned int N = 7;
  h_a = (cuComplex*)malloc(N*sizeof(cuComplex));
  h_b.x = h_b.y = 1.0f;

  // initialize h_a
  for(unsigned int i = 0; i < N; ++i){
    h_a[i].x = (float)(i + 1);
    h_a[i].y = 1.0f;
  }

  // Convert following to CUDA kernel ?
  for(unsigned int i = 0; i < N; ++i){
    h_b.x *= h_a[i].x;
    h_b.y *= h_a[i].y;
  }

  return 0;
}

I know this is probably easy for CUDA experts but it is not so for someone trying to learn. I appreciate all the patient responses.

I would say something like this but i am not an expert neither.

__global__ void myKern1(const unsigned int N, cuDoubleComplex d_b, const cuDoubleComplex* __restrict__ d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  if(i < N)
  {
      d_b.x *= d_a[i].x;
      d_b.y *= d_a[i].y;
  }
}

const restrict hint the compiler that data is read-only so it’s allocated in L1 cache if I understood.

Hi Peter,

The reduction step you want to do is much the same as in the first example. Here is working code that uses your latest initialisation and performs the multiplication reduction on it. This time I’ve done it two different ways. The first is much the same as before - repeated kernel calls reducing half the array each time. The second way performs the reduction within one kernel launch, but doing the loop within the kernel and synchronizing threads after each half array reduction. However this method will only work for an array size that is no more than twice the number of threads within the block, and there must be only one block.

I am curious to know what sort of problem you are working that you want to perform a product reduction of an array!

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

// works on any size array, but must be called mutilple times to reduce the array
__global__ void kernelReduceProduct1(const unsigned int N, const unsigned int O, cuComplex *d_a)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int p = O + i;
    if(p < N)
    {
        d_a[i].x *= d_a[p].x;
        d_a[i].y *= d_a[p].y;
    }
}

// will only work for an array of size N which is <= 2x number threads in a block;
__global__ void kernelReduceProduct2(const unsigned int N, cuComplex *d_a)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    
    unsigned int M = N;
    while(M > 1)
    {
        unsigned int p = (M + 1)/2 + i;
        if(p < M)
        {
            d_a[i].x *= d_a[p].x;
            d_a[i].y *= d_a[p].y;
        }
        M = (M + 1)/2;
        __syncthreads();
    }
}

int main(){
    cuComplex *d_a, *h_a, h_b;
    unsigned int N = 7;

    h_a = (cuComplex*) malloc(N*sizeof(cuComplex));
    cudaMalloc((void**)&d_a, N*sizeof(cuComplex));

    h_b.x = h_b.y = 1.0f;
  
    // initialize h_a
    for(unsigned int i = 0; i < N; ++i){
        h_a[i].x = (float)(i + 1);
        h_a[i].y = 1.0f;
    }
    // copy to device
    cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

    // now reduce d_a
// ***** Use Method 1 *****
    while(N > 1)
    {
        unsigned int O = (N + 1)/2;
        kernelReduceProduct1<<< 1, 32>>>(N, O, d_a);
        N = (N + 1)/2;
    }
// ***** Or Method 2 *****
//    kernelReduceProduct2<<< 1, 32>>>(N, d_a);
//******************
    
    // copy first element of d_a back to h_a
    cudaMemcpy(h_a, d_a, sizeof(cuComplex), cudaMemcpyDeviceToHost);
    h_b.x *= h_a[0].x;
    h_b.y *= h_a[0].y;
    
    printf("h_b = %g + %gi\n", h_b.x, h_b.y);

    free(h_a); 
    cudaFree(d_a);
}

Thanks thomas.pegot, I will give your recommendation a try.

Thank you greg.collecut, the help is much appreciated.

Quick question: is there a way to perform the repeated kernel calls (method 1) within a kernel rather than in the HOST side? It seems that the overhead of launching a CUDA kernel multiple times would drag performance down.

As to the question about what problem I am working on that requires the product reduction of an array - I am attempting to convert Chebyshev filter for DSP to CUDA code. Not easy but interesting.

Thanks again.

Thank you to everyone who helped me with this problem. I ran across CUDA CUB which appears to have many of the features I am looking for - i.e. parallel reduction. A link to similar problem with solution follows:

https://devtalk.nvidia.com/default/topic/1062732/gpu-accelerated-libraries/cub-reduction-with-complex-number-and-multiplication/

Just in case anyone is looking for similar solution.

Thanks again.