CUDA kernel for-loop performance

peterSteele123 · September 4, 2019, 5:49pm

Hello,

I have a CUDA kernel that executes a for-loop which gives the correct results when comparing to HOST code version. However, the performance is spotty at best. Can anyone tell me if it is possible to define the for-loop in the CUDA kernel such that it is better parallelized? I would like to completely remove the for-loop from the kernel if possible.

ANY help/hints for how to increase performance would be appreciated.

The code is posted below:

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

// DEVICE code:

__global__ void myKern(const unsigned int N, const float ep,
                                       cuComplex *__restrict__ d_a, cuComplex *__restrict__ d_b){
  // global thread ID
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2 * i;

  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y = b*cosf(theta); k++;
  }
  
  if(i >= L){
    // if odd
    if(r) d_a[k++].x = -a;
  }

  d_b[0].x = r ? 1.0f : 1.0f / sqrtf(1.0f + ep*ep);

  // ****************************************************************
  // PROBLEM: Performance is horrible here - COSTLY FOR-LOOP
  // ****************************************************************
  // Can this for-loop be removed or defined differently to generate
  // better performance for the CUDA kernel ?
  for(; i < N; i++){
    d_b[0].x *= d_a[i].x;
    d_b[0].y *= d_a[i].y;
  }
  // ****************************************************************
}

// HOST code:

int main() {

  const float ep = 0.01f;
  const unsigned int N = 8;

  cuComplex *d_a, *d_b;
  cuComplex *h_a, h_b;
  cuComplex chk_b;

  h_b.x = h_b.y = 0.0f;

  h_a = (cuComplex*)malloc(N*sizeof(cuComplex));
  cudaMalloc((void**)&d_a, N*sizeof(cuComplex));
  cudaMalloc((void**)&d_b, sizeof(cuComplex));

  // Randomly generate array values
  h_a = genRandom(N);

  // copy to DEVICE
  cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

  // call CUDA kernel
  unsigned int tpb = 1024;
  unsigned int nblocks = (N + tpb - 1) / tpb;
  myKern<<<nblocks,tpb>>>(N, ep, d_a, d_b);

  // copy results to HOST
  cudaMemcpy(&chk_b, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);

  // Execute HOST-version of DEVICE code
  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;

  unsigned int i = 0;
  unsigned int k = 0;
  
  for(; i < L; i++){
    float theta = (float)(2*i + 1) + N - 1);
    h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
    h_a[k].x = a*cosf(theta); h_a[k].y = b*cosf(theta); k++;
  }
  
  // if odd
  if(r) h_a[k++].x = -a;

  h_b.x = r ? 1.0f : 1.0f / sqrtf(1.0f + ep*ep);
  for(i = 0; i < N; ++i){
    h_b.x *= h_a[i].x;
    h_b.y *= h_a[i].y;
  }

  // HOST and DEVICE values match
  printf("\nHOST: %0.6f, DEVICE: %0.6f\n", h_b.x, chk_b.x);

  free(h_a);
  cudaFree(d_a);
  cudaFree(d_b);

  return 0;
}

I apologize for the long post of code.

striker159 · September 4, 2019, 6:01pm

The mentioned loop in your kernel contains a race condition. And it is does not perform the same calculations as the host code.

What you need is a reduction operation.

peterSteele123 · September 4, 2019, 6:11pm

Thank you for the reply striker159.

I will move the for-loop in the CUDA kernel to a reduction. Do you have any recommendations for library(s) to use for reduction ? I was thinking of CUB.

Thanks again.

peterSteele123 · September 4, 2019, 9:16pm

Maybe the following code is better to examine. I manage to get to get matching HOST and DEVICE values but the performance from the for-loop in the CUDA kernel is bad. Besides the race condition, is there any performance optimization(s) I could do?

Thanks again for any help.

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

__global__ void myKern(const unsigned int N, cuComplex *d_a, cuComplex *d_b){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2*i;
  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y = b*cosf(theta); k++;
  }

  // **********************************************************************
  // Performance: Can this for-loop be removed for better performance?
  for(; i < N; ++i){
    d_b[0].x *= d_a[i].x;
    d_b[0].y *= d_a[i].y;
  }
  // **********************************************************************

}


int main(){

float tol = 0.01f;
const unsigned int N = 5;
cuComplex *h_a, *d_a, chk;
cuComplex h_b, *d_b;

h_b.x = h_b.y = 0.0f;

h_a = (cuComplex*)malloc(N*sizeof(cuComplex));

h_a = genRandomArray(N);  // generates random array of cuComplex

cudaMalloc((void**)&d_a, N*sizeof(cuComplex));
cudaMalloc((void**)&d_b, sizeof(cuComplex));

// copy to DEVICE
cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, &h_b, sizeof(cuComplex), cudaMemcpyHostToDevice);

// call CUDA kernel
unsigned int tpb = 1024;
unsigned int nblocks = (N + tpb - 1) / tpb;
myKern<<<nblocks,tpb>>>(N, d_a, d_b);

// copy from DEVICE
cudaMemcpy(&chk, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);

// execute HOST
float a = 0.5*(1.5 + 0.75);
float b = 0.5*(1.5 - 0.75);
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
unsigned int i = 0;
unsigned int k = 0;
for(; i < L; ++i){
  float theta = (float)(2*(i + 1) + N - 1);
  h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
  h_a[k].x = a*cosf(theta); h_a[k].y = b*cosf(theta); k++;
}

for(i = 0; i < N; ++i){
  h_b.x *= h_a[i].x;
  h_b.y *= h_a[i].y;
}

// compare HOST and DEVICE result
printf("HOST: %0.6f, DEVICE: %0.6f\n", h_b.x, chk.x);

free(h_a); 
cudaFree(d_a);
cudaFree(d_b);

}

thomas.pegot · September 4, 2019, 10:12pm

You can maybe take a look at semaphore with atomicCAS.
Or you create an other kernel for the loop [0, N]

peterSteele123 · September 4, 2019, 10:33pm

Thank you thomas.pegot for the reply. I really appreciate all the help/hints I am getting from this forum, you guys really are critical for learning CUDA performance.

The semaphore with atomicCAS could be a good thing to look into. It’s weird though that I get matching values for HOST and DEVICE - maybe that is just a happy coincidence? Do you know of a good example using atomicCAS for something along the lines of what I am using in the CUDA kernel lines 24-27?

I thought about trying something along the lines of the following in the CUDA kernel to replace lines 24-27:

...
if(i < N){
  d_b[0].x *= d_a[i].x;
  d_b[0].y *= d_a[i].y;
}
__syncthreads();
...

However, I believe it could still suffer from potential race condition.

thomas.pegot · September 5, 2019, 12:32am

I thought about trying something along the lines of the following in the CUDA kernel to replace lines 24-27:
...
if(i < N){
  d_b[0].x *= d_a[i].x;
  d_b[0].y *= d_a[i].y;
}
__syncthreads();
...
However, I believe it could still suffer from potential race condition.

Except if you create another kernel function that you allocate and send data to the device. So basically you send 2 times data to the device which can cost time. Ofcourse you need to cudaFree between both call.

greg.collecutt · September 5, 2019, 6:10am

Hi - only just joined this forum and saw this interesting post.
You notice how in the host code you run two loops - one to compute h_a and the second to perform the product reduction of h_a (starting with factor h_b - so why on line 40 have you initialised h_b to zero?)
Anyway, you need to do the same with the kernels. One kernel for computing h_a, and only once this is done can you perform the reduction. I am not sure why you have genRandomArray() when h_a is overwritten.

There are a few ways you can do the reduction. The below is an example that works well if N is a power of 2. Each kernel launch of myKern2 multiplies an index in the lower half of the array by an N/2 offset index in the upper half of the array. The kernel is call log2(N) times until on the final launch it is called with an M of 1. Then you copy out the first index and multiply by h_b.

This process will be vastly more efficient than CPU for N of say 2^20, but for small N CPU will be faster. Interestingly, when I run this code with N>64 the product reduction becomes close to zero.

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

__global__ void myKern1(const unsigned int N, cuComplex *d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i >= N) return;

  float a = 0.5*(1.5 + 0.75);
  float b = 0.5*(1.5 - 0.75);
  unsigned int r = N % 2;
  unsigned int L = (N - r) / 2;
  unsigned int k = 2*i;
  if(i < L){
    float theta = (float)(2*(i + 1) + N - 1);
    d_a[k].x = a*cosf(theta); d_a[k].y = -b*cosf(theta); k++;
    d_a[k].x = a*cosf(theta); d_a[k].y =  b*cosf(theta); k++;
  }
}

__global__ void myKern2(const unsigned int N, cuComplex *d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  // check range
  if(i < N)
  {
      d_a[i].x *= d_a[N + i].x;
      d_a[i].y *= d_a[N + i].y;
  }
}

int main(){

//float tol = 0.01f;
const unsigned int N = 64;
cuComplex *h_a, *d_a, chk;
cuComplex h_b;

h_b.x = h_b.y = 1.0f;

h_a = (cuComplex*)malloc(N*sizeof(cuComplex));

//h_a = genRandomArray(N);  // generates random array of cuComplex

cudaMalloc((void**)&d_a, N*sizeof(cuComplex));

// copy to DEVICE
cudaMemcpy(d_a,  h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

// call CUDA kernel
unsigned int tpb = 1024;
unsigned int nblocks = (N + tpb - 1) / tpb;
myKern1<<<nblocks,tpb>>>(N, d_a);

unsigned int M = N;
while(M > 1)
{
    M /= 2;
    nblocks = (M + tpb - 1) / tpb;
    myKern2<<<nblocks,tpb>>>(M, d_a);
}

// copy from DEVICE
cudaMemcpy(&chk, d_a, sizeof(cuComplex), cudaMemcpyDeviceToHost);
chk.x *= h_b.x;
chk.y *= h_b.y;

// execute HOST
float a = 0.5*(1.5 + 0.75);
float b = 0.5*(1.5 - 0.75);
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
unsigned int i = 0;
unsigned int k = 0;
for(; i < L; ++i){
  float theta = (float)(2*(i + 1) + N - 1);
  h_a[k].x = a*cosf(theta); h_a[k].y = -b*cosf(theta); k++;
  h_a[k].x = a*cosf(theta); h_a[k].y =  b*cosf(theta); k++;
}

for(i = 0; i < N; ++i){
  h_b.x *= h_a[i].x;
  h_b.y *= h_a[i].y;
  printf("h_b = %g + %gi\n", h_b.x, h_b.y);
}

// compare HOST and DEVICE result
printf("HOST: %g, DEVICE: %g\n", h_b.x, chk.x);

free(h_a); 
cudaFree(d_a);

}

thomas.pegot · September 5, 2019, 11:15am

good job!

I think you need to use cuDoubleComplex else there is no point really to run it on GPU:

N:32
timeGPU(ms):0.010144
timeCPU (ms): 0.012357
HOST: 1.358799e-11, DEVICE: 1.358800e-11

N:1024
timeGPU(ms):0.010112
timeCPU (ms): 0.073378
HOST: 1.757215e-250, DEVICE: 1.757208e-250

peterSteele123 · September 5, 2019, 11:54am

Thanks greg.collecut for the information.

It never occurred to me to call the kernel multiple times from the CPU (lines 57-63), but it looks like that would do it for at least powers of 2.

Thanks again.

thomas.pegot · September 5, 2019, 12:27pm

I believe you can replace:

M /= 2;

with

M = std::ceil(M / 2.0f);

For the case N not power of 2

N:63
timeGPU(ms):0.010176
timeCPU (ms): 0.012964
HOSTx: 0.000000e+00, DEVICE: 0.000000e+00
HOSTy: -0.000000e+00, DEVICE: -0.000000e+00

peterSteele123 · September 5, 2019, 1:16pm

Thanks for all the help from everyone. You guys are great.

How about the following HOST C code? I just want to port the for-loop to DEVICE CUDA code such that the h_a array is not modified in the kernel just the h_b variable:

#include <cuComplex.h>
#include <stdio.h>
#include <cuda.h>

int main(){
  cuComplex *h_a, h_b;
  unsigned int N = 7;
  h_a = (cuComplex*)malloc(N*sizeof(cuComplex));
  h_b.x = h_b.y = 1.0f;

  // initialize h_a
  for(unsigned int i = 0; i < N; ++i){
    h_a[i].x = (float)(i + 1);
    h_a[i].y = 1.0f;
  }

  // Convert following to CUDA kernel ?
  for(unsigned int i = 0; i < N; ++i){
    h_b.x *= h_a[i].x;
    h_b.y *= h_a[i].y;
  }

  return 0;
}

I know this is probably easy for CUDA experts but it is not so for someone trying to learn. I appreciate all the patient responses.

thomas.pegot · September 6, 2019, 11:50am

I would say something like this but i am not an expert neither.

__global__ void myKern1(const unsigned int N, cuDoubleComplex d_b, const cuDoubleComplex* __restrict__ d_a){
  unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;

  if(i < N)
  {
      d_b.x *= d_a[i].x;
      d_b.y *= d_a[i].y;
  }
}

const restrict hint the compiler that data is read-only so it’s allocated in L1 cache if I understood.

greg.collecutt · September 6, 2019, 12:08pm

Hi Peter,

The reduction step you want to do is much the same as in the first example. Here is working code that uses your latest initialisation and performs the multiplication reduction on it. This time I’ve done it two different ways. The first is much the same as before - repeated kernel calls reducing half the array each time. The second way performs the reduction within one kernel launch, but doing the loop within the kernel and synchronizing threads after each half array reduction. However this method will only work for an array size that is no more than twice the number of threads within the block, and there must be only one block.

I am curious to know what sort of problem you are working that you want to perform a product reduction of an array!

#include <stdio.h>
#include <iostream>
#include <cuComplex.h>

// works on any size array, but must be called mutilple times to reduce the array
__global__ void kernelReduceProduct1(const unsigned int N, const unsigned int O, cuComplex *d_a)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int p = O + i;
    if(p < N)
    {
        d_a[i].x *= d_a[p].x;
        d_a[i].y *= d_a[p].y;
    }
}

// will only work for an array of size N which is <= 2x number threads in a block;
__global__ void kernelReduceProduct2(const unsigned int N, cuComplex *d_a)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    
    unsigned int M = N;
    while(M > 1)
    {
        unsigned int p = (M + 1)/2 + i;
        if(p < M)
        {
            d_a[i].x *= d_a[p].x;
            d_a[i].y *= d_a[p].y;
        }
        M = (M + 1)/2;
        __syncthreads();
    }
}

int main(){
    cuComplex *d_a, *h_a, h_b;
    unsigned int N = 7;

    h_a = (cuComplex*) malloc(N*sizeof(cuComplex));
    cudaMalloc((void**)&d_a, N*sizeof(cuComplex));

    h_b.x = h_b.y = 1.0f;
  
    // initialize h_a
    for(unsigned int i = 0; i < N; ++i){
        h_a[i].x = (float)(i + 1);
        h_a[i].y = 1.0f;
    }
    // copy to device
    cudaMemcpy(d_a, h_a, N*sizeof(cuComplex), cudaMemcpyHostToDevice);

    // now reduce d_a
// ***** Use Method 1 *****
    while(N > 1)
    {
        unsigned int O = (N + 1)/2;
        kernelReduceProduct1<<< 1, 32>>>(N, O, d_a);
        N = (N + 1)/2;
    }
// ***** Or Method 2 *****
//    kernelReduceProduct2<<< 1, 32>>>(N, d_a);
//******************
    
    // copy first element of d_a back to h_a
    cudaMemcpy(h_a, d_a, sizeof(cuComplex), cudaMemcpyDeviceToHost);
    h_b.x *= h_a[0].x;
    h_b.y *= h_a[0].y;
    
    printf("h_b = %g + %gi\n", h_b.x, h_b.y);

    free(h_a); 
    cudaFree(d_a);
}

peterSteele123 · September 6, 2019, 12:09pm

Thanks thomas.pegot, I will give your recommendation a try.

peterSteele123 · September 6, 2019, 12:24pm

Thank you greg.collecut, the help is much appreciated.

Quick question: is there a way to perform the repeated kernel calls (method 1) within a kernel rather than in the HOST side? It seems that the overhead of launching a CUDA kernel multiple times would drag performance down.

As to the question about what problem I am working on that requires the product reduction of an array - I am attempting to convert Chebyshev filter for DSP to CUDA code. Not easy but interesting.

Thanks again.

peterSteele123 · September 7, 2019, 1:51pm

Thank you to everyone who helped me with this problem. I ran across CUDA CUB which appears to have many of the features I am looking for - i.e. parallel reduction. A link to similar problem with solution follows:

[url]https://devtalk.nvidia.com/default/topic/1062732/gpu-accelerated-libraries/cub-reduction-with-complex-number-and-multiplication/[/url]

Just in case anyone is looking for similar solution.

Thanks again.

Topic		Replies	Views
'for' loop performance hacks? CUDA Programming and Performance	17	10435	February 28, 2009
CUDA very slow performance CUDA Programming and Performance	21	16462	March 6, 2020
Issue with Nested for-loops in kernel CUDA Programming and Performance	13	2271	April 4, 2019
Effective Parallelisation of CUDA C code CUDA Programming and Performance	38	1918	December 27, 2021
How to choose how many threads/blocks to have? CUDA Programming and Performance	43	51200	June 7, 2022
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	13928	September 5, 2008
how to implement double for loops in CUDA CUDA Programming and Performance	23	15748	January 30, 2012
Cuda program taking more time. CUDA Programming and Performance	15	7057	November 21, 2010
Using unified memory causes system crash CUDA Programming and Performance	28	5793	February 4, 2019
GPU/CPU precision comparison and Kernel instructions question CUDA Programming and Performance	5	672	April 4, 2017

CUDA kernel for-loop performance

Related topics