Will calling a kernel from a kernel help on performance?

doscar.sole · June 12, 2022, 9:54am

Hello:

I have a doubt regarding performance on a CUDA code I’m working on.
So, I have a function calc that triggers an external kernel function kernel.

This kernel function expects three data array, and then it makes calculation on each paired item.
After calling that function, I want to perform some checks on the results, so for now the only way I have is to copy the memory from device to host, and then perform the checks on CPU.

I was wondering if it’s worth to somehow wrap all with a kernel function and then perform the checks there, as the kernel function is part of an external source and I don’t have direct access to freely modify it.

As an example, I created a very simillar situation with a minimal code to reproduce my case:

#include "stdio.h"
#include "stdlib.h"
#include "conio.h"
#include "time.h"

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

//memory setting
void h2d(void** device, void* host, int quantity, int type);
void d2h(void* host, void* device, int quantity);

//calculation
void operate(int* a, int* b, char* o, int x, int y, int count);

//CUDA calc
__global__ void kernel(int* a, int* b, char* o, int count);

int main(){
  //dimension
  int x = 16;
  int y = 16;
  int k = 4;
  int i = 0;
  int t = 0;
  
  //host
  int* a = (int*)malloc(sizeof(int) * k * k);
  int* b = (int*)malloc(sizeof(int) * k * k);
  char* o = (char*)malloc(sizeof(char) * k * k);

  //device
  int* da = NULL;
  int* db = NULL;
  char* dop = NULL;

  //random init
  srand(time(NULL));

  printf("Loading ints...\n");

  //memory check
  if (a == NULL || b == NULL || o == NULL) {
    printf("Memory error!\n");
    
    return -1;
  }

  for (i = 0; i < k * k; i++) {
    //a and b are [2, 1000]
    a[i] = rand() % (1000 - 2 + 1) + 2;
    b[i] = rand() % (1000 - 2 + 1) + 2;

    //t is [0, 3]
    t = rand() % 4;

    if (t == 0)
      o[i] = '+';
    else if (t == 1)
      o[i] = '-';
    else if (t == 2)
      o[i] = '*';
    else
      o[i] = '/';
  }

  printf("#op\ta\tb\to\n");

  for (i = 0; i < k * k; i++)
    printf("%i\t%i\t%i\t%c\n", i, a[i], b[i], o[i]);


  //move to device
  h2d((void**)&da, a, k * k, 0);
  h2d((void**)&db, b, k * k, 0);
  h2d((void**)&dop, o, k * k, 1);

  //call to CUDA
  operate(da, db, dop, x, y, k * k);

  //back to host
  d2h(a, da, k * k);


  //perform some additional operations
  /*
  for (i = 0; i < k * k; i++){
    //some code
  }

  //calculate again
  h2d(...)
  operate(...)
  d2h(...)

  for (i = 0; i < k * k; i++){
    //some code
  }

  //calculate again
  h2d(...)
  operate(...)
  d2h(...)
  ...
*/


  printf("#r\tr\n");

  for (i = 0; i < k * k; i++)
    printf("%i\t%i\n", i, a[i]);

  return 0;
}

//copies host memory to device
void h2d(void** device, void* host, int quantity, int type) {
  size_t sz;

  if (quantity <= 0) 
    return;

  if (type == 0)
    sz = sizeof(int);
  else if (type == 1)
    sz = sizeof(char);
  else
    return;

  cudaMalloc(device, sz * quantity);
  cudaMemcpy(*device, host, sz * quantity, cudaMemcpyHostToDevice);
}

//copies device to host memory
void d2h(void* host, void* device, int quantity) {
  if (quantity <= 0) 
    return;
    
  cudaMemcpy(host, device, sizeof(int) * quantity, cudaMemcpyDeviceToHost);
}

//operates
void operate(int* a, int* b, char* o, int x, int y, int count) {
  if (x <= 0 || y <= 0)
    return;

  if (count <= 0)
    return;

  kernel<<<x, y>>>(a, b, o, count);

  //wait to sync
  cudaDeviceSynchronize();
}

__global__ void kernel(int* a, int* b, char* o, int count) {
  int i = 0;
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  int inc = blockDim.x * gridDim.x;

  //https://developer.nvidia.com/blog/even-easier-introduction-cuda/
  for (i = idx;i < count;i += inc){
    if (o[i] == '+')
      a[i] += b[i];
    else if (o[i] == '-')
      a[i] -= b[i];
    else if (o[i] == '*')
      a[i] *= b[i];
    else if (o[i] == '/')
      a[i] /= b[i];
    else
      continue;
  }
}

On my situation, I want to check and ammend a values after cudaDeviceSynchronize() then call again the kernel (this can repeat several times), so what I was wondering is if something like this would improve the performance for larger data quantities:

//operates
void operate(int* a, int* b, char* o, int x, int y, int count) {
  if (x <= 0 || y <= 0)
    return;

  if (count <= 0)
    return;

  customKernel<<<1, 1>>>(a, b, o, x, y, count);

  //wait to sync
  cudaDeviceSynchronize();
}

__global__ customKernel(int* a, int* b, char* o, int x, int y, int count){
  if(blockIdx.x == 0 && threadIdx.x == 0){
    //only do it once
    kernel<<<x, y>>>(a, b, o);

    //wait to sync
    cudaDeviceSynchronize();

    //additional operations
    /*
    for (i = 0; i < count; i++){
      //some code
    }

    kernel<<<x, y>>>(a, b, o);

    //wait to sync
    cudaDeviceSynchronize();

    ...
    */
  }
}

So, as it does not need to syncronize memory back and forth and all the job is done on GPU, does it will improve the performance? or as it will be doing some kernel operations it won’t?

Thanks.

Robert_Crovella · June 13, 2022, 3:52am

I don’t know what “external” means.

Why can’t you launch another kernel to do the data checking (isn’t that what you’re suggesting, anyway)? I don’t see any reason that this data checking kernel must be launched from a kernel. You can launch it from host code.

doscar.sole:

__global__ customKernel(int* a, int* b, char* o, int x, int y, int count){
  if(blockIdx.x == 0 && threadIdx.x == 0){
    //only do it once
    kernel<<<x, y>>>(a, b, o);

    //wait to sync
    cudaDeviceSynchronize();

This methodology is deprecated in newer CUDA versions (~~11.7 and beyond, at least~~). It seems it was deprecated in CUDA 11.6. Also see here

doscar.sole · June 13, 2022, 6:52am

Hi Robert:

for “external” I mean that is not my code, so it’s not likely to be changed at my will.

As far I knew, calling kernels needs some time on the CPU side, so calling 5-6 kernels each time will drain performance (or at least that happened to me on the past).

I just tested calling calc function 5 times in a row and looks like the most “expensive” function is the memory copy, so I guess I can just keep calling some kernels :D

As reference, the last time I did several kernel calls I had the opposite situation (more than 50% of the execution time was spent on launching kernels), so I was starting with that information and that’s why I was avoiding calling more than one kernel per iteration.

I will check it, as I’m working with 11.7 (I think is the latest version on Windows as I installed it last week aside with visual studio 2022.

Thanks for your time Robert!

Robert_Crovella · June 13, 2022, 1:46pm

To a first order approximation, calling kernels from device code has a similar overhead to calling kernels from host code.

doscar.sole · June 13, 2022, 3:48pm

Thanks again Robert.
With this clarification my main doubt is solved.

jerezjulio0 · June 14, 2022, 4:45pm

Why can’t you launch another kernel to do the data checking (isn’t that what you’re suggesting, anyway)? I don’t see any reason that this data checking kernel must be launched from a kernel. You can launch it from host code.

there are very good reasons why. imagine an algorithm like quick sort.
you can have a kernel that figure out the split point, them in with that split value you can launch two kerner each with the correct a number of blocks,

not having that ability, you will have to assume the block is the same are the max and the child kerene have to do the check endt up doing a very large amount of unnecessary work, or alternatively you will have to Sun on the cpu size, making the algorithm worthless. Either way, it does cost both in performance and time consuming refactoring code…

a few years ago, Nvidia made a big deal of the functionality, but now by crippling the ability of syncing child kernels, it makes it only suitable for a very reduced set of algorithms. The expectation was not to eliminate cudaDeviceSynchronize but instead add one the sync on child streams.

These changes are a very serious set back for people who had placed a lot of time and resources on these functionalities just to find out that it will one take a user to download the next cuda SDK and either your app stops working or some other app stops working.

But I guess it is what it is.

Robert_Crovella · June 14, 2022, 5:01pm

I think this is a different case, and you seem to be conflating different things. The deprecation doesn’t preclude the possibility of a nested kernel launch, even a nested kernel launch that depends on arguments computed in the parent kernel. I think it is possible to write a CDP quick sort under the deprecation (it’s going to look different). However, I didn’t mean to communicate that the deprecation was of no importance, but for the stated use case provided by OP it didn’t seem to be (much of) a limitation.

Yes, I agree that the deprecation implies that there are certain things that either can’t be done or must be done differently.

One of the reasons that I included multiple links is that I am hopeful something will pan out based on this:

The cudaDeviceSynchronize() function used for on-device fork/join parallelism is deprecated in preparation for a replacement programming model with higher performance.

(emphasis added)

So I am hopeful that all is not lost. Apart from the discussion of syntax, if you’ve ever made serious use of CDP, there is a good chance you’ve been disappointed with performance. There are numerous forum postings to this effect. And I know you didn’t imply this, but I think its worth stating in this context that a CDP quick sort is a dismal implementation. So maybe the CUDA developers are paying attention to this, realizing that CDP didn’t fulfill everyone’s desires, and are trying to do better. That is my hopeful read of the situation.

Yes, deprecations can be painful.

jerezjulio0 · June 14, 2022, 5:50pm

no, I do not think I am conflating anything. It is quite simple,
you have code base that compile and run correctly in one SDK, one a user is forced to run with a newer sdk because he is using nvidia software and
report that he gets tons of messages of deprecated functions. Who do you think even a single problem will be screwed to?

The warning states that the function is deprecated and will be removed, That is the kind of message that is enough for a user to lose all confidence in your product.

not to mention that it sends you back to month and month of refactoring time.

The idea that there will be a better programming model, is not what you said before all over the forum in answer to similar question and even now,
Instead you are telling people to go back to a more arcane model which is to just launch kernels from host when nVidia, for years, has made a
crusade that secret of high performance is to eliminate as many host synchronization as possible.

system · June 28, 2022, 5:50pm

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.

Robert_Crovella · December 23, 2022, 3:51pm

CUDA 12 has introduced new CDP functionality. This may also be of interest.

Topic		Replies	Views
Silent kernel failure CUDA Programming and Performance	25	8279	May 18, 2020
Cuda code performance CUDA Programming and Performance	14	3129	December 16, 2014
What can't you do in CUDA that you'd like? Requests for the future CUDA Programming and Performance	407	134565	May 26, 2010
cudaDeviceSynchronize from device code is deprecated CUDA Programming and Performance	15	6736	March 18, 2024
Passing dynamically allocated memory in kernel to sub kernel via dynamic parallelism CUDA Programming and Performance	6	690	May 3, 2019
cant call any kernel function CUDA Programming and Performance	8	4831	June 6, 2011
CUDA Memory Consistency CUDA Programming and Performance	23	55503	March 8, 2007
need a help from employees or guys who know compiler well CUDA Programming and Performance	22	8616	December 18, 2008
CUDA very slow performance CUDA Programming and Performance	21	16622	March 6, 2020
The Cuda 5 Second execution-time limit Finding a the way to work around the GDI timeout CUDA Programming and Performance	24	12712	July 26, 2010

Will calling a kernel from a kernel help on performance?

Related topics