CUDA Dynamic Parallelism, the code doesn't run as expected

Hello everyone, I’ve recently been learning about CUDA Dynamic Parallelism.
The result of this code run is expected to be 0x20 000 000, but the actual result is 0x1 000 000, can someone tell me why,thanks.

The file cuda.cu

#include <iostream>
#include <memory>
#include <string>

uint8_t* Local_x = (uint8_t*)malloc(sizeof(uint8_t)* 0x800000);
uint8_t* Local_y = (uint8_t*)malloc(sizeof(uint8_t)* 0x200000);
struct xyIndex{
	uint32_t xLen; uint32_t xOffset; uint32_t yLen; uint32_t yOffset;
};
struct xyIndex* Local_z = (struct xyIndex*)malloc(sizeof(struct xyIndex)* 0x10000);

__device__ uint8_t SubTestAdd(uint8_t p0, uint8_t p1)
{
	return p0 + p1;
}

__global__ void TestAdd(uint32_t* Result, uint8_t* t_x, uint8_t* t_y)
{
	uint32_t i = threadIdx.x;
	uint32_t j = blockIdx.x;
	if (SubTestAdd(t_x[j], t_y[i]) == 2)
	{
		atomicAdd(Result, 2);
	}
	__threadfence();
}

__global__ void TestCalc(uint32_t* Result, uint8_t* t_x, uint8_t* t_y, xyIndex* t_z)
{
	uint32_t Ti = threadIdx.x + blockDim.x * (gridDim.x * blockIdx.y + blockIdx.x);
	cudaStream_t stream;
	cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
	TestAdd<<<t_z[Ti].xLen, t_z[Ti].yLen, 0, stream>>>(Result, &t_x[t_z[Ti].xOffset], &t_y[t_z[Ti].yOffset]);
	cudaStreamDestroy(stream);
	__threadfence();
}

int main()
{
	memset(Local_x, 1, 0x800000 * sizeof(uint8_t));
	memset(Local_y, 1, 0x200000 * sizeof(uint8_t));

	Local_z[0].xLen = 0x80;
	Local_z[0].xOffset = 0;
	Local_z[0].yLen = 0x20;
	Local_z[0].yOffset = 0;
	uint8_t initx = Local_x[0];
	uint8_t inity = Local_y[0];
	for (uint32_t i = 1; i < 0x10000; i++)
	{
		Local_z[i].xLen = 0x80;
		Local_z[i].xOffset = Local_z[i-1].xOffset + Local_z[i-1].xLen;
		Local_z[i].yLen = 0x20;
		Local_z[i].yOffset = Local_z[i-1].yOffset + Local_z[i-1].yLen;
	}

	uint8_t* dev_x;
	uint8_t* dev_y;
	xyIndex* dev_z;
	uint32_t* dev_ret;
	cudaMalloc((uint8_t**)&dev_x, sizeof(uint8_t) * 0x800000);
	cudaMalloc((uint8_t**)&dev_y, sizeof(uint8_t) * 0x200000);
	cudaMalloc((xyIndex**)&dev_z, sizeof(xyIndex) * 0x10000);
	cudaMemcpy(dev_x, Local_x, sizeof(uint8_t) * 0x800000, cudaMemcpyHostToDevice);
	cudaMemcpy(dev_y, Local_y, sizeof(uint8_t) * 0x200000, cudaMemcpyHostToDevice);
	cudaMemcpy(dev_z, Local_z, sizeof(xyIndex) * 0x10000, cudaMemcpyHostToDevice);

	cudaMalloc((uint32_t**)&dev_ret, sizeof(uint32_t));
	dim3 Gi(0x800, 0x20);
	TestCalc<<<Gi, 1>>>(dev_ret, dev_x, dev_y, dev_z);
	cudaDeviceSynchronize();

	uint32_t ret = 0;
	cudaMemcpy(&ret, dev_ret, sizeof(uint32_t), cudaMemcpyDeviceToHost);

	cudaFree(dev_ret);
	cudaFree(dev_x);
	cudaFree(dev_y);
	cudaFree(dev_z);
	free(Local_x);
	free(Local_y);
	free(Local_z);
}

When asking about CDP, its now fairly important to give the compile command, as well as the CUDA version in use. This is because CDP has gone through some changes since its initial incarnation in the CUDA 6 timeframe.

I would also strongly encourage the use of proper CUDA error checking, any time you are having trouble with a CUDA code. In addition, its good practice to run your codes with compute-sanitizer. Regarding CUDA error checking in the CDP case, use of the device runtime API as well as device kernel launches can be checked in the same fashion that you would use in host code, and this is good practice as well. I will say this is to some degree “boiler-plate remarks” that I make in general. In this particular case, although the runtime error-checking as well as compute-sanitizer will indicate reasons for concern, it may not be as instructive as one might like.

I will assume a recent CUDA version and the use of CDP2.

  1. This construction:

    TestAdd<<<t_z[Ti].xLen, t_z[Ti].yLen, 0, stream>>>(Result, &t_x[t_z[Ti].xOffset], &t_y[t_z[Ti].yOffset]);
    cudaStreamDestroy(stream);
    

    is basically not allowed. The reason for that is in CDP2, having the parent kernel synchronize on the child kernel completion is not allowed. Since the created stream is in use by the child kernel, the cudaStreamDestroy in the parent kernel must wait until completion of the child kernel. This violates the rule. Yes, this creates a lot of angst, and folks look for ways around it. If that rule is not enough, note this in the CDP section:

As it is not possible to explicitly synchronize child work from a parent thread, there is no way to guarantee that changes occuring in child grids are visible to threads within the parent grid.

  1. You may be thinking that you can launch an arbitrary amount of work from CDP. That is not the case. There is a launch pending limit which by default is around 2048 outstanding kernel launches that CDP can track. Your code exceeds that quite a bit.

With a modification to your code to remove what I claim is the improper construct in item 1, and adding the device-side error-checking I mentioned, on CUDA 12.2, L4 GPU:

# cat t222.cu
#include <iostream>
#include <memory>
#include <string>

uint8_t* Local_x = (uint8_t*)malloc(sizeof(uint8_t)* 0x800000);
uint8_t* Local_y = (uint8_t*)malloc(sizeof(uint8_t)* 0x200000);
struct xyIndex{
        uint32_t xLen; uint32_t xOffset; uint32_t yLen; uint32_t yOffset;
};
struct xyIndex* Local_z = (struct xyIndex*)malloc(sizeof(struct xyIndex)* 0x10000);

__device__ uint8_t SubTestAdd(uint8_t p0, uint8_t p1)
{
        return p0 + p1;
}

__global__ void TestAdd(uint32_t* Result, uint8_t* t_x, uint8_t* t_y)
{
        uint32_t i = threadIdx.x;
        uint32_t j = blockIdx.x;
        if (SubTestAdd(t_x[j], t_y[i]) == 2)
        {
                atomicAdd(Result, 2);
        }
        __threadfence();
}

__global__ void TestCalc(uint32_t* Result, uint8_t* t_x, uint8_t* t_y, xyIndex* t_z)
{
        uint32_t Ti = threadIdx.x + blockDim.x * (gridDim.x * blockIdx.y + blockIdx.x);
        //cudaStream_t stream;
        //cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
        TestAdd<<<t_z[Ti].xLen, t_z[Ti].yLen, 0, cudaStreamFireAndForget>>>(Result, &t_x[t_z[Ti].xOffset], &t_y[t_z[Ti].yOffset]);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) printf("device: %s\n", cudaGetErrorString(err));
        //cudaStreamDestroy(stream);
        //__threadfence();
}

int main()
{
        memset(Local_x, 1, 0x800000 * sizeof(uint8_t));
        memset(Local_y, 1, 0x200000 * sizeof(uint8_t));

        Local_z[0].xLen = 0x80;
        Local_z[0].xOffset = 0;
        Local_z[0].yLen = 0x20;
        Local_z[0].yOffset = 0;
        //uint8_t initx = Local_x[0];
        //uint8_t inity = Local_y[0];
        for (uint32_t i = 1; i < 0x10000; i++)
        {
                Local_z[i].xLen = 0x80;
                Local_z[i].xOffset = Local_z[i-1].xOffset + Local_z[i-1].xLen;
                Local_z[i].yLen = 0x20;
                Local_z[i].yOffset = Local_z[i-1].yOffset + Local_z[i-1].yLen;
        }

        uint8_t* dev_x;
        uint8_t* dev_y;
        xyIndex* dev_z;
        uint32_t* dev_ret;
        cudaMalloc((uint8_t**)&dev_x, sizeof(uint8_t) * 0x800000);
        cudaMalloc((uint8_t**)&dev_y, sizeof(uint8_t) * 0x200000);
        cudaMalloc((xyIndex**)&dev_z, sizeof(xyIndex) * 0x10000);
        cudaMemcpy(dev_x, Local_x, sizeof(uint8_t) * 0x800000, cudaMemcpyHostToDevice);
        cudaMemcpy(dev_y, Local_y, sizeof(uint8_t) * 0x200000, cudaMemcpyHostToDevice);
        cudaMemcpy(dev_z, Local_z, sizeof(xyIndex) * 0x10000, cudaMemcpyHostToDevice);

        cudaMalloc((uint32_t**)&dev_ret, sizeof(uint32_t));
        dim3 Gi(0x800, 0x20);
        //dim3 Gi(2070, 1);
        TestCalc<<<Gi, 1>>>(dev_ret, dev_x, dev_y, dev_z);
        cudaError_t err = cudaDeviceSynchronize();
        if (err != cudaSuccess) printf("host: %s\n", cudaGetErrorString(err));

        uint32_t ret = 0;
        cudaMemcpy(&ret, dev_ret, sizeof(uint32_t), cudaMemcpyDeviceToHost);

        cudaFree(dev_ret);
        cudaFree(dev_x);
        cudaFree(dev_y);
        cudaFree(dev_z);
        free(Local_x);
        free(Local_y);
        free(Local_z);
}
# nvcc -o t222 t222.cu -arch=sm_89 -rdc=true -lcudadevrt -lineinfo
# ./t222

I get a large amount of print-out like this:

device: launch failed because launch would exceed cudaLimitDevRuntimePendingLaunchCount

It’s entirely possible there are other issues with your code as well. That’s as far as I got, because addressing those issues would likely require significant refactoring of your code.

2 Likes

Thank you very much for pointing out the bug in my code!
I’m programming with CUDA 12.4, 4060TI 16GB GPU.
I designed that code because of a special data handling, like this:

uint8_t data_x[][0xa] = {
   0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23,
   0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67,
   0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab,

   0xcd, 0xef, 0xcd, 0xef, 0xcd, 0xef, 0xcd, 0xef, 0xcd, 0xef,
   0xfe, 0xdc, 0xfe, 0xdc, 0xfe, 0xdc, 0xfe, 0xdc, 0xfe, 0xdc,
   0xba, 0x98, 0xba, 0x98, 0xba, 0x98, 0xba, 0x98, 0xba, 0x98,
   0x76, 0x54, 0x76, 0x54, 0x76, 0x54, 0x76, 0x54, 0x76, 0x54,
   0x32, 0x10, 0x32, 0x10, 0x32, 0x10, 0x32, 0x10, 0x32, 0x10,

   0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23,
   0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67,

   0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab,
   0xcd, 0xef, 0xcd, 0xef, 0xcd, 0xef, 0xcd, 0xef, 0xcd, 0xef,
   0xfe, 0xdc, 0xfe, 0xdc, 0xfe, 0xdc, 0xfe, 0xdc, 0xfe, 0xdc,
   0xba, 0x98, 0xba, 0x98, 0xba, 0x98, 0xba, 0x98, 0xba, 0x98,

   0x76, 0x54, 0x76, 0x54, 0x76, 0x54, 0x76, 0x54, 0x76, 0x54,
   0x32, 0x10, 0x32, 0x10, 0x32, 0x10, 0x32, 0x10, 0x32, 0x10,
   ......
};

uint8_t data_y[][0x6] = {
    0x01, 0x02, 0x03, 0x01, 0x02, 0x03,
    0x11, 0x12, 0x13, 0x11, 0x12, 0x13,

    0x21, 0x22, 0x23, 0x21, 0x22, 0x23,
    0x31, 0x32, 0x33, 0x31, 0x32, 0x33,
    0x41, 0x42, 0x43, 0x41, 0x42, 0x43,

    0x51, 0x52, 0x53, 0x51, 0x52, 0x53,
    0x61, 0x62, 0x63, 0x61, 0x62, 0x63,
    0x71, 0x72, 0x73, 0x71, 0x72, 0x73,

    0x81, 0x82, 0x83, 0x81, 0x82, 0x83,
    0x91, 0x92, 0x93, 0x91, 0x92, 0x93,

    0xa1, 0xa2, 0xa3, 0xa1, 0xa2, 0xa3,
    0xb1, 0xb2, 0xb3, 0xb1, 0xb2, 0xb3,
	......
};

uint8_t data_index[0x10000][0x10] = {
	/*      xLen     */  /*    xOffset    */  /*      yLen     */  /*    yOffset    */
    0x03,0x00,0x00,0x00, 0x00,0x00,0x00,0x00, 0x02,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
    0x05,0x00,0x00,0x00, 0x03,0x00,0x00,0x00, 0x03,0x00,0x00,0x00, 0x02,0x00,0x00,0x00,
    0x02,0x00,0x00,0x00, 0x08,0x00,0x00,0x00, 0x03,0x00,0x00,0x00, 0x05,0x00,0x00,0x00,
    0x04,0x00,0x00,0x00, 0x0a,0x00,0x00,0x00, 0x02,0x00,0x00,0x00, 0x08,0x00,0x00,0x00,
	......
};

I need to take xLen, xOffset, yLen, yOffset from the data_index, and then based on these values, get the corresponding length array at the corresponding positions of the data_x and data_y.
For example, from the first data_index array, we get 3 data_x arrays and 2 data_y arrays:

   0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23,           //data_x[0]
   0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67,           //data_x[1]
   0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab,           //data_x[2]

   0x01, 0x02, 0x03, 0x01, 0x02, 0x03,                                   //data_y[0]
   0x11, 0x12, 0x13, 0x11, 0x12, 0x13,                                   //data_y[1]

Combine the data_x and data_y arrays here in pairs to get six new arrays of 16 bytes.

uint8_t data_new[6][0x10] = {
   0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,
   0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,
   0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x01, 0x02, 0x03, 0x01, 0x02, 0x03,
   0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x01, 0x23, 0x11, 0x12, 0x13, 0x11, 0x12, 0x13,
   0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x45, 0x67, 0x11, 0x12, 0x13, 0x11, 0x12, 0x13,
   0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x89, 0xab, 0x11, 0x12, 0x13, 0x11, 0x12, 0x13,
}

Do other calculations on the new array…

Now I have a huge amount of similar data to process, I’ve been trying for a long time, how do I design it to take advantage of GPU parallel computing?
Thanks again.

I don’t see any indication in your description that CDP would be needed. I suggest starting with a non-CDP implementation. You haven’t indicated any actual calculations you intend to do, and there is nothing in your data access that would prevent you from writing an ordinary kernel to access that data.

Hello, In his example, why would I return an error even if I changed it this way:

TestCalc<<<0x800, 0x20>>>(dev_ret, dev_x, dev_y, dev_z); // error " launch failed because launch would exceed cudaLimitDevRuntimePendingLaunchCount " 

Each active thread of the parent kernel launches a child kernel.

In the original formulation:

dim3 Gi(0x800, 0x20);
TestCalc<<<Gi, 1>>>(dev_ret, dev_x, dev_y, dev_z);

the parent kernel launch consists of a grid of 0x800 by 0x20 blocks, each of one thread. That is a total of 65,536 active parent threads, each of which wants to launch a child kernel. That far exceeds 2048.

In your revised formulation:

You have 0x800 blocks in the grid, each of which has 0x20 active threads. That is a total of 65,536 active parent threads, each of which wants to launch a child kernel. That far exceeds 2048.

Why should there be a difference in the two cases, as far as that error report goes which indicates exceeding the Pending Launch limit of ~2048 launches?

thanks, i got it, 2048 was originally a limit on the number of kernels, and is this restriction for the entire graphics card or a default stream?
I found this parameter from CDP, but it seems to have limitations. I also made mistakes when using it this way

__global__ void TestCalc(uint32_t* Result, uint8_t* t_x, uint8_t* t_y, xyIndex* t_z)
{
        uint32_t Ti = threadIdx.x + blockDim.x * (gridDim.x * blockIdx.y + blockIdx.x);
        TestAdd<<<t_z[Ti].xLen, t_z[Ti].yLen, 0, cudaStreamFireAndForget>>>(Result, &t_x[t_z[Ti].xOffset], &t_y[t_z[Ti].yOffset]);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) printf("device: 0 %s\n", cudaGetErrorString(err));//ok

        TestAdd<<<t_z[Ti+0x800].xLen, t_z[Ti+0x800].yLen, 0, cudaStreamTailLaunch>>>(Result, &t_x[t_z[Ti+0x800].xOffset], &t_y[t_z[Ti+0x800].yOffset]);
        err = cudaGetLastError();
        if (err != cudaSuccess) printf("device: 1 %s\n", cudaGetErrorString(err));//ok

        TestAdd<<<t_z[Ti+0x1000].xLen, t_z[Ti+0x1000].yLen, 0, cudaStreamTailLaunch>>>(Result, &t_x[t_z[Ti+0x1000].xOffset], &t_y[t_z[Ti+0x1000].yOffset]);
        err = cudaGetLastError();
        if (err != cudaSuccess) printf("device: 2 %s\n", cudaGetErrorString(err));//error

        TestAdd<<<t_z[Ti+0x1800].xLen, t_z[Ti+0x1800].yLen, 0, cudaStreamTailLauch>>>(Result, &t_x[t_z[Ti+0x1800].xOffset], &t_y[t_z[Ti+0x1800].yOffset]);
        err = cudaGetLastError();
        if (err != cudaSuccess) printf("device: 3 %s\n", cudaGetErrorString(err));//error
}

Did I use it correct?

The item in question is the pending launch limit, and it is documented in the CDP section of the programming guide. If you’re going to be delving into this, my suggestion would be that you read the entire CDP section. regarding the pending launch limit, my read of it does not say anything about streams; my read of it seems to imply that it pertains to the device as a whole. (By the way I already linked this exact section in a previous reply in this thread.)

Regarding the number 2048, that is based on my memory. It may be wrong, or it may vary from one device type to another. Like any other device limit of this type, if you want to know what the pending launch limit is, you should use cudaDeviceGetLimit or similar.

Let me preface my remarks:

  1. I have no idea what the intent of this code is
  2. A good way to start to answer that question is to do what you have demonstrated: A. Do rigorous error checking and determine if any runtime errors are generated B. Run your code with compute-sanitizer and see if any errors are reported C. Determine if your code is producing correct results
  3. Since you seem to have at least partially done step 2, and found that errors are generated, I think it is clear that something is “incorrect”

More detail:

I like to think of the pending launch limit for device-launched kernels like this:

image

  1. Your code launches kernels. Those kernels do not start immediately, they go into a queue or “pile”. The more kernels you launch, the higher the pile will get
  2. The GPU is attempting to perform work. As it completes work (i.e. as it completes kernels) then it will go to the pile and select more work to do. So as time goes on, the GPU activity tends to reduce the size of the pile.
  3. If at any point in time, the pile gets higher than the pending launch limit, then the error is generated.

That would seem to be what is happening in your case. The first two kernel launches that do not generate errors are putting kernel launches into the pile, but the height of the pile has not exceeded the pending launch limit. By the time you get to the 3rd kernel launch line, the effect of that is to cause the pile of unprocessed work to grow to the size where it exceeds the pending launch limit, and an error is generated.

If you kept the GPU (parent kernel) busy for a while before getting to your 3rd kernel launch point, you might wait long enough for the pile to get low enough that the 3rd kernel launch point would succeed.

If you increase the size of the pile limit, you might be able to get things to succeed without any other changes.

Let me just reiterate for future readers that this discussion applies to CDP, i.e. device-launched kernels. Kernels launched from host code follow a slightly different process, and the pending launch limit has no meaning there. Host launched kernels similarly go into a “queue”, and when the queue depth is exceeded, rather than generating an error, subsequent kernel launches become synchronous rather than asynchronous, allowing the queue size to reduce, and “open up space” for the next kernel to be deposited in the queue. The host thread effectively waits until space is available in the queue, for the requested kernel (launch).

Actually the number 2048 is currently documented as the default value for the pending launch limit. Given that this might change in the future, querying it might still be a good general practice.

So it could be increased with cudaDeviceSetLimit and cudaLimitDevRuntimePendingLaunchCount within a certain interval.

Yes, I imagine so, and that is what I referred to here:

and furthermore its a documented suggestion in the documentation, at the link I previously (twice, now) provided.

To wit:

The size of the fixed-size launch pool is configurable by calling cudaDeviceSetLimit() from the host and specifying cudaLimitDevRuntimePendingLaunchCount .

1 Like

I should probably amend that. One relatively “obvious” area where it might be presumed that CDP is needed is if the grid dimensions are in device memory (only). A host-launched kernel would require these to be in host memory.

In that case, one possible approach would be to launch the work from the host, in a grid-stride loop kernel design, where some reasonable compromise is made for grid dimensions, and let the grid stride limit be picked up from device memory. In fact, if you are launching lots of these kernels, you might do well (from several perspectives) to find a way to “combine” like kernel launches, which have like work, but differing only in data extents, into a single host-launched kernel.

Or you could stay with the CDP design paradigm. But CDP clearly doesn’t have the generality and flexibility of the launch-from-host mechanism, where you can queue up arbitrary amounts of work, with very little attention paid to the organization or subdivision. So it will be necessary for some kind of “more active” design or management of the work issuance process in the CDP case, so as to avoid running into things like the launch pending limit, etc.

As you mentioned, the grid-stride loop kernel would be an alternative. As some of the blocks are run sequentially for huge grids anyway, you can just start enough blocks to fill the GPU.

Another way to do it would be to copy back the grid dimensions to the host from device memory before starting the kernel. But that could cost you some latency.

Is there any document about the grid-stricde loop kernel. There is no introduction in CUDA C++ Programming Guide.

when I did a google search on “cuda grid stride loop” this was the first hit.

But each matrix in my data is different, just like when I need to calculate 0x10000 matrices at the same time, each matrix has a different size, and the parameters<<grid, block>>of each kernel are different. Moreover, the extended griddim.x * blockdim. x far exceeds 0x10000. Therefore, I used strip loop in the childkernel, but I have to add if to limit and avoid exceeding the bounds for each thread, but this doesn’t seem to be very effective. According to the maximum matrix setting, every time I calculate, there will be many threads idle waiting. How should I handle this situation

__device__ uint8_t SubChild(uint8_t p0, uint8_t p1)
{
        //some op
}

__global__ void Child(uint8_t* t_x, uint8_t* t_y, xyIndex t_z)
{
        for(uint32_t Ti =threadIdx.x + blockDim.x *  blockIdx.x; Ti < t_z.xlen*t_z.ylen; Ti+= blockDim.x * gridDim.x)
        {
                if((threadIdx.x < t_z.xlen) && (blockIdx.x < t_z.ylen))
               {
                     SubChild(t_x[threadIdx.x], t_y[ blockIdx.x]);
               }
        }
}

__global__ void Parent(uint8_t* t_x, uint8_t* t_y, xyIndex* t_z)
{
        uint32_t Ti = threadIdx.x + blockDim.x *  blockIdx.x;
        TestAdd<<<t_z[Ti].xLen, t_z[Ti].yLen, 0, cudaStreamFireAndForget>>>(&t_x[t_z[Ti].xOffset], &t_y[t_z[Ti].yOffset], t_z[Ti]);
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) printf("device: %s\n", cudaGetErrorString(err));
}

In the main function:
Parent<<<0x100,0x100>>>(dev_t_x, dev_t_y, dev_t_z);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess) printf("host: %s\n", cudaGetErrorString(err));

Each block and each thread can have a different stride or the same stride and a different maximum. As long as you have/put outer for loops in your kernel, you have flexibility, how many iterations they do.
So threads can always do more work.
Effectively one thread or one block could do the work of 1000 threads or 1000 blocks on the fly.
So you do not have to adapt to the maximum matrix setting, but to a low or a lower average matrix setting. If the matrix is larger, you just do more loop iterations.

On the other hand you can let threads just run to the end. It is probably not too expensive. Or the threads do more than one matrix per kernel invocation and just wait/block, until the next matrix should be done (or let them start with the next one, when other threads still do the old one).

but i have use CDP in my code, seems grid-stride loop doesn’t need CDP, so how to do with my program, without CDP.

Robert_Crovella
In that case, one possible approach would be to launch the work from the host, in a grid-stride loop kernel design, where some reasonable compromise is made for grid dimensions, and let the grid stride limit be picked up from device memory. In fact, if you are launching lots of these kernels, you might do well (from several perspectives) to find a way to “combine” like kernel launches, which have like work, but differing only in data extents, into a single host-launched kernel.

It seems grid-stride loop doesn’t need CDP, but i have use CDP in my code, , so how to do with my program, without CDP.

In the most general case this is fairly involved. But you haven’t mapped out the most general case. You have a set of 0x10000 “sub-” matrices, scattered in memory (in an “overall” matrix), which have varying sizes. On each sub-matrix, you want to run the elementwise operation SubChild. You will need to do some prep-work. For each sub-matrix, we will need the following information:

  1. The x dimension
  2. The y dimension
  3. The starting position (x,y)
  4. The total number of processed matrix elements prior to this matrix

You have data defining items 1 and 2 (and presumably 3) already. Item 4 will be created with a prefix-sum.

Once we have that data set for each sub-matrix, then we can launch a single kernel that processes all matrices, with “no” wasted threads. Each thread in the kernel will find its “work location” in the overall matrix by doing first a binary search (on the prefix-sum data) to find which sub-matrix it belongs to, then by determining its x,y location within that sub-matrix, then converting that position to an (x,y) location in the overall matrix.

This can be done from host or device.

Here is a host example, since you have asked “without CDP”:

#include <cub/cub.cuh>

using it = unsigned;
using dt = unsigned char;

struct mmeta{ // sub-matrix meta-data

  it xdim;  // width of sub-matrix
  it ydim;  // height of sub-matrix
  it xs,ys; // x,y coordinate of location of sub-matrix in overall matrix
};

__device__ dt SubChild(it p0, it p1, dt *d, it width){// do some element-wise work

    d[p1*width+p0]++;
}

__device__ it bsearch(it *pm, it i, it nm){ // binary search
  it lower = 0;
  it upper = nm;
  it midpt;
  while (lower < upper-1){
     midpt = (lower + upper)>>1;
     if (pm[midpt] > i) upper = midpt;
     else lower = midpt;
     }
  return lower;
}

__device__ void get_my_xy(it i, mmeta *m, it *pm, it &x, it &y, it nm){ // convert grid-stride loop index to x,y location for work to be performed
  it midx = bsearch(pm, i, nm);
  it ptot = pm[midx];
  i -= ptot;
  it w = m[midx].xdim;
  y = i / w;
  x = (i - y*w)+m[midx].xs;
  y += m[midx].ys;
}

__global__ void k(mmeta *m, it *pm, dt *d, it *N, it width, it nm){ // main kernel

  for (it i = blockIdx.x*blockDim.x+threadIdx.x; i < *N; i+=gridDim.x*blockDim.x){
    it x,y;
    get_my_xy(i, m, pm, x, y, nm);
    SubChild(x, y, d, width);
  }
}

__global__ void mk(mmeta *m, it *pm, it nm){  // multiply sub-matrix dimensions in preparation for prefix sum
  for (it i = blockIdx.x*blockDim.x+threadIdx.x; i < nm; i+=gridDim.x*blockDim.x)
    pm[i] = m[i].xdim * m[i].ydim;
}

const it nmx = 0x100;
const it nmy = 0x100;
const it nm = nmx*nmy;  // number of sub-matrices to generate
const it mwidth = 262144; // overall matrix width
const it mheight = 2048;  // overall matrrix height

int main(){
  // data setup
  mmeta *m = new mmeta[nm];
  for (int i = 0; i < nm; i++){
    m[i].xdim = i%3 + 3;
    m[i].ydim = (i+1)%4 + 2;
    m[i].xs = i%nmx * 8;
    m[i].ys = i/nmx * 8;}
  mmeta *d_m;
  dt *d_d;
  cudaMalloc(&d_d, sizeof(d_d[0])*mwidth*mheight);
  cudaMemset(d_d, 0, sizeof(d_d[0])*mwidth*mheight);
  cudaMalloc(&d_m, sizeof(d_m[0])*nm);
  cudaMemcpy(d_m, m, sizeof(d_m[0])*nm, cudaMemcpyHostToDevice);
  it *d_pm;
  cudaMalloc(&d_pm, sizeof(d_pm[0])*(nm+1));
  cudaMemset(d_pm+nm, 0, sizeof(d_pm[0]));
  const int nTPB = 512;
  const int nBLK = 58*3;
  // compute sub-matrix sizes
  mk<<<nBLK, nTPB>>>(d_m, d_pm, nm);
  // compute prefix sum
// Determine temporary device storage requirements
  void     *d_temp_storage = nullptr;
  size_t   temp_storage_bytes = 0;
  cub::DeviceScan::ExclusiveSum(
    d_temp_storage, temp_storage_bytes,
    d_pm, d_pm, nm+1);

// Allocate temporary storage
  cudaMalloc(&d_temp_storage, temp_storage_bytes);

// Run exclusive prefix sum
  cub::DeviceScan::ExclusiveSum(
    d_temp_storage, temp_storage_bytes,
    d_pm, d_pm, nm+1);
  // do work
  k<<<nBLK, nTPB>>>(d_m, d_pm, d_d, d_pm+nm, mwidth, nm);
  cudaDeviceSynchronize();
}
1 Like