Hello everyone, I’ve recently been learning about CUDA Dynamic Parallelism.
The result of this code run is expected to be 0x20 000 000, but the actual result is 0x1 000 000, can someone tell me why,thanks.
The file cuda.cu
#include <iostream>
#include <memory>
#include <string>
uint8_t* Local_x = (uint8_t*)malloc(sizeof(uint8_t)* 0x800000);
uint8_t* Local_y = (uint8_t*)malloc(sizeof(uint8_t)* 0x200000);
struct xyIndex{
uint32_t xLen; uint32_t xOffset; uint32_t yLen; uint32_t yOffset;
};
struct xyIndex* Local_z = (struct xyIndex*)malloc(sizeof(struct xyIndex)* 0x10000);
__device__ uint8_t SubTestAdd(uint8_t p0, uint8_t p1)
{
return p0 + p1;
}
__global__ void TestAdd(uint32_t* Result, uint8_t* t_x, uint8_t* t_y)
{
uint32_t i = threadIdx.x;
uint32_t j = blockIdx.x;
if (SubTestAdd(t_x[j], t_y[i]) == 2)
{
atomicAdd(Result, 2);
}
__threadfence();
}
__global__ void TestCalc(uint32_t* Result, uint8_t* t_x, uint8_t* t_y, xyIndex* t_z)
{
uint32_t Ti = threadIdx.x + blockDim.x * (gridDim.x * blockIdx.y + blockIdx.x);
cudaStream_t stream;
cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking);
TestAdd<<<t_z[Ti].xLen, t_z[Ti].yLen, 0, stream>>>(Result, &t_x[t_z[Ti].xOffset], &t_y[t_z[Ti].yOffset]);
cudaStreamDestroy(stream);
__threadfence();
}
int main()
{
memset(Local_x, 1, 0x800000 * sizeof(uint8_t));
memset(Local_y, 1, 0x200000 * sizeof(uint8_t));
Local_z[0].xLen = 0x80;
Local_z[0].xOffset = 0;
Local_z[0].yLen = 0x20;
Local_z[0].yOffset = 0;
uint8_t initx = Local_x[0];
uint8_t inity = Local_y[0];
for (uint32_t i = 1; i < 0x10000; i++)
{
Local_z[i].xLen = 0x80;
Local_z[i].xOffset = Local_z[i-1].xOffset + Local_z[i-1].xLen;
Local_z[i].yLen = 0x20;
Local_z[i].yOffset = Local_z[i-1].yOffset + Local_z[i-1].yLen;
}
uint8_t* dev_x;
uint8_t* dev_y;
xyIndex* dev_z;
uint32_t* dev_ret;
cudaMalloc((uint8_t**)&dev_x, sizeof(uint8_t) * 0x800000);
cudaMalloc((uint8_t**)&dev_y, sizeof(uint8_t) * 0x200000);
cudaMalloc((xyIndex**)&dev_z, sizeof(xyIndex) * 0x10000);
cudaMemcpy(dev_x, Local_x, sizeof(uint8_t) * 0x800000, cudaMemcpyHostToDevice);
cudaMemcpy(dev_y, Local_y, sizeof(uint8_t) * 0x200000, cudaMemcpyHostToDevice);
cudaMemcpy(dev_z, Local_z, sizeof(xyIndex) * 0x10000, cudaMemcpyHostToDevice);
cudaMalloc((uint32_t**)&dev_ret, sizeof(uint32_t));
dim3 Gi(0x800, 0x20);
TestCalc<<<Gi, 1>>>(dev_ret, dev_x, dev_y, dev_z);
cudaDeviceSynchronize();
uint32_t ret = 0;
cudaMemcpy(&ret, dev_ret, sizeof(uint32_t), cudaMemcpyDeviceToHost);
cudaFree(dev_ret);
cudaFree(dev_x);
cudaFree(dev_y);
cudaFree(dev_z);
free(Local_x);
free(Local_y);
free(Local_z);
}