Need Help. CUDA kernel fails randomly

Hi all,

I wrote a simple CUDA kernel function which seems failing randomly.

Any help would be greatly appreciated. Thanks.

////////////////////

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>


__global__ void test_kernel(float2 *dev_data,  long width, long height, long height_2)
{
	long i = threadIdx.x + blockIdx.x * blockDim.x;
	long j = threadIdx.y + blockIdx.y * blockDim.y;

	if(j < width && i < height_2)
	{
        dev_data[(i)*width + j].x = 0.0f;
        dev_data[(i)*width + j].y = 0.0f;
        dev_data[(i+height_2)*width + j].x = 0.0f;
        dev_data[(i+height_2)*width + j].y = 0.0f;
	}

	
	if(j < width && i < height_2)
	{
        
        dev_data[(i)*width + j].x = 1.0f;
        dev_data[(i)*width + j].y = 0.0f;
		
        
        dev_data[(height-i-1)*width + j].x = 2.0f;
        dev_data[(height-i-1)*width + j].y = 2.0f;

	}
	
}

void test_cpu(float2 *host_data,  long width, long height, long height_2)
{
	long i;
	long j;

	for(i=0; i < height_2; i++)
	{
		for(j = 0; j < width; j++)
		{
			host_data[(i)*width + j].x = 0.0f;
			host_data[(i)*width + j].y = 0.0f;
			host_data[(i+height_2)*width + j].x = 0.0f;
			host_data[(i+height_2)*width + j].y = 0.0f;
		}
	}

	for(i=0; i < height_2; i++)
	{
		for(j = 0; j < width; j++)
		{
			host_data[(i)*width + j].x = 1.0f;
			host_data[(i)*width + j].y = 0.0f;
		
        
			host_data[(height-i-1)*width + j].x = 2.0f;
			host_data[(height-i-1)*width + j].y = 2.0f;
		}
	}
	
	
}



int main()
{
	long width = 1024;
	long height = 2048;
	float2 *dev_data;
	(cudaMalloc((void**)&dev_data, height*width*sizeof(float2)));
    float2 *host_data = (float2*)malloc(sizeof(float2)*width*height);
	dim3 threads(16, 16);
	dim3 grid;
	grid.x = (height + 15) / 16;
	grid.y = (width + 15) / 16;
	test_kernel<<<grid, threads>>>(dev_data,  width, height, height/2);
	(cudaMemcpy(host_data, dev_data, height*width*sizeof(float2), cudaMemcpyDeviceToHost));
	
	
	float2 *host_data_cpu = (float2*)malloc(sizeof(float2)*width*height);
	test_cpu(host_data_cpu,  width, height, height/2);
	
	long i,j;
	long errc = 0;
	for(i=0; i < height; i++)
	{
		for(j = 0; j < width; j++)
		{
			if( host_data[(i)*width + j].x != host_data_cpu[(i)*width + j].x)
			{
				errc++;
			}
			if( host_data[(i)*width + j].y != host_data_cpu[(i)*width + j].y)
			{
				errc++;
			}
		}
	}
	printf("errc: %ld\n", errc);
	if(errc != 0)
	{
		printf("cuda kernel error\n");
	}
	cudaFree(dev_data);
	free(host_data);
	free(host_data_cpu);
	return 0;
}

//////////////////////////
Running results:
root@usr:/home# ./test
errc: 616132
cuda kernel error
root@usr:/home# ./test
errc: 610144
cuda kernel error
root@usr:/home# ./test
errc: 591468
cuda kernel error
root@usr:/home# ./test
errc: 607076
cuda kernel error

/////////////////////////////////
OS:
18.04.1-Ubuntu
GPU&CUDA:
Quadro RTX 8000 & 10.1

In the future please format your code correctly (I have fixed it here.) When entering your code, select all of your code then press the </> button at the top of the edit window.

Regarding your kernel, you may be confused about how kernel execution happens. Your CPU function is ordered in such a way that none of the second part will happen until all of the first part is complete. But GPU kernels don’t necessarily execute this way. Since your code depends on this ordering for “correctness”, one possible way to “fix” it is to separate your kernel into two parts, and execute all of the first part, followed by all of the second part, the same way your CPU code executes:

$ cat t2071.cu
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

// includes, project
#include <cuda_runtime.h>
#include <cufft.h>
#include <cufftXt.h>


__global__ void test_kernel(float2 *dev_data,  long width, long height, long height_2, int part)
{
        long i = threadIdx.x + blockIdx.x * blockDim.x;
        long j = threadIdx.y + blockIdx.y * blockDim.y;
if (part == 0)
        if(j < width && i < height_2)
        {
        dev_data[(i)*width + j].x = 0.0f;
        dev_data[(i)*width + j].y = 0.0f;
        dev_data[(i+height_2)*width + j].x = 0.0f;
        dev_data[(i+height_2)*width + j].y = 0.0f;
        }

if (part == 1)
        if(j < width && i < height_2)
        {

        dev_data[(i)*width + j].x = 1.0f;
        dev_data[(i)*width + j].y = 0.0f;


        dev_data[(height-i-1)*width + j].x = 2.0f;
        dev_data[(height-i-1)*width + j].y = 2.0f;

        }

}

void test_cpu(float2 *host_data,  long width, long height, long height_2)
{
        long i;
        long j;

        for(i=0; i < height_2; i++)
        {
                for(j = 0; j < width; j++)
                {
                        host_data[(i)*width + j].x = 0.0f;
                        host_data[(i)*width + j].y = 0.0f;
                        host_data[(i+height_2)*width + j].x = 0.0f;
                        host_data[(i+height_2)*width + j].y = 0.0f;
                }
        }

        for(i=0; i < height_2; i++)
        {
                for(j = 0; j < width; j++)
                {
                        host_data[(i)*width + j].x = 1.0f;
                        host_data[(i)*width + j].y = 0.0f;


                        host_data[(height-i-1)*width + j].x = 2.0f;
                        host_data[(height-i-1)*width + j].y = 2.0f;
                }
        }


}



int main()
{
        long width = 1024;
        long height = 2048;
        float2 *dev_data;
        (cudaMalloc((void**)&dev_data, height*width*sizeof(float2)));
    float2 *host_data = (float2*)malloc(sizeof(float2)*width*height);
        dim3 threads(16, 16);
        dim3 grid;
        grid.x = (height + 15) / 16;
        grid.y = (width + 15) / 16;
        test_kernel<<<grid, threads>>>(dev_data,  width, height, height/2, 0);
        test_kernel<<<grid, threads>>>(dev_data,  width, height, height/2, 1);
        (cudaMemcpy(host_data, dev_data, height*width*sizeof(float2), cudaMemcpyDeviceToHost));


        float2 *host_data_cpu = (float2*)malloc(sizeof(float2)*width*height);
        test_cpu(host_data_cpu,  width, height, height/2);

        long i,j;
        long errc = 0;
        for(i=0; i < height; i++)
        {
                for(j = 0; j < width; j++)
                {
                        if( host_data[(i)*width + j].x != host_data_cpu[(i)*width + j].x)
                        {
                                errc++;
                        }
                        if( host_data[(i)*width + j].y != host_data_cpu[(i)*width + j].y)
                        {
                                errc++;
                        }
                }
        }
        printf("errc: %ld\n", errc);
        if(errc != 0)
        {
                printf("cuda kernel error\n");
        }
        cudaFree(dev_data);
        free(host_data);
        free(host_data_cpu);
        return 0;
}
$ nvcc -o t2071 t2071.cu
$ compute-sanitizer ./t2071
========= COMPUTE-SANITIZER
errc: 0
========= ERROR SUMMARY: 0 errors
$

This comes about because items in the second part of your kernel are writing into some of the same locations that the first part of your kernel is writing into, and this is happening from different threads. Since CUDA specifies no particular order of thread execution, your results will often be scrambled. Your code requires ordering for correctness in this respect. You must enforce that ordering for correct behavior.

Aside: Why you would write code that behaves that way is beyond me. What purpose could it possibly serve for your CPU code to initialize the data this way:

for(i=0; i < height_2; i++)
{
	for(j = 0; j < width; j++)
	{
		host_data[(i)*width + j].x = 0.0f;
    ...
for(i=0; i < height_2; i++)
{
	for(j = 0; j < width; j++)
	{
		host_data[(i)*width + j].x = 1.0f;

First you write 0 into that location, then you write 1 into the same location? You can do that if you want. I can’t imagine what purpose it serves.

1 Like

@Robert_Crovella Sorry for the poor format. Thank you so much for your explanation, I found that I made a basic mistake when writing cuda kernel function. In fact, this function is a simplified demo of my real funciton which does not set all value in the second part. Thanks.

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.