How many loops can I use in CUDA kernel?

Hello,

after many hours of debugging now I am here. I cannot figure out what is wrong with my CUDA kernel, I notice that everything crash with black screen if I use in CUDA kernel more than 2 nested loops.

My question is: what is the maximum number of nested loops in CUDA kernel that is supported??Only 2?? When I comment while loop (see below) everything works as expected (output matrix has all elements to zero), why??

This is my sample (I have two kernels to compute separable convolution of DOG filter kernel, these kernel are not optimized… at the moment my problem is to fix black screen. Note that arrays are column sorted because I use these CUDA kernels with MATLAB…):

#include <cuda.h>

#include <cuda_runtime.h>

#include <math.h>

#include <math_constants.h>

#include <math_functions.h>

#include <texture_fetch_functions.h>

#include <cuda_texture_types.h>

#include <texture_types.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#include <device_types.h>

#include "cudaLibrary.h"

#include "cudaKernelDOG.cuh"

static const dim3 gridSize(6, 8, 1);

static const dim3 blockSize(16, 16);

texture <float, 2> textureImage;

texture <float, 2> textureDogTemp;

texture<float, 1> textureKernel;

__global__ void cudaKernelDOGRow(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *dDog)

{

	// sets conv

	float conv =

		0;

	// sets start row

	const int initRow =

		blockDim.y * blockIdx.y + threadIdx.y;

	// sets start col

	const int initCol =

		blockDim.x * blockIdx.x + threadIdx.x;

	// sets increment row

	const int incrementRow =

		blockDim.y * gridDim.y;

	// sets increment col

	const int incrementCol =

		blockDim.x * gridDim.x;

	int indexKernel =

		0;

	int indexConv =

		0;

	// for each pixel

	for(int row = initRow; row < rowImageSize; row += incrementRow)

	{

		for(int col = initCol; col < colImageSize; col += incrementCol)

		{

			// sets conv

			conv =

				1;

			// sets indexConv

			indexConv =

				col - indexKernelCenter;

			

			while(indexConv < (indexConv + kernelSize))

			{

			//	if(indexConv >= 0 && indexConv < colImageSize)

			//	{

			//		conv +=

			//			(tex2D(textureImage, row, indexConv) * tex1Dfetch(textureKernel, indexKernel));

			//	}

				indexConv++;

				//	indexKernel++;

			}

			// saves value of convolution for current pixel

			dDog[col * rowImageSize + row] =

				conv;

		}

	}

}

__global__ void cudaKernelDOGCol(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *dDog)

{

	// sets conv

	float conv =

		0;

	// sets start row

	const int initRow =

		blockDim.y * blockIdx.y + threadIdx.y;

	// sets start col

	const int initCol =

		blockDim.x * blockIdx.x + threadIdx.x;

	// sets increment row

	const int incrementRow =

		blockDim.y * gridDim.y;

	// sets increment col

	const int incrementCol =

		blockDim.x * gridDim.x;

	int indexKernel =

		0;

	int indexConv =

		0;

	// for each pixel

	for(int row = initRow; row < rowImageSize; row += incrementRow)

	{

		for(int col = initCol; col < colImageSize; col += incrementCol)

		{

			// sets conv

			conv =

				1;

			// sets indexConv

			indexConv =

				row - indexKernelCenter;

			

			while(indexConv < (indexConv + kernelSize))

			{

			//	if(indexConv >= 0 && indexConv < rowImageSize)

			//	{

			//		conv +=

			//			(tex2D(textureImage, indexConv, col) * tex1Dfetch(textureKernel, indexKernel));

			//	}

				indexConv++;

				//	indexKernel++;

			}

			// saves value of convolution for current pixel

			dDog[col * rowImageSize + row] =

				conv;

		}

	}

}

/*

	Computes DOG

*/

void cudaLibrary::cudaDOG(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *hImage, float *hKernel, float *hDog)

{

	// computes size

	const size_t imageSize_t =

		imageSize * sizeof(float);

	const size_t kernelSize_t =

		kernelSize * sizeof(float);

	const size_t rowImageSize_t =

		rowImageSize * sizeof(float);

	const size_t colImageSize_t =

		colImageSize * sizeof(float);

	// device variables

	static float *dImage;

	static float *dKernel;

	static float *dDogTempTexture;

	float *dDog;

	size_t pitchImage;

	size_t pitchTemp;

	// mallocs device variables

	cudaMallocPitch<float>(&dImage, &pitchImage, rowImageSize_t, colImageSize);

	CHECK_ERROR;

	cudaMallocPitch<float>(&dDogTempTexture, &pitchTemp, rowImageSize_t, colImageSize);

	CHECK_ERROR;

	cudaMalloc<float>(&dKernel, kernelSize_t);

	CHECK_ERROR;

	cudaMalloc<float>(&dDog, imageSize_t);

	CHECK_ERROR;

	// copies memory

	cudaMemcpy2D(dImage, pitchImage, hImage, rowImageSize_t, rowImageSize_t, colImageSize, cudaMemcpyHostToDevice);		

	CHECK_ERROR;

	cudaMemcpy(dKernel, hKernel, kernelSize_t, cudaMemcpyHostToDevice);

	CHECK_ERROR;

	

	// creates channels

	cudaChannelFormatDesc channelImage =

		cudaCreateChannelDesc<float>();

	cudaChannelFormatDesc channelKernel =

		cudaCreateChannelDesc<float>();

	// binds textures

	cudaBindTexture2D(NULL, &textureImage, dImage, &channelImage, rowImageSize, colImageSize, pitchImage);

	CHECK_ERROR;

	cudaBindTexture(NULL, &textureKernel, dKernel, &channelKernel, kernelSize_t);

	CHECK_ERROR;

	// computes 1D DOG forward row

	cudaKernelDOGRow<<<gridSize, blockSize>>>(rowImageSize, colImageSize, imageSize, kernelSize, indexKernelCenter, dDog);

	CHECK_ERROR;

	// copies memory

	cudaMemcpy2DAsync(dDogTempTexture, pitchTemp, dDog, rowImageSize_t, rowImageSize_t, colImageSize, cudaMemcpyDeviceToDevice);		

	CHECK_ERROR;

	// creates channel

	cudaChannelFormatDesc channelDogTempTexture =

		cudaCreateChannelDesc<float>();

	// binds texture

	cudaBindTexture2D(NULL, &textureDogTemp, dDogTempTexture, &channelDogTempTexture, rowImageSize, colImageSize, pitchTemp);

	CHECK_ERROR;

	// computes 1D DOG forward column

	cudaKernelDOGCol<<<gridSize, blockSize>>>(rowImageSize, colImageSize, imageSize, kernelSize, indexKernelCenter, dDog);

	CHECK_ERROR;

	// copies memory HERE IS BLACK SCREEN before CHECK_ERROR macro*********************************************************************************

	cudaMemcpy(hDog, dDog, imageSize_t, cudaMemcpyDeviceToHost);

	CHECK_ERROR;

	// frees memory

	cudaUnbindTexture(textureImage);

	CHECK_ERROR;

	cudaUnbindTexture(textureDogTemp);

	CHECK_ERROR;

	cudaUnbindTexture(textureKernel);

	CHECK_ERROR;

	cudaFree(dImage);

	CHECK_ERROR;

	cudaFree(dDogTempTexture);

	CHECK_ERROR;

	cudaFree(dDog);

	CHECK_ERROR;

	cudaFree(dKernel);

	CHECK_ERROR;

	dImage =

		nullptr;

	dKernel =

		nullptr;

	dDog =

		nullptr;

	dDogTempTexture =

		nullptr;

}

Hello,

after many hours of debugging now I am here. I cannot figure out what is wrong with my CUDA kernel, I notice that everything crash with black screen if I use in CUDA kernel more than 2 nested loops.

My question is: what is the maximum number of nested loops in CUDA kernel that is supported??Only 2?? When I comment while loop (see below) everything works as expected (output matrix has all elements to zero), why??

This is my sample (I have two kernels to compute separable convolution of DOG filter kernel, these kernel are not optimized… at the moment my problem is to fix black screen. Note that arrays are column sorted because I use these CUDA kernels with MATLAB…):

#include <cuda.h>

#include <cuda_runtime.h>

#include <math.h>

#include <math_constants.h>

#include <math_functions.h>

#include <texture_fetch_functions.h>

#include <cuda_texture_types.h>

#include <texture_types.h>

#include <device_functions.h>

#include <device_launch_parameters.h>

#include <device_types.h>

#include "cudaLibrary.h"

#include "cudaKernelDOG.cuh"

static const dim3 gridSize(6, 8, 1);

static const dim3 blockSize(16, 16);

texture <float, 2> textureImage;

texture <float, 2> textureDogTemp;

texture<float, 1> textureKernel;

__global__ void cudaKernelDOGRow(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *dDog)

{

	// sets conv

	float conv =

		0;

	// sets start row

	const int initRow =

		blockDim.y * blockIdx.y + threadIdx.y;

	// sets start col

	const int initCol =

		blockDim.x * blockIdx.x + threadIdx.x;

	// sets increment row

	const int incrementRow =

		blockDim.y * gridDim.y;

	// sets increment col

	const int incrementCol =

		blockDim.x * gridDim.x;

	int indexKernel =

		0;

	int indexConv =

		0;

	// for each pixel

	for(int row = initRow; row < rowImageSize; row += incrementRow)

	{

		for(int col = initCol; col < colImageSize; col += incrementCol)

		{

			// sets conv

			conv =

				1;

			// sets indexConv

			indexConv =

				col - indexKernelCenter;

			

			while(indexConv < (indexConv + kernelSize))

			{

			//	if(indexConv >= 0 && indexConv < colImageSize)

			//	{

			//		conv +=

			//			(tex2D(textureImage, row, indexConv) * tex1Dfetch(textureKernel, indexKernel));

			//	}

				indexConv++;

				//	indexKernel++;

			}

			// saves value of convolution for current pixel

			dDog[col * rowImageSize + row] =

				conv;

		}

	}

}

__global__ void cudaKernelDOGCol(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *dDog)

{

	// sets conv

	float conv =

		0;

	// sets start row

	const int initRow =

		blockDim.y * blockIdx.y + threadIdx.y;

	// sets start col

	const int initCol =

		blockDim.x * blockIdx.x + threadIdx.x;

	// sets increment row

	const int incrementRow =

		blockDim.y * gridDim.y;

	// sets increment col

	const int incrementCol =

		blockDim.x * gridDim.x;

	int indexKernel =

		0;

	int indexConv =

		0;

	// for each pixel

	for(int row = initRow; row < rowImageSize; row += incrementRow)

	{

		for(int col = initCol; col < colImageSize; col += incrementCol)

		{

			// sets conv

			conv =

				1;

			// sets indexConv

			indexConv =

				row - indexKernelCenter;

			

			while(indexConv < (indexConv + kernelSize))

			{

			//	if(indexConv >= 0 && indexConv < rowImageSize)

			//	{

			//		conv +=

			//			(tex2D(textureImage, indexConv, col) * tex1Dfetch(textureKernel, indexKernel));

			//	}

				indexConv++;

				//	indexKernel++;

			}

			// saves value of convolution for current pixel

			dDog[col * rowImageSize + row] =

				conv;

		}

	}

}

/*

	Computes DOG

*/

void cudaLibrary::cudaDOG(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *hImage, float *hKernel, float *hDog)

{

	// computes size

	const size_t imageSize_t =

		imageSize * sizeof(float);

	const size_t kernelSize_t =

		kernelSize * sizeof(float);

	const size_t rowImageSize_t =

		rowImageSize * sizeof(float);

	const size_t colImageSize_t =

		colImageSize * sizeof(float);

	// device variables

	static float *dImage;

	static float *dKernel;

	static float *dDogTempTexture;

	float *dDog;

	size_t pitchImage;

	size_t pitchTemp;

	// mallocs device variables

	cudaMallocPitch<float>(&dImage, &pitchImage, rowImageSize_t, colImageSize);

	CHECK_ERROR;

	cudaMallocPitch<float>(&dDogTempTexture, &pitchTemp, rowImageSize_t, colImageSize);

	CHECK_ERROR;

	cudaMalloc<float>(&dKernel, kernelSize_t);

	CHECK_ERROR;

	cudaMalloc<float>(&dDog, imageSize_t);

	CHECK_ERROR;

	// copies memory

	cudaMemcpy2D(dImage, pitchImage, hImage, rowImageSize_t, rowImageSize_t, colImageSize, cudaMemcpyHostToDevice);		

	CHECK_ERROR;

	cudaMemcpy(dKernel, hKernel, kernelSize_t, cudaMemcpyHostToDevice);

	CHECK_ERROR;

	

	// creates channels

	cudaChannelFormatDesc channelImage =

		cudaCreateChannelDesc<float>();

	cudaChannelFormatDesc channelKernel =

		cudaCreateChannelDesc<float>();

	// binds textures

	cudaBindTexture2D(NULL, &textureImage, dImage, &channelImage, rowImageSize, colImageSize, pitchImage);

	CHECK_ERROR;

	cudaBindTexture(NULL, &textureKernel, dKernel, &channelKernel, kernelSize_t);

	CHECK_ERROR;

	// computes 1D DOG forward row

	cudaKernelDOGRow<<<gridSize, blockSize>>>(rowImageSize, colImageSize, imageSize, kernelSize, indexKernelCenter, dDog);

	CHECK_ERROR;

	// copies memory

	cudaMemcpy2DAsync(dDogTempTexture, pitchTemp, dDog, rowImageSize_t, rowImageSize_t, colImageSize, cudaMemcpyDeviceToDevice);		

	CHECK_ERROR;

	// creates channel

	cudaChannelFormatDesc channelDogTempTexture =

		cudaCreateChannelDesc<float>();

	// binds texture

	cudaBindTexture2D(NULL, &textureDogTemp, dDogTempTexture, &channelDogTempTexture, rowImageSize, colImageSize, pitchTemp);

	CHECK_ERROR;

	// computes 1D DOG forward column

	cudaKernelDOGCol<<<gridSize, blockSize>>>(rowImageSize, colImageSize, imageSize, kernelSize, indexKernelCenter, dDog);

	CHECK_ERROR;

	// copies memory HERE IS BLACK SCREEN before CHECK_ERROR macro*********************************************************************************

	cudaMemcpy(hDog, dDog, imageSize_t, cudaMemcpyDeviceToHost);

	CHECK_ERROR;

	// frees memory

	cudaUnbindTexture(textureImage);

	CHECK_ERROR;

	cudaUnbindTexture(textureDogTemp);

	CHECK_ERROR;

	cudaUnbindTexture(textureKernel);

	CHECK_ERROR;

	cudaFree(dImage);

	CHECK_ERROR;

	cudaFree(dDogTempTexture);

	CHECK_ERROR;

	cudaFree(dDog);

	CHECK_ERROR;

	cudaFree(dKernel);

	CHECK_ERROR;

	dImage =

		nullptr;

	dKernel =

		nullptr;

	dDog =

		nullptr;

	dDogTempTexture =

		nullptr;

}

How long does the kernel run before aborting with the third nested loop? If it is more than a few seconds, you are probably hitting the watchdog timer. You need to execute long-running kernels on a non-display GPU, or break them up into smaller calls.

How long does the kernel run before aborting with the third nested loop? If it is more than a few seconds, you are probably hitting the watchdog timer. You need to execute long-running kernels on a non-display GPU, or break them up into smaller calls.

Thx for replying :D, it is about 1 - 2 seconds… OK I will try your advice.

Paolo

Thx for replying :D, it is about 1 - 2 seconds… OK I will try your advice.

Paolo