Hello,
after many hours of debugging now I am here. I cannot figure out what is wrong with my CUDA kernel, I notice that everything crash with black screen if I use in CUDA kernel more than 2 nested loops.
My question is: what is the maximum number of nested loops in CUDA kernel that is supported??Only 2?? When I comment while loop (see below) everything works as expected (output matrix has all elements to zero), why??
This is my sample (I have two kernels to compute separable convolution of DOG filter kernel, these kernel are not optimized… at the moment my problem is to fix black screen. Note that arrays are column sorted because I use these CUDA kernels with MATLAB…):
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#include <math_constants.h>
#include <math_functions.h>
#include <texture_fetch_functions.h>
#include <cuda_texture_types.h>
#include <texture_types.h>
#include <device_functions.h>
#include <device_launch_parameters.h>
#include <device_types.h>
#include "cudaLibrary.h"
#include "cudaKernelDOG.cuh"
static const dim3 gridSize(6, 8, 1);
static const dim3 blockSize(16, 16);
texture <float, 2> textureImage;
texture <float, 2> textureDogTemp;
texture<float, 1> textureKernel;
__global__ void cudaKernelDOGRow(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *dDog)
{
// sets conv
float conv =
0;
// sets start row
const int initRow =
blockDim.y * blockIdx.y + threadIdx.y;
// sets start col
const int initCol =
blockDim.x * blockIdx.x + threadIdx.x;
// sets increment row
const int incrementRow =
blockDim.y * gridDim.y;
// sets increment col
const int incrementCol =
blockDim.x * gridDim.x;
int indexKernel =
0;
int indexConv =
0;
// for each pixel
for(int row = initRow; row < rowImageSize; row += incrementRow)
{
for(int col = initCol; col < colImageSize; col += incrementCol)
{
// sets conv
conv =
1;
// sets indexConv
indexConv =
col - indexKernelCenter;
while(indexConv < (indexConv + kernelSize))
{
// if(indexConv >= 0 && indexConv < colImageSize)
// {
// conv +=
// (tex2D(textureImage, row, indexConv) * tex1Dfetch(textureKernel, indexKernel));
// }
indexConv++;
// indexKernel++;
}
// saves value of convolution for current pixel
dDog[col * rowImageSize + row] =
conv;
}
}
}
__global__ void cudaKernelDOGCol(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *dDog)
{
// sets conv
float conv =
0;
// sets start row
const int initRow =
blockDim.y * blockIdx.y + threadIdx.y;
// sets start col
const int initCol =
blockDim.x * blockIdx.x + threadIdx.x;
// sets increment row
const int incrementRow =
blockDim.y * gridDim.y;
// sets increment col
const int incrementCol =
blockDim.x * gridDim.x;
int indexKernel =
0;
int indexConv =
0;
// for each pixel
for(int row = initRow; row < rowImageSize; row += incrementRow)
{
for(int col = initCol; col < colImageSize; col += incrementCol)
{
// sets conv
conv =
1;
// sets indexConv
indexConv =
row - indexKernelCenter;
while(indexConv < (indexConv + kernelSize))
{
// if(indexConv >= 0 && indexConv < rowImageSize)
// {
// conv +=
// (tex2D(textureImage, indexConv, col) * tex1Dfetch(textureKernel, indexKernel));
// }
indexConv++;
// indexKernel++;
}
// saves value of convolution for current pixel
dDog[col * rowImageSize + row] =
conv;
}
}
}
/*
Computes DOG
*/
void cudaLibrary::cudaDOG(int rowImageSize, int colImageSize, int imageSize, int kernelSize, int indexKernelCenter, float *hImage, float *hKernel, float *hDog)
{
// computes size
const size_t imageSize_t =
imageSize * sizeof(float);
const size_t kernelSize_t =
kernelSize * sizeof(float);
const size_t rowImageSize_t =
rowImageSize * sizeof(float);
const size_t colImageSize_t =
colImageSize * sizeof(float);
// device variables
static float *dImage;
static float *dKernel;
static float *dDogTempTexture;
float *dDog;
size_t pitchImage;
size_t pitchTemp;
// mallocs device variables
cudaMallocPitch<float>(&dImage, &pitchImage, rowImageSize_t, colImageSize);
CHECK_ERROR;
cudaMallocPitch<float>(&dDogTempTexture, &pitchTemp, rowImageSize_t, colImageSize);
CHECK_ERROR;
cudaMalloc<float>(&dKernel, kernelSize_t);
CHECK_ERROR;
cudaMalloc<float>(&dDog, imageSize_t);
CHECK_ERROR;
// copies memory
cudaMemcpy2D(dImage, pitchImage, hImage, rowImageSize_t, rowImageSize_t, colImageSize, cudaMemcpyHostToDevice);
CHECK_ERROR;
cudaMemcpy(dKernel, hKernel, kernelSize_t, cudaMemcpyHostToDevice);
CHECK_ERROR;
// creates channels
cudaChannelFormatDesc channelImage =
cudaCreateChannelDesc<float>();
cudaChannelFormatDesc channelKernel =
cudaCreateChannelDesc<float>();
// binds textures
cudaBindTexture2D(NULL, &textureImage, dImage, &channelImage, rowImageSize, colImageSize, pitchImage);
CHECK_ERROR;
cudaBindTexture(NULL, &textureKernel, dKernel, &channelKernel, kernelSize_t);
CHECK_ERROR;
// computes 1D DOG forward row
cudaKernelDOGRow<<<gridSize, blockSize>>>(rowImageSize, colImageSize, imageSize, kernelSize, indexKernelCenter, dDog);
CHECK_ERROR;
// copies memory
cudaMemcpy2DAsync(dDogTempTexture, pitchTemp, dDog, rowImageSize_t, rowImageSize_t, colImageSize, cudaMemcpyDeviceToDevice);
CHECK_ERROR;
// creates channel
cudaChannelFormatDesc channelDogTempTexture =
cudaCreateChannelDesc<float>();
// binds texture
cudaBindTexture2D(NULL, &textureDogTemp, dDogTempTexture, &channelDogTempTexture, rowImageSize, colImageSize, pitchTemp);
CHECK_ERROR;
// computes 1D DOG forward column
cudaKernelDOGCol<<<gridSize, blockSize>>>(rowImageSize, colImageSize, imageSize, kernelSize, indexKernelCenter, dDog);
CHECK_ERROR;
// copies memory HERE IS BLACK SCREEN before CHECK_ERROR macro*********************************************************************************
cudaMemcpy(hDog, dDog, imageSize_t, cudaMemcpyDeviceToHost);
CHECK_ERROR;
// frees memory
cudaUnbindTexture(textureImage);
CHECK_ERROR;
cudaUnbindTexture(textureDogTemp);
CHECK_ERROR;
cudaUnbindTexture(textureKernel);
CHECK_ERROR;
cudaFree(dImage);
CHECK_ERROR;
cudaFree(dDogTempTexture);
CHECK_ERROR;
cudaFree(dDog);
CHECK_ERROR;
cudaFree(dKernel);
CHECK_ERROR;
dImage =
nullptr;
dKernel =
nullptr;
dDog =
nullptr;
dDogTempTexture =
nullptr;
}