Inconsistent Cuda Errors

Hello,

I have a cuda program that runs fine a little more than 50% of the time, and other times I receive cuda errors that I have set in my program. cuda-memcheck gave 0 errors for the three time I ran it. I am fairly new to cuda so I am not familiar with good programming practices when it comes to host/device synchronization, streams and so forth. If you can offer any insight to what is going wrong that would be great!

On another note if you know why the simple max element finder at the end is giving different results than the std::max_element() function that would be of help as well!
A normal output is typically:

Time: 0.0431291
Time: 0.0482404
Time: 0.046779
Time: 0.047245
Time: 0.0485886
Time: 0.0470794
Time: 0.0468703
Time: 0.0486437
Time: 0.0468626
Time: 0.0473519
The largest element is 3.51592e+10
Number of peaks: 123
Max: 2147483647
Max index: 1541

And an error output more or less looks like:

Time: 0.0810037
Time: 0.0528883
Time: 0.0473969
Time: 0.0533998
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 0.000322271
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 0.000103488
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.4544e-05
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.5439e-05
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.3328e-05
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.5568e-05
The largest element is 2.17361e+38
Number of peaks: 302
Max: 2147483647
Max index: 2442

#include <cufft.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_device_runtime_api.h>

#include <iostream>
#include <stdlib.h>
#include <stdio.h>

#include <opencv2/core/core.hpp>
#include <opencv2/opencv.hpp>



#define height 2048
#define width 2448
#define size 5013504



__global__ void datatransfer(cufftComplex *f2, float *f)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width && y>=0 && y< height) {
f2[width*y+x].x =  f[width*y+x];
f2[width*y+x].y = 0;


}

}



__global__ void magnitude_kernel(cufftComplex *out, cufftComplex *in2)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width && y>=0 && y< height) {

in2[width*y+x].x = sqrt( out[width*y+x].x * out[width*y+x].x + out[width*y+x].y * out[width*y+x].y);
in2[width*y+x].y = 0;
}
}



__global__ void swap_quadrants(cufftComplex *old_img,cufftComplex *new_img)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;



if (x>=0 && x<width/2 && y>=0 && y< height/2)   {

int q1 = width*y+x;
int q2 = width/2+width*y+x;
int q3 = width*height/2+width/2+width*y+x;
int q4 = width*height/2+width*y+x;

new_img[q1] = old_img[q3];
new_img[q2] = old_img[q4];
new_img[q3] = old_img[q1];
new_img[q4] = old_img[q2];
}
}

__global__ void smootharray(float *a1,float *b1,float *c1,float *result,cufftComplex *image)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;


if (x>=0 && x<width)   {
a1[x] = image[width*height/2-width+x].x +image[width*height/2+x].x;
b1[x] = image[width*height/2-width+x-1].x +image[width*height/2+x-1].x;
c1[x] = image[width*height/2-width+x+1].x +image[width*height/2+x+1].x;
result[x] = (a1[x]+b1[x]+c1[x])/3;
}

if (x>0 && x<width-1) {

a1[x] = result[x-1];
b1[x] = result[x+1];
c1[x] = result[x];
result[x] = (a1[x]+b1[x]+c1[x])/3;
}
}

int main() 
{

//Read image
cv::Mat image =cv::imread("im2_48.jpg",CV_LOAD_IMAGE_GRAYSCALE);
image.convertTo(image,CV_32FC1);
	
//Create Array
float *array = new float;
for (int i=0;i<height;i++)
{
	for (int j=0;j<width;j++)
	{
	array[width*i+j] = image.at<float>(i,j);
	}
}
	


//Test reconstruct
cv::Mat reconstruct(height,width,CV_32FC1);
for (int i=0;i<height;i++)
{
	for (int j=0;j<width;j++)
	{
	reconstruct.at<float>(i,j) = array[width*i+j];
	}
}
cv::normalize(reconstruct, reconstruct, 0, 1, CV_MINMAX);
cv::namedWindow("reconstruct",CV_WINDOW_NORMAL);	
cv::imshow("reconstruct",reconstruct);


//Cuda variables
cufftHandle plan;

float *dev_array;

cufftComplex *h_array2,*array2;
cufftComplex *h_array3,*array3;

float *h_1darray,*dev_1darray,*h_shift1,*d_shift1,*h_shift2,*d_shift2,*h_smooth,*d_smooth;


//Allocate memory
h_array2 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_array3 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_1darray = (float*)malloc(width*sizeof(float));
h_shift1 = (float*)malloc(width*sizeof(float));
h_shift2 = (float*)malloc(width*sizeof(float));
h_smooth = (float*)malloc(width*sizeof(float));

cudaMalloc((void **) &dev_array,sizeof(float)*size);
cudaMalloc((void **) &array2, sizeof(cufftComplex)*size);
cudaMalloc((void **) &array3, sizeof(cufftComplex)*size);
cudaMalloc((void **) &dev_1darray, sizeof(float)*width);
cudaMalloc((void **) &d_shift1, sizeof(float)*width);
cudaMalloc((void **) &d_shift2, sizeof(float)*width);
cudaMalloc((void **) &d_smooth, sizeof(float)*width);

if (cudaGetLastError() != cudaSuccess) 
	{
	fprintf(stderr, "Cuda Error: Failed to allocate something\n");	
	}


if (cudaMemcpy(array2,h_array2,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to array2"<<std::endl;
	}

if (cudaMemcpy(array3,h_array3,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to array3"<<std::endl;
	}
if (cudaMemcpy(dev_1darray,h_1darray,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to dev_1darray"<<std::endl;
	}

if (cudaMemcpy(d_shift1,h_shift1,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_shift1"<<std::endl;
	}

if (cudaMemcpy(d_shift2,h_shift2,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_shift2"<<std::endl;
	}

if (cudaMemcpy(d_smooth,h_smooth,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_smooth"<<std::endl;
	}

//Plan 2D FFT
if (cufftPlan2d(&plan,height,width,CUFFT_C2C) != CUFFT_SUCCESS) 
	{
	std::cout<<"Failed to make 1st FFT plan"<<std::endl;
	}

//Copy Image Array to GPU
cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);
if (cudaGetLastError() != cudaSuccess) 
	{
	fprintf(stderr, "Cuda Error: Failed to copy to dev_array\n");	
	}

//Loop through Algorithm 10 times
for (int i=0;i<10;i++) {
	double t1 = (double)cv::getTickCount();


	//Conversion from float to float2
	dim3 threadsPerBlock(32,32);
	dim3 numBlocks(77,64);

	datatransfer<<<numBlocks,threadsPerBlock>>>(array2,dev_array);

	if (cudaGetLastError() != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute kernel\n");	
		}


	//First FFT
	if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
		{
		std::cout<<"FFT Failed"<<std::endl;
		}

	//Perform absolute value

	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
	if (cudaGetLastError() != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");	
		}


	//Swap quadrants 

	swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);


	if (cudaGetLastError() != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");	
		}

	//Perform 2nd FFT

	if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
		{
		std::cout<<"FFT Failed"<<std::endl;
		}


	//Perform 2nd absolute value
	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
	if (cudaGetLastError() != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");	
		}


	//Swap quadrants 

	swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);

	if (cudaGetLastError() != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");	
		}


	//1D data

	smootharray<<<3,1024>>>(dev_1darray, d_shift1,d_shift2,d_smooth,array2);
	if (cudaGetLastError() != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute 1D array kernel\n");	
		}


	cudaDeviceSynchronize();
	t1 = ((double)cv::getTickCount() - t1)/cv::getTickFrequency();
	std::cout<<"Time: "<<t1<<std::endl;
}

//Copy array back to host
cudaMemcpy(h_smooth,d_smooth,sizeof(float)*width,cudaMemcpyDeviceToHost);


//determine peaks
int h_peaks = 0;
int max_num = 0;
int max_ind;
for (int i=2;i<width-2;i++) {
	if (h_smooth[i] > h_smooth[i-1] && h_smooth[i] >h_smooth[i+1] && h_smooth[i] >h_smooth[i+2] && h_smooth[i] >h_smooth[i-2])
	{
	h_peaks = h_peaks +1;
	}

	if (h_smooth[i] > max_num) {	
		max_num = h_smooth[i];
		max_ind = i;
		}
}

std::cout << "The largest element is "  << *std::max_element(h_smooth,h_smooth+width) <<std::endl;

std::cout<<"Number of peaks: "<<h_peaks<<std::endl;

std::cout<<"Max: "<<max_num<<std::endl;
std::cout<<"Max index: "<<max_ind<<std::endl;



cudaFree(array2);
cudaFree(array3);
cudaFree(dev_1darray);
cudaFree(dev_array);
cudaFree(d_shift1);
cudaFree(d_shift2);
cudaFree(d_smooth);

//return h_1darray;
//return img;
return 0;
}

nvcc C2C_fft.cu -I/usr/local/cuda-8.0/include -I/usr/include -lopencv_core -lopencv_gpu -lopencv_highgui -lopencv_imgproc -L/usr/local/cuda-8.0/lib64 -lcufft -lcudart -lcufftw  -o fft

Best practice:

When you get an API-level error, instead of just doing this:

if (cudaGetLastError() != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute 1D array kernel\n");	
		}

Do something like this:

cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute 1D array kernel\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}

Best practice #2: When asking for help from others, make it as easy as possible for them to help you. This is, after all, a CUDA programming forum, not an OpenCV programming forum, and OpenCV is not a NVIDIA product. Furthermore, from what I can see, it’s not likely to be relevant to your CUDA execution errors. Therefore make a test version of your code that doesn’t depend on OpenCV, so that others don’t need to have a OpenCV stack to witness what is going on. You may learn something in the process of doing this.

It is much harder for others to debug your code when they can’t run it, or can’t run it without a lot of effort.

One problem with your own max-finding routine is that max_num variable is defined as int but h_smooth are float quantities, so this is going to give you strange behavior:

int max_num = 0;
int max_ind;
for (int i=2;i<width-2;i++) {
...
	if (h_smooth[i] > max_num) {	// comparing an int to a float
		max_num = h_smooth[i];  // jamming a float into a int
		max_ind = i;
		}
}

Thank you Bob I appreciate the input. I will have a revised code posted soon.

REVISED NON OPENCV CODE:

This version runs perfectly.

#include <cufft.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_device_runtime_api.h>

#include <iostream>
#include <stdlib.h>
#include <stdio.h>




#define height 2048
#define width 2448
#define size 5013504



__global__ void datatransfer(cufftComplex *f2, float *f)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width && y>=0 && y< height) {
f2[width*y+x].x =  f[width*y+x];
f2[width*y+x].y = 0;


}

}



__global__ void magnitude_kernel(cufftComplex *out, cufftComplex *in2)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width && y>=0 && y< height) {

in2[width*y+x].x = sqrt( out[width*y+x].x * out[width*y+x].x + out[width*y+x].y * out[width*y+x].y);
in2[width*y+x].y = 0;
}
}



__global__ void swap_quadrants(cufftComplex *old_img,cufftComplex *new_img)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;



if (x>=0 && x<width/2 && y>=0 && y< height/2)   {

int q1 = width*y+x;
int q2 = width/2+width*y+x;
int q3 = width*height/2+width/2+width*y+x;
int q4 = width*height/2+width*y+x;

new_img[q1] = old_img[q3];
new_img[q2] = old_img[q4];
new_img[q3] = old_img[q1];
new_img[q4] = old_img[q2];
}
}

__global__ void smootharray(float *a1,float *b1,float *c1,float *result,cufftComplex *image)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;


if (x>=0 && x<width)   {
a1[x] = image[width*height/2-width+x].x +image[width*height/2+x].x;
b1[x] = image[width*height/2-width+x-1].x +image[width*height/2+x-1].x;
c1[x] = image[width*height/2-width+x+1].x +image[width*height/2+x+1].x;
result[x] = (a1[x]+b1[x]+c1[x])/3;
}

if (x>0 && x<width-1) {

a1[x] = result[x-1];
b1[x] = result[x+1];
c1[x] = result[x];
result[x] = (a1[x]+b1[x]+c1[x])/3;
}
}



int main() 
{


	
//Create Array
float *array = new float;
for (int i=0;i<height;i++)
{
	for (int j=0;j<width;j++)
	{
	array[width*i+j] = width*i+j;
	}
}
	


//Cuda variables
cufftHandle plan;

float *dev_array;

cufftComplex *h_array2,*array2;
cufftComplex *h_array3,*array3;

float *h_1darray,*dev_1darray,*h_shift1,*d_shift1,*h_shift2,*d_shift2,*h_smooth,*d_smooth;


//Allocate memory
h_array2 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_array3 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_1darray = (float*)malloc(width*sizeof(float));
h_shift1 = (float*)malloc(width*sizeof(float));
h_shift2 = (float*)malloc(width*sizeof(float));
h_smooth = (float*)malloc(width*sizeof(float));

cudaMalloc((void **) &dev_array,sizeof(float)*size);
cudaMalloc((void **) &array2, sizeof(cufftComplex)*size);
cudaMalloc((void **) &array3, sizeof(cufftComplex)*size);
cudaMalloc((void **) &dev_1darray, sizeof(float)*width);
cudaMalloc((void **) &d_shift1, sizeof(float)*width);
cudaMalloc((void **) &d_shift2, sizeof(float)*width);
cudaMalloc((void **) &d_smooth, sizeof(float)*width);

cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to allocate\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


if (cudaMemcpy(array2,h_array2,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to array2"<<std::endl;
	}

if (cudaMemcpy(array3,h_array3,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to array3"<<std::endl;
	}
if (cudaMemcpy(dev_1darray,h_1darray,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to dev_1darray"<<std::endl;
	}

if (cudaMemcpy(d_shift1,h_shift1,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_shift1"<<std::endl;
	}

if (cudaMemcpy(d_shift2,h_shift2,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_shift2"<<std::endl;
	}

if (cudaMemcpy(d_smooth,h_smooth,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_smooth"<<std::endl;
	}

//Plan 2D FFT
if (cufftPlan2d(&plan,height,width,CUFFT_C2C) != CUFFT_SUCCESS) 
	{
	std::cout<<"Failed to make 1st FFT plan"<<std::endl;
	}

//Copy Image Array to GPU
cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);
 err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to copy to dev_array\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}

//Loop through Algorithm 10 times
for (int i=0;i<10;i++) {

	//Conversion from float to float2
	dim3 threadsPerBlock(32,32);
	dim3 numBlocks(77,64);

	datatransfer<<<numBlocks,threadsPerBlock>>>(array2,dev_array);

	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute data transfer\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	//First FFT
	if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
		{
		std::cout<<"FFT 1 Failed"<<std::endl;
		}

	//Perform absolute value

	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value 1\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	//Swap quadrants 

	swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);

	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute swap quadrants 1\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}

	//Perform 2nd FFT

	if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
		{
		std::cout<<"FFT Failed"<<std::endl;
		}


	//Perform 2nd absolute value
	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value 2\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	//Swap quadrants 

	swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);

	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute swap_quadrants 2\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	//1D data

	smootharray<<<3,1024>>>(dev_1darray, d_shift1,d_shift2,d_smooth,array2);
	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute smooth array\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	cudaDeviceSynchronize();
	
}

//Copy array back to host
cudaMemcpy(h_smooth,d_smooth,sizeof(float)*width,cudaMemcpyDeviceToHost);


//determine peaks
int h_peaks = 0;
float max_num = 0;
int max_ind;
for (int i=2;i<width-2;i++) {
	if (h_smooth[i] > h_smooth[i-1] && h_smooth[i] >h_smooth[i+1] && h_smooth[i] >h_smooth[i+2] && h_smooth[i] >h_smooth[i-2])
	{
	h_peaks = h_peaks +1;
	}

	if (h_smooth[i] > max_num) {	
		max_num = h_smooth[i];
		max_ind = i;
		}
}


std::cout<<"Number of peaks: "<<h_peaks<<std::endl;

std::cout<<"Max: "<<max_num<<std::endl;
std::cout<<"Max index: "<<max_ind<<std::endl;


cudaFree(array2);
cudaFree(array3);
cudaFree(dev_1darray);
cudaFree(dev_array);
cudaFree(d_shift1);
cudaFree(d_shift2);
cudaFree(d_smooth);

//return h_1darray;
//return img;
return 0;
}

REVISED OPENCV CODE:

Still experiencing problems. Even when not using any of the OpenCV image data. Here you can see I store the values width*i+j in each array element, however I still recieve errors a little less than half the time:

Time: 0.0435446
Time: 0.0466272
Time: 0.0459261
Time: 0.0460598
Time: 0.0982809
Cuda Error: Failed to execute data transfer
Cuda Error String: unspecified launch failure
FFT 1 Failed
Cuda Error: Failed to execute absolute value 1
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 1
Cuda Error String: unspecified launch failure
FFT 2 Failed
Cuda Error: Failed to execute absolute value 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute smooth array
Cuda Error String: unspecified launch failure
Time: 0.00030188
Cuda Error: Failed to execute data transfer
Cuda Error String: unspecified launch failure
FFT 1 Failed
Cuda Error: Failed to execute absolute value 1
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 1
Cuda Error String: unspecified launch failure
FFT 2 Failed
Cuda Error: Failed to execute absolute value 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute smooth array
Cuda Error String: unspecified launch failure
Time: 0.000102269
Cuda Error: Failed to execute data transfer
Cuda Error String: unspecified launch failure
FFT 1 Failed
Cuda Error: Failed to execute absolute value 1
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 1
Cuda Error String: unspecified launch failure
FFT 2 Failed
Cuda Error: Failed to execute absolute value 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute smooth array
Cuda Error String: unspecified launch failure
Time: 8.1374e-05
Cuda Error: Failed to execute data transfer
Cuda Error String: unspecified launch failure
FFT 1 Failed
Cuda Error: Failed to execute absolute value 1
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 1
Cuda Error String: unspecified launch failure
FFT 2 Failed
Cuda Error: Failed to execute absolute value 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute smooth array
Cuda Error String: unspecified launch failure
Time: 8.099e-05
Cuda Error: Failed to execute data transfer
Cuda Error String: unspecified launch failure
FFT 1 Failed
Cuda Error: Failed to execute absolute value 1
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 1
Cuda Error String: unspecified launch failure
FFT 2 Failed
Cuda Error: Failed to execute absolute value 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute swap quadrants 2
Cuda Error String: unspecified launch failure
Cuda Error: Failed to execute smooth array
Cuda Error String: unspecified launch failure
Time: 7.9454e-05
The largest element is 2.84114e-29
Number of peaks: 0
Max: 2.84114e-29
Max index: 8

This leads me to believe this isn’t an OpenCV error, but rather something happening with cuda such as accessing data not fully processed yet. Is this a wrong assumption?

#include <cufft.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_device_runtime_api.h>

#include <iostream>
#include <stdlib.h>
#include <stdio.h>

#include <opencv2/core/core.hpp>
#include <opencv2/opencv.hpp>



#define height 2048
#define width 2448
#define size 5013504



__global__ void datatransfer(cufftComplex *f2, float *f)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width && y>=0 && y< height) {
f2[width*y+x].x =  f[width*y+x];
f2[width*y+x].y = 0;


}

}



__global__ void magnitude_kernel(cufftComplex *out, cufftComplex *in2)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;

if (x>=0 && x<width && y>=0 && y< height) {

in2[width*y+x].x = sqrt( out[width*y+x].x * out[width*y+x].x + out[width*y+x].y * out[width*y+x].y);
in2[width*y+x].y = 0;
}
}



__global__ void swap_quadrants(cufftComplex *old_img,cufftComplex *new_img)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;



if (x>=0 && x<width/2 && y>=0 && y< height/2)   {

int q1 = width*y+x;
int q2 = width/2+width*y+x;
int q3 = width*height/2+width/2+width*y+x;
int q4 = width*height/2+width*y+x;

new_img[q1] = old_img[q3];
new_img[q2] = old_img[q4];
new_img[q3] = old_img[q1];
new_img[q4] = old_img[q2];
}
}

__global__ void smootharray(float *a1,float *b1,float *c1,float *result,cufftComplex *image)
{

int x = (blockIdx.x * blockDim.x) + threadIdx.x;


if (x>=0 && x<width)   {
a1[x] = image[width*height/2-width+x].x +image[width*height/2+x].x;
b1[x] = image[width*height/2-width+x-1].x +image[width*height/2+x-1].x;
c1[x] = image[width*height/2-width+x+1].x +image[width*height/2+x+1].x;
result[x] = (a1[x]+b1[x]+c1[x])/3;
}

if (x>0 && x<width-1) {

a1[x] = result[x-1];
b1[x] = result[x+1];
c1[x] = result[x];
result[x] = (a1[x]+b1[x]+c1[x])/3;
}
}

__global__ void findlocalpeaks(float *smooth_array,int *num_peaks)
{
num_peaks = 0;
for (int i=10;i<width-10;i++)
{	if (smooth_array[i]>smooth_array[i-1] &&smooth_array[i]>smooth_array[i+1])
	{
	num_peaks = num_peaks+ 1;
	}
}
}


int main() 
{

//Read image
cv::Mat image =cv::imread("im2_48.jpg",CV_LOAD_IMAGE_GRAYSCALE);
image.convertTo(image,CV_32FC1);
	
//Create Array
float *array = new float;
for (int i=0;i<height;i++)
{
	for (int j=0;j<width;j++)
	{
	//array[width*i+j] = image.at<float>(i,j);
	array[width*i+j] = width*i+j;
	}
}
	


//Test reconstruct
cv::Mat reconstruct(height,width,CV_32FC1);
for (int i=0;i<height;i++)
{
	for (int j=0;j<width;j++)
	{
	reconstruct.at<float>(i,j) = array[width*i+j];
	}
}
cv::normalize(reconstruct, reconstruct, 0, 1, CV_MINMAX);
cv::namedWindow("reconstruct",CV_WINDOW_NORMAL);	
cv::imshow("reconstruct",reconstruct);
cv::waitKey();

//Cuda variables
cufftHandle plan;

float *dev_array;

cufftComplex *h_array2,*array2;
cufftComplex *h_array3,*array3;

float *h_1darray,*dev_1darray,*h_shift1,*d_shift1,*h_shift2,*d_shift2,*h_smooth,*d_smooth;


//Allocate memory
h_array2 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_array3 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_1darray = (float*)malloc(width*sizeof(float));
h_shift1 = (float*)malloc(width*sizeof(float));
h_shift2 = (float*)malloc(width*sizeof(float));
h_smooth = (float*)malloc(width*sizeof(float));

cudaMalloc((void **) &dev_array,sizeof(float)*size);
cudaMalloc((void **) &array2, sizeof(cufftComplex)*size);
cudaMalloc((void **) &array3, sizeof(cufftComplex)*size);
cudaMalloc((void **) &dev_1darray, sizeof(float)*width);
cudaMalloc((void **) &d_shift1, sizeof(float)*width);
cudaMalloc((void **) &d_shift2, sizeof(float)*width);
cudaMalloc((void **) &d_smooth, sizeof(float)*width);

cudaError_t err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to allocate\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


if (cudaMemcpy(array2,h_array2,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to array2"<<std::endl;
	}

if (cudaMemcpy(array3,h_array3,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to array3"<<std::endl;
	}
if (cudaMemcpy(dev_1darray,h_1darray,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to dev_1darray"<<std::endl;
	}

if (cudaMemcpy(d_shift1,h_shift1,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_shift1"<<std::endl;
	}

if (cudaMemcpy(d_shift2,h_shift2,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_shift2"<<std::endl;
	}

if (cudaMemcpy(d_smooth,h_smooth,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
	{
	std::cout<<"Failed to copy to d_smooth"<<std::endl;
	}

//Plan 2D FFT
if (cufftPlan2d(&plan,height,width,CUFFT_C2C) != CUFFT_SUCCESS) 
	{
	std::cout<<"Failed to make 1st FFT plan"<<std::endl;
	}

//Copy Image Array to GPU
cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);
err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to copy to dev_array\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


//Loop through Algorithm 10 times
for (int i=0;i<10;i++) {
	double t1 = (double)cv::getTickCount();


	//Conversion from float to float2
	dim3 threadsPerBlock(32,32);
	dim3 numBlocks(77,64);

	datatransfer<<<numBlocks,threadsPerBlock>>>(array2,dev_array);

	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute data transfer\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}



	//First FFT
	if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
		{
		std::cout<<"FFT 1 Failed"<<std::endl;
		}

	//Perform absolute value

	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value 1\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	//Swap quadrants 

	swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);


	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute swap quadrants 1\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}

	//Perform 2nd FFT

	if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
		{
		std::cout<<"FFT 2 Failed"<<std::endl;
		}


	//Perform 2nd absolute value
	magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute absolute value 2\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}

	//Swap quadrants 

	swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);

	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute swap quadrants 2\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	//1D data

	smootharray<<<3,1024>>>(dev_1darray, d_shift1,d_shift2,d_smooth,array2);
	err = cudaGetLastError();
	if (err != cudaSuccess) 
		{
		fprintf(stderr, "Cuda Error: Failed to execute smooth array\n");
		fprintf(stderr, "Cuda Error String: %s\n", cudaGetErrorString(err));	
		}


	cudaDeviceSynchronize();
	t1 = ((double)cv::getTickCount() - t1)/cv::getTickFrequency();
	std::cout<<"Time: "<<t1<<std::endl;
}

//Copy array back to host
cudaMemcpy(h_smooth,d_smooth,sizeof(float)*width,cudaMemcpyDeviceToHost);


//determine peaks
int h_peaks = 0;
float max_num = 0;
int max_ind;
for (int i=2;i<width-2;i++) {
	if (h_smooth[i] > h_smooth[i-1] && h_smooth[i] >h_smooth[i+1] && h_smooth[i] >h_smooth[i+2] && h_smooth[i] >h_smooth[i-2])
	{
	h_peaks = h_peaks +1;
	}

	if (h_smooth[i] > max_num) {	
		max_num = h_smooth[i];
		max_ind = i;
		}
}

std::cout << "The largest element is "  << *std::max_element(h_smooth,h_smooth+width) <<std::endl;

std::cout<<"Number of peaks: "<<h_peaks<<std::endl;

std::cout<<"Max: "<<max_num<<std::endl;
std::cout<<"Max index: "<<max_ind<<std::endl;

//if (cudaMemcpy(h_array2,array2,sizeof(cufftComplex)*size,cudaMemcpyDeviceToHost) != cudaSuccess)
//	{
//	std::cout<<"Failed to copy to h_array2"<<std::endl;
//	}

	
//cv:: Mat img(height,width,CV_32FC1);
//for (int i=0;i<height;i++)
//{
//	for (int j=0;j<width;j++)
//	{
//	img.at<float>(i,j) = h_array2[width*i+j].x;
//	}
//




cudaFree(array2);
cudaFree(array3);
cudaFree(dev_1darray);
cudaFree(dev_array);
cudaFree(d_shift1);
cudaFree(d_shift2);
cudaFree(d_smooth);

//return h_1darray;
//return img;
return 0;
}

unspecified launch failure often means:

  1. you are making an illegal, out-of-bounds access in kernel code. Normally cuda-memcheck can immediately confirm this
  2. you have tripped over a WDDM TDR timeout on windows. This would normally happen if you have a kernel that runs too long (say, longer than 2 seconds). Since your non-OpenCV code does not have this issue, this seems unlikely to me.

since the CUDA code works but the OpenCV code doesn’t, I’d consider whether the handoff/assumptions between OpenCV and CUDA are correct. If you got an image size wrong, for example, that could cause problems.

If there was an out-of-bounds access wouldn’t there be an error every time the kernel was called or the program ran?

Yes, this is a strange case. I took your opencv code and ripped the openCV stuff out of it, and it runs fine.

Are you on a jetson? Is the jetson hosting a display?

Yes the non-openCV code runs fine for me as well. And yes I am on a jetson tx2. What do you mean by hosting a display?

Does your jetson have X running?

I think you are possibly hitting a linux display timeout, which would corrupt the CUDA context, and so everything after that fails. You’ve instrumented some, but not all of your CUDA activity with error checking, so it’s a little bit uncertain. But that is my best guess based on the data you have provided.

[url]USING CUDA AND X | NVIDIA

Ahh yes this is most likely the issue, thank you. You have cracked the case once again Bob.