Hello,
I have a cuda program that runs fine a little more than 50% of the time, and other times I receive cuda errors that I have set in my program. cuda-memcheck gave 0 errors for the three time I ran it. I am fairly new to cuda so I am not familiar with good programming practices when it comes to host/device synchronization, streams and so forth. If you can offer any insight to what is going wrong that would be great!
On another note if you know why the simple max element finder at the end is giving different results than the std::max_element() function that would be of help as well!
A normal output is typically:
Time: 0.0431291
Time: 0.0482404
Time: 0.046779
Time: 0.047245
Time: 0.0485886
Time: 0.0470794
Time: 0.0468703
Time: 0.0486437
Time: 0.0468626
Time: 0.0473519
The largest element is 3.51592e+10
Number of peaks: 123
Max: 2147483647
Max index: 1541
And an error output more or less looks like:
Time: 0.0810037
Time: 0.0528883
Time: 0.0473969
Time: 0.0533998
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 0.000322271
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 0.000103488
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.4544e-05
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.5439e-05
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.3328e-05
Cuda Error: Failed to execute kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
FFT Failed
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute absolute value kernel
Cuda Error: Failed to execute 1D array kernel
Time: 8.5568e-05
The largest element is 2.17361e+38
Number of peaks: 302
Max: 2147483647
Max index: 2442
#include <cufft.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <cuda_device_runtime_api.h>
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <opencv2/core/core.hpp>
#include <opencv2/opencv.hpp>
#define height 2048
#define width 2448
#define size 5013504
__global__ void datatransfer(cufftComplex *f2, float *f)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x>=0 && x<width && y>=0 && y< height) {
f2[width*y+x].x = f[width*y+x];
f2[width*y+x].y = 0;
}
}
__global__ void magnitude_kernel(cufftComplex *out, cufftComplex *in2)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x>=0 && x<width && y>=0 && y< height) {
in2[width*y+x].x = sqrt( out[width*y+x].x * out[width*y+x].x + out[width*y+x].y * out[width*y+x].y);
in2[width*y+x].y = 0;
}
}
__global__ void swap_quadrants(cufftComplex *old_img,cufftComplex *new_img)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
int y = (blockIdx.y * blockDim.y) + threadIdx.y;
if (x>=0 && x<width/2 && y>=0 && y< height/2) {
int q1 = width*y+x;
int q2 = width/2+width*y+x;
int q3 = width*height/2+width/2+width*y+x;
int q4 = width*height/2+width*y+x;
new_img[q1] = old_img[q3];
new_img[q2] = old_img[q4];
new_img[q3] = old_img[q1];
new_img[q4] = old_img[q2];
}
}
__global__ void smootharray(float *a1,float *b1,float *c1,float *result,cufftComplex *image)
{
int x = (blockIdx.x * blockDim.x) + threadIdx.x;
if (x>=0 && x<width) {
a1[x] = image[width*height/2-width+x].x +image[width*height/2+x].x;
b1[x] = image[width*height/2-width+x-1].x +image[width*height/2+x-1].x;
c1[x] = image[width*height/2-width+x+1].x +image[width*height/2+x+1].x;
result[x] = (a1[x]+b1[x]+c1[x])/3;
}
if (x>0 && x<width-1) {
a1[x] = result[x-1];
b1[x] = result[x+1];
c1[x] = result[x];
result[x] = (a1[x]+b1[x]+c1[x])/3;
}
}
int main()
{
//Read image
cv::Mat image =cv::imread("im2_48.jpg",CV_LOAD_IMAGE_GRAYSCALE);
image.convertTo(image,CV_32FC1);
//Create Array
float *array = new float;
for (int i=0;i<height;i++)
{
for (int j=0;j<width;j++)
{
array[width*i+j] = image.at<float>(i,j);
}
}
//Test reconstruct
cv::Mat reconstruct(height,width,CV_32FC1);
for (int i=0;i<height;i++)
{
for (int j=0;j<width;j++)
{
reconstruct.at<float>(i,j) = array[width*i+j];
}
}
cv::normalize(reconstruct, reconstruct, 0, 1, CV_MINMAX);
cv::namedWindow("reconstruct",CV_WINDOW_NORMAL);
cv::imshow("reconstruct",reconstruct);
//Cuda variables
cufftHandle plan;
float *dev_array;
cufftComplex *h_array2,*array2;
cufftComplex *h_array3,*array3;
float *h_1darray,*dev_1darray,*h_shift1,*d_shift1,*h_shift2,*d_shift2,*h_smooth,*d_smooth;
//Allocate memory
h_array2 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_array3 = (cufftComplex*)malloc(size*sizeof(cufftComplex));
h_1darray = (float*)malloc(width*sizeof(float));
h_shift1 = (float*)malloc(width*sizeof(float));
h_shift2 = (float*)malloc(width*sizeof(float));
h_smooth = (float*)malloc(width*sizeof(float));
cudaMalloc((void **) &dev_array,sizeof(float)*size);
cudaMalloc((void **) &array2, sizeof(cufftComplex)*size);
cudaMalloc((void **) &array3, sizeof(cufftComplex)*size);
cudaMalloc((void **) &dev_1darray, sizeof(float)*width);
cudaMalloc((void **) &d_shift1, sizeof(float)*width);
cudaMalloc((void **) &d_shift2, sizeof(float)*width);
cudaMalloc((void **) &d_smooth, sizeof(float)*width);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to allocate something\n");
}
if (cudaMemcpy(array2,h_array2,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to array2"<<std::endl;
}
if (cudaMemcpy(array3,h_array3,sizeof(cufftComplex)*size,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to array3"<<std::endl;
}
if (cudaMemcpy(dev_1darray,h_1darray,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to dev_1darray"<<std::endl;
}
if (cudaMemcpy(d_shift1,h_shift1,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to d_shift1"<<std::endl;
}
if (cudaMemcpy(d_shift2,h_shift2,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to d_shift2"<<std::endl;
}
if (cudaMemcpy(d_smooth,h_smooth,sizeof(float)*width,cudaMemcpyHostToDevice) != cudaSuccess)
{
std::cout<<"Failed to copy to d_smooth"<<std::endl;
}
//Plan 2D FFT
if (cufftPlan2d(&plan,height,width,CUFFT_C2C) != CUFFT_SUCCESS)
{
std::cout<<"Failed to make 1st FFT plan"<<std::endl;
}
//Copy Image Array to GPU
cudaMemcpy(dev_array,array,sizeof(float)*size,cudaMemcpyHostToDevice);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to copy to dev_array\n");
}
//Loop through Algorithm 10 times
for (int i=0;i<10;i++) {
double t1 = (double)cv::getTickCount();
//Conversion from float to float2
dim3 threadsPerBlock(32,32);
dim3 numBlocks(77,64);
datatransfer<<<numBlocks,threadsPerBlock>>>(array2,dev_array);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to execute kernel\n");
}
//First FFT
if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
{
std::cout<<"FFT Failed"<<std::endl;
}
//Perform absolute value
magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");
}
//Swap quadrants
swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");
}
//Perform 2nd FFT
if (cufftExecC2C(plan,(cufftComplex *)array2,(cufftComplex *)array2, CUFFT_FORWARD) != CUFFT_SUCCESS)
{
std::cout<<"FFT Failed"<<std::endl;
}
//Perform 2nd absolute value
magnitude_kernel<<<numBlocks,threadsPerBlock>>>(array2,array3);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");
}
//Swap quadrants
swap_quadrants<<<numBlocks,threadsPerBlock>>>(array3,array2);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to execute absolute value kernel\n");
}
//1D data
smootharray<<<3,1024>>>(dev_1darray, d_shift1,d_shift2,d_smooth,array2);
if (cudaGetLastError() != cudaSuccess)
{
fprintf(stderr, "Cuda Error: Failed to execute 1D array kernel\n");
}
cudaDeviceSynchronize();
t1 = ((double)cv::getTickCount() - t1)/cv::getTickFrequency();
std::cout<<"Time: "<<t1<<std::endl;
}
//Copy array back to host
cudaMemcpy(h_smooth,d_smooth,sizeof(float)*width,cudaMemcpyDeviceToHost);
//determine peaks
int h_peaks = 0;
int max_num = 0;
int max_ind;
for (int i=2;i<width-2;i++) {
if (h_smooth[i] > h_smooth[i-1] && h_smooth[i] >h_smooth[i+1] && h_smooth[i] >h_smooth[i+2] && h_smooth[i] >h_smooth[i-2])
{
h_peaks = h_peaks +1;
}
if (h_smooth[i] > max_num) {
max_num = h_smooth[i];
max_ind = i;
}
}
std::cout << "The largest element is " << *std::max_element(h_smooth,h_smooth+width) <<std::endl;
std::cout<<"Number of peaks: "<<h_peaks<<std::endl;
std::cout<<"Max: "<<max_num<<std::endl;
std::cout<<"Max index: "<<max_ind<<std::endl;
cudaFree(array2);
cudaFree(array3);
cudaFree(dev_1darray);
cudaFree(dev_array);
cudaFree(d_shift1);
cudaFree(d_shift2);
cudaFree(d_smooth);
//return h_1darray;
//return img;
return 0;
}
nvcc C2C_fft.cu -I/usr/local/cuda-8.0/include -I/usr/include -lopencv_core -lopencv_gpu -lopencv_highgui -lopencv_imgproc -L/usr/local/cuda-8.0/lib64 -lcufft -lcudart -lcufftw -o fft