Hi everyone,
I have realy bad problem about CUDA_SAFE_CALL.
My program doesn’t work perfectly, so I added cuda_safe_call, but unfortunately I got in cmd.exe error: “cuda error in file ob.h in line 134: unknown error.” (No of line is this line where I wrote cuda_safe_call).
When I’d deleted makro in this line, the problem appeared in other line where is cuda_safe_call…
can anyone help me?
ob.h:
// convolution on gpu
#include <iostream>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>
#include <windows.h>
#include <C:\Program Files\Microsoft SDKs\Windows\v6.0A\Include\gl\gl.h>
#include <C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\shared\inc\GL\glut.h>
#include <c:\cuda\include\cufft.h>
#include <c:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\common\inc\cutil_inline.h>
#include <cuda_runtime_api.h>
#include <c:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\common\inc\cutil.h>
#include "kernel.cu"
using namespace std;
long double obl(int argc, char** argv,int n){
cout<<"-------------------------------"<<endl;
cout<<"lenght of tab:"<<n<<endl;
cufftComplex *gpu_funkcja,
*gpu_filtr,
*cpu_funkcja,
*cpu_filtr,
*mnozenie,
*gpu_mnozenie,
*splot,
*cpu_splot;
cudaError_t status;
cufftResult result;
cpu_funkcja = new cufftComplex [n];
cpu_filtr = new cufftComplex [n];
mnozenie = new cufftComplex [n];
cpu_splot = new cufftComplex [n];
CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&gpu_funkcja, sizeof(cufftComplex)*n));
CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&gpu_filtr, sizeof(cufftComplex)*n));
// creating funkcja and filtru
generacja_sygnalu<<<255,255>>>(n,gpu_funkcja);
cudaThreadSynchronize();
cudaThreadExit();
filtr<<<255,255>>>(n,gpu_filtr);
cudaThreadSynchronize();
cudaThreadExit();
// copy funkcja and filtr form gpu to cpu
status=cudaMemcpy(cpu_funkcja,gpu_funkcja,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);
if(status !=cudaSuccess){cout<<"copy funkcja to cpu unsuccess"<<endl;}
status=cudaMemcpy(cpu_filtr,gpu_filtr,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);
if(status !=cudaSuccess){cout<<"copy filtr to cpu unsuccess"<<endl;}
// free gpu
CUDA_SAFE_CALL_NO_SYNC(cudaFree(gpu_funkcja));
CUDA_SAFE_CALL_NO_SYNC(cudaFree(gpu_filtr));
// time
long double ms;
unsigned __int64 freq, counterStart, counterStop;
QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*> (&freq));
QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*> (&counterStart)); // start time
// fft
cufftComplex *fft_filtr,
*fft_funkcja,
*fft_funkcja_cpu,
*fft_filtr_cpu;
cufftHandle plan;
fft_funkcja_cpu = new cufftComplex [n];
fft_filtr_cpu = new cufftComplex [n];
CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&fft_filtr, sizeof(cufftComplex)*n)); // fft_filtr on gpu
CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&fft_funkcja, sizeof(cufftComplex)*n)); // fft_funkcja on gpu
cufftPlan1d(&plan, n, CUFFT_C2C, 2); // creating planu 1d fft
result=cufftExecC2C(plan, cpu_funkcja, fft_funkcja, 1); // fft save in fft_funkcja
if(result == CUFFT_SUCCESS){cout<<"fft_funkcja powiodlo sie 1/5"<<endl;};
status=cudaMemcpy(fft_funkcja_cpu,fft_funkcja,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);
if(status==cudaSuccess){cout<<"fft_funkcja_cpu ok 2/5"<<endl;}
result=cufftExecC2C(plan, cpu_filtr, fft_filtr, 1); // fft save in fft_funkcja
if(result == CUFFT_SUCCESS){cout<<"fft_filtr powiodlo sie 3/5"<<endl;};
status=cudaMemcpy(fft_filtr_cpu,fft_filtr,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);
if(status==cudaSuccess){cout<<"fft_filtr_cpu ok 4/5"<<endl;}
CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&gpu_mnozenie, sizeof(cufftComplex)*n));
CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&splot, sizeof(cufftComplex)*n));
//multiply
m<<<255,255>>>(fft_funkcja, fft_filtr, gpu_mnozenie, n);
cudaThreadSynchronize();
cudaThreadExit();
// przeslanie na cpu wyniku mnozenie funkcji i filtru w dziedzieni czestotliwosci
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(mnozenie,gpu_mnozenie,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost));
// ifft
int d=0;
cufftPlan1d(&plan, n, CUFFT_C2C, 1);
result = cufftExecC2C(plan, mnozenie, splot, -1); // ifft save in splot
if(result == CUFFT_SUCCESS){cout<<"splot ok 5/5"<<endl;d=1;}
else d=0;
// copy splot to cpu
CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(cpu_splot, splot, sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost));
QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*> (&counterStop)); // end of time
ms = (static_cast<long double> (counterStop) - counterStart) / freq * 1000;
cout<<"time of convolution:"<<ms<<"ms"<<endl;
cufftDestroy(plan);
CUDA_SAFE_CALL_NO_SYNC(cudaFree(fft_filtr));
CUDA_SAFE_CALL_NO_SYNC(cudaFree(fft_funkcja));
CUDA_SAFE_CALL_NO_SYNC(cudaFree(gpu_mnozenie));
CUDA_SAFE_CALL_NO_SYNC(cudaFree(splot));
delete []cpu_funkcja;
delete []cpu_filtr;
delete []mnozenie;
delete []fft_funkcja_cpu;
delete []fft_filtr_cpu;
return 0;
};