CUDA_SAFE_CALL

user_user · November 8, 2010, 7:48pm

Hi everyone,

I have realy bad problem about CUDA_SAFE_CALL.

My program doesn’t work perfectly, so I added cuda_safe_call, but unfortunately I got in cmd.exe error: “cuda error in file ob.h in line 134: unknown error.” (No of line is this line where I wrote cuda_safe_call).

When I’d deleted makro in this line, the problem appeared in other line where is cuda_safe_call…

can anyone help me?

ob.h:

// convolution on gpu 

#include <iostream>

#include <stdlib.h>

#include <stdio.h>

#include <string.h>

#include <math.h>

#include <time.h>

#include <windows.h>

#include <C:\Program Files\Microsoft SDKs\Windows\v6.0A\Include\gl\gl.h>

#include <C:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\shared\inc\GL\glut.h>

#include <c:\cuda\include\cufft.h>

#include <c:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\common\inc\cutil_inline.h>

#include <cuda_runtime_api.h>

#include <c:\ProgramData\NVIDIA Corporation\NVIDIA GPU Computing SDK\C\common\inc\cutil.h>

#include "kernel.cu"

using namespace std;

long double obl(int argc, char** argv,int n){

		cout<<"-------------------------------"<<endl;

		cout<<"lenght of tab:"<<n<<endl;	

	

		cufftComplex		*gpu_funkcja, 

							*gpu_filtr, 

							*cpu_funkcja, 

							*cpu_filtr, 

							*mnozenie, 

							*gpu_mnozenie,

							*splot,

							*cpu_splot;

		cudaError_t			status;

		cufftResult	        result;

		

		cpu_funkcja = new cufftComplex [n];

		cpu_filtr   = new cufftComplex [n];

		mnozenie    = new cufftComplex [n];

		cpu_splot   = new cufftComplex [n];

		

		CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&gpu_funkcja, sizeof(cufftComplex)*n));

		CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&gpu_filtr,   sizeof(cufftComplex)*n));

		

		// creating funkcja and filtru

		generacja_sygnalu<<<255,255>>>(n,gpu_funkcja);

		cudaThreadSynchronize();

		cudaThreadExit();

		filtr<<<255,255>>>(n,gpu_filtr);

		cudaThreadSynchronize();

		cudaThreadExit();

		// copy funkcja and filtr form gpu to cpu

		status=cudaMemcpy(cpu_funkcja,gpu_funkcja,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);

				if(status !=cudaSuccess){cout<<"copy funkcja to cpu unsuccess"<<endl;}

		status=cudaMemcpy(cpu_filtr,gpu_filtr,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);

				if(status !=cudaSuccess){cout<<"copy filtr to cpu unsuccess"<<endl;}

		// free gpu 

		CUDA_SAFE_CALL_NO_SYNC(cudaFree(gpu_funkcja));

		CUDA_SAFE_CALL_NO_SYNC(cudaFree(gpu_filtr));

				

		// time	

		long double ms;

		unsigned __int64 freq, counterStart, counterStop;

		QueryPerformanceFrequency(reinterpret_cast<LARGE_INTEGER*> (&freq));

		QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*> (&counterStart));  // start time

		

		// fft

		cufftComplex      *fft_filtr, 

						  *fft_funkcja, 

						  *fft_funkcja_cpu,

						  *fft_filtr_cpu;

		cufftHandle		   plan;

		fft_funkcja_cpu = new cufftComplex [n];

		fft_filtr_cpu   = new cufftComplex [n];

		CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&fft_filtr,   sizeof(cufftComplex)*n));		//  fft_filtr on gpu

		CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&fft_funkcja, sizeof(cufftComplex)*n));		//  fft_funkcja on gpu

		

		cufftPlan1d(&plan, n, CUFFT_C2C, 2);		                                                // creating planu 1d fft

		

	       result=cufftExecC2C(plan, cpu_funkcja, fft_funkcja, 1);		// fft save in fft_funkcja

			if(result == CUFFT_SUCCESS){cout<<"fft_funkcja powiodlo sie 1/5"<<endl;};

		

		status=cudaMemcpy(fft_funkcja_cpu,fft_funkcja,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);

			if(status==cudaSuccess){cout<<"fft_funkcja_cpu ok 2/5"<<endl;}		

		result=cufftExecC2C(plan, cpu_filtr, fft_filtr, 1);			// fft save in fft_funkcja

			if(result == CUFFT_SUCCESS){cout<<"fft_filtr powiodlo sie 3/5"<<endl;};

		

		status=cudaMemcpy(fft_filtr_cpu,fft_filtr,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost);	

				if(status==cudaSuccess){cout<<"fft_filtr_cpu ok 4/5"<<endl;}

		CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&gpu_mnozenie, sizeof(cufftComplex)*n));

		CUDA_SAFE_CALL_NO_SYNC(cudaMalloc((void**)&splot,        sizeof(cufftComplex)*n));

		

		//multiply

		m<<<255,255>>>(fft_funkcja, fft_filtr, gpu_mnozenie, n);

		cudaThreadSynchronize();

		cudaThreadExit();

		// przeslanie na cpu wyniku mnozenie funkcji i filtru w dziedzieni czestotliwosci

		CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(mnozenie,gpu_mnozenie,sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost));			

		// ifft	

		int d=0;

		cufftPlan1d(&plan, n, CUFFT_C2C, 1);

		result = cufftExecC2C(plan, mnozenie, splot, -1); // ifft save in splot

				if(result == CUFFT_SUCCESS){cout<<"splot ok 5/5"<<endl;d=1;}

				else d=0;

		// copy splot to cpu

		CUDA_SAFE_CALL_NO_SYNC(cudaMemcpy(cpu_splot, splot, sizeof(cufftComplex)*n,cudaMemcpyDeviceToHost));

	

		QueryPerformanceCounter(reinterpret_cast<LARGE_INTEGER*> (&counterStop));  // end of time 

    	

                ms = (static_cast<long double> (counterStop) - counterStart) / freq * 1000; 

	       cout<<"time of convolution:"<<ms<<"ms"<<endl;

		cufftDestroy(plan);

		CUDA_SAFE_CALL_NO_SYNC(cudaFree(fft_filtr));

		CUDA_SAFE_CALL_NO_SYNC(cudaFree(fft_funkcja));

		CUDA_SAFE_CALL_NO_SYNC(cudaFree(gpu_mnozenie));

		CUDA_SAFE_CALL_NO_SYNC(cudaFree(splot));

		delete []cpu_funkcja;

		delete []cpu_filtr;

		delete []mnozenie;

		delete []fft_funkcja_cpu;

		delete []fft_filtr_cpu;

		return 0;

};

user_user · November 8, 2010, 7:48pm