/* Host Code for the Interlacing / PSlitting operations of multiple arrays */ #include #include #include #include #include #include "deinterlace_kernel.cu" //------------------------------------------------------------------------------- float deinterlace_call (float* h_odata, float* h_idata, int n, unsigned long int N, unsigned long int mem_size, int it) { float *d_idata, *d_odata;; cudaMalloc((void**) &d_idata, mem_size); cudaMalloc((void**) &d_odata, mem_size); dim3 grid ((int)ceil(sqrt(N)/tile_x), (int)ceil(sqrt(N)/tile_y)); dim3 threads (thd); unsigned int timer; float time; cutCreateTimer (&timer); //Warm-up for (int i = 0; i < 10; i++) { if (n==2) deinterlace <2> <<>> (d_odata, d_idata, N); else if (n==3) deinterlace <3> <<>> (d_odata, d_idata, N); cudaThreadSynchronize(); } cudaMemset(d_odata, 0, mem_size); cudaThreadSynchronize(); cutResetTimer (timer); cutStartTimer (timer); for (int i = 0; i < it; i++) { if (n==2) deinterlace <2> <<>> (d_odata, d_idata, N); else if (n==3) deinterlace <3> <<>> (d_odata, d_idata, N); cudaThreadSynchronize(); } cutStopTimer (timer); time = cutGetTimerValue (timer); cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); cudaFree(d_idata); cudaFree(d_odata); cutilCheckMsg("De-interlace Kernel execution failed"); return time; } //------------------------------------------------------------------------------- float memcpy_call (float* h_odata, float* h_idata, int n, unsigned long int N, unsigned long int mem_size, int it) { float *d_idata, *d_odata;; cudaMalloc((void**) &d_idata, mem_size); cudaMalloc((void**) &d_odata, mem_size); unsigned int timer; float time; cutCreateTimer (&timer); cudaMemset(d_odata, 0, mem_size); cudaThreadSynchronize(); cutResetTimer (timer); cutStartTimer (timer); for (int i = 0; i < it; i++) { cudaMemcpy(d_odata, d_idata, mem_size, cudaMemcpyDeviceToDevice); cudaThreadSynchronize(); } cutStopTimer (timer); time = cutGetTimerValue (timer); cudaFree(d_idata); cudaFree(d_odata); cutilCheckMsg("Memcpy Kernel execution failed"); return time; } //------------------------------------------------------------------------------- int main (int argc, char** argv) { cudaSetDevice ( 0 ); int n = 3, Ni = 4096*4096; // Number of arrays to be interalced or split into unsigned long int N_o = Ni*n; // Number of elements in interlaced array unsigned long int N = N_o/n; // Number of elements in inout array unsigned long int mem_size = N_o * sizeof(float); float gb = (((float)N_o + (float)N_o) * sizeof(float)/ 1e9); float *h_idata = (float*)malloc(mem_size); float *h_odata = (float*)malloc(mem_size); for (unsigned long int i = 0, j = 0; i < N; i++) { for (int k = 0; k < n; k++, j++) h_idata[(k*N)+i] = (float)i; } int it = 100; //------------------------------------------------------------------------- float time0, time; time = deinterlace_call (h_odata, h_idata, n, N, mem_size, it); time0 = memcpy_call (h_odata, h_idata, n, N, mem_size, it); //------------------------------------------------------------------------- printf ("De-intrlacing into"); printf("%2d arrays(%5.4fGb) Kernel-%8.3f(Gb/s) Memcpy-%8.3f(Gb/s) \n", n, gb, gb/((time / it) * 1e-3), gb/((time0 / it) * 1e-3)); //------------------------------------------------------------------------- free(h_odata); free(h_idata); cudaThreadExit(); return 0; }