/* Host Code for the Interlacing / PSlitting operations of multiple arrays */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

#include <cutil_inline.h>

#include "deinterlace_kernel.cu"

//-------------------------------------------------------------------------------

float deinterlace_call (float* h_odata, float* h_idata, int n, unsigned long int N, unsigned long int mem_size, int it)
{
	float *d_idata, *d_odata;;
	cudaMalloc((void**) &d_idata, mem_size);
	cudaMalloc((void**) &d_odata, mem_size);
	
	

	dim3 grid ((int)ceil(sqrt(N)/tile_x), (int)ceil(sqrt(N)/tile_y));
	dim3 threads (thd);
	unsigned int timer;
	float time;
	cutCreateTimer (&timer);
	
	//Warm-up
	for (int i = 0; i < 10; i++) {
		if (n==2)
			deinterlace <2> <<<grid, threads, (n*thd*sizeof(float))>>> (d_odata, d_idata, N);
		else if (n==3)
			deinterlace <3> <<<grid, threads, (n*thd*sizeof(float))>>> (d_odata, d_idata, N);
		cudaThreadSynchronize();	
	}
	
	cudaMemset(d_odata, 0, mem_size);
	cudaThreadSynchronize();
	
	cutResetTimer (timer);
	cutStartTimer (timer);
	for (int i = 0; i < it; i++) {
		if (n==2)
			deinterlace <2> <<<grid, threads, (n*thd*sizeof(float))>>> (d_odata, d_idata, N);
		else if (n==3)
			deinterlace <3> <<<grid, threads, (n*thd*sizeof(float))>>> (d_odata, d_idata, N);
	
		cudaThreadSynchronize();	
	}
	cutStopTimer (timer);
	time = cutGetTimerValue (timer);		
	
	cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
	
	cudaFree(d_idata);
	cudaFree(d_odata);
	
	cutilCheckMsg("De-interlace Kernel execution failed");

	return time;
}

//-------------------------------------------------------------------------------

float memcpy_call (float* h_odata, float* h_idata, int n, unsigned long int N, unsigned long int mem_size, int it)
{
	float *d_idata, *d_odata;;
	cudaMalloc((void**) &d_idata, mem_size);
	cudaMalloc((void**) &d_odata, mem_size);
	
	unsigned int timer;
	float time;
	
	cutCreateTimer (&timer);

	cudaMemset(d_odata, 0, mem_size);
	cudaThreadSynchronize();
	
	cutResetTimer (timer);
	cutStartTimer (timer);
	for (int i = 0; i < it; i++) {
		cudaMemcpy(d_odata, d_idata, mem_size, cudaMemcpyDeviceToDevice);

		cudaThreadSynchronize();	
	}
	cutStopTimer (timer);
	time = cutGetTimerValue (timer);		
	
	cudaFree(d_idata);
	cudaFree(d_odata);
	
	cutilCheckMsg("Memcpy Kernel execution failed");

	return time;
}

//-------------------------------------------------------------------------------

int main (int argc, char** argv)
{
	cudaSetDevice ( 0 );
	
	int n = 3, Ni = 4096*4096; // Number of arrays to be interalced or split into
	
	unsigned long int N_o = Ni*n; // Number of elements in interlaced array
	unsigned long int N = N_o/n; // Number of elements in inout array

	unsigned long int mem_size = N_o * sizeof(float); 
	
	float gb = (((float)N_o + (float)N_o) * sizeof(float)/ 1e9);

	float *h_idata   = (float*)malloc(mem_size);
	float *h_odata   = (float*)malloc(mem_size);
	
	for (unsigned long int i = 0, j = 0; i < N; i++) {
		for (int k = 0; k < n; k++, j++)
			h_idata[(k*N)+i] = (float)i;
	}
	

	int it = 100;

	//------------------------------------------------------------------------- 
	float time0, time;
	
		time  = deinterlace_call (h_odata, h_idata, n, N, mem_size, it);
	
		time0  = memcpy_call (h_odata, h_idata, n, N, mem_size, it);
	//------------------------------------------------------------------------- 

 	
	printf ("De-intrlacing into");
	printf("%2d arrays(%5.4fGb)      Kernel-%8.3f(Gb/s)      Memcpy-%8.3f(Gb/s)       \n", n, gb, gb/((time / it) * 1e-3), gb/((time0 / it) * 1e-3));
	
	//------------------------------------------------------------------------- 
	
	free(h_odata);
 	free(h_idata);

	cudaThreadExit();
	
	return 0;
}