#include <stdio.h>
#include <cuda.h>
#include <math.h>

// includes, project
#include <cutil.h>

// includes, kernels
#include <tmp_kernel.cu>

void copyArrayFromDevice(float* host, 
                         const float* device, 
                         unsigned int pbo, 
                         int numBodies)
{       
    CUDA_SAFE_CALL(cudaMemcpy(host, device, numBodies*4*sizeof(float), 
                              cudaMemcpyDeviceToHost));    
}

void copyArrayToDevice(float* device, const float* host, int numBodies)
{
    CUDA_SAFE_CALL(cudaMemcpy(device, host, numBodies*4*sizeof(float), 
                              cudaMemcpyHostToDevice));
}

void threadSync() { cudaThreadSynchronize(); }

// Allocates a matrix with random float entries.
void randomInit(double* data, int size)
{
    for (int i = 0; i < size; ++i)
        data[i] = 0.001;
}



//////////////////////////////////////////////////////////////////////////////
// Program main
//////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{
    CUT_DEVICE_INIT(argc, argv);
    
    unsigned int size_best_fit = 18;
    unsigned int mem_size_best_fit = sizeof(double) * size_best_fit;

	// allocate host memory for vector best_fit
	double* best_fit = (double *)malloc(mem_size_best_fit);
	
	// initialize host memory
    randomInit(best_fit, size_best_fit);
	
	// allocate device memory
    double* d_best_fit;
    CUDA_SAFE_CALL(cudaMalloc((void**) &d_best_fit, mem_size_best_fit));
    
    // copy host memory to device
    CUDA_SAFE_CALL(cudaMemcpy(d_best_fit, best_fit, mem_size_best_fit,
                              cudaMemcpyHostToDevice) );
		
	
	if(best_fit == NULL)
	{
		printf("ERROR");
		return 0;
	}

	// Setup fit parameters	
	int n_Ri = 10;
	double a_Ri = 1e-4;
	double b_Ri = 1e-6;

	int n_Ra = 10;
	double a_Ra = 1e-6;
	double b_Ra = 1e-8;

	int n_u = 10;
	double a_u = 0.1;
	double b_u = 0.3;

	int n_f = 10;
	double a_f = 0.01;
	double b_f = 1;

	int n_F = 10;
	double a_F = 0.01;
	double b_F = 1;
	
	double amax = 24;
	double OBS_actual[18] = {0,0,0,0.3774698797909421,0.9295964152018034,2.2888179669638697,4.9572477946528934,10.060737092639414,19.212700771497765,32.64962221028801,50.602760882723594,72.15450919845851,91.01944190848369,111.51973488772018,127.85040140587097,141.27914970726457,144.9363176965305,114.84572391087971};

	
	//Compute execution configuration
	dim3 dimBlock(n_f,n_F);
	/*if(n_f*n_F <= 512)
	{
		//dim3 dimBlock(n_f,n_F);		
	}
	else
	{
		//Error
		return 0;
	}*/

	int nBlocks_x = n_Ri*n_u;
	int nBlocks_y = n_Ra;
	dim3 dimGrid( nBlocks_x, nBlocks_y);
	
	// execute the kernel
	compute_fit<<< dimGrid,dimBlock >>>(n_Ri, a_Ri, b_Ri, n_Ra, a_Ra, b_Ra, n_u, a_u, b_u, n_f, a_f, b_f, n_F, a_F, b_F, OBS_actual, amax, d_best_fit);	

	// check if kernel execution generated and error
    CUT_CHECK_ERROR("Kernel execution failed");

    // copy result from device to host
    CUDA_SAFE_CALL(cudaMemcpy(best_fit, d_best_fit, mem_size_best_fit,
                              cudaMemcpyDeviceToHost) );
    
    // clean up memory
    free(best_fit);    
    CUDA_SAFE_CALL(cudaFree(d_best_fit));                              

	return 0;
}