// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>


// includes, kernels
#include <cuda_kernel.cu>


////////////////////////////////////////////////////////////////////////////////
// prototypes
void allocate_host(void ** data, int size);
void allocate_dev(void ** data, int size);
void error_check(cudaError_t result);
void load_input_data_to_GPU_tex_mem(unsigned char * in_array, void * dev_ptr, cudaChannelFormatDesc * cd, texture<uchar4, 1, cudaReadModeElementType> * tex, unsigned int size);
void setup_exec_config_and_run_kernel(int num_threads, int num_blocks, uint4 * output, unsigned int index_range, unsigned int tex_id);
void gen_char_data(unsigned char * data, unsigned int size);


////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv) 
{

	//initialise CUDA device	
	CUT_DEVICE_INIT(argc, argv);	
	    
    
    /******************* initialise input data parameters ****************************/
        
    const unsigned int N = 1*1024*1024; 
    const unsigned int num_threads = 512;  
    const unsigned int index_range = 256;
    const unsigned int divisor = num_threads * index_range;
    const unsigned int num_blocks = N / divisor + (N%divisor == 0 ? 0:1);    
       
    
    /***************** allocate memory on host for input *******************************/
           
    unsigned char * host_input;   
    allocate_host((void **) &host_input,N);
       
       
    /***************** allocate memory on device for input *****************************/
       
    unsigned char * dev_input;
    allocate_dev((void **) &dev_input, N);
    
         
    /***************** allocate memory on host for output ******************************/
    
    const unsigned int output_size = N*sizeof(uint4);
    void * host_output;
    allocate_host((void **) &host_output,output_size);
        
       
    /***************** allocate memory on device for output ****************************/
    
    uint4 * dev_output;
    allocate_dev((void **) &dev_output,output_size);   
   
 	
    /***************** setup texture memory and read format *********************/
          
	input_texture1.addressMode[0] = cudaAddressModeClamp;
	input_texture1.addressMode[1] = cudaAddressModeClamp;
	input_texture1.filterMode = cudaFilterModePoint;
	input_texture1.normalized = 0;
	
	    
	//channel format description parameters x, y, z and w specify the number of bits for each component of the resulting texture fetch
    //this allows ability to split fetch (4 bytes) into 1, 2, 3 or 4 individual components which can be independantly addressed
    cudaChannelFormatDesc input_cd = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
        
    
    /***************** setup test procedure *******************************************/
    
    gen_char_data(host_input,N);

	load_input_data_to_GPU_tex_mem(host_input,dev_input,&input_cd,&input_texture1,N);
	
	//setup execution parameters and run kernel	
	setup_exec_config_and_run_kernel(num_threads,num_blocks,dev_output,index_range,1);	    				


	cudaError_t result = CUDA_SAFE_CALL(cudaMemcpy(host_output,dev_output,output_size,cudaMemcpyDeviceToHost));
	error_check(result);

 	
  	/******************** Test procedure complete ***********************/
  	
  	
  	/******************** Output results ********************************/ 	

	uint4 * res = (uint4 *) host_output;
	
    for (int i=0; i<5; i++)
    {
    	printf("----------------------\n");
    	printf("index %d:\n",i);
    	printf("x: %u\n", res[i].x);
    	printf("y: %u\n",res[i].y);
    	printf("z: %u\n",res[i].z);
    	printf("w: %u\n",res[i].w);
    	printf("----------------------\n");
    }
  	  	
  	  	
  	/******************** cleanup *****************************************/
  	
    CUDA_SAFE_CALL(cudaFreeHost(host_output));
    CUDA_SAFE_CALL(cudaFree(dev_output));

    
    CUT_EXIT(argc, argv);
}

/*
 * error_check
 * 
 * Test 'result' and report either success or failure.
 * In the event of failure, CUDA-specific error information is also given
 */

void error_check(cudaError_t result)
{
	if (result == cudaSuccess)
	{
//		printf("success\n");		
	}
	else if (result == cudaErrorInvalidValue)
	{
		printf("Invalid value\n");
		exit(1);
	}
	else if (result == cudaErrorInvalidDevicePointer)
	{
		printf("Invalid device pointer\n");
		exit(1);
	}
	else if (result == cudaErrorInvalidTexture)
	{
		printf("invalid texture\n");
		exit(1);
	}
	else if (result == cudaErrorInvalidMemcpyDirection)
	{
		printf("Invalid copy direction\n");
		exit(1);
	}
	else
	{		
		printf("%s\n",cudaGetErrorString(result));
		exit(1);
	}	
}

/*
 * allocate_host
 * 
 * allocates memory required by 'size' bytes to start address 'data'
 * initialises all locations to 0 
 */

void allocate_host(void ** data, int size)
{
	cudaError_t result = CUDA_SAFE_CALL(cudaMallocHost(data,size));	
	error_check(result);
	memset(*data,0,size);
}

/*
 * allocate_dev
 * 
 * allocates memory required by 'size' bytes to start address 'data'
 * reports success or failure with any relevant CUDA-specific information 
 */

void allocate_dev(void ** data, int size)
{	
	cudaError_t result = CUDA_SAFE_CALL(cudaMalloc(data, size));	
	error_check(result);
	cudaMemset(*data,0,size);
}


void setup_exec_config_and_run_kernel(int num_threads, int num_blocks, uint4 * output, unsigned int index_range, unsigned int tex_id)
{
	dim3 dimBlock(num_threads,1,1);
    dim3 dimGrid(num_blocks,1,1);
    
    my_kernel<<<dimGrid,dimBlock>>>(tex_id, index_range, output);
	CUT_CHECK_ERROR("Kernel execution Failed");    
}


void load_input_data_to_GPU_tex_mem(unsigned char * in_array, void * dev_ptr, cudaChannelFormatDesc* cd, texture<uchar4, 1, cudaReadModeElementType> *tex, unsigned int size)
{
	if (in_array == NULL)
	{
		printf ("Input data array is NULL\n");
		exit(1);
	}
	if (dev_ptr == NULL)
	{
		printf("Device pointer points to NULL\n");
		exit(1);
	}
	if (tex == NULL)
	{
		printf("Texture reference points to NULL\n");
		exit(1);
	}
	
	//copy memory to device memory
	cudaError_t result = CUDA_SAFE_CALL(cudaMemcpy(dev_ptr,(void*) in_array,size,cudaMemcpyHostToDevice));
	error_check(result);
	
	//bind memory to texture reference
	result = CUDA_SAFE_CALL(cudaBindTexture(NULL,tex,(void*) dev_ptr, cd, size));
	error_check(result);	

}

void
gen_char_data(unsigned char * data, unsigned int size)
{
	
	for (unsigned int i=0; i<size; i++)
	{
		data[i] = 'z';	
	}
	
	for (unsigned int i=0; i<size; i+=500)
	{
		data[i] = 'A';
		data[i+1] = 'A';
		data[i+2] = 'B';
		data[i+3] = 'B';
		data[i+4] = 'C';
	}	
}