#ifndef _CUDA_KERNEL_H_
#define _CUDA_KERNEL_H_
#include <cutil.h>
#include <stdio.h>
#include <math.h>


// declare texture reference for 1D unsigned char texture
texture<uchar4, 1, cudaReadModeElementType> input_texture1;


#define SDATA( index)      CUT_BANK_CHECKER(sdata, index)


__device__ void write_result(unsigned int flag, uint4 * result, unsigned int result_index, uint4 * results_array)
{
	if (!flag)
	{
		;
	}
	else
	{
		results_array[result_index] = *result;
	}	
}


__global__ void my_kernel(unsigned int tex_id, unsigned short index_range, uint4 * output_data) 
{		
		
	//setup index location for each thread
	unsigned int start_index = blockIdx.x * blockDim.x * index_range + threadIdx.x * index_range;	
				
	uint4 result;
	
	//flags to indicate whether a 'A' is detected
	unsigned int flag;
		
	//current index location in texture
	unsigned int input_index = start_index;
	
	//4 bytes of data
	uchar4 ch;
	
	//track number of bytes processed
	unsigned int bytes_processed = 0;

	//while thread has not completed its range of input bytes to test...
	while (bytes_processed < index_range)
	{		
		//retrieve next input character set (uchar4)

/*
 * This test will always evaluate to true given the constant input from the host
 * However, if this test is removed (leaving "ch = tex1d....index);" ), the output result
 * 	below is correct
 * 
 */
		if (tex_id == 1)
		{
			ch = tex1Dfetch(input_texture1,input_index);
		}


/*
 * input stream = AABBCzzzzzzzzzz............AABBCzzz..... etc
 * when an A is detected, result should be initialised with the given values.
 * 
 */
		
		if (ch.x == 'A')
		{
			/*
			 * This set of values will always work, regardless of the test (tex_id == 1) condition above.
			 * Adding ANY constant term to ANY of the 4 fields (as in the '3' below) results in correct operation
			 * HOWEVER, if ALL fields are based on ANY variable available to the kernel, operation fails. (see second comment)
			 */
			result.x = blockDim.x;
			result.y = 3;
			result.z = threadIdx.x;
			result.w = start_index + bytes_processed;
			flag = 1;
			
			/*
			 * This set will ONLY work when the test (tex_id == 1) is disabled.
			 * compare with test enabled to see difference
			 * 
			 */			
//			result.x = blockDim.x;
//			result.y = blockIdx.x;
//			result.z = threadIdx.x;
//			result.w = start_index + bytes_processed;
//			flag = 1;
			
			/*
			 * General comment
			 * Is this an issue with texture memory reading or compiler optimization (when no constants are used)
			 * If you declare (statically) n textures and perform a switch test on tex_id the results are only output correctly
			 * 		when a constant term is used in any field of 'result'
			 * I hope this is enough information for a solution to be found. (or at least a reason as to why this happens)
			 * Thanks.
			 * 
			 */
		}
		
		write_result(flag,&result,0,output_data);	
		flag = 0;
		
		//update input index
		input_index++;
				
		//update progress through buffer
		bytes_processed+=4;
	}	
}

#endif