#ifndef _CUDA_KERNEL_H_ #define _CUDA_KERNEL_H_ #include #include #include // declare texture reference for 1D unsigned char texture texture input_texture1; #define SDATA( index) CUT_BANK_CHECKER(sdata, index) __device__ void write_result(unsigned int flag, uint4 * result, unsigned int result_index, uint4 * results_array) { if (!flag) { ; } else { results_array[result_index] = *result; } } __global__ void my_kernel(unsigned int tex_id, unsigned short index_range, uint4 * output_data) { //setup index location for each thread unsigned int start_index = blockIdx.x * blockDim.x * index_range + threadIdx.x * index_range; uint4 result; //flags to indicate whether a 'A' is detected unsigned int flag; //current index location in texture unsigned int input_index = start_index; //4 bytes of data uchar4 ch; //track number of bytes processed unsigned int bytes_processed = 0; //while thread has not completed its range of input bytes to test... while (bytes_processed < index_range) { //retrieve next input character set (uchar4) /* * This test will always evaluate to true given the constant input from the host * However, if this test is removed (leaving "ch = tex1d....index);" ), the output result * below is correct * */ if (tex_id == 1) { ch = tex1Dfetch(input_texture1,input_index); } /* * input stream = AABBCzzzzzzzzzz............AABBCzzz..... etc * when an A is detected, result should be initialised with the given values. * */ if (ch.x == 'A') { /* * This set of values will always work, regardless of the test (tex_id == 1) condition above. * Adding ANY constant term to ANY of the 4 fields (as in the '3' below) results in correct operation * HOWEVER, if ALL fields are based on ANY variable available to the kernel, operation fails. (see second comment) */ result.x = blockDim.x; result.y = 3; result.z = threadIdx.x; result.w = start_index + bytes_processed; flag = 1; /* * This set will ONLY work when the test (tex_id == 1) is disabled. * compare with test enabled to see difference * */ // result.x = blockDim.x; // result.y = blockIdx.x; // result.z = threadIdx.x; // result.w = start_index + bytes_processed; // flag = 1; /* * General comment * Is this an issue with texture memory reading or compiler optimization (when no constants are used) * If you declare (statically) n textures and perform a switch test on tex_id the results are only output correctly * when a constant term is used in any field of 'result' * I hope this is enough information for a solution to be found. (or at least a reason as to why this happens) * Thanks. * */ } write_result(flag,&result,0,output_data); flag = 0; //update input index input_index++; //update progress through buffer bytes_processed+=4; } } #endif