// includes, system #include #include #include #include #include // includes, kernels #include //////////////////////////////////////////////////////////////////////////////// // prototypes void allocate_host(void ** data, int size); void allocate_dev(void ** data, int size); void error_check(cudaError_t result); void load_input_data_to_GPU_tex_mem(unsigned char * in_array, void * dev_ptr, cudaChannelFormatDesc * cd, texture * tex, unsigned int size); void setup_exec_config_and_run_kernel(int num_threads, int num_blocks, uint4 * output, unsigned int index_range, unsigned int tex_id); void gen_char_data(unsigned char * data, unsigned int size); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { //initialise CUDA device CUT_DEVICE_INIT(argc, argv); /******************* initialise input data parameters ****************************/ const unsigned int N = 1*1024*1024; const unsigned int num_threads = 512; const unsigned int index_range = 256; const unsigned int divisor = num_threads * index_range; const unsigned int num_blocks = N / divisor + (N%divisor == 0 ? 0:1); /***************** allocate memory on host for input *******************************/ unsigned char * host_input; allocate_host((void **) &host_input,N); /***************** allocate memory on device for input *****************************/ unsigned char * dev_input; allocate_dev((void **) &dev_input, N); /***************** allocate memory on host for output ******************************/ const unsigned int output_size = N*sizeof(uint4); void * host_output; allocate_host((void **) &host_output,output_size); /***************** allocate memory on device for output ****************************/ uint4 * dev_output; allocate_dev((void **) &dev_output,output_size); /***************** setup texture memory and read format *********************/ input_texture1.addressMode[0] = cudaAddressModeClamp; input_texture1.addressMode[1] = cudaAddressModeClamp; input_texture1.filterMode = cudaFilterModePoint; input_texture1.normalized = 0; //channel format description parameters x, y, z and w specify the number of bits for each component of the resulting texture fetch //this allows ability to split fetch (4 bytes) into 1, 2, 3 or 4 individual components which can be independantly addressed cudaChannelFormatDesc input_cd = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned); /***************** setup test procedure *******************************************/ gen_char_data(host_input,N); load_input_data_to_GPU_tex_mem(host_input,dev_input,&input_cd,&input_texture1,N); //setup execution parameters and run kernel setup_exec_config_and_run_kernel(num_threads,num_blocks,dev_output,index_range,1); cudaError_t result = CUDA_SAFE_CALL(cudaMemcpy(host_output,dev_output,output_size,cudaMemcpyDeviceToHost)); error_check(result); /******************** Test procedure complete ***********************/ /******************** Output results ********************************/ uint4 * res = (uint4 *) host_output; for (int i=0; i<5; i++) { printf("----------------------\n"); printf("index %d:\n",i); printf("x: %u\n", res[i].x); printf("y: %u\n",res[i].y); printf("z: %u\n",res[i].z); printf("w: %u\n",res[i].w); printf("----------------------\n"); } /******************** cleanup *****************************************/ CUDA_SAFE_CALL(cudaFreeHost(host_output)); CUDA_SAFE_CALL(cudaFree(dev_output)); CUT_EXIT(argc, argv); } /* * error_check * * Test 'result' and report either success or failure. * In the event of failure, CUDA-specific error information is also given */ void error_check(cudaError_t result) { if (result == cudaSuccess) { // printf("success\n"); } else if (result == cudaErrorInvalidValue) { printf("Invalid value\n"); exit(1); } else if (result == cudaErrorInvalidDevicePointer) { printf("Invalid device pointer\n"); exit(1); } else if (result == cudaErrorInvalidTexture) { printf("invalid texture\n"); exit(1); } else if (result == cudaErrorInvalidMemcpyDirection) { printf("Invalid copy direction\n"); exit(1); } else { printf("%s\n",cudaGetErrorString(result)); exit(1); } } /* * allocate_host * * allocates memory required by 'size' bytes to start address 'data' * initialises all locations to 0 */ void allocate_host(void ** data, int size) { cudaError_t result = CUDA_SAFE_CALL(cudaMallocHost(data,size)); error_check(result); memset(*data,0,size); } /* * allocate_dev * * allocates memory required by 'size' bytes to start address 'data' * reports success or failure with any relevant CUDA-specific information */ void allocate_dev(void ** data, int size) { cudaError_t result = CUDA_SAFE_CALL(cudaMalloc(data, size)); error_check(result); cudaMemset(*data,0,size); } void setup_exec_config_and_run_kernel(int num_threads, int num_blocks, uint4 * output, unsigned int index_range, unsigned int tex_id) { dim3 dimBlock(num_threads,1,1); dim3 dimGrid(num_blocks,1,1); my_kernel<<>>(tex_id, index_range, output); CUT_CHECK_ERROR("Kernel execution Failed"); } void load_input_data_to_GPU_tex_mem(unsigned char * in_array, void * dev_ptr, cudaChannelFormatDesc* cd, texture *tex, unsigned int size) { if (in_array == NULL) { printf ("Input data array is NULL\n"); exit(1); } if (dev_ptr == NULL) { printf("Device pointer points to NULL\n"); exit(1); } if (tex == NULL) { printf("Texture reference points to NULL\n"); exit(1); } //copy memory to device memory cudaError_t result = CUDA_SAFE_CALL(cudaMemcpy(dev_ptr,(void*) in_array,size,cudaMemcpyHostToDevice)); error_check(result); //bind memory to texture reference result = CUDA_SAFE_CALL(cudaBindTexture(NULL,tex,(void*) dev_ptr, cd, size)); error_check(result); } void gen_char_data(unsigned char * data, unsigned int size) { for (unsigned int i=0; i