Is this a bug? magic = ok, variable = error

cudaUser3 · October 10, 2008, 10:34am

Hi there,
I have isolated what appears to be a compiler optimization error whereby if a variables are used to create a result of type uint4 - the output the host receives is zero.
However, if changing ANY of these variables to a magic number, all results the host receives are correct.
I have also subsequently discovered that this ONLY occurs when performing a conditional test on a kernel input parameter (even if this parameter is constant)

Theory which led to this is using a texture ID to determine which texture reference to take input from (so that stream i can work on data i bound to texture reference i - although these were statically allocated as it appears you cannot have an array of texture references).

Any help on this, possible reasonings or solutions are very welcome.
Being limited to one texture (albeit 128MB in linear memory) at a time significantly undermines the capability of the hardware. - especially when wanting asynchronous copies to be performed.

Thanks in advance,
Michael.

P.S. There are comments in the kernel to help replicate the error.

Source code for host and kernel are attached as well:

SOURCE CODE FOR HOST

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>

// includes, kernels
#include <cuda_kernel.cu>

////////////////////////////////////////////////////////////////////////////////
// prototypes
void allocate_host(void ** data, int size);
void allocate_dev(void ** data, int size);
void error_check(cudaError_t result);
void load_input_data_to_GPU_tex_mem(unsigned char * in_array, void * dev_ptr, cudaChannelFormatDesc * cd, texture<uchar4, 1, cudaReadModeElementType> * tex, unsigned int size);
void setup_exec_config_and_run_kernel(int num_threads, int num_blocks, uint4 * output, unsigned int index_range, unsigned int tex_id);
void gen_char_data(unsigned char * data, unsigned int size);

////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int
main( int argc, char** argv)
{

//initialise CUDA device	
CUT_DEVICE_INIT(argc, argv);	
    

/******************* initialise input data parameters ****************************/
    
const unsigned int N = 1*1024*1024; 
const unsigned int num_threads = 512;  
const unsigned int index_range = 256;
const unsigned int divisor = num_threads * index_range;
const unsigned int num_blocks = N / divisor + (N%divisor == 0 ? 0:1);    
   

/***************** allocate memory on host for input *******************************/
       
unsigned char * host_input;   
allocate_host((void **) &host_input,N);
   
   
/***************** allocate memory on device for input *****************************/
   
unsigned char * dev_input;
allocate_dev((void **) &dev_input, N);

     
/***************** allocate memory on host for output ******************************/

const unsigned int output_size = N*sizeof(uint4);
void * host_output;
allocate_host((void **) &host_output,output_size);
    
   
/***************** allocate memory on device for output ****************************/

uint4 * dev_output;
allocate_dev((void **) &dev_output,output_size);   


/***************** setup texture memory and read format *********************/
      
input_texture1.addressMode[0] = cudaAddressModeClamp;
input_texture1.addressMode[1] = cudaAddressModeClamp;
input_texture1.filterMode = cudaFilterModePoint;
input_texture1.normalized = 0;

    
//channel format description parameters x, y, z and w specify the number of bits for each component of the resulting texture fetch
//this allows ability to split fetch (4 bytes) into 1, 2, 3 or 4 individual components which can be independantly addressed
cudaChannelFormatDesc input_cd = cudaCreateChannelDesc(8, 8, 8, 8, cudaChannelFormatKindUnsigned);
    

/***************** setup test procedure *******************************************/

gen_char_data(host_input,N);

load_input_data_to_GPU_tex_mem(host_input,dev_input,&input_cd,&input_texture1,N);

//setup execution parameters and run kernel	
setup_exec_config_and_run_kernel(num_threads,num_blocks,dev_output,index_range,1);	    				


cudaError_t result = CUDA_SAFE_CALL(cudaMemcpy(host_output,dev_output,output_size,cudaMemcpyDeviceToHost));
error_check(result);


/******************** Test procedure complete ***********************/



/******************** Output results ********************************/ 	

uint4 * res = (uint4 *) host_output;

for (int i=0; i<5; i++)
{
	printf("----------------------\n");
	printf("index %d:\n",i);
	printf("x: %u\n", res[i].x);
	printf("y: %u\n",res[i].y);
	printf("z: %u\n",res[i].z);
	printf("w: %u\n",res[i].w);
	printf("----------------------\n");
}
  	
  	
/******************** cleanup *****************************************/

CUDA_SAFE_CALL(cudaFreeHost(host_output));
CUDA_SAFE_CALL(cudaFree(dev_output));


CUT_EXIT(argc, argv);

}

/*

error_check
Test ‘result’ and report either success or failure.
In the event of failure, CUDA-specific error information is also given
*/

void error_check(cudaError_t result)
{
if (result == cudaSuccess)
{
// printf(“success\n”);
}
else if (result == cudaErrorInvalidValue)
{
printf(“Invalid value\n”);
exit(1);
}
else if (result == cudaErrorInvalidDevicePointer)
{
printf(“Invalid device pointer\n”);
exit(1);
}
else if (result == cudaErrorInvalidTexture)
{
printf(“invalid texture\n”);
exit(1);
}
else if (result == cudaErrorInvalidMemcpyDirection)
{
printf(“Invalid copy direction\n”);
exit(1);
}
else
{
printf(“%s\n”,cudaGetErrorString(result));
exit(1);
}
}

/*

allocate_host
allocates memory required by ‘size’ bytes to start address ‘data’
initialises all locations to 0
*/

void allocate_host(void ** data, int size)
{
cudaError_t result = CUDA_SAFE_CALL(cudaMallocHost(data,size));
error_check(result);
memset(*data,0,size);
}

/*

allocate_dev
allocates memory required by ‘size’ bytes to start address ‘data’
reports success or failure with any relevant CUDA-specific information
*/

void allocate_dev(void ** data, int size)
{
cudaError_t result = CUDA_SAFE_CALL(cudaMalloc(data, size));
error_check(result);
cudaMemset(*data,0,size);
}

void setup_exec_config_and_run_kernel(int num_threads, int num_blocks, uint4 * output, unsigned int index_range, unsigned int tex_id)
{
dim3 dimBlock(num_threads,1,1);
dim3 dimGrid(num_blocks,1,1);

my_kernel<<<dimGrid,dimBlock>>>(tex_id, index_range, output);
CUT_CHECK_ERROR("Kernel execution Failed");

}

void load_input_data_to_GPU_tex_mem(unsigned char * in_array, void * dev_ptr, cudaChannelFormatDesc* cd, texture<uchar4, 1, cudaReadModeElementType> *tex, unsigned int size)
{
if (in_array == NULL)
{
printf (“Input data array is NULL\n”);
exit(1);
}
if (dev_ptr == NULL)
{
printf(“Device pointer points to NULL\n”);
exit(1);
}
if (tex == NULL)
{
printf(“Texture reference points to NULL\n”);
exit(1);
}

//copy memory to device memory
cudaError_t result = CUDA_SAFE_CALL(cudaMemcpy(dev_ptr,(void*) in_array,size,cudaMemcpyHostToDevice));
error_check(result);

//bind memory to texture reference
result = CUDA_SAFE_CALL(cudaBindTexture(NULL,tex,(void*) dev_ptr, cd, size));
error_check(result);

}

void
gen_char_data(unsigned char * data, unsigned int size)
{

for (unsigned int i=0; i<size; i++)
{
	data[i] = 'z';	
}

for (unsigned int i=0; i<size; i+=500)
{
	data[i] = 'A';
	data[i+1] = 'A';
	data[i+2] = 'B';
	data[i+3] = 'B';
	data[i+4] = 'C';
}

}

SOURCE CODE FOR KERNEL

#ifndef CUDA_KERNEL_H
#define CUDA_KERNEL_H
#include <cutil.h>
#include <stdio.h>
#include <math.h>

// declare texture reference for 1D unsigned char texture
texture<uchar4, 1, cudaReadModeElementType> input_texture1;

#define SDATA( index) CUT_BANK_CHECKER(sdata, index)

device void write_result(unsigned int flag, uint4 * result, unsigned int result_index, uint4 * results_array)
{
if (!flag)
{
;
}
else
{
results_array[result_index] = *result;
}
}

global void my_kernel(unsigned int tex_id, unsigned short index_range, uint4 * output_data)
{

//setup index location for each thread
unsigned int start_index = blockIdx.x * blockDim.x * index_range + threadIdx.x * index_range;	
			
uint4 result;

//flags to indicate whether a 'A' is detected
unsigned int flag;
	
//current index location in texture
unsigned int input_index = start_index;

//4 bytes of data
uchar4 ch;

//track number of bytes processed
unsigned int bytes_processed = 0;

//while thread has not completed its range of input bytes to test...
while (bytes_processed < index_range)
{		
	//retrieve next input character set (uchar4)

/*

This test will always evaluate to true given the constant input from the host
However, if this test is removed (leaving “ch = tex1d…index);” ), the output result
below is correct

*/
if (tex_id == 1)
{
ch = tex1Dfetch(input_texture1,input_index);
}

/*

input stream = AABBCzzzzzzzzzz…AABBCzzz… etc
when an A is detected, result should be initialised with the given values.

*/

	if (ch.x == 'A')
	{
		/*
		 * This set of values will always work, regardless of the test (tex_id == 1) condition above.
		 * Adding ANY constant term to ANY of the 4 fields (as in the '3' below) results in correct operation
		 * HOWEVER, if ALL fields are based on ANY variable available to the kernel, operation fails. (see second comment)
		 */
		result.x = blockDim.x;
		result.y = 3;
		result.z = threadIdx.x;
		result.w = start_index + bytes_processed;
		flag = 1;
		
		/*
		 * This set will ONLY work when the test (tex_id == 1) is disabled.
		 * compare with test enabled to see difference
		 * 
		 */

// result.x = blockDim.x;
// result.y = blockIdx.x;
// result.z = threadIdx.x;
// result.w = start_index + bytes_processed;
// flag = 1;

		/*
		 * General comment
		 * Is this an issue with texture memory reading or compiler optimization (when no constants are used)
		 * If you declare (statically) n textures and perform a switch test on tex_id the results are only output correctly
		 * 		when a constant term is used in any field of 'result'
		 * I hope this is enough information for a solution to be found. (or at least a reason as to why this happens)
		 * Thanks.
		 * 
		 */
	}
	
	write_result(flag,&result,0,output_data);	
	flag = 0;
	
	//update input index
	input_index++;
			
	//update progress through buffer
	bytes_processed+=4;
}

}

#endif

cudaUser3 · October 10, 2008, 10:34am

It appears I can’t add the files as attachments…
Sorry for the inconvenience

Michael.

Reimar · October 10, 2008, 11:13am

Change the extension to .txt and you can attach them.

If you really think it is a compiler problem I’d suggest comparing the generated ptx code, though to me it looks very suspicious that all threads write at the same location and for what I can tell all-zeros is one of the values being written, though that alone does not really explain your results.

netllama · October 10, 2008, 11:54am

In addition to Reimar’s suggestion, please confirm that this reproduces with most recent CUDA toolkit, and document which OS(s) and GPU(s) you’ve seen this behavior under.

Also, you didn’t explain what the expected & actual output from your test app is under these conditions.

cudaUser3 · October 10, 2008, 12:11pm

Thanks for your comments,
Sorry I didn’t include hardware information.
It is using CUDA version 2 on fedora 8 with dual xeon CPUs

As for the expected output, blockDim.x == 512, and I forget what gridDim is (but certainly not zero, which is the error value)
blockIdx and threadIdx are self explanatory :
0<= tid < blockDim.x
0<= bid < gridDim.x

Thanks for the information about attachments, should be on this post now.
cuda.cu.txt (6.62 KB)
cuda_kernel.cu.txt (2.95 KB)

cudaUser3 · October 10, 2008, 12:25pm

As for the ptx code, it shows the operations being performed perfectly. The correct registers are copying the appropriate values into the appropriate structure.

The address of the results structure in host code is the same before and after computation (as it should be)

(full ptx for both variable and constant versions are attached)

Here is the relevant sections of ptx code :

CONSTANT VERSION

---- setting values —

// 81 result.x = blockDim.x;

mov.s32 	%r31, %r1;           	// 

.loc	14	82	0

// 82 result.y = 3;

mov.u32 	%r32, 3;             	// 

mov.s32 	%r33, %r32;          	// 

.loc	14	83	0

// 83 result.z = threadIdx.x;

mov.s32 	%r34, %r4;           	// 

.loc	14	84	0

// 84 result.w = start_index + bytes_processed;

add.u32 	%r35, %r7, %r18;     	// 

mov.s32 	%r36, %r35;          	//

------- storing in global memory (write_result function) —

mov.s32 %r37, %r31; //

mov.s32 %r38, %r33; //

mov.s32 %r39, %r34; //

mov.s32 %r40, %r36; //

st.global.v4.u32 [%rd1+0], {%r37,%r38,%r39,%r40}; //

VARIABLE VERSION

— setting values ----

// 92 result.x = blockDim.x;

mov.s32 	%r31, %r1;           	// 

.loc	14	93	0

// 93 result.y = blockIdx.x;

mov.s32 	%r32, %r2;           	// 

.loc	14	94	0

// 94 result.z = threadIdx.x;

mov.s32 	%r33, %r4;           	// 

.loc	14	95	0

// 95 result.w = start_index + bytes_processed;

add.u32 	%r34, %r7, %r18;     	// 

mov.s32 	%r35, %r34;          	//

---- storing in global memory —

mov.s32 %r36, %r31; //

mov.s32 %r37, %r32; //

mov.s32 %r38, %r33; //

mov.s32 %r39, %r35; //

st.global.v4.u32 [%rd1+0], {%r36,%r37,%r38,%r39}; //
cuda_kernel.ptx.variable.txt (7.25 KB)
cuda_kernel.ptx.constant.txt (7.46 KB)

cudaUser3 · October 10, 2008, 12:28pm

EDIT: GPU is 8800GT

cudaUser3 · October 13, 2008, 8:30am

In addition,
The line below is taken from the kernel source code and the zero represents the index to write the result to.

write_result(flag,&result,0,output_data);

If this index is based on a global variable which undergoes atomic increments each pass then NO situation from the initial example results in correct output. (the .w field of ‘result’ should always increase (in steps of 500, given the host code) since it is a count of the bytes processed.

[ i.e. atomicInc((unsigned int *)results_index,max_results_count) ) where results_index is a kernel parameter of type unsigned int * and max_results_count is simply output_size / sizeof(uint4) ]

The only situation I can explain this with is if blocks are not executed in a linear fashion.
Is this the case?

tmurray · October 13, 2008, 4:36pm

There’s no guaranteed scheduling for blocks or threads.

Sarnath · October 15, 2008, 7:26am

Well, they are guaranteed to be scheduled. but the order in which they are scheduled is not guaranteed.

Topic		Replies	Views
Possible nvcc bug? CUDA Programming and Performance	13	8830	January 9, 2011
Different runs using same parameters produce different results CUDA Programming and Performance cuda	5	659	October 12, 2021
__constant memory issues CUDA Programming and Performance	32	12615	September 15, 2010
Odd error fixed by commenting unrelated line? CUDA Programming and Performance	11	8654	February 17, 2010
Texture fetches bug I hope this complete report helps CUDA Programming and Performance	5	2864	June 19, 2008
Peculiar Shared Memory Behavior - NEED HELP! A test program appears to have two arrays sharing v CUDA Programming and Performance	21	2194	June 5, 2011
strange error in summation memory problems CUDA Programming and Performance	14	16649	July 21, 2010
A value of zero is either printed as 1 or 4 in a kernel for some specific case !! CUDA Programming and Performance	7	747	January 13, 2018
Weird behaviour with large data and textures CUDA Programming and Performance	0	675	August 3, 2009
Incosistent results - can't explain CUDA Programming and Performance	18	3122	May 10, 2010

Is this a bug? magic = ok, variable = error

Related topics