possible bug with nvcc - char values in registers

I think I found a bug with nvcc but I wanted to make sure that I’m not missing something. I attached the whole code since I’m not sure yet if it’s the whole thing or just the last part. In the attached code I marked the problematic part. It’s the part where is thread transposes a 4x4 pixel block (in char format). With the marked syncthreads call the program puts int data[4] in local memory and everything works fine (calls to it are done using ld.local.s8). The same happens if I mark data as volatile. Without the syncthreads the output is mixed up (can’t tell exactly whats wrong but it seems that there is some permutation in the pixels). The code is accessed using mov.s32 instead of s8.

The code

global void TransposeImageKernelShared(char *in, size_t inStride,
char *out, size_t outStride, int width, int height)
{
extern shared char s_data;

int dataStride = blockDim.x*4;

// -------------------
// Read Data Row Major
// -------------------

int x0 = blockIdx.x*blockDim.x;
int y0 = blockIdx.y*blockDim.y;

int dx = threadIdx.x;
int dy = threadIdx.y;

// Read 4 rows of data into the shared memory
for (int i = 0 ; i < 4 ; i++)
*((int *)(s_data + (dy*4 + i)*dataStride) + dx) = *((int *)(in + ((y0 + dy)*4 + i)*inStride) + x0 + dx);

// ----
// Sync
// ----
__syncthreads();

// -----------------------
// Write Data Column Major
// -----------------------

dx = threadIdx.y;
dy = threadIdx.x;

==============================================
start of problematic part

int data[4];

for (int i = 0 ; i < 4 ; i++)
data[i] = *((int *)(s_data + (dy*4 + i)*dataStride) + dx);

for (int i = 0 ; i < 4 ; i++)
{
char4 val;

val.x = *((char *)&data[0] + i);
val.y = *((char *)&data[1] + i);
val.z = *((char *)&data[2] + i);
val.w = *((char *)&data[3] + i);

the call that changes the result

__syncthreads();

*((char4 *)(out + ((x0 + dx)*4 + i)*outStride) + y0 + dy) = val;
}

}


if it matters, The kernel is called by

// launch kernel
dim3 dimGrid(width/(16*4), height/(16*4));
// 4 ints wide by 16 lines long
dim3 dimBlock(16, 16);
// Size in bytes 16x16
int sharedMemSize = dimBlock.x*4*dimBlock.y*4;

TransposeImageKernelShared <<< dimGrid, dimBlock, sharedMemSize >>>
(in, inStride, out, outStride, width, height);

The relevant part of the ptx file WITHOUT the __syncthreads call

.loc	15	104	0
mov.s32 	%r38, 0;             	// 
mov.s32 	%r39, %r24;          	// 
and.b32 	%r40, %r39, 255;     	// 
mov.s32 	%r40, %r40;          	// 
or.b32 	%r41, %r40, 0;        	// 
.loc	15	105	0
mov.s32 	%r42, %r29;          	// 
and.b32 	%r43, %r41, -65281;  	// 
and.b32 	%r44, %r42, 255;     	// 
shl.b32 	%r44, %r44, 8;       	// 
or.b32 	%r41, %r43, %r44;     	// 
.loc	15	106	0
mov.s32 	%r45, %r33;          	// 
and.b32 	%r46, %r41, -16711681;	// 
and.b32 	%r47, %r45, 255;     	// 
shl.b32 	%r47, %r47, 16;      	// 
or.b32 	%r41, %r46, %r47;     	// 
.loc	15	107	0
mov.s32 	%r48, %r37;          	// 
and.b32 	%r49, %r41, 16777215;	// 
and.b32 	%r50, %r48, 255;     	// 
shl.b32 	%r50, %r50, 24;      	// 
or.b32 	%r41, %r49, %r50;     	// 

The relevant part of the ptx file WITH the __syncthreads call

.loc	15	104	0
mov.s32 	%r38, 0;             	// 
ld.local.s8 	%r39, [%rd54+0]; 	// id:99 __cuda___cuda_data_040+0x0
and.b32 	%r40, %r39, 255;     	// 
mov.s32 	%r40, %r40;          	// 
or.b32 	%r41, %r40, 0;        	// 
.loc	15	105	0
ld.local.s8 	%r42, [%rd54+4]; 	// id:100 __cuda___cuda_data_040+0x0
and.b32 	%r43, %r41, -65281;  	// 
and.b32 	%r44, %r42, 255;     	// 
shl.b32 	%r44, %r44, 8;       	// 
or.b32 	%r41, %r43, %r44;     	// 
.loc	15	106	0
ld.local.s8 	%r45, [%rd54+8]; 	// id:101 __cuda___cuda_data_040+0x0
and.b32 	%r46, %r41, -16711681;	// 
and.b32 	%r47, %r45, 255;     	// 
shl.b32 	%r47, %r47, 16;      	// 
or.b32 	%r41, %r46, %r47;     	// 
.loc	15	107	0
ld.local.s8 	%r48, [%rd54+12];	// id:102 __cuda___cuda_data_040+0x0
and.b32 	%r49, %r41, 16777215;	// 
and.b32 	%r50, %r48, 255;     	// 
shl.b32 	%r50, %r50, 24;      	// 
or.b32 	%r41, %r49, %r50;     	//