The following kernel does nothing but reading out an [font=“Courier”]int2[/font]value from global memory, copy it into four register, add to each x-component a constant number (here it is 1,2,3 and 4) and write the results out to another location in global memory.
#include <GL/glew.h>
#include <cutil.h>
#define TYPE int2
__device__ TYPE* d_in;
__device__ TYPE* d_out;
__global__ void zefix(TYPE* g_in, TYPE* g_out) {
TYPE data0;
TYPE data1;
TYPE data2;
TYPE data3;
TYPE data = g_in[0];
// __syncthreads();
data0 = data;
data1 = data;
data2 = data;
data3 = data;
data0.x = data.x + 1;
data1.x = data.x + 2;
data2.x = data.x + 3;
data3.x = data.x + 4;
g_out[0] = data0;
g_out[1] = data1;
g_out[2] = data2;
g_out[3] = data3;
__syncthreads();
}
int main(int argc, char** argv) {
CUDA_SAFE_CALL(cudaMalloc((void**)&d_in, sizeof(TYPE)*4));
CUDA_SAFE_CALL(cudaMemset((void*)d_in, 0, sizeof(TYPE)*4));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_out, sizeof(TYPE)*4));
CUDA_SAFE_CALL(cudaMemset((void*)d_out, 0, sizeof(TYPE)*4));
CUT_DEVICE_INIT();
cudaThreadSynchronize();
zefix <<< 1, 1 >>> (d_in, d_out);
cudaThreadSynchronize();
TYPE* host = (TYPE*)malloc(sizeof(TYPE) * 4);
CUDA_SAFE_CALL(cudaMemcpy(host, d_out, sizeof(
TYPE) * (4), cudaMemcpyDeviceToHost));
cudaThreadSynchronize();
for (int i = 0; i < 4; i++) {
printf("%d \n", host[i].x);
}
}
However the output is [font=“Courier”][1,5,3,7][/font] instead of [font=“Courier”][1,2,3,4][/font]. Of course, this does not occur in emulation mode. I tested it on two different 8800 GTS and one GTX and in all three cases the output is not as expected.
Inserting a [font=“Courier”]__syncthread()[/font] after the read, produces a correct output, however from my understanding this should not be necessary.
Then I found out that this behaviour does not occurs for scalar types, such as plain [font=“Courier”]int[/font] and [font=“Courier”]float![/font] Note that I have used [font=“Courier”]int2[/font] here and it is not working either with [font=“Courier”]float2[/font]. However it works with [font=“Courier”]int3[/font], [font=“Courier”]float3[/font], [font=“Courier”]int4[/font], and [font=“Courier”]float4[/font].
So I have checked the ptx code and in deed there is something weird when using 2-component types: those writes producing wrong results are performed using two separate write commands (one for each component), whereas the correct stores use just one write command.
So what am I doing wrong here? How can I make the compiler do it in one write instead of two? Did anyone encounter a similar problem?