Hi,
i have a weird bug when writing to gpu memory, for some reason the data isn’t written to the address
i specify. I can’t find any reason why shouldn’t it, so i would like to ask if there are some logical
issues in the code.
The kernel i am running is this:
__constant__ gpu_mem_t d_gpu_mem;
extern "C" __global__ void cppn_init_activation() { // run: one thread per X Y pair
net_pack_t *cp;
cp=(net_pack_t*) d_gpu_mem.cppn_base;
cp->new_links=(uint) cp->nvalues;
cp->nvalues[2]=2.0f;
cp->nvalues[3]=3.0f;
cp->nvalues[4]=4.0f;
}
very simple. The d_gpu_mem constant has the address to net_pack_t struct and it is declared like this:
typedef struct net_pack_t {
uint nodes_total;
uint links_total;
ushort num_inputs;
ushort num_outputs;
link_weight_t min_weight;
uint links_max;
uint nodes_max;
byte net_creation_result;
uint new_links;
uint deepest_level;
char *data_base_addr;
ulong data_len;
link_weight_t *lweights;
node_num_t *srcs;
net_id_t *net_ids;
node_num_t *dsts;
node_value_t *nvalues;
node_value_t *nmirror;
node_func_t *nfuncs;
net_table_t *nets_table;
inlinks_table_t *inlinks;
unsigned char *depth_status;
node_value_t *depth_levels;
node_value_t *inputs;
node_value_t *outputs;
} net_pack_t;
The buggy code i am executing is this:
device_ptr=(CUdeviceptr) gi->d_gpu_mem.cppn_base;
cucall(cuMemcpyDtoH(&np_tmp,device_ptr,sizeof(net_pack_t)));
printf("nvalues on gpu=%d\n",np_tmp.nvalues);
device_ptr=(CUdeviceptr) np_tmp.nvalues;
printf("(before kernel call) checking values on gpu at device_ptr=%d\n",device_ptr);
cucall(cuMemcpyDtoH(&float_arr,device_ptr,sizeof(float)*7));
for(i=0;i<7;i++) printf("f[%d]=%f ",i,float_arr[i]);
printf("\n");
cucall(cuParamSetSize(cuf_cppn_init_activation,0));
cucall(cuFuncSetBlockShape(cuf_cppn_init_activation,1,1,1));
cucall(cuLaunchGrid(cuf_cppn_init_activation,1,1));
printf("after kernel call\n");
device_ptr=(CUdeviceptr) gi->d_gpu_mem.cppn_base;
cucall(cuMemcpyDtoH(&np_tmp,device_ptr,sizeof(net_pack_t)));
printf("nvalues on gpu=%d\n",np_tmp.nvalues);
printf("new_links=%d\n",np_tmp.new_links);
device_ptr=(CUdeviceptr) np_tmp.nvalues;
printf("copying data from gpu at device_ptr=%d\n",device_ptr);
cucall(cuMemcpyDtoH(&float_arr,device_ptr,sizeof(float)*7));
for(i=0;i<7;i++) printf("f[%d]=%f ",i,float_arr[i]);
printf("\n");
This is the output when the program is run:
nvalues on gpu=2174866
(before kernel call) checking values on gpu at device_ptr=2174866
f[0]=0.000000 f[1]=0.000000 f[2]=0.000000 f[3]=0.000000 f[4]=0.000000 f[5]=0.000000 f[6]=0.000000
after kernel call
nvalues on gpu=2174866
new_links=2174866
copying data from gpu at device_ptr=2174866
f[0]=0.000000 f[1]=0.000000 f[2]=0.000000 f[3]=0.000000 f[4]=0.000000 f[5]=0.000000 f[6]=0.000000
Now for some reason, the writes that the kernel does do not happen. Why could it be?
I am verifying the destination addres and it mantains its value at 2174866 all the time,
so what other reasons could there be for the GPU not writing values where it should ?
The GPU is GTX280, cuda 3.0beta, OS Gentoo lastest
Thanks in advance
p.s.
this is the ptx of the kernel
.version 1.4
.target sm_13
// compiled with /usr/local/cuda/open64/lib//be
// nvopencc 3.0 built on 2009-10-26
//-----------------------------------------------------------
// Compiling /tmp/tmpxft_00000e73_00000000-7_gpu.cpp3.i (/tmp/ccBI#.uIafJQ)
//-----------------------------------------------------------
//-----------------------------------------------------------
// Options:
//-----------------------------------------------------------
// Target:ptx, ISA:sm_13, Endian:little, Pointer Size:64
// -O3 (Optimization level)
// -g0 (Debug level)
// -m2 (Report advisories)
//-----------------------------------------------------------
.const .align 8 .b8 d_gpu_mem[96];
.entry cppn_init_activation
{
.reg .u32 %r<3>;
.reg .u64 %rd<7>;
.reg .f32 %f<5>;
.loc 17 1226 0
$LBB1_cppn_init_activation:
.loc 18 10 0
ld.const.u64 %rd1, [d_gpu_mem+48];
ld.global.u64 %rd2, [%rd1+88];
cvt.u32.u64 %r1, %rd2;
st.global.u32 [%rd1+32], %r1;
.loc 18 22 0
mov.f32 %f1, 0f40000000; // 2
ld.global.u64 %rd3, [%rd1+88];
st.global.f32 [%rd3+8], %f1;
.loc 18 23 0
mov.f32 %f2, 0f40400000; // 3
ld.global.u64 %rd4, [%rd1+88];
st.global.f32 [%rd4+12], %f2;
.loc 18 24 0
mov.f32 %f3, 0f40800000; // 4
ld.global.u64 %rd5, [%rd1+88];
st.global.f32 [%rd5+16], %f3;
.loc 18 25 0
exit;
$LDWend_cppn_init_activation:
} // cppn_init_activation