help with a weird bug in the code doesn't writes to memory

Hi,

i have a weird bug when writing to gpu memory, for some reason the data isn’t written to the address

i specify. I can’t find any reason why shouldn’t it, so i would like to ask if there are some logical

issues in the code.

The kernel i am running is this:

__constant__ gpu_mem_t d_gpu_mem;

extern "C" __global__ void cppn_init_activation() { // run: one thread per X Y pair

	net_pack_t *cp;

	cp=(net_pack_t*) d_gpu_mem.cppn_base;

	cp->new_links=(uint) cp->nvalues;

	cp->nvalues[2]=2.0f;

	cp->nvalues[3]=3.0f;

	cp->nvalues[4]=4.0f;

}

very simple. The d_gpu_mem constant has the address to net_pack_t struct and it is declared like this:

typedef struct net_pack_t {

		uint nodes_total; 

		uint links_total;

		ushort num_inputs;

		ushort num_outputs;

		link_weight_t min_weight;

		uint links_max;

		uint nodes_max;

		byte net_creation_result;

		uint new_links;

		uint deepest_level;

		char *data_base_addr;

		ulong data_len;

		link_weight_t *lweights;

		node_num_t *srcs;

		net_id_t *net_ids;

		node_num_t *dsts;

		node_value_t *nvalues;

		node_value_t *nmirror;

		node_func_t *nfuncs;

		net_table_t *nets_table;

		inlinks_table_t *inlinks;

		unsigned char *depth_status; 

		node_value_t *depth_levels; 

		node_value_t *inputs;

		node_value_t *outputs;

} net_pack_t;

The buggy code i am executing is this:

device_ptr=(CUdeviceptr) gi->d_gpu_mem.cppn_base;

			cucall(cuMemcpyDtoH(&np_tmp,device_ptr,sizeof(net_pack_t)));

			printf("nvalues on gpu=%d\n",np_tmp.nvalues);

			device_ptr=(CUdeviceptr) np_tmp.nvalues;

			printf("(before kernel call) checking values on gpu at device_ptr=%d\n",device_ptr);

			cucall(cuMemcpyDtoH(&float_arr,device_ptr,sizeof(float)*7));

			for(i=0;i<7;i++) printf("f[%d]=%f ",i,float_arr[i]);

			printf("\n");

			cucall(cuParamSetSize(cuf_cppn_init_activation,0));

			cucall(cuFuncSetBlockShape(cuf_cppn_init_activation,1,1,1));

			cucall(cuLaunchGrid(cuf_cppn_init_activation,1,1));

			printf("after kernel call\n");

			device_ptr=(CUdeviceptr) gi->d_gpu_mem.cppn_base;

			cucall(cuMemcpyDtoH(&np_tmp,device_ptr,sizeof(net_pack_t)));

			printf("nvalues on gpu=%d\n",np_tmp.nvalues);

			printf("new_links=%d\n",np_tmp.new_links);

			device_ptr=(CUdeviceptr) np_tmp.nvalues;

			printf("copying data from gpu at device_ptr=%d\n",device_ptr);

			cucall(cuMemcpyDtoH(&float_arr,device_ptr,sizeof(float)*7));

			for(i=0;i<7;i++) printf("f[%d]=%f ",i,float_arr[i]);

			printf("\n");

This is the output when the program is run:

nvalues on gpu=2174866

(before kernel call) checking values on gpu at device_ptr=2174866

f[0]=0.000000 f[1]=0.000000 f[2]=0.000000 f[3]=0.000000 f[4]=0.000000 f[5]=0.000000 f[6]=0.000000

after kernel call

nvalues on gpu=2174866

new_links=2174866

copying data from gpu at device_ptr=2174866

f[0]=0.000000 f[1]=0.000000 f[2]=0.000000 f[3]=0.000000 f[4]=0.000000 f[5]=0.000000 f[6]=0.000000

Now for some reason, the writes that the kernel does do not happen. Why could it be?

I am verifying the destination addres and it mantains its value at 2174866 all the time,

so what other reasons could there be for the GPU not writing values where it should ?

The GPU is GTX280, cuda 3.0beta, OS Gentoo lastest

Thanks in advance

p.s.

this is the ptx of the kernel

.version 1.4

		.target sm_13

		// compiled with /usr/local/cuda/open64/lib//be

		// nvopencc 3.0 built on 2009-10-26

		//-----------------------------------------------------------

		// Compiling /tmp/tmpxft_00000e73_00000000-7_gpu.cpp3.i (/tmp/ccBI#.uIafJQ)

		//-----------------------------------------------------------

		//-----------------------------------------------------------

		// Options:

		//-----------------------------------------------------------

		//  Target:ptx, ISA:sm_13, Endian:little, Pointer Size:64

		//  -O3 (Optimization level)

		//  -g0 (Debug level)

		//  -m2 (Report advisories)

		//-----------------------------------------------------------

		.const .align 8 .b8 d_gpu_mem[96];

		.entry cppn_init_activation

		{

		.reg .u32 %r<3>;

		.reg .u64 %rd<7>;

		.reg .f32 %f<5>;

		.loc	17	  1226	0

$LBB1_cppn_init_activation:

		.loc	18	  10	  0

		ld.const.u64	%rd1, [d_gpu_mem+48];

		ld.global.u64   %rd2, [%rd1+88];

		cvt.u32.u64	 %r1, %rd2;

		st.global.u32   [%rd1+32], %r1;

		.loc	18	  22	  0

		mov.f32		 %f1, 0f40000000;		// 2

		ld.global.u64   %rd3, [%rd1+88];

		st.global.f32   [%rd3+8], %f1;

		.loc	18	  23	  0

		mov.f32		 %f2, 0f40400000;		// 3

		ld.global.u64   %rd4, [%rd1+88];

		st.global.f32   [%rd4+12], %f2;

		.loc	18	  24	  0

		mov.f32		 %f3, 0f40800000;		// 4

		ld.global.u64   %rd5, [%rd1+88];

		st.global.f32   [%rd5+16], %f3;

		.loc	18	  25	  0

		exit;

$LDWend_cppn_init_activation:

		} // cppn_init_activation

i have got a little progress. i replaced cuda 3.0beta for 2.3 and got the same problem. then i made a new kernel that calculated the sum of 3 values right after the first kernel call and it gave 9 as result, it means 2+3+4 were set into gpu memory. It is the problem of transfering between GPU and CPU. So i did a step with gdb session and this is what i got (when transfering from GPU to CPU):

314				 cucall(cuMemcpyDtoH(&float_arr,device_ptr,sizeof(float)*7));

(gdb) print float_arr

$4 = {0, 0, 0, 0, 0, 0, 0, 4.58490845e-41, -1.86802556e+12, 4.58490845e-41, -1.47304782e+12, 4.58490845e-41, 

  -1.86802556e+12, 4.58490845e-41, 3.22298647e-44, 0, 3.22298647e-44, 0, -1.46771935e+12, 4.58490845e-41}

(gdb) step

__cucall (cu_err=CUDA_SUCCESS, file=0x405982 "gpu_api.c", line=314) at gpu_api.c:39

39		  if( CUDA_SUCCESS != cu_err) {

(gdb) step

44	  }

(gdb) step

add_net (gi=0x7fffdda88e20, cp=0x17f2ff0, gnme_idx={index = 8589934593, f = {idx = 1, slot = 2}}, level=2, 

	active_inlinks=0x7fffdda88dc0, num_active_inlinks=1, net_tables_out=0x6072e0, net_size=0x7fffdda88ddc)

	at gpu_api.c:315

315				 for(i=0;i<7;i++) printf("f[%d]=%f ",i,float_arr[i]);

(gdb) step

316				 printf("\n");

(gdb) print float_arr

$5 = {0, 0,5.87747175e-39, 5.90043063e-39, 5.9233895e-39, 0, 0, 4.58490845e-41, -1.86802556e+12, 4.58490845e-41, 

  -1.47304782e+12, 4.58490845e-41, -1.86802556e+12, 4.58490845e-41, 3.22298647e-44, 0, 3.22298647e-44, 0, 

  -1.46771935e+12, 4.58490845e-41}

(gdb) step

f[0]=0.000000 f[1]=0.000000 f[2]=0.000000 f[3]=0.000000 f[4]=0.000000 f[5]=0.000000 f[6]=0.000000 

318				 cucall(cuParamSetSize(cuf_cppn_debug,0));

(gdb)

now before the transfer the float_arr has the first 7 values set to 0 ( transfer size is 7 floats)

after the call we see the values at index 2, 3 and 4 set to some strange numbers. I suppose those were my 2,3 and 4

before :

(gdb) print float_arr

$4 = {0, 0, 0, 0, 0, 0, 0, 4.58490845e-41, -1.86802556e+12, 4.58490845e-41, -1.47304782e+12, 4.58490845e-41,

-1.86802556e+12, 4.58490845e-41, 3.22298647e-44, 0, 3.22298647e-44, 0, -1.46771935e+12, 4.58490845e-41}

after:

(gdb) print float_arr

$5 = {0, 0, 5.87747175e-39, 5.90043063e-39, 5.9233895e-39, 0, 0, 4.58490845e-41, -1.86802556e+12, 4.58490845e-41,

-1.47304782e+12, 4.58490845e-41, -1.86802556e+12, 4.58490845e-41, 3.22298647e-44, 0, 3.22298647e-44, 0,

-1.46771935e+12, 4.58490845e-41}

the temporal variables i use are defined locally with fixed size:

net_pack_t np_tmp;

float float_arr[20];

and other vaules inside float_arr of size=20 remains the same, only 3 values change and those indexes are exactly that i change by the kernel on the gpu. It means, “something” during the transfer happens. Maybe floating point numbers are converted to another format?

I have a problem with my code and it sounds similar to yours. I don’t really know what happens, but when I execute in EMULATION mode everything is fine which means I give it some starting values, and it returns some calculations.

When I execute in DEBUG mode (using the GPU) the result and start values are the same.

Does this sound similar to your problem to you?

If yes, have you made any progress?

My environment is Windows7, Cuda 3.0, GeForce 8800GTS.