gld counter - visual profiler question

Hello again,

Could anyone tell me why in example, where I do not read anything from global memory (just write to it), I get counter gld > 0?

Shouldn’t this counter be also on zero, or if not, when does it reads from global memory in this case?

Btw: I am using GF8.

gld_uncoalesced = 32

gld_ coalesced = 0

gst_uncoalesced = 0

gst_coalesced = 128

local_load = 0

local_store = 0

kernel file:

//test

__device__ float *devInData;

__global__ void cudaInit(float* cdevInData) {

	devInData = cdevInData;

}

__global__ void cudaRun(int length) {

	//float sum = 0.0f;

	#ifdef __DEVICE_EMULATION__

	printf("thread:%d \t",blockIdx.x*blockDim.x+threadIdx.x);

	#endif

	for(int i = 0; i<length; i++) {

		#ifdef __DEVICE_EMULATION__

		printf("%d \t",threadIdx.x + blockDim.x * length*blockIdx.x+blockDim.x*i);

		#endif

	//	sum += devInData[threadIdx.x + blockDim.x * length*blockIdx.x+blockDim.x*i];

		devInData[threadIdx.x + blockDim.x * length*blockIdx.x+blockDim.x*i] = 0;

	}

	#ifdef __DEVICE_EMULATION__

	printf("\n");

	#endif

}

main file:

// includes, system

#include <stdlib.h>

#include <stdio.h>

// includes, project

#include <cutil_inline.h>

// includes, kernels

#include <test_kernel.cu>

int main( int argc, char** argv) {

	int blocks = 2;

	int threads = blocks * 16;

	int length = 16;	// each thread has 10 items to process

	float *hData = (float*)malloc(sizeof(float) * length * threads);

	for(int i=0; i<length*threads; i++) {

		if(i%length == 0)

			printf("\n\t\t");

		hData[i] = (float)i;

		printf("%.1f \t",hData[i]);

	}printf("\n\n");

	devInData = NULL;

	cutilSafeCall( cudaMalloc((void**) &devInData, ( sizeof(float) * length * threads )) );

	cutilSafeCall( cudaMemcpy( devInData, hData, ( sizeof(float) * length * threads ), cudaMemcpyHostToDevice) );

	cudaInit<<< 1, 1, 0 >>>(devInData);

	cutilSafeCall( cudaThreadSynchronize() );

	cudaRun<<< blocks, threads/blocks, 0 >>>(length);

	cutilSafeCall( cudaThreadSynchronize() );

}

Thanks

I just compiled with ptx option to nvcc and got this:

$LBB1__Z7cudaRunv:

	.loc	15	21	0

	cvt.u32.u16 	%r1, %ntid.x;		// 

	cvt.u32.u16 	%r2, %ctaid.x;   	// 

	mul.lo.u32 	%r3, %r2, %r1;		// 

	mul.lo.u32 	%r4, %r3, 16;	 	// 

	cvt.u32.u16 	%r5, %tid.x;	 	// 

	ld.global.u64 	%rd1, [devInData];	// id:15 devInData+0x0

	mov.f32 	%f1, 0f00000000;	 	// 0

	add.u32 	%r6, %r4, %r5;	   	// 

	cvt.u64.u32 	%rd2, %r6;	   	// 

	mul.lo.u64 	%rd3, %rd2, 4;		// 

	add.u64 	%rd4, %rd1, %rd3;		// 

	st.global.f32 	[%rd4+0], %f1; 	// id:18

	mov.f32 	%f2, 0f00000000;	 	// 0

I assume there is this load global which actually loads array or smthing (there is only this ld.global in ptx file)?

So as I understand it: each thread loads this devInData array or pointer, which adds +1 to gld_uncoalesced (32 threads = 32). But why to uncoalesced and how to fix it? :)