Hello again,
Could anyone tell me why in example, where I do not read anything from global memory (just write to it), I get counter gld > 0?
Shouldn’t this counter be also on zero, or if not, when does it reads from global memory in this case?
Btw: I am using GF8.
gld_uncoalesced = 32
gld_ coalesced = 0
gst_uncoalesced = 0
gst_coalesced = 128
local_load = 0
local_store = 0
kernel file:
//test
__device__ float *devInData;
__global__ void cudaInit(float* cdevInData) {
devInData = cdevInData;
}
__global__ void cudaRun(int length) {
//float sum = 0.0f;
#ifdef __DEVICE_EMULATION__
printf("thread:%d \t",blockIdx.x*blockDim.x+threadIdx.x);
#endif
for(int i = 0; i<length; i++) {
#ifdef __DEVICE_EMULATION__
printf("%d \t",threadIdx.x + blockDim.x * length*blockIdx.x+blockDim.x*i);
#endif
// sum += devInData[threadIdx.x + blockDim.x * length*blockIdx.x+blockDim.x*i];
devInData[threadIdx.x + blockDim.x * length*blockIdx.x+blockDim.x*i] = 0;
}
#ifdef __DEVICE_EMULATION__
printf("\n");
#endif
}
main file:
// includes, system
#include <stdlib.h>
#include <stdio.h>
// includes, project
#include <cutil_inline.h>
// includes, kernels
#include <test_kernel.cu>
int main( int argc, char** argv) {
int blocks = 2;
int threads = blocks * 16;
int length = 16; // each thread has 10 items to process
float *hData = (float*)malloc(sizeof(float) * length * threads);
for(int i=0; i<length*threads; i++) {
if(i%length == 0)
printf("\n\t\t");
hData[i] = (float)i;
printf("%.1f \t",hData[i]);
}printf("\n\n");
devInData = NULL;
cutilSafeCall( cudaMalloc((void**) &devInData, ( sizeof(float) * length * threads )) );
cutilSafeCall( cudaMemcpy( devInData, hData, ( sizeof(float) * length * threads ), cudaMemcpyHostToDevice) );
cudaInit<<< 1, 1, 0 >>>(devInData);
cutilSafeCall( cudaThreadSynchronize() );
cudaRun<<< blocks, threads/blocks, 0 >>>(length);
cutilSafeCall( cudaThreadSynchronize() );
}
Thanks