Hello all,
I did a basic test: 1 thread block, 128 threads in the thread block. A shared memory buffer of 128 integer elements. The kernel is as following:
[i]#include <stdio.h>
#include “cuda_runtime_api.h”
global void smem_offset_test(int * out)
{
shared float var_arr[128];
float sum = 0;
if(threadIdx.x == 0)
{
for(int i = 0; i < 128; i++)
{
var_arr[i] = i;
}
}
__syncthreads();
for(int i = 0; i < 128; i++)
{
sum += var_arr[i];
}
out[threadIdx.x] = sum;
return;
}[/i]
The NVVP shows that for the load of shared memory is 2-way conflicted caused by the loop of “sum += var_arr[i];”. I don’t understand the reason.
My understanding is, 128 threads makeup 4 warps. For each warp, all threads read same element for each iteration. I check the ptx file, but I didn’t find any clue that I can understand.
The whole source code is as following. Any feedback is appreciated.
Susan
///////////////////////////////////////////////////
///test.cu
#include <stdio.h>
#include “cuda_runtime_api.h”
global void smem_offset_test(int * out)
{
shared int var_arr[128];
int sum = 0;
if(threadIdx.x == 0)
{
for(int i = 0; i < 128; i++)
{
var_arr[i] = i;
}
}
__syncthreads();
for(int i = 0; i < 128; i++)
{
sum += var_arr[i];
}
out[threadIdx.x] = sum;
return;
}
int main()
{
int * d_data, * h_data;
cudaMalloc((void**)&d_data, sizeof(int) * 128);
h_data = (int *)malloc(sizeof(int) * 128);
smem_offset_test<<<1, 128>>>(d_data);
cudaThreadSynchronize();
cudaMemcpy(h_data, d_data, sizeof(int) * 128, cudaMemcpyDeviceToHost);
cudaFree(d_data);
free(h_data);
return 0;
}