Hi,
I am trying to understand the Cuda coalesced memory acces restrictions. I wrote a very simple kernel that i thought would be a good starting point for non-coalesced reads and writes. But the profiler shows many. I’m reading and writing 32-bit floats in linear order…
Any suggestions?
Kind regards,
Daniel Dekkers
[codebox]#include <stdio.h>
#include <cutil.h>
global void CudaSqrtKernel
(const float *p_CudaArrayIn,
float *p_CudaArrayOut,
const int p_Size)
{
int idx = (blockIdx.x * blockDim.x + threadIdx.x);
if (idx<p_Size)
{
float l_Temp = p_CudaArrayIn[idx];
l_Temp = sqrt(l_Temp)+ 3.0f;
p_CudaArrayOut[idx] = l_Temp;
}
}
// ============================================================
================
extern “C” void
CudaSqrt
(const float *p_ArrayIn,
float *p_ArrayOut,
const int p_Size)
{
float *l_CudaArrayIn;
float *l_CudaArrayOut;
cudaMalloc((void**) &l_CudaArrayIn, p_Size*sizeof(float));
cudaMalloc((void**) &l_CudaArrayOut, p_Size*sizeof(float));
cudaMemcpy(l_CudaArrayIn, p_ArrayIn, p_Size*sizeof(float), cudaMemcpyHostToDevice);
// Do calculation on device...
int block_size = 384;
int n_blocks = p_Size/block_size + (p_Size%block_size == 0 ? 0:1);
CudaSqrtKernel <<< n_blocks, block_size >>> (l_CudaArrayIn, l_CudaArrayOut, p_Size);
cudaMemcpy(p_ArrayOut, l_CudaArrayOut, p_Size*sizeof(float), cudaMemcpyDeviceToHost);
cudaFree(l_CudaArrayIn);
cudaFree(l_CudaArrayOut);
}
[/codebox]