this code:
__global void test (float *gpu)
{
for(int i=0;i<1000;i++)
{
gpu[i]=i; //!!! this cost a lot of time
}
main.cu
float * gpu;
gpu=cudaMalloc((void **)&gpu,4*1000);
dim grid grids(1)
dim thead threads(100)
test<<<girds,threads>>>gpu;
this code cost a lot of time more than 100ms
what’s wrong ?
Is write global memory slower than CPU memory
I use this code fo Genetic Algorithm,so code must be wirtten like this.