i have seen those sdk examples… but i am little bit confusing about tat … thing is where i have to initialize timers for below mentioned program… i want know how much time it is taking for A[1024] [ 1024] on the gpu … hwlp me sir
global static void cudaFunction(float *A)
{
int x=threadIdx.x+blockIdx.x*blockDim.x;
int y=blockIdx.y;
A[x+y*1024]=whatever you want to do;
}
int main(int argc,int *argv){
float A_host=(float)malloc(10241024sizeof(float)); //allocate the array in host memory
//do preprocessing or whatever with A_host, use A_host[x+y*1024] to access elements
float *A_device;
cudaMalloc((void**)&A_device,10241024sizeof(float)); //allocate the array in memory on your graphics card
cudaMemcpy(A_device,A_host,10241024sizeof(float),cudaMemcp
yHostToDevice); //copy the contents of the host array to the device array, so you can work with that on the gpu
cudaFunction<<<dim3(1024/256,1024,1),dim3(256,1,1)>>>(A_device); //call the kernel on the gpu
cudaMemcpy(A_host,A_device,10241024sizeof(float),cudaMemcp
yDeviceToHost); //copy the results from the gpu back to host memory
//do whatever else you want to do with A_host
return 0;
}