I put all the variable initialization below. so one question I have is the following. is it ok to hold pointers to device memory in the c portion of code? for instance I want to keep all the mesh data in global memory and not have to reallocate it every time the function is called.
[codebox]
// only using 12 threads to test on a simple cube
int gx = 1;//(int)(cd.sizeM1/12);
int gy = 1;//(int)(cd.sizeM2/12);
// setup execution parameters
dim3 grid( gx, gy, 1);
dim3 threads( 12, 12, 1);
float *d_t1,*d_t2;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t1, sizeof(float)*3 ));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_t2, sizeof(float)*3 ));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_t1, t1, sizeof(float)*3 ,cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_t2, t2, sizeof(float)*3 ,cudaMemcpyHostToDevice) );
float *d_m1,*d_m2;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_m1, sizeof(float)9s1 ));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_m2, sizeof(float)9s2 ));
// copy host memory to device
CUDA_SAFE_CALL(cudaMemcpy(d_m1, m1, sizeof(float)*9*s1 ,cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_m2, m2, sizeof(float)*9*s2 ,cudaMemcpyHostToDevice) );
int *d_r1,*d_r2;
CUDA_SAFE_CALL(cudaMalloc((void**) &d_r1, sizeof(int)*s1));
CUDA_SAFE_CALL(cudaMalloc((void**) &d_r2, sizeof(int)*s2 ));
CUDA_SAFE_CALL(cudaMemcpy(d_r1, h_r1, sizeof(int)*s1 ,cudaMemcpyHostToDevice) );
CUDA_SAFE_CALL(cudaMemcpy(d_r2, h_r2, sizeof(int)*s2 ,cudaMemcpyHostToDevice) );
kernelCol<<< grid, threads>>>(d_t1, d_t2, d_m1, d_m2, d_r1, d_r2);
// check if kernel execution generated and error
CUT_CHECK_ERROR("Kernel execution failed");
CUT_SAFE_CALL( cutStopTimer( timer));
printf( "time: %f (ms)\n", cutGetTimerValue( timer));
CUT_SAFE_CALL( cutDeleteTimer( timer));
cudaMemcpy( h_r1 , d_r1 , sizeof(int)*s1 , cudaMemcpyDeviceToHost) ;
cudaMemcpy( h_r2 , d_r2 , sizeof(int)*s2 , cudaMemcpyDeviceToHost) ;
printf("m1: ");
for(int i = 0;i<s1;i++)
{
printf("%d ",h_r1[i]);
}
printf("m2: ");
for(int i = 0;i<s2;i++)
{
printf("%d ",h_r2[i]);
}
CUDA_SAFE_CALL(cudaFree(d_t1));
CUDA_SAFE_CALL(cudaFree(d_t2));
CUDA_SAFE_CALL(cudaFree(d_r1));
CUDA_SAFE_CALL(cudaFree(d_r2));
CUDA_SAFE_CALL(cudaFree(d_m1));
CUDA_SAFE_CALL(cudaFree(d_m2));[/codebox]