#include #include #include // includes, project #include // includes, kernels #include void copyArrayFromDevice(float* host, const float* device, unsigned int pbo, int numBodies) { CUDA_SAFE_CALL(cudaMemcpy(host, device, numBodies*4*sizeof(float), cudaMemcpyDeviceToHost)); } void copyArrayToDevice(float* device, const float* host, int numBodies) { CUDA_SAFE_CALL(cudaMemcpy(device, host, numBodies*4*sizeof(float), cudaMemcpyHostToDevice)); } void threadSync() { cudaThreadSynchronize(); } // Allocates a matrix with random float entries. void randomInit(double* data, int size) { for (int i = 0; i < size; ++i) data[i] = 0.001; } ////////////////////////////////////////////////////////////////////////////// // Program main ////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { CUT_DEVICE_INIT(argc, argv); unsigned int size_best_fit = 18; unsigned int mem_size_best_fit = sizeof(double) * size_best_fit; // allocate host memory for vector best_fit double* best_fit = (double *)malloc(mem_size_best_fit); // initialize host memory randomInit(best_fit, size_best_fit); // allocate device memory double* d_best_fit; CUDA_SAFE_CALL(cudaMalloc((void**) &d_best_fit, mem_size_best_fit)); // copy host memory to device CUDA_SAFE_CALL(cudaMemcpy(d_best_fit, best_fit, mem_size_best_fit, cudaMemcpyHostToDevice) ); if(best_fit == NULL) { printf("ERROR"); return 0; } // Setup fit parameters int n_Ri = 10; double a_Ri = 1e-4; double b_Ri = 1e-6; int n_Ra = 10; double a_Ra = 1e-6; double b_Ra = 1e-8; int n_u = 10; double a_u = 0.1; double b_u = 0.3; int n_f = 10; double a_f = 0.01; double b_f = 1; int n_F = 10; double a_F = 0.01; double b_F = 1; double amax = 24; double OBS_actual[18] = {0,0,0,0.3774698797909421,0.9295964152018034,2.2888179669638697,4.9572477946528934,10.060737092639414,19.212700771497765,32.64962221028801,50.602760882723594,72.15450919845851,91.01944190848369,111.51973488772018,127.85040140587097,141.27914970726457,144.9363176965305,114.84572391087971}; //Compute execution configuration dim3 dimBlock(n_f,n_F); /*if(n_f*n_F <= 512) { //dim3 dimBlock(n_f,n_F); } else { //Error return 0; }*/ int nBlocks_x = n_Ri*n_u; int nBlocks_y = n_Ra; dim3 dimGrid( nBlocks_x, nBlocks_y); // execute the kernel compute_fit<<< dimGrid,dimBlock >>>(n_Ri, a_Ri, b_Ri, n_Ra, a_Ra, b_Ra, n_u, a_u, b_u, n_f, a_f, b_f, n_F, a_F, b_F, OBS_actual, amax, d_best_fit); // check if kernel execution generated and error CUT_CHECK_ERROR("Kernel execution failed"); // copy result from device to host CUDA_SAFE_CALL(cudaMemcpy(best_fit, d_best_fit, mem_size_best_fit, cudaMemcpyDeviceToHost) ); // clean up memory free(best_fit); CUDA_SAFE_CALL(cudaFree(d_best_fit)); return 0; }