/* Project which finds a working conifugration of threads on device * */ // includes, system #include #include #include #include // includes, project #include #define __64K 65536 #define __32K 32768 #define __512 512 __global__ static void testKernel (int* data) { int tx = threadIdx.x; int ty = threadIdx.y; int bx = blockIdx.x; int by = blockIdx.y; int TX = blockDim.x; int TY = blockDim.y; int BX = gridDim.x; int BY = gridDim.y; if (tx == 0) { data[0] = 30; // just a sample number } else if ( tx == TX-1 && ty == TY-1 && bx == BX-1 && by == BY-1) { data[1] = 27; } __syncthreads(); } //////////////////////////////////////////////////////////////////////////////// // declaration, forward void runTest( int argc, char** argv); //////////////////////////////////////////////////////////////////////////////// // Program main //////////////////////////////////////////////////////////////////////////////// int main( int argc, char** argv) { runTest( argc, argv); CUT_EXIT(argc, argv); } //////////////////////////////////////////////////////////////////////////////// //! Run a simple test for CUDA //////////////////////////////////////////////////////////////////////////////// void runTest( int argc, char** argv) { CUT_DEVICE_INIT(); int h_idata[2] = {0}; // allocate device memory int* d_idata; CUDA_SAFE_CALL( cudaMalloc( (void**) &d_idata, 2*sizeof(int))); CUDA_SAFE_CALL( cudaMemset( d_idata, 0, 2*sizeof(int)) ); //////////////////////////// FILE* outFile; outFile = fopen ( "cuda_configuration.txt" , "w" ); if (outFile==NULL) {fputs ("File error",stderr); exit (1);} //////////////////////////// unsigned int MAX_BLOCKS_X = 512; unsigned int MAX_BLOCKS_Y = 512; unsigned int MAX_THREADS_X = 512; unsigned int MAX_THREADS_Y = 512; // setup execution parameters dim3 grid; dim3 threads; for (int BY = 1; BY <= MAX_BLOCKS_Y; BY *= 2 ) { grid.y = BY; for (int BX = 1; BX <= MAX_BLOCKS_X; BX *= 2 ) { grid.x = BX; for (int TY = 1; TY <= MAX_THREADS_Y; TY *= 2 ) { threads.y = TY; for (int TX = 1; TX <= MAX_THREADS_X; TX *= 2 ) { threads.x = TX; //threads.z; /* Dg is of type dim3 and specifies the dimension and size of the grid, such that Dg.x * Dg.y equals the number of blocks being launched; Db is of type dim3 and specifies the dimension and size of each block, such that Db.x * Db.y * Db.z equals the number of threads per block; */ double totThreads = TX * TY; double totBlocks = BX * BY; CUDA_SAFE_CALL( cudaMemset( d_idata, 0, 2*sizeof(int)) ); testKernel<<< grid, threads >>>(d_idata); CUDA_SAFE_CALL(cudaMemcpy(h_idata, d_idata, sizeof(int) * 2, cudaMemcpyDeviceToHost)); if ( h_idata[0] == 30 && h_idata[1] == 27 ) { fprintf (outFile, "Config: By=%d, Bx=%d, Ty=%d, Tx=%d : \tBlocks=%.0f Threads=%.0f \tValid\n", BY, BX, TY, TX, totBlocks, totThreads); } else { fprintf (outFile, "Config: By=%d, Bx=%d, Ty=%d, Tx=%d : \tNot Valid\n", BY, BX, TY, TX); } } } } } // custom configuration test for our algo fprintf (outFile, "------------------Custom configuration test------------------\n"); threads.y = 1; threads.x = 512; grid.y = 176; // our packet sample size normally for (int BX = 128; BX <= 8192; BX *= 2 ) { grid.x = BX; double totBlocks = BX * grid.y; CUDA_SAFE_CALL( cudaMemset( d_idata, 0, 2*sizeof(int)) ); testKernel<<< grid, threads >>>(d_idata); CUDA_SAFE_CALL(cudaMemcpy(h_idata, d_idata, sizeof(int) * 2, cudaMemcpyDeviceToHost)); if ( h_idata[0] == 30 && h_idata[1] == 27 ) { fprintf (outFile, "Config: By=%d, Bx=%d, Ty=%d, Tx=%d : \tBlocks=%.0f Threads=512 \tValid\n", grid.y, BX, threads.y, threads.x, totBlocks); } else { fprintf (outFile, "Config: By=%d, Bx=%d, Ty=%d, Tx=%d : \tNot Valid\n", grid.y, BX, threads.y, threads.x); } } fclose (outFile); // check if kernel execution generated an error CUT_CHECK_ERROR("Kernel execution failed"); // cleanup memory CUDA_SAFE_CALL(cudaFree(d_idata)); }