#include #include #include void __global__ kernel1(int* counter, int* tmp) { atomicAdd(counter, 1); while (true) { __syncthreads(); } } void __global__ kernel2(int* counter, int* tmp) { int global_thread_id = threadIdx.x + blockIdx.x * blockDim.x + blockIdx.y * gridDim.x * blockDim.x; int max = gridDim.x * blockDim.x * gridDim.y; atomicAdd(counter, 1); int data = 0; int id = global_thread_id % max; while (true) { data += 1; tmp[id] = data; __syncthreads(); if (data == 100000) { atomicAdd(counter, 1); } } } int main(int argc, char** argv) { if (argc < 6) { printf("usage: \n"); return 0; } int device = atoi(argv[1]); int grid_x = atoi(argv[2]); int grid_y = atoi(argv[3]); int threads_per_block = atoi(argv[4]); int kernel = atoi(argv[5]); cudaSetDevice(device); int* device_tmp; int* host_tmp; int* device_counter; int* new_counter; cudaStream_t kStream, mStream; host_tmp = (int*)malloc(sizeof(int) * threads_per_block * grid_x * grid_y); for (int i = 0; i < threads_per_block * grid_x * grid_y; i++) { host_tmp[i] = 0; } cudaMalloc((void**)&device_tmp, sizeof(int) * threads_per_block * grid_x * grid_y); cudaMalloc((void**)&device_counter, sizeof(int)); cudaMallocHost((void**)&new_counter, sizeof(int)); *new_counter = 0; cudaMemcpy(device_counter, new_counter, sizeof(int), cudaMemcpyHostToDevice); cudaMemcpy(device_tmp, host_tmp, sizeof(int) * threads_per_block * grid_x * grid_y, cudaMemcpyHostToDevice); dim3 dimGrid(grid_x, grid_y); cudaStreamCreate(&kStream); cudaStreamCreate(&mStream); switch (kernel) { case 1: kernel1<<>>(device_counter, device_tmp); break; case 2: kernel2<<>>(device_counter, device_tmp); break; } while (true) { sleep(2); cudaMemcpyAsync(new_counter, device_counter, sizeof(int), cudaMemcpyDeviceToHost, mStream); cudaStreamSynchronize(mStream); printf("%d\n", *new_counter); printf("%s\n", cudaGetErrorString(cudaGetLastError())); } }