// -*- c -*- #include #include #include "cuda.h" __constant__ unsigned int c_vals[256]; texture tex; __global__ void const_speedtest(unsigned int *d_outvals) { int idx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int cur = c_vals[threadIdx.x]; for (int i = 0; i < 1000; i++) { cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; cur = c_vals[cur] & 0xff; } d_outvals[idx] = cur; } __global__ void tex_speedtest(unsigned int *d_outvals) { int idx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int cur = tex1Dfetch(tex, threadIdx.x); for (int i = 0; i < 1000; i++) { cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; cur = tex1Dfetch(tex, cur) & 0xff; } d_outvals[idx] = cur; } __global__ void shmem_speedtest(unsigned int *d_outvals, unsigned int *d_invals) { __shared__ unsigned int shared[256]; int idx = blockIdx.x * blockDim.x + threadIdx.x; shared[threadIdx.x] = d_invals[threadIdx.x]; unsigned int cur = shared[threadIdx.x]; for (int i = 0; i < 1000; i++) { cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; cur = shared[cur] & 0xff; } d_outvals[idx] = cur; } __global__ void global_speedtest(unsigned int *d_outvals, unsigned int *d_invals) { int idx = blockIdx.x * blockDim.x + threadIdx.x; unsigned int cur = d_invals[threadIdx.x]; for (int i = 0; i < 1000; i++) { cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; cur = d_invals[cur] & 0xff; } d_outvals[idx] = cur; } int main(int argc, char *argv[]) { int n_blocks = 500; int block_size = 256; int device = 0; if (argc > 1) device = atoi(argv[1]); cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, device); printf("Selecting Device %d: %s\n", device, deviceProp.name); cudaSetDevice(device); // allocate global memory for output unsigned int *d_outvals; cudaMalloc((void**)&d_outvals, sizeof(unsigned int) * n_blocks * block_size); // host memory for values unsigned int vals[256]; // RANDOM MODE // start with elements in order for (int i = 0; i < 256; i++) vals[i] = i; // random permutation for (int i = 0; i < 256; i++) { int swap = vals[i]; // NEVER EVER DO THIS IF YOU CARE ABOUT RAND QUALITY int j = i + (rand() % (256-i)); vals[i] = vals[j]; vals[j] = swap; } // global memory for values unsigned int *d_vals; cudaMalloc((void**)&d_vals, sizeof(unsigned int) * 256); // copy memory to device cudaMemcpy(d_vals, vals, sizeof(unsigned int) * 256, cudaMemcpyHostToDevice); cudaBindTexture(0,tex,d_vals,sizeof(unsigned int) * 256); cudaMemcpyToSymbol(c_vals, vals, sizeof(unsigned int) * 256); // run the kernels for (int i = 0; i < 10; i++) { const_speedtest<<< n_blocks, block_size >>>(d_outvals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { tex_speedtest<<< n_blocks, block_size >>>(d_outvals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { shmem_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { global_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals); } cudaThreadSynchronize(); // WARP-BROADCAST MODE for (int i = 0; i < 256; i+=32) { vals[i] = rand() & 0xff; for (int j = i+1; j < i+32; j++) vals[j] = vals[i]; } // copy memory to device cudaMemcpy(d_vals, vals, sizeof(unsigned int) * 256, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_vals, vals, sizeof(unsigned int) * 256); // run the kernels again for (int i = 0; i < 10; i++) { const_speedtest<<< n_blocks, block_size >>>(d_outvals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { tex_speedtest<<< n_blocks, block_size >>>(d_outvals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { shmem_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { global_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals); } cudaThreadSynchronize(); // LINEAR MODE for (int i = 0; i < 256; i++) { vals[i] = i; } // copy memory to device cudaMemcpy(d_vals, vals, sizeof(unsigned int) * 256, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_vals, vals, sizeof(unsigned int) * 256); for (int i = 0; i < 10; i++) { const_speedtest<<< n_blocks, block_size >>>(d_outvals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { tex_speedtest<<< n_blocks, block_size >>>(d_outvals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { shmem_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals); } cudaThreadSynchronize(); for (int i = 0; i < 10; i++) { global_speedtest<<< n_blocks, block_size >>>(d_outvals, d_vals); } cudaThreadSynchronize(); return 0; }