Random loads of small unsigned integer values, __constant__ vs. __shared__ vs. global

Have an application for which all threads load unsigned values (0 to 15) from a table in memory in a random disparate pattern, and I wrote a test function to determine the ‘optimal’ implementation.
Since the values are small I opted to use the native CUDA uchar1 type, but thought there would not be much difference in performance between loading single byte values from constant memory vs loading a regular 4 byte integer values from constant memory.

It turns out that using the uchar1 type in constant to get those integer values (casting to 32 bit int after load) is about 2x faster than the same implementation using 32 bit integers.

This ‘dummy’ test was performed on a mobile laptop GTX 980M, which is not known for good memory performance but still gives a idea of the runtime difference.

Using uchar1 type:

best test val=1023
==4392== Profiling application: testApp.exe
==4392== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
498.45ms  1.3120us                    -               -         -         -         -  1.2120KB  923.78MB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
499.40ms  3.4234ms                    -               -         -         -         -  36.373MB  10.625GB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
502.96ms  2.4386ms          (16384 1 1)        (64 1 1)        25  2.2280KB        0B         -           -  GeForce GTX 980         1         7  test_read(int const *, int*) [100]
505.47ms  7.8080us                    -               -         -         -         -  65.536KB  8.3934GB/s  GeForce GTX 980         1         7  [CUDA memcpy DtoH]

In this ‘dummy’ test application there are 2^20 threads launched, and each 64 thread block loads 555 pre-filled random index values(int32) from global memory and stores in shared memory. Then each individual thread reads 555 randomly chosen values from constant memory and, performs some silly calculations and then continues to do some silly max value reduction using shared memory and __shfl().

This exploratory test code is here:

#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <utility>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <string>
#include <cmath>
//#include <map>
#include <ctime>
#include <cuda.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")
#define _CRTDBG_MAP_ALLOC
#include <crtdbg.h>
using namespace std;

typedef long long ll;
#define NUM_ELEM 1212
#define INNER_LOOP_INDICES 555

#define _DTH cudaMemcpyDeviceToHost
#define _DTD cudaMemcpyDeviceToDevice
#define _HTD cudaMemcpyHostToDevice

#define THREADS 64

__constant__ uchar1 Cache[NUM_ELEM];

__global__ void test_read(
			const int *indices,
			int *blk_result){

	const int tid=threadIdx.x+blockIdx.x*blockDim.x;
	const int warpIdx=threadIdx.x%32;

	__shared__ int loc_idx_cache[INNER_LOOP_INDICES];
	__shared__ int bst_val[2];

	for(int i=threadIdx.x;i<INNER_LOOP_INDICES;i+=THREADS){
		loc_idx_cache[i]=__ldg(&indices[blockIdx.x*INNER_LOOP_INDICES+i]);
	}
	__syncthreads();

	int vvv=tid%INNER_LOOP_INDICES,temp;
	
	for(int i=0;i<INNER_LOOP_INDICES;i++){
		vvv^=(int(Cache[loc_idx_cache[i]].x)|i);
	}

#pragma unroll
	for(int i=16;i>0;i>>=1){
		temp=__shfl(vvv,warpIdx+i);
		vvv=max(vvv,temp);
	}

	if(warpIdx==0){
		bst_val[threadIdx.x>>5]=vvv;
	}
	__syncthreads();

	if(threadIdx.x==0){
		blk_result[blockIdx.x]=max(bst_val[0],bst_val[1]);
	}
}



int main(){

	srand(time(NULL));
	cudaError_t err;
	const int tot_threads=(1<<20);
	const int num_blocks=tot_threads/THREADS;
	int *H_idx,*H_result;
	err=cudaHostAlloc((void**)&H_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int),cudaHostAllocDefault);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaHostAlloc((void**)&H_result,num_blocks*sizeof(int),cudaHostAllocDefault);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	uchar1 *H_Cache;
	err=cudaHostAlloc((void**)&H_Cache,NUM_ELEM*sizeof(uchar1),cudaHostAllocDefault);
		
	for(int i=0;i<NUM_ELEM;i++){
		H_Cache[i].x=unsigned char(rand()%256);
	}
	for(int i=0;i<num_blocks;i++){
		for(int j=0;j<INNER_LOOP_INDICES;j++){
			H_idx[i*INNER_LOOP_INDICES+j]=(rand()%NUM_ELEM);
		}
	}
	
	err=cudaMemcpyToSymbol(Cache,H_Cache,NUM_ELEM*sizeof(uchar1));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	int *D_idx,*D_result;

	err=cudaMalloc((void**)&D_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaMalloc((void**)&D_result,num_blocks*sizeof(int));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMemcpy(D_idx,H_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int),_HTD);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	test_read<<<num_blocks,THREADS>>>(D_idx,D_result);
	err=cudaDeviceSynchronize();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMemcpy(H_result,D_result,num_blocks*sizeof(int),_DTH);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	int best_val=-(1<<20);
	for(int i=0;i<num_blocks;i++){
		best_val=max(best_val,H_result[i]);
	}

	printf("\n best test val=%d \n",best_val);

	err=cudaFree(D_idx);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFree(D_result);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	
	err=cudaFreeHost(H_idx);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFreeHost(H_Cache);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFreeHost(H_result);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}


	err=cudaDeviceReset();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	return 0;
}

bool InitMMTimer(UINT wTimerRes){
	TIMECAPS tc;
	if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
	wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
	timeBeginPeriod(wTimerRes); 
	return true;
}

void DestroyMMTimer(UINT wTimerRes, bool init){
	if(init)
		timeEndPeriod(wTimerRes);
}

The max value reduction portion of this code was added just to make sure that the compiler did not get too smart.

I was lazy and perform the last reduction step on the host, but this is just a test to determine the best way to access a constant integer array.

When I change the type of constant memory from uchar1 to int this is the nvprof output:

best test val=1023
==1084== Profiling application: testApp.exe
==1084== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
536.01ms  1.5360us                    -               -         -         -         -  4.8480KB  3.1563GB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
536.91ms  3.3561ms                    -               -         -         -         -  36.373MB  10.838GB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
540.40ms  5.0230ms          (16384 1 1)        (64 1 1)        32  2.2280KB        0B         -           -  GeForce GTX 980         1         7  test_read(int const *, int*) [100]
545.51ms  7.8400us                    -               -         -         -         -  65.536KB  8.3592GB/s  GeForce GTX 980         1         7  [CUDA memcpy DtoH]

Again this is mobile version of the GTX 980m on my laptop, so I would expect the desktop GPUs to perform much better on this test.

I also tried loading to shared memory from global and that was the second fastest method.

My confusion about this relates to the big performance difference between making those disparate load from constant memory in 1 byte form, vs loading in 4 byte form.

Should I expect any type of bank conflicts in constant memory using such a scheme?

I thought that loads smaller than 4 bytes were treated the same as 4 byte loads, due to minimum size of registers in kernel.

My objective is to determine the optimal method to load common values values used by all threads during the course of a simulation. I need this table to be as small as possible and the reads to be as fast as possible(but the reads will be in random order). May even split the uchar1 into 2 4 bit fields since the value range of the integers is 0 to 15, but would like to decide if that would be worth the effort.

What is the size of the data per block?
Does each block use the same data or separate data?
Statistically how many times is each data element read by a block?

On Maxwell global, local, surface, and texture operations all go through texture is larger than the constant cache. Texture unit is significant higher performance than random constant accesses especially if there is address divergence in the warp.

Shared memory is faster than texture at resolving random accesses.

If the data is small enough to fit in shared memory and each data element will be accessed multiple times then shared memory probably has the best performance; otherwise, I would use texture.

If you read a 32-bit element from constant and you have to select 1 bytes from the element then you will be incurring an additional dependent bit field extract which is likely slower than the byte accesses. If you were padding the bytes elements in constant then you the constant cache hit rate is going to be reduced as you will decreasing the efficiency.

Curious. Aren’t all 64 threads in a block doing exactly the same thing during the read of global, the write to shared memory, and the read of constant? They are all even writing to the same location in shared.

my guess is that the improved perf int to uchar is due to the fact that the constant cache holds 4 times as many uchar entries as it does int entries, and the same can be said for each cacheline. Once you load a cacheline, it’s 4x as likely that a subsequent load will come from the same cacheline (assuming some sort of disparate/random loading pattern, and that you don’t change NUM_ELEM as you go from int to uchar).

That’s cool. I didn’t know you could make non warp uniform accesses to constant memory:

LDC.U8 R8, c[0x3][R8];

I can probably explain the discrepancy. The constant cache is fast enough to feed instructions at the same speed as register access. But it can only do this in 64 byte chunks at a time. By using an 8 bit format you compress more of your data to fit in that level of cache that’s very close to the cuda cores. So you end up having fewer cache misses than if you were using 32 bit access. You’re probably not seeing a 4x speedup since the code is being slowed down by quarter throughput I2I instructions:

I2I.U32.U8 R8, R10;

These are probably totally superfluous and it’s a shame the compiler thinks it needs them.

You can make non-uniform accesses. They serialize.

But I don’t see any evidence of non-uniform access in the source. Maybe I am missing it.

This is the load from constant:

vvv^=(int(Cache[loc_idx_cache[i]].x)|i);

I don’t see anything non-uniform about it.

Oh, I wasn’t studying the source code as closely as the sass. I was just surprised to see a register in the constant cache address. But if they serialize then that’s not useful.

But as a general rule, I’ve seen lots of instances where an x2x instruction is generated by the compiler that is an effective no-op.

Excessive use of I2Is used to be a common problem when operating on types narrower than ‘int’. The problem starts with the C/C++ expression evaluation semantics which require that any operand of a type narrower than ‘int’ is first converted to ‘int’. Compilers then need to apply significant effort to determine where such widening is not necessary to produce identical results under the “as-if” rule. Most uses of redundant I2Is in compiler-generated code should be eliminated by now.

Apparently the compiler uses the I2I to apply zero extension in this case, not knowing that the LDC.U8 already took care of that. Speculation: A PTX instruction for zero extension is emitted by NVVM, which knows next to nothing about machine specifics, and thus is not aware that the load instruction is zero extending. These two instructions (a load instruction, followed by a zero-extend instruction) are then handed to PTXAS, which does have the machine knowledge, but for lack of range tracking cannot tell the I2I is redundant.

If I have some time I will check the generated PTX so see whether my hypothesis holds.

[Later:] I compiled the code as posted, and don’t see any I2I instructions in the SASS. I used the CUDA 6.5 toolchain and compiled for sm_30 and sm_50.

I just tried this:

/usr/local/cuda-6.5/bin/nvcc -arch sm_52 -cubin const.cu

and the I2I’s go away. So looks like 7.5 has regressed somewhat in this respect.

In case any of you were wondering, the n in njuffa is for ninja.

Each thread block got its own set of random indices which were pre-computed on the host and copied over to device memory, but you are correct that each thread in that block ends up reading from the same set of locations 555 times.

That part of kernel code was the same for both the uchar1 version and the int version, so that probably does not cause the performance difference.

So I added a bit more code to make each thread in a thread block load from different somewhat random locations, which had a big impact on the performance of both implementations, but the uchar1 version is still faster, but by less of a margin.

updated uchar1 version with each thread loading from a different location nvprof:

==4132== Profiling application: testApp.exe
==4132== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
542.98ms  1.2800us                    -               -         -         -         -  1.2120KB  946.88MB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
543.79ms  3.2015ms                    -               -         -         -         -  36.373MB  11.361GB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
547.11ms  48.092ms          (16384 1 1)        (64 1 1)        33  2.2280KB        0B         -           -  GeForce GTX 980         1         7  test_read(int const *, int*) [100]
595.27ms  7.6480us                    -               -         -         -         -  65.536KB  8.5690GB/s  GeForce GTX 980         1         7  [CUDA memcpy DtoH]

int version nvprof:

==3316== Profiling application: testApp.exe
==3316== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
482.84ms  1.5360us                    -               -         -         -         -  4.8480KB  3.1563GB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
483.69ms  3.2161ms                    -               -         -         -         -  36.373MB  11.310GB/s  GeForce GTX 980         1         7  [CUDA memcpy HtoD]
487.03ms  59.163ms          (16384 1 1)        (64 1 1)        36  2.2280KB        0B         -           -  GeForce GTX 980         1         7  test_read(int const *, int*) [100]
546.28ms  7.6480us                    -               -         -         -         -  65.536KB  8.5690GB/s  GeForce GTX 980         1         7  [CUDA memcpy DtoH]

now there is only about 23% performance difference favoring the uchar1 version.

here is the updated code which causes each loading thread to access a differenct set of values from constant memory. Not the most robust implementation, but it still illustrates the performance difference:

#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <utility>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <string>
#include <cmath>

#include <ctime>
#include <cuda.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")
#define _CRTDBG_MAP_ALLOC
#include <crtdbg.h>
using namespace std;

typedef long long ll;
#define NUM_ELEM 1212
#define INNER_LOOP_INDICES 555

#define _DTH cudaMemcpyDeviceToHost
#define _DTD cudaMemcpyDeviceToDevice
#define _HTD cudaMemcpyHostToDevice

#define THREADS 64

__constant__ uchar1 Cache[NUM_ELEM];

__global__ void test_read(
			const int *indices,
			int *blk_result){

	const int tid=threadIdx.x+blockIdx.x*blockDim.x;
	const int warpIdx=threadIdx.x%32;

	__shared__ int loc_idx_cache[INNER_LOOP_INDICES];
	__shared__ int bst_val[2];

	for(int i=threadIdx.x;i<INNER_LOOP_INDICES;i+=THREADS){
		loc_idx_cache[i]=__ldg(&indices[blockIdx.x*INNER_LOOP_INDICES+i]);
	}
	__syncthreads();

	int vvv=tid%INNER_LOOP_INDICES,temp;
	
	for(int i=0;i<INNER_LOOP_INDICES;i++){
		vvv^=(int(Cache[(loc_idx_cache[i]+threadIdx.x)%NUM_ELEM].x)|i);
	}

#pragma unroll
	for(int i=16;i>0;i>>=1){
		temp=__shfl(vvv,warpIdx+i);
		vvv=max(vvv,temp);
	}

	if(warpIdx==0){
		bst_val[threadIdx.x>>5]=vvv;
	}
	__syncthreads();

	if(threadIdx.x==0){
		blk_result[blockIdx.x]=max(bst_val[0],bst_val[1]);
	}
}



int main(){

	srand(time(NULL));
	cudaError_t err;
	const int tot_threads=(1<<20);
	const int num_blocks=tot_threads/THREADS;
	int *H_idx,*H_result;
	err=cudaHostAlloc((void**)&H_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int),cudaHostAllocDefault);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaHostAlloc((void**)&H_result,num_blocks*sizeof(int),cudaHostAllocDefault);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	uchar1 *H_Cache;
	err=cudaHostAlloc((void**)&H_Cache,NUM_ELEM*sizeof(uchar1),cudaHostAllocDefault);
		
	for(int i=0;i<NUM_ELEM;i++){
		H_Cache[i].x=unsigned char(rand()%256);
	}
	for(int i=0;i<num_blocks;i++){
		for(int j=0;j<INNER_LOOP_INDICES;j++){
			H_idx[i*INNER_LOOP_INDICES+j]=(rand()%NUM_ELEM);
		}
	}
	
	err=cudaMemcpyToSymbol(Cache,H_Cache,NUM_ELEM*sizeof(uchar1));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	int *D_idx,*D_result;

	err=cudaMalloc((void**)&D_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaMalloc((void**)&D_result,num_blocks*sizeof(int));
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMemcpy(D_idx,H_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int),_HTD);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	test_read<<<num_blocks,THREADS>>>(D_idx,D_result);
	err=cudaDeviceSynchronize();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	err=cudaMemcpy(H_result,D_result,num_blocks*sizeof(int),_DTH);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	int best_val=-(1<<20);
	for(int i=0;i<num_blocks;i++){
		best_val=max(best_val,H_result[i]);
	}

	printf("\n best test val=%d \n",best_val);

	err=cudaFree(D_idx);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFree(D_result);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	
	err=cudaFreeHost(H_idx);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFreeHost(H_Cache);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
	err=cudaFreeHost(H_result);
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}


	err=cudaDeviceReset();
	if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}

	return 0;
}

bool InitMMTimer(UINT wTimerRes){
	TIMECAPS tc;
	if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
	wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
	timeBeginPeriod(wTimerRes); 
	return true;
}

void DestroyMMTimer(UINT wTimerRes, bool init){
	if(init)
		timeEndPeriod(wTimerRes);
}

Once you break the uniform loading, constant memory doesn’t make much sense. You went from 2-5ms to 40-60ms because of the N-way serialization.

Under these circumstances, one of the other caching approaches that Greg mentioned will probably make more sense and be faster. Even just ordinary global access might be faster.

constant memory usage is generally only recommended when the loads are uniform.

I would make a slightly weaker statement, since my experience is that constant memory can still be of good use if there is occasional serialization, or serialization is frequent but at most two ways (meaning that the threads in a warp present a most two distinct addresses). This assumes that data is resident in the cache, i.e. accesses are generally cache hits.

Yes, these answers make sense. Thanks.

My current implementation of this simulation uses shared memory for this table, and this test kernel was trying to see if there was an improved approach.

Still find it interesting that there was a performance difference between the accesses of the 4 byte vs. 1 byte size.

I have very large shared memory requirements in the simulations, and have to compress my data as much as possible. I just assumed that those accesses of 1 byte values in shared memory would take at the same amount of time as the 4 byte access, with an increased chance of bank conflicts. But for me the 1/4 reduction of shared memory space was worth the higher bank conflict risk.

Took another look at this and wrote a better test case using cuRAND() to generate the unsigned integer indices in the kernels, and used shared memory for the storage rather than constant.

Still the uchar1 method of storage is faster than the integer method of storage, while using 1/4 of the memory.

output for new implementation using shared and uchar1 type:

6192== Profiling application: ConsoleApplication1.exe
6192== Profiling result:
 Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
1.06ms  1.6960us                    -               -         -         -         -       64B  37.736MB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
1.10ms  1.4080us                    -               -         -         -         -  1.2120KB  860.80MB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
1.18ms  19.104us                    -               -         -         -         -  4.1943MB  219.55GB/s  GeForce GTX TIT         1         7  [CUDA memset]
1.20ms  927.31us          (16384 1 1)        (64 1 1)        25        0B        0B         -           -  GeForce GTX TIT         1         7  setup_rand_states(curandStatePhilox4_32_10*, int, __int64) [191]
2.18ms  17.264ms          (16384 1 1)        (64 1 1)        29  1.2800KB        0B         -           -  GeForce GTX TIT         1         7  test_kern(uchar1 const *, curandStatePhilox4_32_10*, float const *, float*) [198]
9.48ms  324.58us                    -               -         -         -         -  4.1943MB  12.922GB/s  GeForce GTX TIT         1         7  [CUDA memcpy DtoH]

where the 555 random loads per thread in block from shared memory are done from the uchar1 type and cast to unsigned integer.

For that version I needed 1280 bytes of shared memory per thread block and the main kernel time including random number generation was 17.264 ms.

Then I tested the 32 bit integer version using shared memory for storage:

==1264== Profiling application: ConsoleApplication1.exe
==1264== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
180.79ms  1.6640us                    -               -         -         -         -       64B  38.462MB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
180.83ms  1.6010us                    -               -         -         -         -  4.8480KB  3.0281GB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
180.92ms  18.944us                    -               -         -         -         -  4.1943MB  221.41GB/s  GeForce GTX TIT         1         7  [CUDA memset]
180.95ms  925.94us          (16384 1 1)        (64 1 1)        25        0B        0B         -           -  GeForce GTX TIT         1         7  setup_rand_states(curandStatePhilox4_32_10*, int, __int64) [191]
181.93ms  19.220ms          (16384 1 1)        (64 1 1)        29  4.9160KB        0B         -           -  GeForce GTX TIT         1         7  test_kern(int const *, curandStatePhilox4_32_10*, float const *, float*) [198]
201.20ms  325.22us                    -               -         -         -         -  4.1943MB  12.897GB/s  GeForce GTX TIT         1         7  [CUDA memcpy DtoH]

In that case 4916 bytes of shared memory was used per thread block, and the running time for the same main kernel was 19.22 ms.

And since I know ahead of time that the values never will exceed a 4 bit representation I tested and even more memory efficient version which stored 2 values per uchar1 byte location, and extracted the correct value using bit manipulation:

==6592== Profiling application: ConsoleApplication1.exe
==6592== Profiling result:
   Start  Duration            Grid Size      Block Size     Regs*    SSMem*    DSMem*      Size  Throughput           Device   Context    Stream  Name
256.52ms  1.6320us                    -               -         -         -         -       64B  39.216MB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
256.56ms     864ns                    -               -         -         -         -      606B  701.39MB/s  GeForce GTX TIT         1         7  [CUDA memcpy HtoD]
256.66ms  20.160us                    -               -         -         -         -  4.1943MB  208.05GB/s  GeForce GTX TIT         1         7  [CUDA memset]
256.68ms  928.37us          (16384 1 1)        (64 1 1)        25        0B        0B         -           -  GeForce GTX TIT         1         7  setup_rand_states(curandStatePhilox4_32_10*, int, __int64) [191]
257.66ms  18.416ms          (16384 1 1)        (64 1 1)        29      674B        0B         -           -  GeForce GTX TIT         1         7  test_kern(uchar1 const *, curandStatePhilox4_32_10*, float const *, float*) [198]
276.12ms  325.29us                    -               -         -         -         -  4.1943MB  12.894GB/s  GeForce GTX TIT         1         7  [CUDA memcpy DtoH]

And this version only needed 674 bytes of shared memory per thread block, which was 13.7% of the shared memory needed for the integer version. Even with the overhead of bit field extract from the uchar1 type and the casting, that version still was faster than the integer version, and slightly slower than the standard one value per byte uchar1 version.

These recent tests were done on a Titan X, and the code is slightly different that the original version due to the use of cuRAND().