Have an application for which all threads load unsigned values (0 to 15) from a table in memory in a random disparate pattern, and I wrote a test function to determine the ‘optimal’ implementation.
Since the values are small I opted to use the native CUDA uchar1 type, but thought there would not be much difference in performance between loading single byte values from constant memory vs loading a regular 4 byte integer values from constant memory.
It turns out that using the uchar1 type in constant to get those integer values (casting to 32 bit int after load) is about 2x faster than the same implementation using 32 bit integers.
This ‘dummy’ test was performed on a mobile laptop GTX 980M, which is not known for good memory performance but still gives a idea of the runtime difference.
Using uchar1 type:
best test val=1023
==4392== Profiling application: testApp.exe
==4392== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
498.45ms 1.3120us - - - - - 1.2120KB 923.78MB/s GeForce GTX 980 1 7 [CUDA memcpy HtoD]
499.40ms 3.4234ms - - - - - 36.373MB 10.625GB/s GeForce GTX 980 1 7 [CUDA memcpy HtoD]
502.96ms 2.4386ms (16384 1 1) (64 1 1) 25 2.2280KB 0B - - GeForce GTX 980 1 7 test_read(int const *, int*) [100]
505.47ms 7.8080us - - - - - 65.536KB 8.3934GB/s GeForce GTX 980 1 7 [CUDA memcpy DtoH]
In this ‘dummy’ test application there are 2^20 threads launched, and each 64 thread block loads 555 pre-filled random index values(int32) from global memory and stores in shared memory. Then each individual thread reads 555 randomly chosen values from constant memory and, performs some silly calculations and then continues to do some silly max value reduction using shared memory and __shfl().
This exploratory test code is here:
#include <algorithm>
#include <iostream>
#include <fstream>
#include <sstream>
#include <utility>
#include <cstdlib>
#include <cstdio>
#include <cstring>
#include <string>
#include <cmath>
//#include <map>
#include <ctime>
#include <cuda.h>
#include <math_functions.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <Windows.h>
#include <MMSystem.h>
#pragma comment(lib, "winmm.lib")
#define _CRTDBG_MAP_ALLOC
#include <crtdbg.h>
using namespace std;
typedef long long ll;
#define NUM_ELEM 1212
#define INNER_LOOP_INDICES 555
#define _DTH cudaMemcpyDeviceToHost
#define _DTD cudaMemcpyDeviceToDevice
#define _HTD cudaMemcpyHostToDevice
#define THREADS 64
__constant__ uchar1 Cache[NUM_ELEM];
__global__ void test_read(
const int *indices,
int *blk_result){
const int tid=threadIdx.x+blockIdx.x*blockDim.x;
const int warpIdx=threadIdx.x%32;
__shared__ int loc_idx_cache[INNER_LOOP_INDICES];
__shared__ int bst_val[2];
for(int i=threadIdx.x;i<INNER_LOOP_INDICES;i+=THREADS){
loc_idx_cache[i]=__ldg(&indices[blockIdx.x*INNER_LOOP_INDICES+i]);
}
__syncthreads();
int vvv=tid%INNER_LOOP_INDICES,temp;
for(int i=0;i<INNER_LOOP_INDICES;i++){
vvv^=(int(Cache[loc_idx_cache[i]].x)|i);
}
#pragma unroll
for(int i=16;i>0;i>>=1){
temp=__shfl(vvv,warpIdx+i);
vvv=max(vvv,temp);
}
if(warpIdx==0){
bst_val[threadIdx.x>>5]=vvv;
}
__syncthreads();
if(threadIdx.x==0){
blk_result[blockIdx.x]=max(bst_val[0],bst_val[1]);
}
}
int main(){
srand(time(NULL));
cudaError_t err;
const int tot_threads=(1<<20);
const int num_blocks=tot_threads/THREADS;
int *H_idx,*H_result;
err=cudaHostAlloc((void**)&H_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int),cudaHostAllocDefault);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaHostAlloc((void**)&H_result,num_blocks*sizeof(int),cudaHostAllocDefault);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
uchar1 *H_Cache;
err=cudaHostAlloc((void**)&H_Cache,NUM_ELEM*sizeof(uchar1),cudaHostAllocDefault);
for(int i=0;i<NUM_ELEM;i++){
H_Cache[i].x=unsigned char(rand()%256);
}
for(int i=0;i<num_blocks;i++){
for(int j=0;j<INNER_LOOP_INDICES;j++){
H_idx[i*INNER_LOOP_INDICES+j]=(rand()%NUM_ELEM);
}
}
err=cudaMemcpyToSymbol(Cache,H_Cache,NUM_ELEM*sizeof(uchar1));
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
int *D_idx,*D_result;
err=cudaMalloc((void**)&D_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int));
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaMalloc((void**)&D_result,num_blocks*sizeof(int));
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaMemcpy(D_idx,H_idx,INNER_LOOP_INDICES*num_blocks*sizeof(int),_HTD);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
test_read<<<num_blocks,THREADS>>>(D_idx,D_result);
err=cudaDeviceSynchronize();
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaMemcpy(H_result,D_result,num_blocks*sizeof(int),_DTH);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
int best_val=-(1<<20);
for(int i=0;i<num_blocks;i++){
best_val=max(best_val,H_result[i]);
}
printf("\n best test val=%d \n",best_val);
err=cudaFree(D_idx);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaFree(D_result);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaFreeHost(H_idx);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaFreeHost(H_Cache);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaFreeHost(H_result);
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
err=cudaDeviceReset();
if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
return 0;
}
bool InitMMTimer(UINT wTimerRes){
TIMECAPS tc;
if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) {return false;}
wTimerRes = min(max(tc.wPeriodMin, 1), tc.wPeriodMax);
timeBeginPeriod(wTimerRes);
return true;
}
void DestroyMMTimer(UINT wTimerRes, bool init){
if(init)
timeEndPeriod(wTimerRes);
}
The max value reduction portion of this code was added just to make sure that the compiler did not get too smart.
I was lazy and perform the last reduction step on the host, but this is just a test to determine the best way to access a constant integer array.
When I change the type of constant memory from uchar1 to int this is the nvprof output:
best test val=1023
==1084== Profiling application: testApp.exe
==1084== Profiling result:
Start Duration Grid Size Block Size Regs* SSMem* DSMem* Size Throughput Device Context Stream Name
536.01ms 1.5360us - - - - - 4.8480KB 3.1563GB/s GeForce GTX 980 1 7 [CUDA memcpy HtoD]
536.91ms 3.3561ms - - - - - 36.373MB 10.838GB/s GeForce GTX 980 1 7 [CUDA memcpy HtoD]
540.40ms 5.0230ms (16384 1 1) (64 1 1) 32 2.2280KB 0B - - GeForce GTX 980 1 7 test_read(int const *, int*) [100]
545.51ms 7.8400us - - - - - 65.536KB 8.3592GB/s GeForce GTX 980 1 7 [CUDA memcpy DtoH]
Again this is mobile version of the GTX 980m on my laptop, so I would expect the desktop GPUs to perform much better on this test.
I also tried loading to shared memory from global and that was the second fastest method.
My confusion about this relates to the big performance difference between making those disparate load from constant memory in 1 byte form, vs loading in 4 byte form.
Should I expect any type of bank conflicts in constant memory using such a scheme?
I thought that loads smaller than 4 bytes were treated the same as 4 byte loads, due to minimum size of registers in kernel.
My objective is to determine the optimal method to load common values values used by all threads during the course of a simulation. I need this table to be as small as possible and the reads to be as fast as possible(but the reads will be in random order). May even split the uchar1 into 2 4 bit fields since the value range of the integers is 0 to 15, but would like to decide if that would be worth the effort.