Quick question
If three ints (32 bits) are expanded to a uint64 following some rules (complex bit operation) and packed staggered, how can the warp utilisation be improved?
Context
This MWE is based on a pixel render problem where a Morton code is introduced to map the pixel 3D coordinates.
Float coordinates of pixels are truncated into int ones (cell-wise). Then, cell coordinates will be fed into the Morton mask for a Morton coordinate.
The MASK of the Morton code is defined as,
(0x9249249249249249UL<<3) & 0xfffffffffffff000UL | (3<<2) // x
(0x2492492492492492UL<<3) & 0xfffffffffffff000UL | (3<<4) // y
(0x4924924924924924UL<<3) & 0xfffffffffffff000UL | (3<<6) // z
Morton coordinate in each axis = 52 higher bits + 12 lower bits
High one repeats from right to left in the stride of 3:
X: … 100 100 (x )
Y: … 010 010 (y)
Z: … 001 001 (z)
The lower one is fixed as:
X: 0000 11(x) 00(y) 00(z) 00
Y: 0000 00(x) 11(y) 00(z) 00
Z: 0000 00(x) 00(y) 11(z) 00
For some reason, 6 bits are reserved for additional information not shown in this MWE.
Given pixel holding cell position (5, 3, 6) in decimal and (101, 011, 110 ) in binary.
Since 5 and 6 are greater than 3 (0b11),
Higher bits are set:
X Higher bits are: 49 bits + 100
Z Higher bits are: 49 bits + 001
Not for Y:
Y Higher bits are: 49 bits + 000
X Lower bits are: 0000 01(x) 00(y) 00(z) 00
Y Lower bits are: 0000 00(x) 11(y) 00(z) 00
Z Lower bits are: 0000 00(x) 00(y) 10(z) 00
Finally, all three Morton X Y and Z will be Packed as:
49*0 + 101 + 0000 + 01 11 10 00
Sorry for the tedious explanation.
Codes
// testutil.cuh
// Defined the Morton transformation for each axis and packing kernel
#ifndef __TEST_UTIL__
#define __TEST_UTIL__
#include <cstdio>
#include <cstdint>
#define CHECK_CUDA_ERROR() \
{ \
cudaError_t cucheck_err = cudaGetLastError(); \
if(cucheck_err != cudaSuccess) { \
const char *err_str = cudaGetErrorString(cucheck_err); \
printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \
} \
}
#define cudaErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// Generation of Morton code
__device__
uint64_t BitExpand(const uint64_t mask,
const int data);
// Morton kernel
__global__
void formParticleFullOffset(const int datanum,
const float ratio,
float** data3d,
uint64_t* output);
#endif
// testutil.cu
#include "testutil.cuh"
extern __device__ __constant__ uint64_t glomemmasks[3];
__device__
uint64_t BitExpand(const uint64_t mask,
const int data)
{
uint64_t rmask = __brevll(mask);
int dat = data;
uint64_t result = 0;
unsigned char lz, offset = __clzll(rmask);
while (rmask) {
lz = __clzll(rmask) + 1;
result = result << lz | (dat & 1);
dat >>= 1, rmask <<= lz;
}
result = __brevll(result) >> __clzll(mask);
return result;
}
__global__
void formParticleFullOffset(const int datanum,
const float ratio,
float** data3d,
uint64_t* output)
{
int parid = blockIdx.x * blockDim.x + threadIdx.x;
if (parid < datanum)
{
output[parid] |= BitExpand(glomemmasks[0], (int)((data3d[0][parid]) * ratio + 0.5f) - 1);
output[parid] |= BitExpand(glomemmasks[1], (int)((data3d[1][parid]) * ratio + 0.5f) - 1);
output[parid] |= BitExpand(glomemmasks[2], (int)((data3d[2][parid]) * ratio + 0.5f) - 1);
}
}
// test.cu
#include "testutil.cuh"
__device__ __constant__ uint64_t glomemmasks[3];
int main()
{
uint64_t hostglo[3] = {
(uint64_t) (0x9249249249249249UL<<3) & 0xfffffffffffff000UL | (3<<2),
(uint64_t) (0x2492492492492492UL<<3) & 0xfffffffffffff000UL | (3<<4),
(uint64_t) (0x4924924924924924UL<<3) & 0xfffffffffffff000UL | (3<<6)
};
cudaErrChk(cudaMemcpyToSymbol(glomemmasks, hostglo, 3*sizeof(uint64_t)));
const int datanumx = 100;
const int datanumy = 25;
const int datanumz = 80;
const int datanum = datanumx * datanumy * datanumz;
const float dataspace = 0.002f;
// 3D DATA: SOA
float* data3d_flatten;
float** data3d;
uint64_t* output;
cudaErrChk(cudaMallocManaged((void**)&data3d_flatten, sizeof(float)*3*datanum));
cudaErrChk(cudaMallocManaged((void**)&data3d, sizeof(float*)*3));
cudaErrChk(cudaMalloc((void**)&output, sizeof(uint64_t)*datanum));
float* hostptrtmp[3];
for(int i =0; i < 3; ++i)
hostptrtmp[i] = reinterpret_cast<float*>((uint64_t)data3d_flatten
+ uint64_t(i*datanum*sizeof(float)));
cudaErrChk(cudaMemcpy(data3d, hostptrtmp, 3*sizeof(float*), cudaMemcpyHostToDevice));
// In my Morton code, data is filled in the order of z y and x
for(int dimx = 0; dimx < datanumx; ++dimx)
{
for(int dimy = 0; dimy < datanumy; ++dimy)
{
for(int dimz = 0; dimz < datanumz; ++dimz)
{
int ptloc = dimx*datanumy*datanumz + dimy*datanumz + dimz;
data3d[0][ptloc] = (dimx+0.5f)*dataspace;
data3d[1][ptloc] = (dimy+0.5f)*dataspace;
data3d[2][ptloc] = (dimz+0.5f)*dataspace;
}
}
}
int threadnum = 256;
int blocknum = (datanum + 255)/threadnum;
formParticleFullOffset<<<blocknum, threadnum>>>(datanum, 1.f/dataspace, data3d, output);
cudaDeviceSynchronize();
CHECK_CUDA_ERROR()
cudaFree(data3d_flatten);
cudaFree(data3d);
cudaFree(output);
return 0;
}
Test:
Built by
nvcc -o test *.cu -rdc=true
gcc 9.4.0 + cuda 12.2 + sm75 (Turing)
Results
Compute throughput: 75 % Memory throughput 23 %
Warps available: 7.21 warps per scheduler, but only 2.26 are eligible.
L1 hit: 56% L2 hit: 54%
How come the performance is sub-optimal (the utilization of warp is less than 50%)?
Thank you for any comments!