**Quick question**

If three **int**s (32 bits) are expanded to a uint64 following some rules (complex bit operation) and packed staggered, how can the warp utilisation be improved?

**Context**

This MWE is based on a pixel render problem where a Morton code is introduced to map the pixel 3D **coordinates**.

Float coordinates of pixels are truncated into int ones (cell-wise). Then, cell coordinates will be fed into the Morton mask for a Morton coordinate.

The **MASK** of the Morton code is defined as,

```
(0x9249249249249249UL<<3) & 0xfffffffffffff000UL | (3<<2) // x
(0x2492492492492492UL<<3) & 0xfffffffffffff000UL | (3<<4) // y
(0x4924924924924924UL<<3) & 0xfffffffffffff000UL | (3<<6) // z
```

Morton coordinate in each axis = 52 higher bits + 12 lower bits

High one repeats from right to left in the stride of 3:

X: … 100 100 (x )

Y: … 010 010 (y)

Z: … 001 001 (z)

The lower one is fixed as:

X: 0000 11(x) 00(y) 00(z) 00

Y: 0000 00(x) 11(y) 00(z) 00

Z: 0000 00(x) 00(y) 11(z) 00

*For some reason, 6 bits are reserved for additional information not shown in this MWE.*

Given pixel holding cell position (5, 3, 6) in decimal and (101, 011, 110 ) in binary.

Since 5 and 6 are greater than 3 (0b11),

Higher bits are set:

**X** Higher bits are: 49 bits + 100

**Z** Higher bits are: 49 bits + 001

Not for Y:

**Y** Higher bits are: 49 bits + 000

**X** Lower bits are: 0000 01(x) 00(y) 00(z) 00

**Y** Lower bits are: 0000 00(x) 11(y) 00(z) 00

**Z** Lower bits are: 0000 00(x) 00(y) 10(z) 00

Finally, all three Morton **X** **Y** and **Z** will be **Packed** as:

49*0 + 101 + 0000 + 01 11 10 00

Sorry for the tedious explanation.

**Codes**

```
// testutil.cuh
// Defined the Morton transformation for each axis and packing kernel
#ifndef __TEST_UTIL__
#define __TEST_UTIL__
#include <cstdio>
#include <cstdint>
#define CHECK_CUDA_ERROR() \
{ \
cudaError_t cucheck_err = cudaGetLastError(); \
if(cucheck_err != cudaSuccess) { \
const char *err_str = cudaGetErrorString(cucheck_err); \
printf("%s (%d): %s\n", __FILE__, __LINE__, err_str); \
} \
}
#define cudaErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
if (code != cudaSuccess)
{
fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
if (abort) exit(code);
}
}
// Generation of Morton code
__device__
uint64_t BitExpand(const uint64_t mask,
const int data);
// Morton kernel
__global__
void formParticleFullOffset(const int datanum,
const float ratio,
float** data3d,
uint64_t* output);
#endif
// testutil.cu
#include "testutil.cuh"
extern __device__ __constant__ uint64_t glomemmasks[3];
__device__
uint64_t BitExpand(const uint64_t mask,
const int data)
{
uint64_t rmask = __brevll(mask);
int dat = data;
uint64_t result = 0;
unsigned char lz, offset = __clzll(rmask);
while (rmask) {
lz = __clzll(rmask) + 1;
result = result << lz | (dat & 1);
dat >>= 1, rmask <<= lz;
}
result = __brevll(result) >> __clzll(mask);
return result;
}
__global__
void formParticleFullOffset(const int datanum,
const float ratio,
float** data3d,
uint64_t* output)
{
int parid = blockIdx.x * blockDim.x + threadIdx.x;
if (parid < datanum)
{
output[parid] |= BitExpand(glomemmasks[0], (int)((data3d[0][parid]) * ratio + 0.5f) - 1);
output[parid] |= BitExpand(glomemmasks[1], (int)((data3d[1][parid]) * ratio + 0.5f) - 1);
output[parid] |= BitExpand(glomemmasks[2], (int)((data3d[2][parid]) * ratio + 0.5f) - 1);
}
}
// test.cu
#include "testutil.cuh"
__device__ __constant__ uint64_t glomemmasks[3];
int main()
{
uint64_t hostglo[3] = {
(uint64_t) (0x9249249249249249UL<<3) & 0xfffffffffffff000UL | (3<<2),
(uint64_t) (0x2492492492492492UL<<3) & 0xfffffffffffff000UL | (3<<4),
(uint64_t) (0x4924924924924924UL<<3) & 0xfffffffffffff000UL | (3<<6)
};
cudaErrChk(cudaMemcpyToSymbol(glomemmasks, hostglo, 3*sizeof(uint64_t)));
const int datanumx = 100;
const int datanumy = 25;
const int datanumz = 80;
const int datanum = datanumx * datanumy * datanumz;
const float dataspace = 0.002f;
// 3D DATA: SOA
float* data3d_flatten;
float** data3d;
uint64_t* output;
cudaErrChk(cudaMallocManaged((void**)&data3d_flatten, sizeof(float)*3*datanum));
cudaErrChk(cudaMallocManaged((void**)&data3d, sizeof(float*)*3));
cudaErrChk(cudaMalloc((void**)&output, sizeof(uint64_t)*datanum));
float* hostptrtmp[3];
for(int i =0; i < 3; ++i)
hostptrtmp[i] = reinterpret_cast<float*>((uint64_t)data3d_flatten
+ uint64_t(i*datanum*sizeof(float)));
cudaErrChk(cudaMemcpy(data3d, hostptrtmp, 3*sizeof(float*), cudaMemcpyHostToDevice));
// In my Morton code, data is filled in the order of z y and x
for(int dimx = 0; dimx < datanumx; ++dimx)
{
for(int dimy = 0; dimy < datanumy; ++dimy)
{
for(int dimz = 0; dimz < datanumz; ++dimz)
{
int ptloc = dimx*datanumy*datanumz + dimy*datanumz + dimz;
data3d[0][ptloc] = (dimx+0.5f)*dataspace;
data3d[1][ptloc] = (dimy+0.5f)*dataspace;
data3d[2][ptloc] = (dimz+0.5f)*dataspace;
}
}
}
int threadnum = 256;
int blocknum = (datanum + 255)/threadnum;
formParticleFullOffset<<<blocknum, threadnum>>>(datanum, 1.f/dataspace, data3d, output);
cudaDeviceSynchronize();
CHECK_CUDA_ERROR()
cudaFree(data3d_flatten);
cudaFree(data3d);
cudaFree(output);
return 0;
}
```

**Test**:

Built by

```
nvcc -o test *.cu -rdc=true
```

gcc 9.4.0 + cuda 12.2 + sm75 (Turing)

**Results**

Compute throughput: 75 % Memory throughput 23 %

Warps available: 7.21 warps per scheduler, but only 2.26 are eligible.

L1 hit: 56% L2 hit: 54%

How come the performance is sub-optimal (the utilization of warp is less than 50%)?

Thank you for any comments!