Low utilization of warp per scheduler for bit pack operation (Morton Code)

Quick question

If three ints (32 bits) are expanded to a uint64 following some rules (complex bit operation) and packed staggered, how can the warp utilisation be improved?


This MWE is based on a pixel render problem where a Morton code is introduced to map the pixel 3D coordinates.

Float coordinates of pixels are truncated into int ones (cell-wise). Then, cell coordinates will be fed into the Morton mask for a Morton coordinate.

The MASK of the Morton code is defined as,

(0x9249249249249249UL<<3) & 0xfffffffffffff000UL  | (3<<2) // x
(0x2492492492492492UL<<3) & 0xfffffffffffff000UL  | (3<<4) // y
(0x4924924924924924UL<<3) & 0xfffffffffffff000UL  | (3<<6) // z

Morton coordinate in each axis = 52 higher bits + 12 lower bits

High one repeats from right to left in the stride of 3:

X: … 100 100 (x )
Y: … 010 010 (y)
Z: … 001 001 (z)

The lower one is fixed as:
X: 0000 11(x) 00(y) 00(z) 00
Y: 0000 00(x) 11(y) 00(z) 00
Z: 0000 00(x) 00(y) 11(z) 00

For some reason, 6 bits are reserved for additional information not shown in this MWE.

Given pixel holding cell position (5, 3, 6) in decimal and (101, 011, 110 ) in binary.

Since 5 and 6 are greater than 3 (0b11),

Higher bits are set:

X Higher bits are: 49 bits + 100
Z Higher bits are: 49 bits + 001

Not for Y:
Y Higher bits are: 49 bits + 000

X Lower bits are: 0000 01(x) 00(y) 00(z) 00
Y Lower bits are: 0000 00(x) 11(y) 00(z) 00
Z Lower bits are: 0000 00(x) 00(y) 10(z) 00

Finally, all three Morton X Y and Z will be Packed as:

49*0 + 101 + 0000 + 01 11 10 00

Sorry for the tedious explanation.


// testutil.cuh
// Defined the Morton transformation for each axis and packing kernel

#ifndef __TEST_UTIL__
#define __TEST_UTIL__

#include <cstdio>
#include <cstdint>

#define CHECK_CUDA_ERROR()                                                            \
        {                                                                             \
        cudaError_t cucheck_err = cudaGetLastError(); \
        if(cucheck_err != cudaSuccess) {                          \
            const char *err_str = cudaGetErrorString(cucheck_err);  \
            printf("%s (%d): %s\n", __FILE__, __LINE__, err_str);   \
        }                                                         \

#define cudaErrChk(ans) { cudaAssert((ans), __FILE__, __LINE__); }
    inline void cudaAssert(cudaError_t code, const char *file, int line, bool abort=true)
        if (code != cudaSuccess)
            fprintf(stderr, "CUDA Error: %s %s %d\n", cudaGetErrorString(code), file, line);
            if (abort) exit(code);

// Generation of Morton code
uint64_t BitExpand(const uint64_t mask,
                  const int data);

// Morton kernel
void formParticleFullOffset(const int datanum, 
                            const float ratio, 
                            float** data3d,   
                            uint64_t* output);


// testutil.cu
#include "testutil.cuh"

extern __device__ __constant__ uint64_t glomemmasks[3];

uint64_t BitExpand(const uint64_t mask,
                                    const int data)
    uint64_t rmask = __brevll(mask);
    int dat = data;
    uint64_t result = 0;
    unsigned char lz, offset = __clzll(rmask);
    while (rmask) {
        lz = __clzll(rmask) + 1;
        result = result << lz | (dat & 1);
        dat >>= 1, rmask <<= lz;
    result = __brevll(result) >> __clzll(mask);
    return result;

void formParticleFullOffset(const int datanum, 
                            const float ratio, 
                            float** data3d,   
                            uint64_t* output) 
    int parid = blockIdx.x * blockDim.x + threadIdx.x;
    if (parid < datanum)
        output[parid] |= BitExpand(glomemmasks[0], (int)((data3d[0][parid]) * ratio + 0.5f) - 1);
        output[parid] |= BitExpand(glomemmasks[1], (int)((data3d[1][parid]) * ratio + 0.5f) - 1);
        output[parid] |= BitExpand(glomemmasks[2], (int)((data3d[2][parid]) * ratio + 0.5f) - 1);

// test.cu
#include "testutil.cuh"

__device__ __constant__ uint64_t glomemmasks[3];

int main()
    uint64_t hostglo[3] = {
      (uint64_t) (0x9249249249249249UL<<3) & 0xfffffffffffff000UL  | (3<<2),
      (uint64_t) (0x2492492492492492UL<<3) & 0xfffffffffffff000UL  | (3<<4),
      (uint64_t) (0x4924924924924924UL<<3) & 0xfffffffffffff000UL  | (3<<6)

    cudaErrChk(cudaMemcpyToSymbol(glomemmasks, hostglo, 3*sizeof(uint64_t)));

    const int datanumx = 100;
    const int datanumy = 25;
    const int datanumz = 80;

    const int datanum = datanumx * datanumy * datanumz;
    const float dataspace = 0.002f;

    // 3D DATA: SOA
    float* data3d_flatten;
    float** data3d;

    uint64_t* output;

    cudaErrChk(cudaMallocManaged((void**)&data3d_flatten, sizeof(float)*3*datanum)); 
    cudaErrChk(cudaMallocManaged((void**)&data3d,         sizeof(float*)*3)); 
    cudaErrChk(cudaMalloc((void**)&output,                sizeof(uint64_t)*datanum));

    float* hostptrtmp[3];
    for(int i =0; i < 3; ++i)
        hostptrtmp[i] = reinterpret_cast<float*>((uint64_t)data3d_flatten 
                                                + uint64_t(i*datanum*sizeof(float)));

    cudaErrChk(cudaMemcpy(data3d, hostptrtmp, 3*sizeof(float*), cudaMemcpyHostToDevice));

    // In my Morton code, data is filled in the order of z y and x
    for(int dimx = 0; dimx < datanumx; ++dimx)
        for(int dimy = 0; dimy < datanumy; ++dimy)
            for(int dimz = 0; dimz < datanumz; ++dimz)
                int ptloc = dimx*datanumy*datanumz + dimy*datanumz + dimz;
                data3d[0][ptloc] = (dimx+0.5f)*dataspace;
                data3d[1][ptloc] = (dimy+0.5f)*dataspace;
                data3d[2][ptloc] = (dimz+0.5f)*dataspace;

    int threadnum = 256;
    int blocknum = (datanum + 255)/threadnum;

    formParticleFullOffset<<<blocknum, threadnum>>>(datanum, 1.f/dataspace, data3d, output);

    return 0;


Built by

nvcc -o test *.cu -rdc=true

gcc 9.4.0 + cuda 12.2 + sm75 (Turing)


Compute throughput: 75 % Memory throughput 23 %

Warps available: 7.21 warps per scheduler, but only 2.26 are eligible.

L1 hit: 56% L2 hit: 54%

How come the performance is sub-optimal (the utilization of warp is less than 50%)?

Thank you for any comments!

Help me understand: You have full access to the code, can execute it on whatever GPU you are running this on, and can profile the code to identify bottlenecks. You now ask random strangers on the internet to explain the behavior of code that is only provided in part, code that they cannot run and cannot profile, and without knowledge of the hardware it is running on. You expect said strangers to explain why this code is not running as fast as you expect / hope.

Generating more insight with less information is usually a recipe for speculation rather than insightful analysis. I would suggest posting a minimal complete reproducer that third parties can cut&paste, compile, run, and profile. Specify how the code was compiled (nvcc command line) and what hardware this is running on.

Some general observations:

(1) GPU are 32-bit architectures with 64-bit addressing capabilities. All 64-bit integer operations are therefore emulated. Can you express this expansion process using 32-bit operations? What is this used for (context)?

(2) Memory access here is somewhat inefficient if one assumes that each thread accesses exactly three data items data3d[][]. Is this actually a jagged array, meaning the row vectors can have different length? If not, can you convert data3d into a true 2D array using contiguous storage? With that replacement there would be a single data access to get the array’s base pointer from .param space (constant memory), followed by index arithmetic, then three data accesses to retrieve the three ints. This would reduce memory traffic (one pointer fetch versus four pointer fetches).

(3) The expansion process looks … convoluted. I am wondering whether there is a more efficient way to do this. Is there a full specification that describes it? Why are the bit-reversals necessary? It looks like some sort of PDEP (parallel bit deposit) emulation? Maybe take a look here:

Actually, from the example provided in the question this looks more like a 3D Morton code than generalized PDEP functionality, which should make bit-extraction and deposit patterns entirely regular, making me wonder why clz is invoked inside the loop. Is the following the correct specification?

Given three 32-bit integers x, y, z, we are to generate an interleaved 64-bit result r such that for i in [0, 20] bit
r3i+0 = xi
r3i+1 = yi
r3i+2 = zi

Obviously if we expect the x, y, z to be mostly small in magnitude (as in the example), we can limit the range of i based on clz(a|b|c|).

If the desired functionality is indeed a 3D Morton code, maybe take a look at this question (there seem to be some good answers there, but I don’t have the time right now to identify the best one; most of them look like variants of the same basic algorithm):

Thank you for your post.

You are right, it is a modified version of 3D Morton code.

The complete example requires a non-trivial Morton implementation, so I chose the BitExpand function in my question.

As for the data3d, I have updated the original post and provided the data3d structure.

What behind the data3d is a 1d array with contiguous distribution of X, Y and Z coordinates.

However I am not sure if it is a reasonable way to map the 1D data to 2D using a manual calculation of the offset.

Profiling results show that the compute pressure is much more than the memory feeding.

Is it possible that data transfer limits the overall performance?

Thank you for your time!

A 52-bit 3D Morton code will be computationally expensive to compute. But the bit extract and deposit patterns are entirely regular, no need to write loops, or reverse bits, or count leading zeros. There are many different approaches to computing a Morton code. I would suggest using 32-bit operations throughout. One could interleave bits <9:0> of the source integers separately from bits <19:10> for example, then perform a merge of the final result using 64-bit arithmetic.

It’s 2am here, and I am too tired to follow the specification of this particular Morton code. I would think that the masks shown are an implementation artifact, not part of the specification. It is usually best to specify bit-wise transformations by showing the equations for each bit:

r0 = ?
r1 = ?
r2 = ?

r51 = ?

That way it is completely unambiguous what source bits contribute to which destination bits. I assume the 52 bits are not arbitrarily chose, but were picked because that’s the number of stored significand bits in an IEEE-754 double?


Following your suggestions, maybe I can try an int32-bit shift:

Given an int32 (NUM),

My Morton pattern will separate the lower two bits (L) and the others (H),

L = NUM & 0x3
H = NUM >> 2

L will do as,

newL = L << (1 << 2 or 4 or 6), 2 4 6 for z, y and x

H will map to newH as,

for i in (0, 32)
    newH[i*3] = H[i] 

Then, newH does a left-shift (0+12, 1+12, 2+12 for z, y and x)

Results = newH | newL

There is no need for any bitwise intrinsic, and 64-bit operation is involved only in the last step.

I hope the code will be faster.

PS The BitExpand is developed as a factory and will be integrated into other colleagues’ code.

Anyway, I will remove it in this code.

To illustrate what I had in mind, here is some code I quickly hacked together on an x86 platform, where it works well. The basic building block is a function that spreads the least significant 10 bits of an integer a into a result r as follows:

for i in [0,9]:
r3i+0 = ai
r3i+1 = 0
r3i+2 = 0

A reference function would look like this:

unsigned int spread_ten_bits_by_3_ref (unsigned int a)
    unsigned int r = 0;
    r |= (((a >> 0) & 1) <<  0);
    r |= (((a >> 1) & 1) <<  3);
    r |= (((a >> 2) & 1) <<  6);
    r |= (((a >> 3) & 1) <<  9);
    r |= (((a >> 4) & 1) << 12);
    r |= (((a >> 5) & 1) << 15);
    r |= (((a >> 6) & 1) << 18);
    r |= (((a >> 7) & 1) << 21);
    r |= (((a >> 8) & 1) << 24);
    r |= (((a >> 9) & 1) << 27);
    return r;

A version optimized for x86 (11 instructions, 2-way instruction level parallelism) might look like this:

unsigned int spread_ten_bits_by_3 (unsigned int a)
    return (((((a & 0x333) * 0x50005) | ((a & 0x333) * 0x00500)) & 0x09009009) |
            (((a & 0x0cc) * 0x05050) & 0x00240240));


We can use this as a building block for a 30-bit 3D Morton code, based on the ten least significant bits of each input (with other bits assumed 0):

unsigned int morton3d_ten_bits (unsigned int x, unsigned int y, unsigned int z)
    return ((spread_ten_bits_by_3 (x) << 0) |
            (spread_ten_bits_by_3 (y) << 1) |
            (spread_ten_bits_by_3 (z) << 2));    

We can also produce a 60-bit 3D Morton code based on the 20 least significant bits of each input. If we expect many small inputs, we can provide an early-out:

unsigned long long int morton3d_twenty_bits (unsigned int x, 
                                             unsigned int y, 
                                             unsigned int z)
    unsigned int lo, hi = 0;
    int bits_needed = 32 - clz (x | y | z);
    lo = morton3d_ten_bits (x & 0x3ff, y & 0x3ff, z & 0x3ff);
    if (bits_needed > 10) {
        hi = morton3d_ten_bits ((x >> 10) & 0x3ff, (y >> 10) & 0x3ff, 
                                (z >> 10) & 0x3ff); 
    return ((unsigned long long int)hi << 30) | lo;

This gives us some idea how many instructions we might expect for a 60-bit Morton code: around a hundred depending on processor architecture. This explains why you found that your code is mostly bound by compute throughput. The optimal decomposition of a 3D Morton code into building blocks likely depends on the GPU architecture. There are trade-offs between the integer operations used, potential use of small tables, etc.

Your advice is so helpful. Thank you!

Since a 30-bit 3D Morton index seems to be of potentially wider utility, I cleaned up the code I posted previously, added code for a de-interleaving building block and Morton index decoding, and added a test framework with an exhaustive test.

This code, with decorations added for device code as needed, should provide good performance on all GPU architectures with a full 32-bit integer multiplier.

  Copyright 2023, Norbert Juffa

  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions
  are met:

  1. Redistributions of source code must retain the above copyright
     notice, this list of conditions and the following disclaimer.

  2. Redistributions in binary form must reproduce the above copyright
     notice, this list of conditions and the following disclaimer in the
     documentation and/or other materials provided with the distribution.


#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>

/* Spread least significant ten bits of argument x in [0, 1023] to every third 
   bit of result. For i in [0, 9]: r<3*i+0> = a<i>, r<3*i+1> = 0, r<3*i+2> = 0. 
uint32_t deposit_every_third (uint32_t x)
    const uint32_t sm1 = 0x333ul;      // source mask: select bits 0,1,4,5,8,9
    const uint32_t sm2 = 0x0ccul;      // source mask: select bits 2,3,6,7
    const uint32_t ml1 = 0x50005ul;    // multiplier: spread to bits 0-11,16-27
    const uint32_t ml2 = 0x00500ul;    // multiplier: spread to bits 8-19
    const uint32_t ml3 = 0x05050ul;    // multiplier: spread to bits 6-13,14-21
    const uint32_t rm1 = 0x09009009ul; // result mask: bits 0,3,12,15,24,27
    const uint32_t rm2 = 0x00240240ul; // result mask: bits 6,9,18,21
    uint32_t t = x & sm1;
    return ((((t * ml1) | (t * ml2)) & rm1) | (((x & sm2) * ml3) & rm2));

/* For argument x in which only bits x<3*i> are utilized and the remaining
   bits must be zero, right-compact the used bits into a 10-bit integer, i.e.
   r<i> = a<3*i>.
uint32_t gather_every_third (uint32_t x)
    const uint32_t group_mult = (1L <<  4) | (1L <<  2) | (1L << 0);
    const uint32_t group_mask = (7L << 22) | (7L << 13) | (7L << 4);
    const uint32_t concat_mul = (1L << 14) | (1L <<  8) | (1L << 2);
    const uint32_t concat_msk = (0x1ffL << 18);
    const uint32_t bit27_mask = (1L << 27);
    uint32_t t; 
    t = x * group_mult; // form 3-bit groups: t<4:6>,t<13:15>,t<22:24>
    t = t & group_mask; // isolate the 3-bit groups
    t = t * concat_mul; // concatenate groups, t<26:18>
    t = t & concat_msk; // isolate the concatenation of three 3-bit groups
    x = x & bit27_mask; // isolate ungrouped bit t<27>
    t = (t | x) >> 18 ; // merge ungrouped bit t<27>, move result to t<9:0>
    return t;

/* For x, y, z in [0, 1023], compute 30-bit 3D Morton code by interleaving bits.
   For i in [0, 9]: r<3*i+0> = x<i>, r<3*i+1> = y<i>, r<3*i+2> = z<i>.
uint32_t encode_morton3d_10_bits (uint32_t x, uint32_t y, uint32_t z)
    return ((deposit_every_third (x) << 0) |
            (deposit_every_third (y) << 1) |
            (deposit_every_third (z) << 2));    

/* De-interleave a 30-bit 3D Morton code into units x, y, z each in [0,1023] 
   For i in [0, 9]: x<i> = a<3*i+0>, y<i> = a<3*i+1>, z<i> = a<3*i+1>
void decode_morton3d_10_bits (uint32_t a, uint32_t *x, uint32_t *y, uint32_t *z)
    uint32_t mask = 01111111111; // octal constant!
    *x = gather_every_third ((a >> 0) & mask);
    *y = gather_every_third ((a >> 1) & mask);
    *z = gather_every_third ((a >> 2) & mask);

int main (void)
    for (uint32_t x = 0; x < 1024; x++) {
        for (uint32_t y = 0; y < 1024; y++) {
            for (uint32_t z = 0; z < 1024; z++) {
                uint32_t encoded = encode_morton3d_10_bits (x, y, z);
                uint32_t dx, dy, dz;
                decode_morton3d_10_bits (encoded, &dx, &dy, &dz);
                if ((x != dx) | (y != dy) | (z != dz)) {
                    printf ("error @ %03lx,%03lx,%03lx: encoded=%08lx decoded=%03lx,%03lx,%03lx\n",
                            (unsigned long int)x, (unsigned long int)y, (unsigned long int)z,
                            (unsigned long int)encoded,
                            (unsigned long int)dx, (unsigned long int)dy, (unsigned long int)dz);
                    return EXIT_FAILURE;
    return EXIT_SUCCESS;
1 Like

I checked the code generated for deposit_every_third() and gather_every_third() when building for sm_75, and there are no surprises. Thanks to support for one immediate operand in integer instructions, and thanks to LOP3 (three-input logic op), the code is nice and tight. And thanks to full-speed 32-bit integer multiplies for compute capability >= 7.0 it is zippy, too. Compared to x86-64 with BMI2, which provides dedicated PDEP and PEXT instructions, the GPU is at a bit of a disadvantage here.

deposit_every_third()                            // x in R4
   LOP3.LUT R0, R4.reuse, 0x333, RZ, 0xc0, !PT ; // x & sm1
   LOP3.LUT R4, R4, 0xcc, RZ, 0xc0, !PT ;        // x & sm2
   IMAD R2, R0.reuse, 0x50005, RZ ;              // s = t * ml1
   IMAD R3, R0, 0x500, RZ ;                      // t = t * ml2
   IMAD R5, R4, 0x5050, RZ ;                     // u = (x & sm2) * ml3
   LOP3.LUT R2, R2, 0x9009009, R3, 0xc8, !PT ;   // v = ((s | t) & rm1)
   LOP3.LUT R2, R2, 0x240240, R5, 0xf8, !PT ;    // (v | (u & rm2))

gather_every_third()                             // x in R3
   IMAD R0, R3, 0x15, RZ ;                       // t = x * group_mult;
   LOP3.LUT R0, R0, 0x1c0e070, RZ, 0xc0, !PT ;   // t = t & group_mask;
   IMAD R0, R0, 0x4104, RZ ;                     // t = t * concat_mul;
   LOP3.LUT R0, R0, 0x7fc0000, RZ, 0xc0, !PT ;   // t = t & concat_msk;
   LOP3.LUT R0, R0, 0x8000000, R3, 0xf8, !PT ;   // t = t | (x & bit27_mask);
   SHF.R.U32.HI R0, RZ, 0x12, R0 ;               // t = t >> 18

This topic was automatically closed 14 days after the last reply. New replies are no longer allowed.