cub sort error

Hi,

I got the following error when running cub sort on two gpus: cudaErrorIllegalAddress. The logic is quite simple: 1. Generate a random array of numbers between 0~255. 2. Modify it a little bit and run on two GPUs. 3 Merge the results.

Can anyone please help to take a look? Thanks.

Here is the code:

__global__ void gpu_modify(uint32_t * d_keys_in, int N, uint32_t d)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N)
    {
        d_keys_in[i] = (d_keys_in[i] + d) & 255;
    }
}

void modify(uint32_t * d_keys_in, int N, uint32_t d)
{
    int threads = 1024;
    int blocks = (N + threads - 1) / threads;
    gpu_modify << <blocks, threads >> > (d_keys_in, N, d);
}

class Partition
{
public:
    int N;
    const int m_deviceId;

    uint32_t * d_keys_in = NULL;
    uint32_t * d_vals_in = NULL;
    uint32_t * d_keys_out = NULL;
    uint32_t * d_vals_out = NULL;
    void * d_temp = NULL;
    size_t temp_size = 0;
    
    Partition(int deviceId, int N, uint32_t * keys, uint32_t * vals) : m_deviceId(deviceId), N(N)
    {
        checkCudaErrors(cudaSetDevice(deviceId));
        checkCudaErrors(cudaMalloc((void **)&d_keys_in, N * sizeof(uint32_t)));
        checkCudaErrors(cudaMalloc((void **)&d_vals_in, N * sizeof(uint32_t)));
        checkCudaErrors(cudaMalloc((void **)&d_keys_out, N * sizeof(uint32_t)));
        checkCudaErrors(cudaMalloc((void **)&d_vals_out, N * sizeof(uint32_t)));

        checkCudaErrors(cudaMemcpy(d_keys_in, keys, N * sizeof(uint32_t), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_vals_in, vals, N * sizeof(uint32_t), cudaMemcpyHostToDevice));

        cub::DeviceRadixSort::SortPairs(d_temp, temp_size, d_keys_in, d_keys_out, d_vals_in, d_vals_out, N);
        checkCudaErrors(cudaGetLastError());
        checkCudaErrors(cudaMalloc(&d_temp, temp_size));
    }

    void Sort(int i)
    {
        checkCudaErrors(cudaSetDevice(m_deviceId));
        modify(d_keys_in, N, i + m_deviceId);
        checkCudaErrors(cudaGetLastError());
        cub::DeviceRadixSort::SortPairs(d_temp, temp_size, d_keys_in, d_keys_out, d_vals_in, d_vals_out, N);
        checkCudaErrors(cudaGetLastError());
    }

    void Release()
    {
        checkCudaErrors(cudaSetDevice(m_deviceId));
        if (d_keys_in) checkCudaErrors(cudaFree(d_keys_in));
        if (d_vals_in) checkCudaErrors(cudaFree(d_vals_in));
        if (d_keys_out) checkCudaErrors(cudaFree(d_keys_out));
        if (d_vals_out) checkCudaErrors(cudaFree(d_vals_out));
        if (d_temp) checkCudaErrors(cudaFree(d_temp));
    }
};

int main()
{
    const int N = 2 * 1000 * 1000;

    srand(time(NULL));

    uint32_t * keys_in = NULL, * vals_in = NULL, * keys_out = NULL, * vals_out = NULL;
    
    keys_in = new uint32_t[N];
    vals_in = new uint32_t[N];
    keys_out = new uint32_t[N];
    vals_out = new uint32_t[N];

    for (uint32_t i = 0; i < N; i++)
    {
        keys_in[i] = rand() & 255;
        vals_in[i] = i;
    }

    checkCudaErrors(cudaSetDevice(0));

    GPUStopWatch timer;

    vector<Partition> ps;

    for (int i = 0; i < 2; i++)
    {
        ps.push_back({ i, N, keys_in, vals_in });
    }

    thread * threads = new thread[2];

    for (uint32_t i = 0; i < 100000; i++)
    {
        checkCudaErrors(cudaSetDevice(0));
        timer.Reset();

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            auto f = [&]() {
                ps[threadId].Sort(i);
            };

            threads[threadId] = thread(f);
        }

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            threads[threadId].join();
        }

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            checkCudaErrors(cudaSetDevice(ps[threadId].m_deviceId));
            checkCudaErrors(cudaMemcpy(keys_out + 2 * threadId, ps[threadId].d_keys_out, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
            checkCudaErrors(cudaMemcpy(vals_out + 2 * threadId, ps[threadId].d_vals_out, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
        }

        checkCudaErrors(cudaSetDevice(0));
        timer.Stop();
        
        printf("[%d] %d %d %d %d %d %d %d %d %f\n", i, keys_out[0], keys_out[1], vals_out[0], vals_out[1], keys_out[2], keys_out[3], vals_out[2], vals_out[3], timer.GetTimeElapsed());
    }

    for (int i = 0; i < 2; i++)
    {
        ps[i].Release();
    }

    if (keys_in) delete[] keys_in;
    if (vals_in) delete[] vals_in;
    if (keys_out) delete[] keys_out;
    if (vals_out) delete[] vals_out;
    if (threads) delete[] threads;

    checkCudaErrors(cudaDeviceReset());

    return 0;
}

Anyone used cub sort on multiple GPUs?

I don’t think cub should be in any way affected by its use on multiple GPUs.

After I figured out via trial and error what header files you had chopped off, and removed the reference to GPUStopWatch, I was able to compile and run your code. When I run your code with cuda-memcheck, I get errors like this:

========= Invalid __global__ read of size 4
=========     at 0x00000058 in gpu_modify(unsigned int*, int, unsigned int)
=========     by thread (0,0,0) in block (0,0,0)
=========     Address 0x00000041 is misaligned
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x204505]
=========     Host Frame:./t1348 [0x2f171]
=========     Host Frame:./t1348 [0x4cd13]
=========     Host Frame:./t1348 [0x54c7]
=========     Host Frame:./t1348 [0x47a3]
=========     Host Frame:./t1348 [0x47cb]
=========     Host Frame:./t1348 [0x40ee]
=========     Host Frame:./t1348 [0x6901]
=========     Host Frame:./t1348 [0x4130]
=========     Host Frame:./t1348 [0x6312]
=========     Host Frame:./t1348 [0x6269]
=========     Host Frame:./t1348 [0x6202]
=========     Host Frame:/lib64/libstdc++.so.6 [0xb5220]
=========     Host Frame:/lib64/libpthread.so.0 [0x7dc5]
=========     Host Frame:/lib64/libc.so.6 (clone + 0x6d) [0xf6ced]
=========
CUDA error at t1348.cu:60 code=4(cudaErrorLaunchFailure) "cudaGetLastError()"

That is code you wrote. It has nothing to do with CUB. You are confused thinking this is a CUB issue because your error checking after the modify function/kernel is not forcing the kernel to complete. If you add error checking on a cudaDeviceSynchronize() call at that point, you will find that your error checking will attribute the error to the modify function/kernel, rather than CUB. Having said all that, the misaligned error is somewhat unusual, and nothing about your code immediately suggests that the CUDA pointers should be misaligned. In addition, if I run your code without cuda-memcheck, I get different errors - a report of an invalid configuration on the modify kernel launch. Different than what I saw with cuda-memcheck, and different than what you report.

After some more poking, I’ve come to the conclusion that your threads are broken. When I add a printf statement in modify to printout N, I get values of 0 for both threads.

After a bit more poking, I observed that if I change your lambda capture spec from this:

auto f = [&]() {

to this:

auto f = [&ps, threadId, i]() {

things started working without any runtime errors. See below for a description of why I think this is.

(By the way you don’t really need two separate threads of execution to launch concurrent kernels on two devices.)

Here’s a version of your code with all CUDA elements stripped out. It demonstrates that your lambda capture mechanism is not working the way you expect:

$ cat t1348.cpp
#include <stdio.h>
#include <vector>
#include <thread>

using namespace std;

void modify(uint32_t * d_keys_in, int N, uint32_t d)
{
    int threads = 1024;
    int blocks = (N + threads - 1) / threads;
}

class Partition
{
public:
    int N;
    const int m_deviceId;

    uint32_t * d_keys_in = NULL;
    uint32_t * d_vals_in = NULL;
    uint32_t * d_keys_out = NULL;
    uint32_t * d_vals_out = NULL;
    void * d_temp = NULL;
    size_t temp_size = 0;

    Partition(int deviceId, int N, uint32_t * keys, uint32_t * vals) : m_deviceId(deviceId), N(N)
    {
    }

    void Sort(int i)
    {
        printf("N = %d, device_id = %d\n", N, m_deviceId);
        modify(d_keys_in, N, i + m_deviceId);
    }

    void Release()
    {
    }
};

int main()
{
    const int N = 2 * 1000 * 1000;

    srand(time(NULL));

    uint32_t * keys_in = NULL, * vals_in = NULL, * keys_out = NULL, * vals_out = NULL;

    keys_in = new uint32_t[N];
    vals_in = new uint32_t[N];
    keys_out = new uint32_t[N];
    vals_out = new uint32_t[N];

    for (uint32_t i = 0; i < N; i++)
    {
        keys_in[i] = rand() & 255;
        vals_in[i] = i;
    }

vector<Partition> ps;

    for (int i = 0; i < 2; i++)
    {
        ps.push_back({ i, N, keys_in, vals_in });
    }
    thread * threads = new thread[2];

    for (uint32_t i = 0; i < 1; i++)
    {

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
#ifndef FIX
            auto f = [&]() {
#else
            auto f = [&ps, threadId, i]() {

#endif
                ps[threadId].Sort(i);
            };

            threads[threadId] = thread(f);
        }

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            threads[threadId].join();
        }

}

    for (int i = 0; i < 2; i++)
    {
        ps[i].Release();
    }

    if (keys_in) delete[] keys_in;
    if (vals_in) delete[] vals_in;
    if (keys_out) delete[] keys_out;
    if (vals_out) delete[] vals_out;
    if (threads) delete[] threads;

return 0;
}
$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread
$ ./t1348cpu
N = 0, device_id = 0
N = 0, device_id = 0
[user2@dc10 misc]$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread -DFIX
$ ./t1348cpu
N = 2000000, device_id = 0
N = 2000000, device_id = 1
$

(g++ 4.8.5)

The takeaway according to my testing is that this has nothing to do with CUB or CUDA, but with a broken threading methodology. The theory I have to explain this is roughly described here:

https://stackoverflow.com/questions/36325039/starting-c11-thread-with-a-lambda-capturing-local-variable

When you capture variables by reference in a lambda, and use that lambda as a thread function, the resolution of those (capture-by-reference) variables does not actually occur until the thread starts executing - asynchronously - sometime later. Since threadId is your loop variable, it seems fairly evident that capturing this by reference is a bad idea - the loop iterates twice (at least) before the threads actually begin executing, so the threadId variable may get updated via the loop iteration, before the threads consume it. According to my testing, it is sufficient to do this:

auto f = [&, threadId]() {

to work around this problem, which I believe is consistent with my description. For additional supporting evidence, we can modify our test case slightly to have Sort() accept an additional integer parameter and print it out. We will pass threadId to it:

$ cat t1348.cpp
#include <stdio.h>
#include <vector>
#include <thread>

using namespace std;

void modify(uint32_t * d_keys_in, int N, uint32_t d)
{
    int threads = 1024;
    int blocks = (N + threads - 1) / threads;
}

class Partition
{
public:
    int N;
    const int m_deviceId;

    uint32_t * d_keys_in = NULL;
    uint32_t * d_vals_in = NULL;
    uint32_t * d_keys_out = NULL;
    uint32_t * d_vals_out = NULL;
    void * d_temp = NULL;
    size_t temp_size = 0;

    Partition(int deviceId, int N, uint32_t * keys, uint32_t * vals) : m_deviceId(deviceId), N(N)
    {
    }

    void Sort(int i, int tID)
    {
        printf("N = %d, device_id = %d, tID = %d\n", N, m_deviceId, tID);
        modify(d_keys_in, N, i + m_deviceId);
    }

    void Release()
    {
    }
};

int main()
{
    const int N = 2 * 1000 * 1000;

    srand(time(NULL));

    uint32_t * keys_in = NULL, * vals_in = NULL, * keys_out = NULL, * vals_out = NULL;

    keys_in = new uint32_t[N];
    vals_in = new uint32_t[N];
    keys_out = new uint32_t[N];
    vals_out = new uint32_t[N];

    for (uint32_t i = 0; i < N; i++)
    {
        keys_in[i] = rand() & 255;
        vals_in[i] = i;
    }

vector<Partition> ps;

    for (int i = 0; i < 2; i++)
    {
        ps.push_back({ i, N, keys_in, vals_in });
    }
    thread * threads = new thread[2];

    for (uint32_t i = 0; i < 1; i++)
    {

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
#ifndef FIX
            auto f = [&]() {
#else
            auto f = [&, threadId]() {

#endif
                ps[threadId].Sort(i,threadId);
            };

            threads[threadId] = thread(f);
        }

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            threads[threadId].join();
        }

}

    for (int i = 0; i < 2; i++)
    {
        ps[i].Release();
    }

    if (keys_in) delete[] keys_in;
    if (vals_in) delete[] vals_in;
    if (keys_out) delete[] keys_out;
    if (vals_out) delete[] vals_out;
    if (threads) delete[] threads;

return 0;
}
$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread
$ ./t1348cpu
N = 0, device_id = 0, tID = 2
N = 0, device_id = 0, tID = 2
$ g++ t1348.cpp -std=c++11 -o t1348cpu -pthread -DFIX
$ ./t1348cpu
N = 2000000, device_id = 1, tID = 1
N = 2000000, device_id = 0, tID = 0
$

We now see that in the failing case, the passed threadId value has the illegal index of 2 (for both threads, since the loop has actually updated the loop variable twice, by the time either thread actually starts executing), whereas in the passing case, with the capture-by-value of threadId, we get the expected results.

As a final test, when I take a suitably modified version of your original code:

  • add necessary headers and preamble
  • remove references to GPUStopWatch
  • reduce main loop count to a sane number
  • modify the lambda to capture threadId by value

your code runs without any runtime or CUDA API error for me:

$ cat t1348.cu
#include <cub/cub.cuh>
#include <stdio.h>
#include <helper_cuda.h>
#include <vector>
#include <thread>

using namespace std;
__global__ void gpu_modify(uint32_t * d_keys_in, int N, uint32_t d)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    if (i < N)
    {
        d_keys_in[i] = (d_keys_in[i] + d) & 255;
    }
}

void modify(uint32_t * d_keys_in, int N, uint32_t d)
{
    int threads = 1024;
    int blocks = (N + threads - 1) / threads;
    gpu_modify << <blocks, threads >> > (d_keys_in, N, d);
}

class Partition
{
public:
    int N;
    const int m_deviceId;

    uint32_t * d_keys_in = NULL;
    uint32_t * d_vals_in = NULL;
    uint32_t * d_keys_out = NULL;
    uint32_t * d_vals_out = NULL;
    void * d_temp = NULL;
    size_t temp_size = 0;

    Partition(int deviceId, int N, uint32_t * keys, uint32_t * vals) : m_deviceId(deviceId), N(N)
    {
        checkCudaErrors(cudaSetDevice(deviceId));
        checkCudaErrors(cudaMalloc((void **)&d_keys_in, N * sizeof(uint32_t)));
        checkCudaErrors(cudaMalloc((void **)&d_vals_in, N * sizeof(uint32_t)));
        checkCudaErrors(cudaMalloc((void **)&d_keys_out, N * sizeof(uint32_t)));
        checkCudaErrors(cudaMalloc((void **)&d_vals_out, N * sizeof(uint32_t)));

        checkCudaErrors(cudaMemcpy(d_keys_in, keys, N * sizeof(uint32_t), cudaMemcpyHostToDevice));
        checkCudaErrors(cudaMemcpy(d_vals_in, vals, N * sizeof(uint32_t), cudaMemcpyHostToDevice));

        cub::DeviceRadixSort::SortPairs(d_temp, temp_size, d_keys_in, d_keys_out, d_vals_in, d_vals_out, N);
        checkCudaErrors(cudaGetLastError());
        checkCudaErrors(cudaMalloc(&d_temp, temp_size));
    }

    void Sort(int i)
    {
        checkCudaErrors(cudaSetDevice(m_deviceId));
        modify(d_keys_in, N, i + m_deviceId);
        checkCudaErrors(cudaGetLastError());
        cub::DeviceRadixSort::SortPairs(d_temp, temp_size, d_keys_in, d_keys_out, d_vals_in, d_vals_out, N);
        checkCudaErrors(cudaGetLastError());
    }

    void Release()
    {
        checkCudaErrors(cudaSetDevice(m_deviceId));
        if (d_keys_in) checkCudaErrors(cudaFree(d_keys_in));
        if (d_vals_in) checkCudaErrors(cudaFree(d_vals_in));
        if (d_keys_out) checkCudaErrors(cudaFree(d_keys_out));
        if (d_vals_out) checkCudaErrors(cudaFree(d_vals_out));
        if (d_temp) checkCudaErrors(cudaFree(d_temp));
    }
};

int main()
{
    const int N = 2 * 1000 * 1000;

    srand(time(NULL));

    uint32_t * keys_in = NULL, * vals_in = NULL, * keys_out = NULL, * vals_out = NULL;

    keys_in = new uint32_t[N];
    vals_in = new uint32_t[N];
    keys_out = new uint32_t[N];
    vals_out = new uint32_t[N];

    for (uint32_t i = 0; i < N; i++)
    {
        keys_in[i] = rand() & 255;
        vals_in[i] = i;
    }

    checkCudaErrors(cudaSetDevice(0));

vector<Partition> ps;

    for (int i = 0; i < 2; i++)
    {
        ps.push_back({ i, N, keys_in, vals_in });
    }

    thread * threads = new thread[2];

    for (uint32_t i = 0; i < 10; i++)
    {
        checkCudaErrors(cudaSetDevice(0));

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            auto f = [&, threadId]() {
                ps[threadId].Sort(i);
            };

            threads[threadId] = thread(f);
        }

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            threads[threadId].join();
        }

        for (size_t threadId = 0; threadId < 2; threadId++)
        {
            checkCudaErrors(cudaSetDevice(ps[threadId].m_deviceId));
            checkCudaErrors(cudaMemcpy(keys_out + 2 * threadId, ps[threadId].d_keys_out, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
            checkCudaErrors(cudaMemcpy(vals_out + 2 * threadId, ps[threadId].d_vals_out, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
        }

        checkCudaErrors(cudaSetDevice(0));

        printf("[%d] %d %d %d %d %d %d %d %d\n", i, keys_out[0], keys_out[1], vals_out[0], vals_out[1], keys_out[2], keys_out[3], vals_out[2], vals_out[3]);
    }

    for (int i = 0; i < 2; i++)
    {
        ps[i].Release();
    }

    if (keys_in) delete[] keys_in;
    if (vals_in) delete[] vals_in;
    if (keys_out) delete[] keys_out;
    if (vals_out) delete[] vals_out;
    if (threads) delete[] threads;

    checkCudaErrors(cudaDeviceReset());

    return 0;
}
$ nvcc -arch=sm_35 -std=c++11 -o t1348 t1348.cu -I/usr/local/cuda/samples/common/inc
$ ./t1348
[0] 0 0 299 463 0 0 315 333
[1] 0 0 315 333 0 0 117 437
[2] 0 0 117 437 0 0 119 683
[3] 0 0 119 683 0 0 326 498
[4] 0 0 326 498 0 0 390 966
[5] 0 0 390 966 0 0 321 340
[6] 0 0 321 340 0 0 77 271
[7] 0 0 77 271 0 0 553 599
[8] 0 0 553 599 0 0 73 929
[9] 0 0 73 929 0 0 614 1164
$ cuda-memcheck ./t1348
========= CUDA-MEMCHECK
[0] 0 0 664 868 0 0 272 672
[1] 0 0 272 672 0 0 458 491
[2] 0 0 458 491 0 0 555 618
[3] 0 0 555 618 0 0 268 459
[4] 0 0 268 459 0 0 349 546
[5] 0 0 349 546 0 0 13 366
[6] 0 0 13 366 0 0 96 185
[7] 0 0 96 185 0 0 543 1004
[8] 0 0 543 1004 0 0 54 138
[9] 0 0 54 138 0 0 7 500
========= ERROR SUMMARY: 0 errors
$