abut sort by key

Why it is not sort ? here is the code

#include “cuda_runtime.h”
#include “device_launch_parameters.h”

#include <stdio.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size);

global void addKernel(int *c, const int *a, const int *b)
int i = threadIdx.x;
c[i] = a[i] + b[i];

int main()
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 50, 20, 30, 40, 50 };
int c[arraySize] = { 0 };

// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “addWithCuda failed!”);
return 1;

printf(“{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n”,
c[0], c[1], c[2], c[3], c[4]);

// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceReset failed!”);
return 1;
return 0;

// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size)
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;

// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaSetDevice failed! Do you have a CUDA-capable GPU installed?”);
goto Error;

// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;

cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;

cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMalloc failed!”);
goto Error;

// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;

cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;

// Launch a kernel on the GPU with one thread for each element.
addKernel<<<1, size>>>(dev_c, dev_a, dev_b);
// thrust::device_ptr< unsigned int> dev_data_ptr(dev_a);
// thrust::device_ptr< unsigned int> dev_keys_ptr(dev_c);

//thrust::sort_by_key(dev_keys_ptr, dev_keys_ptr +size, dev_data_ptr);
//unsigned int * sc = thrust::raw_pointer_cast(dev_data_ptr);
//unsigned int * sa = thrust::raw_pointer_cast(dev_keys_ptr);

thrust::sort_by_key(thrust::device_ptr((unsigned *)dev_a), thrust::device_ptr((unsigned *)dev_a+size), thrust::device_ptr(dev_c));

// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “addKernel launch failed: %s\n”, cudaGetErrorString(cudaStatus));
goto Error;

// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaDeviceSynchronize returned error code %d after launching addKernel!\n”, cudaStatus);
goto Error;

// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, “cudaMemcpy failed!”);
goto Error;


return cudaStatus;

Your code is working as I would expect.

When I run your code, the output I get is:

{1,2,3,4,5} + {10,20,30,40,50} = {51,22,33,44,55}

This makes sense. The first thing you are doing is adding two vectors, a and b. Contrary to your printout, your vectors are defined like this:

const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 50, 20, 30, 40, 50 };

So the sum result of these in c is:


Then you do a sort-by-key operation (key-value sort) on a and c. Sort by key means use the a vector as the keys, the c vector as the values, and arrange both vectors so they are in the order that a would be if sorted.


However a is already sorted, so no changes occur (in either a or c) and so the printout result:


is just the order of c produced by the vector add operation.

If you want the c vector itself to be sorted, just use an ordinary thrust::sort on c. No need to use sort-by-key.

hello txbob,
thank you for your reply,
I kown what’t the problem . thank you very much.