You could use thrust::gather. But you can also just use thrust::copy with a permutation iterator.
Here’s a worked example. For my test case (5 arrays each of 1M elements) my method is about 3x faster than your method (K40c, CUDA 7, CentOS 6.2):
$ cat t746.cu
#include <stdio.h>
#include <stdlib.h>
#include <thrust/device_ptr.h>
#include <thrust/sort.h>
#include <thrust/sequence.h>
#include <thrust/copy.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/permutation_iterator.h>
#define DSIZE 1048576
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
timeval tv;
gettimeofday(&tv, 0);
return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
template<typename T>
struct tuple_comp
{
__host__ __device__
bool operator()(const thrust::tuple<T, T, T, T, T> t,
const thrust::tuple<T, T, T, T, T> v)
{
return ((unsigned& ) thrust::get<0>(t)) < ((unsigned& ) thrust::get<0>(v));
}
};
void my_sort(int *pa, int *ta, int *la, int *fs, int *nm, size_t array_capacity){
thrust::sort(thrust::make_zip_iterator(
thrust::make_tuple(thrust::device_ptr<int>(pa),
thrust::device_ptr<int>(ta),
thrust::device_ptr<int>(la),
thrust::device_ptr<int>(fs),
thrust::device_ptr<int>(nm))),
thrust::make_zip_iterator(
thrust::make_tuple(thrust::device_ptr<int>(pa + array_capacity),
thrust::device_ptr<int>(ta + array_capacity),
thrust::device_ptr<int>(la + array_capacity),
thrust::device_ptr<int>(fs + array_capacity),
thrust::device_ptr<int>(nm + array_capacity))),
tuple_comp<int>());
}
int main(){
int *h1, *h2, *h3, *h4, *h5, *d0, *d1, *d2, *d3, *d4, *d5, *d6, *d7, *d8, *d9;
int *h1_1, *h1_2, *h2_1, *h2_2, *h3_1, *h3_2, *h4_1, *h4_2, *h5_1, *h5_2;
int dsize = DSIZE*sizeof(int);
h1 = (int *)malloc(dsize);
h2 = (int *)malloc(dsize);
h3 = (int *)malloc(dsize);
h4 = (int *)malloc(dsize);
h5 = (int *)malloc(dsize);
h1_1 = (int *)malloc(dsize);
h2_1 = (int *)malloc(dsize);
h3_1 = (int *)malloc(dsize);
h4_1 = (int *)malloc(dsize);
h5_1 = (int *)malloc(dsize);
h1_2 = (int *)malloc(dsize);
h2_2 = (int *)malloc(dsize);
h3_2 = (int *)malloc(dsize);
h4_2 = (int *)malloc(dsize);
h5_2 = (int *)malloc(dsize);
cudaMalloc(&d1, dsize);
cudaMalloc(&d2, dsize);
cudaMalloc(&d3, dsize);
cudaMalloc(&d4, dsize);
cudaMalloc(&d5, dsize);
for (int i = 0; i < DSIZE; i++) {
h1[i] = (rand() - RAND_MAX/2);
h2[i] = (rand() - RAND_MAX/2);
h3[i] = (rand() - RAND_MAX/2);
h4[i] = (rand() - RAND_MAX/2);
h5[i] = (rand() - RAND_MAX/2);}
// warm-up
my_sort(d1, d2, d3, d4, d5, DSIZE);
cudaMemcpy(d1, h1, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d2, h2, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d3, h3, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d4, h4, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d5, h5, dsize, cudaMemcpyHostToDevice);
unsigned long long mytime = dtime_usec(0);
my_sort(d1, d2, d3, d4, d5, DSIZE);
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
cudaMemcpy(h1_1, d1, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h2_1, d2, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h3_1, d3, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h4_1, d4, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h5_1, d5, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(d1, h1, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d2, h2, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d3, h3, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d4, h4, dsize, cudaMemcpyHostToDevice);
cudaMemcpy(d5, h5, dsize, cudaMemcpyHostToDevice);
printf("t1: %fs\n", mytime/(float)USECPSEC);
mytime = dtime_usec(0);
cudaMalloc(&d0, dsize);
cudaMalloc(&d6, dsize);
cudaMalloc(&d7, dsize);
cudaMalloc(&d8, dsize);
cudaMalloc(&d9, dsize);
thrust::sequence(thrust::device_ptr<int>(d0), thrust::device_ptr<int>(d0+DSIZE));
thrust::sort_by_key(thrust::device_ptr<unsigned>((unsigned *)d1), thrust::device_ptr<unsigned>((unsigned *)d1+DSIZE), thrust::device_ptr<int>(d0));
thrust::copy_n(thrust::make_permutation_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::device_ptr<int>(d2), thrust::device_ptr<int>(d3), thrust::device_ptr<int>(d4), thrust::device_ptr<int>(d5))), thrust::device_ptr<int>(d0)), DSIZE, thrust::make_zip_iterator(thrust::make_tuple(thrust::device_ptr<int>(d6), thrust::device_ptr<int>(d7), thrust::device_ptr<int>(d8), thrust::device_ptr<int>(d9))));
cudaDeviceSynchronize();
mytime = dtime_usec(mytime);
printf("t2: %fs\n", mytime/(float)USECPSEC);
cudaMemcpy(h1_2, d1, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h2_2, d6, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h3_2, d7, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h4_2, d8, dsize, cudaMemcpyDeviceToHost);
cudaMemcpy(h5_2, d9, dsize, cudaMemcpyDeviceToHost);
// validate results
for (int i = 0; i < DSIZE; i++){
if (h1_1[i] != h1_2[i]) {printf("h1 mismatch at %d, was: %d, should be: %d\n", i, h1_2[i], h1_1[i]); return 1;}
if (h2_1[i] != h2_2[i]) {printf("h2 mismatch at %d, was: %d, should be: %d\n", i, h2_2[i], h2_1[i]); return 1;}
if (h3_1[i] != h3_2[i]) {printf("h3 mismatch at %d, was: %d, should be: %d\n", i, h3_2[i], h3_1[i]); return 1;}
if (h4_1[i] != h4_2[i]) {printf("h4 mismatch at %d, was: %d, should be: %d\n", i, h4_2[i], h4_1[i]); return 1;}
if (h5_1[i] != h5_2[i]) {printf("h5 mismatch at %d, was: %d, should be: %d\n", i, h5_2[i], h5_1[i]); return 1;}
}
return 0;
}
$ nvcc -o t746 t746.cu
$ ./t746
t1: 0.016679s
t2: 0.005404s
$