The concept is indirection. It’s a basic programming concept not unique or specific to CUDA. Perhaps you should give it a try? I’m not sure how to explain it if you are not able to follow the code I provided. We use an index or array offset provided in b, to select the item to retrieve from a, here:
float my_val = a[b_idx]; // retrieve value from a
^^^^^
note use of b_idx here instead of idx
Here is a completely worked example, using the exact code I already provided:
# cat t89.cu
#include <iostream>
__global__ void my_kernel_b(size_t *b, float *a, size_t len_a, size_t len_b, float *r){
size_t idx = threadIdx.x+blockDim.x*blockIdx.x; // calculate index
if (idx < len_b){
size_t b_idx = b[idx];
if (b_idx < len_a){
float my_val = a[b_idx]; // retrieve value from a
// do some work on the value from a
r[idx] = my_val;
}
}
}
int main(){
float a[] ={100,101,102,200,201,202,300,301,302};
size_t b[] = {0, 3, 6};
size_t len_a = 9;
size_t len_b = 3;
float *d_a, *d_r;
size_t *d_b;
float *r = new float[len_b];
cudaMalloc(&d_a, len_a*sizeof(a[0]));
cudaMalloc(&d_b, len_b*sizeof(b[0]));
cudaMalloc(&d_r, len_b*sizeof(r[0]));
cudaMemcpy(d_a, a, len_a*sizeof(a[0]), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, len_b*sizeof(b[0]), cudaMemcpyHostToDevice);
my_kernel_b<<<(len_b + 255)/256, 256>>>(d_b, d_a, len_a, len_b, d_r);
cudaMemcpy(r, d_r, len_b*sizeof(r[0]), cudaMemcpyDeviceToHost);
for (int i = 0; i < len_b; i++) std::cout << r[i] << ",";
std::cout << std::endl;
}
# nvcc -o t89 t89.cu
# compute-sanitizer ./t89
========= COMPUTE-SANITIZER
100,200,300,
========= ERROR SUMMARY: 0 errors
#