Hello,

I’m working on one c++ project using thrust. I have a code that maybe can be optimized, not sure. In the following, you will see a minimal working example. The input vector can be thought to be a matrix in row-major format. In a real application, the size of the matrix has the scale: row = 10,

column = 10^7. The functor should operate on the columns of the matrix and write the result back into the same column (number of threads = number of columns). The solution I have found is using the input of the functor (see “arr” below) to read and write the data. My questions are: will the code perform faster if one will use the iterators (in combination with the transform), instead of using the functor input? If it is possible, what combination of iterators should one use? And how to implement a functor that is returning an array? Or maybe there is a better approach?

Thank you.

```
#include <thrust/device_vector.h>
struct functor
{
const int R;
const int C;
float *arr;
float *tempVec = NULL;
float *condVec = NULL;
functor(int _R, int _C, float *_arr) : R(_R), C(_C), arr(_arr) {};
__host__ __device__
void operator()(int i) {
tempVec = new float[R];
condVec = new float[R];
// Collect the column entry into a temporary array
int wi1 = 0;
for(int j = i; j <= i+(R-1)*C; j = j + C){
tempVec[wi1] = arr[j];
condVec[wi1] = arr[j];
wi1++;
}
/* The main body of the functor starts here and
in a real application, it is more complicated.
In this example, I reduced it to some simple
operation */
thrust::sort(thrust::device, tempVec, tempVec + R);
float thr = tempVec[0]*tempVec[0] - tempVec[R-1];
/* End of main body */
// write the modified column back to the array
int wi2 = 0;
for(int j = i; j <= i+(R-1)*C; j = j + C){
if( (condVec[wi2] - thr) < 0.0){
arr[j] = 0.0;
}else{
arr[j] = condVec[wi2] - thr;
}
wi2++;
}
delete [] tempVec;
delete [] condVec;
}
};
int main() {
int R = 3 ; // this will be the rows
int C = 4 ; // and this the columns
const int vecL = C*R;
// initialize host array
float x[vecL] = {1.0, 2.0, 3.0, 1.5, 4.0, 6.0, 3.0, 3.0, 2.0, 5.4, 1.3, 3.2};
// transfer to device
thrust::device_vector<float> d_x(x, x + vecL);
// call functor
thrust::for_each_n(thrust::device, thrust::counting_iterator<size_t>(0), C, functor(R, C, thrust::raw_pointer_cast(d_x.data())));
cudaDeviceSynchronize();
// copy result to host
thrust::host_vector<float> result_host = d_x;
// print out result
for(int i = 0; i<vecL; i++)
std::cout << result_host[i] << " ";
std::cout<< "\n";
// result: 4 4 4.31 2.45 7 8 4.31 3.95 5 7.4 2.61 4.15
return 0;
}
```