Here’s a slightly different approach.

- Flatten your A matrix into a thrust vector.
- Replicate your B matrix into a vector of the same length:

A = [0 0 0 1 0 0 1 0 0 0 1 1]

B = [0 1 1 1 0 1 1 1 0 1 1 1]

- Create a C vector that is 1 if there is a match between the two:

C = [1 0 0 1 1 0 1 0 1 0 1 1]

- Do an ordinary sum reduction on C:

res = 7

- Divide by N, the length of the template

ans = 7/4 = 1.75

Note that steps 2 and 3, the creation of B and C vectors, don’t have to be done explicitly, they can be handled through fusion. Also, since transform_reduce can only accept a single input vector, we must zip the A and (implicit) B vectors together. Here’s a fully worked example:

```
#include <iostream>
#include <stdlib.h>
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/functional.h>
#include <thrust/iterator/permutation_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#define A_ROWS 3
#define A_COLS 4
#define RANGE 2
struct my_modulo_functor : public thrust::unary_function<int, int>
{
__host__ __device__
int operator() (int idx) {
return idx%(A_COLS);}
};
struct my_tuple_functor : public thrust::unary_function<thrust::tuple<int, int>, int>
{
__host__ __device__
int operator() (thrust::tuple<int, int> a) {
if (a.get<0>() == a.get<1>()) return 1;
return 0;}
};
typedef thrust::transform_iterator<my_modulo_functor, thrust::counting_iterator<int>, int> my_titer;
typedef thrust::permutation_iterator<thrust::device_vector<int>::iterator, my_titer> my_piter;
typedef thrust::zip_iterator<thrust::tuple<thrust::device_vector<int>::iterator, my_piter> > zip2int;
int main(){
int A_mat[A_ROWS*A_COLS] = { 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1 };
int B_mat[A_COLS] = {0, 1, 1, 1};
thrust::host_vector<int> A(A_ROWS*A_COLS); // the data
thrust::host_vector<int> B(A_COLS); // the template
// synthetic data:
// for (int i = 0; i < A_ROWS*A_COLS; i++) A[i] = rand()%RANGE;
// for (int i = 0; i < A_COLS; i++) B[i] = rand()%RANGE;
// instead flatten/copy matrices into vectors A, B
for (int i = 0; i < A_ROWS*A_COLS; i++) A[i] = A_mat[i];
for (int i = 0; i < A_COLS; i++) B[i] = B_mat[i];
thrust::device_vector<int> d_A = A;
thrust::device_vector<int> d_B = B;
zip2int first = thrust::make_zip_iterator(thrust::make_tuple(d_A.begin(), thrust::make_permutation_iterator(d_B.begin(), thrust::make_transform_iterator(thrust::counting_iterator<int>(0), my_modulo_functor()))));
zip2int last = first + (A_ROWS*A_COLS);
int red_sum = thrust::transform_reduce(first, last, my_tuple_functor(), 0, thrust::plus<int>());
float res = (float)red_sum/(float)(A_COLS);
std::cout << "result = " << res << std::endl;
return 0;
}
```