reduction of dynamic array

I have two dynamic arrays A and B each of size 1024:

I want to calculate (A-B)^2. I have written a code to calculate this problem using the kernel call : sumdiffsquare<<<1, dim3(8,4)>>>
global void

sumdiffsquare(float* C, float* A, float * pop, int wA, int hA)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;

// Accumulate row i of A and column j of B
int j = by * blockDim.y + ty;
int i = bx * blockDim.x + tx;

if (i < wA && j < hA)
	C[j * wA + i] = (A[j * wA + i] - pop[j * wA + i])*(A[j * wA + i] - pop[j * wA + i]);
__syncthreads();

for (int stride = 1; stride < 1024; stride *= 2)
{
	if (((j*wA + i) % (2 * stride)) == 0)
	{
		C[j*wA + i] += C[j*wA + i + stride];
	}

	// synchronize within threadblock
	__syncthreads();
}

}

Is there any easy method to perform this calculation like thrust or using the kernel call sumdiffsquare<<< 2, dim3(4,4)>>>

note that wA=256 and hA=4

I’ve tried to calc. C = (A-B)^2 using Thrust:

// DO NOT FORGET OPTION: --expt-extended-lambda

#include <cuda_runtime.h>
#include <thrust/transform.h>
#include <thrust/device_ptr.h>
#include <algorithm>
#include <random>

#include <iostream>

int main() {
  using namespace std;
  const int M = 256; // row
  const int N =   4; // col

  float hA[M][N];
  float hB[M][N];
  float hC[M][N];

  mt19937 gen;
  uniform_real_distribution<float> dist;
  auto rand = [&]() { return dist(gen); };

  // fill hA and hB with random values 
  generate_n(&hA[0][0], M*N, rand);
  generate_n(&hB[0][0], M*N, rand);

  // C = (A-B)^2 on CPU
  for ( int row = 0; row < M; ++row ) {
    for ( int col = 0; col < N; ++col ) {
      float va = hA[row][col];
      float vb = hB[row][col];
      hC[row][col] = (va-vb)*(va-vb);
    }
  }

  float (*dA)[M][N];
  float (*dB)[M][N];
  float (*dC)[M][N];
  cudaMalloc(&dA, M*N*sizeof(float));
  cudaMalloc(&dB, M*N*sizeof(float));
  cudaMalloc(&dC, M*N*sizeof(float));

  cudaMemcpy(dA, hA, M*N*sizeof(float), cudaMemcpyDefault);
  cudaMemcpy(dB, hB, M*N*sizeof(float), cudaMemcpyDefault);

  thrust::device_ptr<float> pA((float*)dA);
  thrust::device_ptr<float> pB((float*)dB);
  thrust::device_ptr<float> pC((float*)dC);
  // C = (A-B)^2 on GPU using thrust::transform
  thrust::transform(pA, pA+M*N, pB, pC,
                    [] __device__ (float va, float vb) { return (va-vb)*(va-vb);});

  float CC[M][N]; // copy of dC
  cudaMemcpy(&CC[0][0], dC, M*N*sizeof(float), cudaMemcpyDefault);
  // compare CPU & GPU result
  for ( int row = 0; row < M; ++row ) {
    for ( int col = 0; col < N; ++col ) {
      float vh = hC[row][col];
      float vd = CC[row][col];
      if ( vh != vd ) 
        cout << "oops! " << vh << " != " << vd 
             << " at " << row << "," << col << endl;
    }
  }

  cudaFree(dA);
  cudaFree(dB);
  cudaFree(dC);
  cudaDeviceReset();

}

Ha, I was about to post a solution then I see that episteme did the same thing XD

I have to ask though, why on God’s green Earth are you using that much vanilla CUDA? Thrust’s built-in vectors are perfectly capable of this and make the code actually palatable. CUDA C is just like cigarettes, it eventually gives you cancer.

I see.

I don’t wanna explain how to use thrust::device_vector. that’s not my purpose.

nicotine-free version.

// DO NOT FORGET OPTION: --expt-extended-lambda

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>

#include <algorithm>
#include <random>

#include <iostream>

int main() {
  using namespace std;
  const int M = 256; // row
  const int N =   4; // col

  float hA[M][N];
  float hB[M][N];
  float hC[M][N];

  mt19937 gen;
  uniform_real_distribution<float> dist;
  auto rand = [&]() { return dist(gen); };

  // fill hA and hB with random values 
  generate_n(&hA[0][0], M*N, rand);
  generate_n(&hB[0][0], M*N, rand);

  // C = (A-B)^2 on CPU
  for ( int row = 0; row < M; ++row ) {
    for ( int col = 0; col < N; ++col ) {
      float va = hA[row][col];
      float vb = hB[row][col];
      hC[row][col] = (va-vb)*(va-vb);
    }
  }

  thrust::device_vector<float> dA((float*)hA, (float*)hA+M*N);
  thrust::device_vector<float> dB((float*)hB, (float*)hB+M*N);
  thrust::device_vector<float> dC(M*N);

  // C = (A-B)^2 on GPU using thrust::transform
  thrust::transform(begin(dA), end(dA), begin(dB), begin(dC),
                    [] __device__ (float va, float vb) { return (va-vb)*(va-vb);});

  float CC[M][N]; // copy of dC
  thrust::copy(begin(dC), end(dC), (float*)&CC);
  // compare CPU & GPU result
  for ( int row = 0; row < M; ++row ) {
    for ( int col = 0; col < N; ++col ) {
      float vh = hC[row][col];
      float vd = CC[row][col];
      if ( vh != vd ) 
        cout << "oops! " << vh << " != " << vd 
             << " at " << row << "," << col << endl;
    }
  }

}

thank you all.
is the thrust has the ability to do any calculations?

is the thrust has the ability to do any calculations?

depends on memory-layout.
if the data can be treated as linear/contiguous(1D), almost any calc.

this solution calculate the expression but how to add the result array elements into one element
let say that the input is (256*4) array and needed to be reduced into 1 element only by adding all the elements

piece of cake :)

// DO NOT FORGET OPTION: --expt-extended-lambda

#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>

#include <algorithm>
#include <random>

#include <iostream>

int main() {
  using namespace std;
  const int M = 256; // row
  const int N =   4; // col

  float hA[M][N];
  float hB[M][N];

  mt19937 gen;
  uniform_real_distribution<float> dist;
  auto rand = [&]() { return dist(gen); };

  // fill hA and hB with random values 
  generate_n(&hA[0][0], M*N, rand);
  generate_n(&hB[0][0], M*N, rand);

  // C = (A-B)^2 on CPU
  float sum = 0.0;
  for ( int row = 0; row < M; ++row ) {
    for ( int col = 0; col < N; ++col ) {
      float va = hA[row][col];
      float vb = hB[row][col];
      sum += (va-vb)*(va-vb);
    }
  }
  cout << "sum = " << sum << " (CPU)" <<endl;

  thrust::device_vector<float> dA((float*)hA, (float*)hA+M*N);
  thrust::device_vector<float> dB((float*)hB, (float*)hB+M*N);

  // sum = sigma((A-B)^2) on GPU using thrust::transform_reduce
  sum = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(begin(dA), begin(dB))),
                                 thrust::make_zip_iterator(thrust::make_tuple(end(dA)  , end(dB)  )),
                                 [] __device__ (const auto& ab) {
                                   float va = thrust::get<0>(ab);
                                   float vb = thrust::get<1>(ab);
                                   return (va-vb)*(va-vb);
                                 },
                                 0.0f,
                                 [] __device__ (float x, float y) { return  x + y; });
  cout << "sum = " << sum << " (GPU)" << endl;

}

thank you
a wonderful cake :)

sorry episteme,

kernel.cu(44): error: “auto” is not allowed here

kernel.cu(45): error: no instance of overloaded function “thrust::get” matches the argument list
argument types are: ()

kernel.cu(46): error: no instance of overloaded function “thrust::get” matches the argument list
argument types are: ()

3 errors detected in the compilation

watch diabetes.
thrust tends to consume bigger overhead to invoke than simple kernel-call.

3 errors detected in the compilation

oops… this code have been compiled/executed successfuly on:
Windows 10(x64), Visual C++ 2015 update 1, CUDA 8.0

you use g++?
I’m not familier with Linux/g++, but I heard “-std=c++11” needs to be specified on compilation.

i am using also Windows 7(x64), Visual Studio 2013 , CUDA 7.5

try to replace ‘auto&’ with ‘thrust::tuple<float,float>&’

Done
thank you very much.
how can learn thrust with CUDA. any recommended books?
and another question
in your opinion, if i have an array 4*4 it is easy to calculate the sigma of (diagonal elements)^2 and sigma of (nondiagonal elements)^2
finally, thank you again for your cake.

  1. I usually refer online docs on http://thrust.github.io/

  2. how about to prepare ‘mask’ as below then calc (A*MASK)^2?
    1 0 0 0
    0 1 0 0
    0 0 1 0
    0 0 0 1

however CUDA is not suitable for such small task. better to calc on CPU I think.