# reduction of dynamic array

I have two dynamic arrays A and B each of size 1024:

## I want to calculate (A-B)^2. I have written a code to calculate this problem using the kernel call : sumdiffsquare<<<1, dim3(8,4)>>>global void

sumdiffsquare(float* C, float* A, float * pop, int wA, int hA)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;

``````// Thread index

// Accumulate row i of A and column j of B
int j = by * blockDim.y + ty;
int i = bx * blockDim.x + tx;

if (i < wA && j < hA)
C[j * wA + i] = (A[j * wA + i] - pop[j * wA + i])*(A[j * wA + i] - pop[j * wA + i]);

for (int stride = 1; stride < 1024; stride *= 2)
{
if (((j*wA + i) % (2 * stride)) == 0)
{
C[j*wA + i] += C[j*wA + i + stride];
}

}
``````

## }

Is there any easy method to perform this calculation like thrust or using the kernel call sumdiffsquare<<< 2, dim3(4,4)>>>

note that wA=256 and hA=4

I’ve tried to calc. C = (A-B)^2 using Thrust:

``````// DO NOT FORGET OPTION: --expt-extended-lambda

#include <cuda_runtime.h>
#include <thrust/transform.h>
#include <thrust/device_ptr.h>
#include <algorithm>
#include <random>

#include <iostream>

int main() {
using namespace std;
const int M = 256; // row
const int N =   4; // col

float hA[M][N];
float hB[M][N];
float hC[M][N];

mt19937 gen;
uniform_real_distribution<float> dist;
auto rand = [&]() { return dist(gen); };

// fill hA and hB with random values
generate_n(&hA[0][0], M*N, rand);
generate_n(&hB[0][0], M*N, rand);

// C = (A-B)^2 on CPU
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float va = hA[row][col];
float vb = hB[row][col];
hC[row][col] = (va-vb)*(va-vb);
}
}

float (*dA)[M][N];
float (*dB)[M][N];
float (*dC)[M][N];
cudaMalloc(&dA, M*N*sizeof(float));
cudaMalloc(&dB, M*N*sizeof(float));
cudaMalloc(&dC, M*N*sizeof(float));

cudaMemcpy(dA, hA, M*N*sizeof(float), cudaMemcpyDefault);
cudaMemcpy(dB, hB, M*N*sizeof(float), cudaMemcpyDefault);

thrust::device_ptr<float> pA((float*)dA);
thrust::device_ptr<float> pB((float*)dB);
thrust::device_ptr<float> pC((float*)dC);
// C = (A-B)^2 on GPU using thrust::transform
thrust::transform(pA, pA+M*N, pB, pC,
[] __device__ (float va, float vb) { return (va-vb)*(va-vb);});

float CC[M][N]; // copy of dC
cudaMemcpy(&CC[0][0], dC, M*N*sizeof(float), cudaMemcpyDefault);
// compare CPU & GPU result
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float vh = hC[row][col];
float vd = CC[row][col];
if ( vh != vd )
cout << "oops! " << vh << " != " << vd
<< " at " << row << "," << col << endl;
}
}

cudaFree(dA);
cudaFree(dB);
cudaFree(dC);

}
``````

Ha, I was about to post a solution then I see that episteme did the same thing XD

I have to ask though, why on God’s green Earth are you using that much vanilla CUDA? Thrust’s built-in vectors are perfectly capable of this and make the code actually palatable. CUDA C is just like cigarettes, it eventually gives you cancer.

I see.

I don’t wanna explain how to use thrust::device_vector. that’s not my purpose.

nicotine-free version.

``````// DO NOT FORGET OPTION: --expt-extended-lambda

#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>

#include <algorithm>
#include <random>

#include <iostream>

int main() {
using namespace std;
const int M = 256; // row
const int N =   4; // col

float hA[M][N];
float hB[M][N];
float hC[M][N];

mt19937 gen;
uniform_real_distribution<float> dist;
auto rand = [&]() { return dist(gen); };

// fill hA and hB with random values
generate_n(&hA[0][0], M*N, rand);
generate_n(&hB[0][0], M*N, rand);

// C = (A-B)^2 on CPU
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float va = hA[row][col];
float vb = hB[row][col];
hC[row][col] = (va-vb)*(va-vb);
}
}

thrust::device_vector<float> dA((float*)hA, (float*)hA+M*N);
thrust::device_vector<float> dB((float*)hB, (float*)hB+M*N);
thrust::device_vector<float> dC(M*N);

// C = (A-B)^2 on GPU using thrust::transform
thrust::transform(begin(dA), end(dA), begin(dB), begin(dC),
[] __device__ (float va, float vb) { return (va-vb)*(va-vb);});

float CC[M][N]; // copy of dC
thrust::copy(begin(dC), end(dC), (float*)&CC);
// compare CPU & GPU result
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float vh = hC[row][col];
float vd = CC[row][col];
if ( vh != vd )
cout << "oops! " << vh << " != " << vd
<< " at " << row << "," << col << endl;
}
}

}
``````

thank you all.
is the thrust has the ability to do any calculations?

is the thrust has the ability to do any calculations?

depends on memory-layout.
if the data can be treated as linear/contiguous(1D), almost any calc.

this solution calculate the expression but how to add the result array elements into one element
let say that the input is (256*4) array and needed to be reduced into 1 element only by adding all the elements

piece of cake :)

``````// DO NOT FORGET OPTION: --expt-extended-lambda

#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>

#include <algorithm>
#include <random>

#include <iostream>

int main() {
using namespace std;
const int M = 256; // row
const int N =   4; // col

float hA[M][N];
float hB[M][N];

mt19937 gen;
uniform_real_distribution<float> dist;
auto rand = [&]() { return dist(gen); };

// fill hA and hB with random values
generate_n(&hA[0][0], M*N, rand);
generate_n(&hB[0][0], M*N, rand);

// C = (A-B)^2 on CPU
float sum = 0.0;
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float va = hA[row][col];
float vb = hB[row][col];
sum += (va-vb)*(va-vb);
}
}
cout << "sum = " << sum << " (CPU)" <<endl;

thrust::device_vector<float> dA((float*)hA, (float*)hA+M*N);
thrust::device_vector<float> dB((float*)hB, (float*)hB+M*N);

// sum = sigma((A-B)^2) on GPU using thrust::transform_reduce
sum = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(begin(dA), begin(dB))),
thrust::make_zip_iterator(thrust::make_tuple(end(dA)  , end(dB)  )),
[] __device__ (const auto& ab) {
float va = thrust::get<0>(ab);
float vb = thrust::get<1>(ab);
return (va-vb)*(va-vb);
},
0.0f,
[] __device__ (float x, float y) { return  x + y; });
cout << "sum = " << sum << " (GPU)" << endl;

}
``````

thank you
a wonderful cake :)

sorry episteme,

kernel.cu(44): error: “auto” is not allowed here

kernel.cu(45): error: no instance of overloaded function “thrust::get” matches the argument list
argument types are: ()

kernel.cu(46): error: no instance of overloaded function “thrust::get” matches the argument list
argument types are: ()

3 errors detected in the compilation

watch diabetes.
thrust tends to consume bigger overhead to invoke than simple kernel-call.

3 errors detected in the compilation

oops… this code have been compiled/executed successfuly on:
Windows 10(x64), Visual C++ 2015 update 1, CUDA 8.0

you use g++?
I’m not familier with Linux/g++, but I heard “-std=c++11” needs to be specified on compilation.

i am using also Windows 7(x64), Visual Studio 2013 , CUDA 7.5

try to replace ‘auto&’ with ‘thrust::tuple<float,float>&’

Done
thank you very much.
how can learn thrust with CUDA. any recommended books?
and another question
in your opinion, if i have an array 4*4 it is easy to calculate the sigma of (diagonal elements)^2 and sigma of (nondiagonal elements)^2
finally, thank you again for your cake.

1. I usually refer online docs on http://thrust.github.io/