I have two dynamic arrays A and B each of size 1024:
I want to calculate (A-B)^2. I have written a code to calculate this problem using the kernel call : sumdiffsquare<<<1, dim3(8,4)>>>
global void
sumdiffsquare(float* C, float* A, float * pop, int wA, int hA)
{
// Block index
int bx = blockIdx.x;
int by = blockIdx.y;
// Thread index
int tx = threadIdx.x;
int ty = threadIdx.y;
// Accumulate row i of A and column j of B
int j = by * blockDim.y + ty;
int i = bx * blockDim.x + tx;
if (i < wA && j < hA)
C[j * wA + i] = (A[j * wA + i] - pop[j * wA + i])*(A[j * wA + i] - pop[j * wA + i]);
__syncthreads();
for (int stride = 1; stride < 1024; stride *= 2)
{
if (((j*wA + i) % (2 * stride)) == 0)
{
C[j*wA + i] += C[j*wA + i + stride];
}
// synchronize within threadblock
__syncthreads();
}
}
Is there any easy method to perform this calculation like thrust or using the kernel call sumdiffsquare<<< 2, dim3(4,4)>>>
note that wA=256 and hA=4
I’ve tried to calc. C = (A-B)^2 using Thrust:
// DO NOT FORGET OPTION: --expt-extended-lambda
#include <cuda_runtime.h>
#include <thrust/transform.h>
#include <thrust/device_ptr.h>
#include <algorithm>
#include <random>
#include <iostream>
int main() {
using namespace std;
const int M = 256; // row
const int N = 4; // col
float hA[M][N];
float hB[M][N];
float hC[M][N];
mt19937 gen;
uniform_real_distribution<float> dist;
auto rand = [&]() { return dist(gen); };
// fill hA and hB with random values
generate_n(&hA[0][0], M*N, rand);
generate_n(&hB[0][0], M*N, rand);
// C = (A-B)^2 on CPU
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float va = hA[row][col];
float vb = hB[row][col];
hC[row][col] = (va-vb)*(va-vb);
}
}
float (*dA)[M][N];
float (*dB)[M][N];
float (*dC)[M][N];
cudaMalloc(&dA, M*N*sizeof(float));
cudaMalloc(&dB, M*N*sizeof(float));
cudaMalloc(&dC, M*N*sizeof(float));
cudaMemcpy(dA, hA, M*N*sizeof(float), cudaMemcpyDefault);
cudaMemcpy(dB, hB, M*N*sizeof(float), cudaMemcpyDefault);
thrust::device_ptr<float> pA((float*)dA);
thrust::device_ptr<float> pB((float*)dB);
thrust::device_ptr<float> pC((float*)dC);
// C = (A-B)^2 on GPU using thrust::transform
thrust::transform(pA, pA+M*N, pB, pC,
[] __device__ (float va, float vb) { return (va-vb)*(va-vb);});
float CC[M][N]; // copy of dC
cudaMemcpy(&CC[0][0], dC, M*N*sizeof(float), cudaMemcpyDefault);
// compare CPU & GPU result
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float vh = hC[row][col];
float vd = CC[row][col];
if ( vh != vd )
cout << "oops! " << vh << " != " << vd
<< " at " << row << "," << col << endl;
}
}
cudaFree(dA);
cudaFree(dB);
cudaFree(dC);
cudaDeviceReset();
}
Ha, I was about to post a solution then I see that episteme did the same thing XD
I have to ask though, why on God’s green Earth are you using that much vanilla CUDA? Thrust’s built-in vectors are perfectly capable of this and make the code actually palatable. CUDA C is just like cigarettes, it eventually gives you cancer.
I see.
I don’t wanna explain how to use thrust::device_vector. that’s not my purpose.
nicotine-free version.
// DO NOT FORGET OPTION: --expt-extended-lambda
#include <thrust/device_vector.h>
#include <thrust/transform.h>
#include <thrust/copy.h>
#include <algorithm>
#include <random>
#include <iostream>
int main() {
using namespace std;
const int M = 256; // row
const int N = 4; // col
float hA[M][N];
float hB[M][N];
float hC[M][N];
mt19937 gen;
uniform_real_distribution<float> dist;
auto rand = [&]() { return dist(gen); };
// fill hA and hB with random values
generate_n(&hA[0][0], M*N, rand);
generate_n(&hB[0][0], M*N, rand);
// C = (A-B)^2 on CPU
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float va = hA[row][col];
float vb = hB[row][col];
hC[row][col] = (va-vb)*(va-vb);
}
}
thrust::device_vector<float> dA((float*)hA, (float*)hA+M*N);
thrust::device_vector<float> dB((float*)hB, (float*)hB+M*N);
thrust::device_vector<float> dC(M*N);
// C = (A-B)^2 on GPU using thrust::transform
thrust::transform(begin(dA), end(dA), begin(dB), begin(dC),
[] __device__ (float va, float vb) { return (va-vb)*(va-vb);});
float CC[M][N]; // copy of dC
thrust::copy(begin(dC), end(dC), (float*)&CC);
// compare CPU & GPU result
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float vh = hC[row][col];
float vd = CC[row][col];
if ( vh != vd )
cout << "oops! " << vh << " != " << vd
<< " at " << row << "," << col << endl;
}
}
}
thank you all.
is the thrust has the ability to do any calculations?
is the thrust has the ability to do any calculations?
depends on memory-layout.
if the data can be treated as linear/contiguous(1D), almost any calc.
this solution calculate the expression but how to add the result array elements into one element
let say that the input is (256*4) array and needed to be reduced into 1 element only by adding all the elements
piece of cake :)
// DO NOT FORGET OPTION: --expt-extended-lambda
#include <thrust/device_vector.h>
#include <thrust/transform_reduce.h>
#include <algorithm>
#include <random>
#include <iostream>
int main() {
using namespace std;
const int M = 256; // row
const int N = 4; // col
float hA[M][N];
float hB[M][N];
mt19937 gen;
uniform_real_distribution<float> dist;
auto rand = [&]() { return dist(gen); };
// fill hA and hB with random values
generate_n(&hA[0][0], M*N, rand);
generate_n(&hB[0][0], M*N, rand);
// C = (A-B)^2 on CPU
float sum = 0.0;
for ( int row = 0; row < M; ++row ) {
for ( int col = 0; col < N; ++col ) {
float va = hA[row][col];
float vb = hB[row][col];
sum += (va-vb)*(va-vb);
}
}
cout << "sum = " << sum << " (CPU)" <<endl;
thrust::device_vector<float> dA((float*)hA, (float*)hA+M*N);
thrust::device_vector<float> dB((float*)hB, (float*)hB+M*N);
// sum = sigma((A-B)^2) on GPU using thrust::transform_reduce
sum = thrust::transform_reduce(thrust::make_zip_iterator(thrust::make_tuple(begin(dA), begin(dB))),
thrust::make_zip_iterator(thrust::make_tuple(end(dA) , end(dB) )),
[] __device__ (const auto& ab) {
float va = thrust::get<0>(ab);
float vb = thrust::get<1>(ab);
return (va-vb)*(va-vb);
},
0.0f,
[] __device__ (float x, float y) { return x + y; });
cout << "sum = " << sum << " (GPU)" << endl;
}
thank you
a wonderful cake :)
sorry episteme,
kernel.cu(44): error: “auto” is not allowed here
kernel.cu(45): error: no instance of overloaded function “thrust::get” matches the argument list
argument types are: ()
kernel.cu(46): error: no instance of overloaded function “thrust::get” matches the argument list
argument types are: ()
3 errors detected in the compilation
watch diabetes.
thrust tends to consume bigger overhead to invoke than simple kernel-call.
3 errors detected in the compilation
oops… this code have been compiled/executed successfuly on:
Windows 10(x64), Visual C++ 2015 update 1, CUDA 8.0
you use g++?
I’m not familier with Linux/g++, but I heard “-std=c++11” needs to be specified on compilation.
i am using also Windows 7(x64), Visual Studio 2013 , CUDA 7.5
try to replace ‘auto&’ with ‘thrust::tuple<float,float>&’
Done
thank you very much.
how can learn thrust with CUDA. any recommended books?
and another question
in your opinion, if i have an array 4*4 it is easy to calculate the sigma of (diagonal elements)^2 and sigma of (nondiagonal elements)^2
finally, thank you again for your cake.
-
I usually refer online docs on [url]http://thrust.github.io/[/url]
-
how about to prepare ‘mask’ as below then calc (A*MASK)^2?
1 0 0 0
0 1 0 0
0 0 1 0
0 0 0 1
however CUDA is not suitable for such small task. better to calc on CPU I think.