I am attempting to execute division with two complex numbers such that a reduction is employed via CUDA CUB. However, the value I get back from the computation using CUB does not match that computed on HOST.
I am hoping it is something straight forward that I am just missing due to my inexperience with CUB. Can anyone tell me what I am doing wrong?
The code is copied below:
#include <stdio.h>
#include <cuda.h>
#include <cuComplex.h>
#include "cub/cub.cuh" // assumes in same directory
struct ComplexDiv{
__device__ __forceinline__
cuComplex operator()(cuComplex &a, cuComplex &b){
cuComplex tmp;
tmp.x = (a.x*b.x + a.y*b.y) / (b.x*b.x + b.y*b.y);
tmp.y = (a.y*b.x - a.x*b.y) / (b.x*b.x + b.y*b.y);
a.x = tmp.x;
a.y = tmp.y;
return a;
}
};
int main(int argc, char **argv){
unsigned int N = 123;
unsigned int r = N % 2;
unsigned int L = (N - r) / 2;
cuComplex *h_a, h_b;
cuComplex *d_a, *d_b;
cuComplex chk;
h_a = (cuComplex*)malloc(2*L*sizeof(cuComplex));
cudaMalloc((void**)&d_a, 2*L*sizeof(cuComplex));
cudaMalloc((void**)&d_b, sizeof(cuComplex));
// initialize with some "dummy data"
for(unsigned int i = 0; i < 2*L; ++i){
h_a[i].x = (float)(i + 2.3);
h_a[i].y = (float)(i + 1);
}
// copy to DEVICE
cudaMemcpy(d_a, h_a, 2*L*sizeof(cuComplex), cudaMemcpyHostToDevice);
h_b.x = 1.0f;
h_b.y = 0.0f;
// compute on HOST
for(i = 0; i < 2*L; i++){
cuComplex tmp;
tmp.x = (h_b.x*h_a.x + h_b.y*h_a.y) / (h_a.x*h_a.x + h_a.y*h_a.y);
tmp.y = (h_b.y*h_a.x - h_b.x*h_a.y) / (h_a.x*h_a.x + h_a.y*h_a.y);
h_b.x = tmp.x;
h_b.y = tmp.y;
}
// compute on DEVICE using CUB
ComplexDiv div;
cuComplex init;
void *tmpStorageDiv = NULL;
size_t tmpStorageBytesDiv = 0;
init.x = 1.0f;
init.y = 0.0f;
cub::DeviceReduce::Reduce(tmpStorageDiv, tmpStorageBytesDiv, d_a, d_b, 2*L, div, init);
cudaMalloc(&tmpStorageDiv, tmpStorageBytesDiv);
cub::DeviceReduce::Reduce(tmpStorageDiv, tmpStorageBytesDiv, d_a, d_b, 2*L, div, init);
cudaFree(&tmpStorageDiv);
// copy result from DEVICE to HOST
cudaMemcpy(&chk, d_b, sizeof(cuComplex), cudaMemcpyDeviceToHost);
// Problem: HOST and DEVICE values do not match
printf("\nHOST: (%0.4f, %0.4f), DEVICE: (%0.4f, %0.4f)\n", h_b.x h_b.y, chk.x, chk.y);
free(h_a);
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
After doing some more investigation, I noticed unexpected results for the addition portion of the custom complex division functor which didn’t occur in HOST.
I modified the original code such that random values are generated for h_a and the N value was set to 5 so I could see what occurs on a small sample size. It is really weird that the custom CUB functor will output non-zeros whereas the HOST-side version does not - perhaps CUB is doing some calculations behind the scenes I am unaware of? Any ideas/hints would be greatly appreciated.
The below is code snippet that is modification from original posted code:
...
// called by CUDA CUB as a custom functor, same as in above (original) code
struct ComplexDiv{
__device__ __forceinline__
cuComplex operator()(cuComplex &a, cuComplex &b){
float x = a.y * b.y;
// The following outputs non-zero(s) when zero(s) are expected - weird?
if(x != 0.0f) printf("DEVICE: %0.4f\n", x);
cuComplex tmp;
tmp.x = (a.x*bx + a.y*b.y); // a.x*b.x is same as HOST, adding '+ a.y*b.y' yields different from HOST
tmp.y = 0.0f;
a.x = tmp.x;
a.y = tmp.y;
return a;
/*
cuComplex tmp;
tmp.x = (a.x*b.x + a.y*b.y) / (b.x*b.x + b.y*b.y);
tmp.y = (a.y*b.x - a.x*b.y) / (b.x*b.x + b.y*b.y);
a.x = tmp.x;
a.y = tmp.y;
return a;
*/
}
};
...
// compute on HOST
for(i = 0; i < 2*L; i++){
float x = h_b.y * h_a.y;
// Never outputs from HOST
if(x != 0.0f) printf("HOST: %0.4f\n", x);
cuComplex tmp;
tmp.x = (h_b.x*h_a.x + h_b.y*h_a.y);
tmp.y = 0.0f;
h_b.x = tmp.x;
h_b.y = tmp.y;
/*
cuComplex tmp;
tmp.x = (h_b.x*h_a.x + h_b.y*h_a.y) / (h_a.x*h_a.x + h_a.y*h_a.y);
tmp.y = (h_b.y*h_a.x - h_b.x*h_a.y) / (h_a.x*h_a.x + h_a.y*h_a.y);
h_b.x = tmp.x;
h_b.y = tmp.y;
*/
}
...
Does anyone have experience using a custom functor as part of CUB DeviceReduce, such that complex number division was employed? Or this this completely unexplored territory?
Does the code in the original posting look right as far as complex division using a custom functor? I seem to get different values for HOST and DEVICE (that uses a CUB DeviceReduce).
I can’t seem to find why the values wouldn’t match.
I’m not sure in what way a division operation could be a sensible reduction operation. I know what it means to add a set of numbers together. I know what it means to multiply a set of numbers together. I don’t know what it means to divide a set of numbers (together).
I don’t know what this means, arithmetically (or any other way):
“I am attempting to execute division with two complex numbers such that a reduction is employed via CUDA CUB.”
I don’t understand what you are doing. But to go beyond that, a parallel reduction, in my experience, requires an operator that is commutative. In fact, wikipedia states this right up front:
In a parallel reduction, the general assumption is that there is no ordering of operations. That is, the operations can be carried out in any order, and you must guarantee that the code is correct under that proviso.
Yes, the fact that division is not commutative is likely why the HOST and DEVICE results don’t match when employing the CUB DeviceReduce operation. My attempt at using a reduction in this case was incorrect.
I am trying to parallelize a for loop from C code using CUDA that does the following:
#include <stdio.h>
#include <cuda.h>
#include <cuComplex.h>
int main(){
cuComplex *array;
cuComplex val;
const unsigned int N = 128;
val.x = 1.0f; val.y = 1.5f;
array = (cuComplex*)malloc(N*size of(cuComplex));
// initialize with dummy data
for(unsigned int i = 0; i < N; i++){
array[i].x = (float) i + 1.0f;
array[i].y = 0.1f + i;
}
// This is what I would like to create as a CUDA kernel if possible:
for(unsigned int i = 0; i < N; i++){
val = val / array[i];
}
return 0;
}
Any hints or help for how something like the above C code could be created using a CUDA kernel would be great.
Again, thank you txtbob for all your patient assistance.