Hello I want to use the function x with the type float as the function takes type int in input I do not know if this is possible, I tried with conversion or cast but this does not give the right resulta
__global__ void KernelTest()
{
int tid = threadIdx.x;
__shared__ float v[32];
v[tid] = tid + 0.1;
float mySum = __reduce_add_sync(0xffffffff, (int)v[tid]);
}
int main() {
KernelTest<<<1, 32>>>();
}
float is not supported at this time. You won’t be able to convert the float to int to get the right answer in the general case, either.
You can create a warp reduce that works on float using warp-shuffle operations:
$ cat t59.cu
#include <stdio.h>
__global__ void warpReduce() {
int laneId = threadIdx.x & 0x1f;
// Seed starting value as inverse lane ID
float value = laneId + 0.1f;
// Use XOR mode to perform butterfly reduction
for (int i=16; i>=1; i/=2)
value += __shfl_xor_sync(0xffffffff, value, i, 32);
// "value" now contains the sum across all threads
printf("Thread %d final value = %f\n", threadIdx.x, value);
}
int main() {
warpReduce<<< 1, 32 >>>();
cudaDeviceSynchronize();
return 0;
}
$ nvcc -o t59 t59.cu
$ cuda-memcheck ./t59
========= CUDA-MEMCHECK
Thread 0 final value = 499.200012
Thread 1 final value = 499.200012
Thread 2 final value = 499.200012
Thread 3 final value = 499.200012
Thread 4 final value = 499.200012
Thread 5 final value = 499.200012
Thread 6 final value = 499.200012
Thread 7 final value = 499.200012
Thread 8 final value = 499.200012
Thread 9 final value = 499.200012
Thread 10 final value = 499.200012
Thread 11 final value = 499.200012
Thread 12 final value = 499.200012
Thread 13 final value = 499.200012
Thread 14 final value = 499.200012
Thread 15 final value = 499.200012
Thread 16 final value = 499.200012
Thread 17 final value = 499.200012
Thread 18 final value = 499.200012
Thread 19 final value = 499.200012
Thread 20 final value = 499.200012
Thread 21 final value = 499.200012
Thread 22 final value = 499.200012
Thread 23 final value = 499.200012
Thread 24 final value = 499.200012
Thread 25 final value = 499.200012
Thread 26 final value = 499.200012
Thread 27 final value = 499.200012
Thread 28 final value = 499.200012
Thread 29 final value = 499.200012
Thread 30 final value = 499.200012
Thread 31 final value = 499.200012
========= ERROR SUMMARY: 0 errors
$
As a bonus, this method does not require cc8.0 or higher, which the reduce intrinsics require.
Depending on what the actual use case is, using __reduce_add_sync() in conjunction with a fixed-point representation (i.e. an integer associated with an implicit scale factor) may be one way to address the lack of float support.