Hi,
The sample code below uses powf() functions in host and device. I guess that the results are different due to the device powf() function with 8 maximum ULP error.
Is there a way to produce same results in device? (fractional exponents are required)
Thank you for your time.
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
__global__ void powKernel(float *a, float *b, const int size)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < size) {
b[i] = powf(a[i], 1.67f);
}
}
int main()
{
const int size = 10000;
float *h_a, *h_b, *h_c, *d_a, *d_b;
cudaError_t cudaStatus;
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
return 1;
}
h_a = (float *)malloc(size * sizeof(float));
h_b = (float *)malloc(size * sizeof(float));
h_c = (float *)malloc(size * sizeof(float));
srand(time(NULL));
for (int i = 0; i < size; i++) {
h_a[i] = ((float)rand() / (float)RAND_MAX) * 100.0f;
h_b[i] = powf(h_a[i], 1.67f);
}
cudaMalloc(&d_a, size * sizeof(float));
cudaMalloc(&d_b, size * sizeof(float));
cudaMemcpy(d_a, h_a, size * sizeof(float), cudaMemcpyHostToDevice);
powKernel<<<(size / 1024) + 1, 1024>>>(d_a, d_b, size);
cudaMemcpy(h_c, d_b, size * sizeof(float), cudaMemcpyDeviceToHost);
int count = 0;
for (int i = 0; i < size; i++) {
if (h_b[i] != h_c[i]) {
printf("%f vs %f\n", h_b[i], h_c[i]);
count++;
}
}
printf("Total: %d, wrong: %d\n", size, count);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
free(h_a);
free(h_b);
free(h_c);
cudaFree(d_a);
cudaFree(d_b);
return 0;
}