I thought nvcc would deal with **__powf(x,2)** by compile it to assembly that equals to **x * x**

But here is my test code

```
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>
__global__ void pow_test(float *input, float *output)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
output[idx] = __powf(input[idx], 2);
}
__global__ void mul_test(float *input, float *output)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
output[idx] = input[idx] * input[idx];
}
int main()
{
int M;
std::cin >> M;
float *a;
cudaMallocManaged((void **)&a, M * sizeof(float));
float *b;
cudaMallocManaged((void **)&b, M * sizeof(float));
float *c;
cudaMallocManaged((void **)&c, M * sizeof(float));
for(int i = 0; i < M; i++)
{
a[i] = rand() % 11;
}
int b_num = (M + 1023) / 1024;
cudaEvent_t start, mid, stop;
cudaEventCreate(&start);
cudaEventCreate(&mid);
cudaEventCreate(&stop);
cudaEventRecord(start);
pow_test<<<b_num, 1024>>>(a, b);
cudaDeviceSynchronize();
cudaEventRecord(mid);
mul_test<<<b_num, 1024>>>(a, c);
cudaDeviceSynchronize();
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float u_t;
cudaEventElapsedTime(&u_t, start, mid);
printf("pow_time:%g\n", u_t);
cudaEventElapsedTime(&u_t, mid, stop);
printf("mul_time:%g\n", u_t);
float max = - 100;
for(int i = 0; i < M; i++)
{
max = (max < (b[i] - c[i])) ? (b[i] - c[i]) : max;
}
printf("%f\n", max);
return 0;
}
```

And here is a running test

```
$ ./test
10000
pow_time:0.254016
mul_time:0.006464
0.000004
```

So is my test right? In fact, the nvcc wouldn’t treat __powf(x,2) as x * x, right?

If my test is right, I wonder why this would happen?

Because I thought this pow(x,2) compile optimization is not a difficult thing.