ptx assembly in cuda for calculating square root

In my cuda project, I have used a math.h function sqrt to find the square root of a number which is supposed to be slow so while searching in the web to increase speed I found to use the assembly language from here

Here in visual studio I tried to implement simple program to find square root as

#include <stdio.h>
#include <cuda.h>
#include "math.h"

// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
  int idx = blockIdx.x * blockDim.x + threadIdx.x;
  if (idx<N) a[idx] =sqrt14(a[idx]) ;
}
__device__ double inline __declspec (naked) __fastcall sqrt14(double n)
    {
        _asm fld qword ptr [esp+4]
        _asm fsqrt
        _asm ret 8
    }

// main routine that executes on the host
int main(void)
{
  float *a_h, *a_d;  // Pointer to host & device arrays
  const int N = 10;  // Number of elements in arrays
  size_t size = N * sizeof(float);
  a_h = (float *)malloc(size);        // Allocate array on host
  cudaMalloc((void **) &a_d, size);   // Allocate array on device
  // Initialize host array and copy it to CUDA device
  for (int i=0; i<N; i++) a_h[i] = (float)i;
  cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);
  // Do calculation on device:
  int block_size = 4;
  int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
  square_array <<< n_blocks, block_size >>> (a_d, N);
  // Retrieve result from device and store it in host array
  cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);
  // Print results
  for (int i=0; i<N; i++) printf("%d %f\n", i, a_h[i]);
  // Cleanup
  free(a_h); cudaFree(a_d);
}

since asm is not supported we need PTX of the asm . what could be the equivalent PTx of the above asm??

No need for assembly, just use the sqrt() and sqrtf() functions. See also the CUDA programming guide.