Can't compile simple test cuda kernel

Good afternoon all,

I have been looking at converting some cpp code that I’ve had good success to a cuda kernel to maybe squeeze even more speed out of it. I decided to try my hand at a simple kernel first, a slight tweak on the example found here https://developer.nvidia.com/blog/even-easier-introduction-cuda/.

However, when try to compile this, nvcc just hangs forever:

I think that it’s not an issue with my code itself, but just in case this is it:

#include <torch/script.h>

using namespace torch;

__global__ 
void add_kernel(int N, float x[], float y[], float z[]) {
    
    int64_t i = threadIdx.x;
    int64_t s = blockDim.x;

    for (int64_t ix = i; i < N; i += s) {
        z[i] = x[i] + y[i];
    }
}

Tensor add_gpu(Tensor a, Tensor b) {

    int64_t N = a.size(0);
    size_t bytes = N * sizeof(float);

    Tensor result = torch::empty(N);

    float* d_a;
    float* d_b;    
    float* d_z;
    float* h_z = result.data_ptr<float>();
    cudaMallocManaged(&d_a, bytes);
    cudaMallocManaged(&d_b, bytes);
    cudaMallocManaged(&d_z, bytes);

    cudaMemcpy(d_a, a.data_ptr<float>(), bytes, cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, b.data_ptr<float>(), bytes, cudaMemcpyHostToDevice);

    int64_t block_size = 256;
    int64_t grid_size = (int64_t)ceil(N / block_size);

    add_kernel<<<grid_size, block_size>>>(N, d_a, d_b, d_z);
    cudaMemcpy(h_z, d_z, bytes, cudaMemcpyDeviceToHost);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_z);

    return result;
}

int main(void) {
    int N = 100000;
    torch::Tensor x = torch::arange(N);
    torch::Tensor y = x + 1;
    Tensor z = add_gpu(x, y);

}