Good afternoon all,
I have been looking at converting some cpp code that I’ve had good success to a cuda kernel to maybe squeeze even more speed out of it. I decided to try my hand at a simple kernel first, a slight tweak on the example found here https://developer.nvidia.com/blog/even-easier-introduction-cuda/.
However, when try to compile this, nvcc just hangs forever:
I think that it’s not an issue with my code itself, but just in case this is it:
#include <torch/script.h>
using namespace torch;
__global__
void add_kernel(int N, float x[], float y[], float z[]) {
int64_t i = threadIdx.x;
int64_t s = blockDim.x;
for (int64_t ix = i; i < N; i += s) {
z[i] = x[i] + y[i];
}
}
Tensor add_gpu(Tensor a, Tensor b) {
int64_t N = a.size(0);
size_t bytes = N * sizeof(float);
Tensor result = torch::empty(N);
float* d_a;
float* d_b;
float* d_z;
float* h_z = result.data_ptr<float>();
cudaMallocManaged(&d_a, bytes);
cudaMallocManaged(&d_b, bytes);
cudaMallocManaged(&d_z, bytes);
cudaMemcpy(d_a, a.data_ptr<float>(), bytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b.data_ptr<float>(), bytes, cudaMemcpyHostToDevice);
int64_t block_size = 256;
int64_t grid_size = (int64_t)ceil(N / block_size);
add_kernel<<<grid_size, block_size>>>(N, d_a, d_b, d_z);
cudaMemcpy(h_z, d_z, bytes, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_z);
return result;
}
int main(void) {
int N = 100000;
torch::Tensor x = torch::arange(N);
torch::Tensor y = x + 1;
Tensor z = add_gpu(x, y);
}