Hello,
I am compiling some code that runs saxpy on GPU. This code is some university assignment that I am trying out. Unfortunately, compilation fails and I don’t know how to debug.
Here’s the link to code and Makefile:https://github.com/stanford-cs149/asst3/tree/master/saxpy
I have edited the saxpy.cu file:
void saxpyCuda(int N, float alpha, float* xarray, float* yarray, float* resultarray) {
// must read both input arrays (xarray and yarray) and write to
// output array (resultarray)
int totalBytes = sizeof(float) * 3 * N;
// compute number of blocks and threads per block. In this
// application we've hardcoded thread blocks to contain 512 CUDA
// threads.
const int threadsPerBlock = 512;
// Notice the round up here. The code needs to compute the number
// of threads blocks needed such that there is one thread per
// element of the arrays. This code is written to work for values
// of N that are not multiples of threadPerBlock.
const int blocks = (N + threadsPerBlock - 1) / threadsPerBlock;
// These are pointers that will be pointers to memory allocated
// *one the GPU*. You should allocate these pointers via
// cudaMalloc. You can access the resulting buffers from CUDA
// device kernel code (see the kernel function saxpy_kernel()
// above) but you cannot access the contents these buffers from
// this thread. CPU threads cannot issue loads and stores from GPU
// memory!
float* device_x;
float* device_y;
float* device_result;
//
// CS149 TODO: allocate device memory buffers on the GPU using cudaMalloc.
//
// We highly recommend taking a look at NVIDIA's
// tutorial, which clearly walks you through the few lines of code
// you need to write for this part of the assignment:
//
// https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/
//
cudaMalloc(&device_x, N*sizeof(float));
cudaMalloc(&device_y, N*sizeof(float));
cudaMalloc(&device_result, N*sizeof(float));
// start timing after allocation of device memory
double startTime = CycleTimer::currentSeconds();
//
// CS149 TODO: copy input arrays to the GPU using cudaMemcpy
//
cudaMemcpy(device_x, xarray, N*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(device_y, yarray, N*sizeof(float), cudaMemcpyHostToDevice);
// run CUDA kernel. (notice the <<< >>> brackets indicating a CUDA
// kernel launch) Execution on the GPU occurs here.
double startRunTime = CycleTimer::currentSeconds();
saxpy_kernel<<<blocks, threadsPerBlock>>>(N, alpha, device_x, device_y, device_result);
cudaDeviceSynchronize();
double endRunTime = CycleTimer::currentSeconds();
printf("RunTime: %.3f ms\n", 1000.0f*(endRunTime - startRunTime));
//
// CS149 TODO: copy result from GPU back to CPU using cudaMemcpy
//
cudaMemcpy(resultarray, device_result, N*sizeof(float), cudaMemcpyDeviceToHost);
// end timing after result has been copied back into host memory
double endTime = CycleTimer::currentSeconds();
cudaError_t errCode = cudaPeekAtLastError();
if (errCode != cudaSuccess) {
fprintf(stderr, "WARNING: A CUDA error occured: code=%d, %s\n",
errCode, cudaGetErrorString(errCode));
}
double overallDuration = endTime - startTime;
printf("Effective BW by CUDA saxpy: %.3f ms\t\t[%.3f GB/s]\n", 1000.f * overallDuration, GBPerSec(totalBytes, overallDuration));
//
// CS149 TODO: free memory buffers on the GPU using cudaFree
//
cudaFree(device_x);
cudaFree(device_y);
cudaFree(device_result);
}
The compilation error output:
mkdir -p objs/
g++ -m64 -O3 -Wall -o cudaSaxpy objs/main.o objs/saxpy.o -L/usr/local/cuda/lib64/ -lcudart
objs/saxpy.o: In function `saxpy_kernel(int, float, float*, float*, float*)':
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x4a): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x80): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x98): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0xb0): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0xc8): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0xd6): undefined reference to `cudaLaunch'
objs/saxpy.o: In function `saxpyCuda(int, float, float*, float*, float*)':
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x276): undefined reference to `cudaConfigureCall'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x2ba): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x4c0): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x4dc): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x4f8): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x514): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x526): undefined reference to `cudaLaunch'
objs/saxpy.o: In function `__device_stub__Z12saxpy_kernelifPfS_S_(int, float, float*, float*, float*)':
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x6b9): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x6e0): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x6f8): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x710): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x726): undefined reference to `cudaSetupArgument'
tmpxft_0000320c_00000000-4_saxpy.cudafe1.cpp:(.text+0x734): undefined reference to `cudaLaunch'
collect2: error: ld returned 1 exit status
Makefile:42: recipe for target 'cudaSaxpy' failed
make: *** [cudaSaxpy] Error 1
I have also tried the simple saxpy example from this blog: https://devblogs.nvidia.com/easy-introduction-cuda-c-and-c/ and it works fine.
My machine specifications:
OS: Ubuntu 16.04
CUDA: 10.1
Driver: 418.56
GPU: GTX 1080 TI
NVCC: 7.5.17
gcc: 5.4