Hi,
I’ve just migrated to a new laptop and having lots of problems getting CUDA 7.5 code to work. New laptop is an Alienware 17 with a mobile GeForce GTX 1080 graphics card. Unfortunately the current install of CUDA 8.0 does not support this card, and using the laptop’s existing driver did not work with Visual Studio NSight (you could run but not debug the kernels).
The Dell/Alienware drivers and latest NVidia drivers do not recognise the 1080 card either, so my only way to get a newer version of the driver was to amend an Acer 1080M laptop driver within the latest NVidia driver (376.09 mobile) so that the hardware ID to match the device in my machine (PCI\VEN_10DE&DEV_1BE0&SUBSYS_07C21028). This seemed to work but I’ve found some problems still with debugging CUDA code. E.g. the following sample crashes with the error “Internal debugger error occurred while attempting to launch _Z12setup_kernelP19curandStateMRG32k3am in CUcontext 0x1c7444bf190, CUmodule 0x1c751a7d7c0:
code patching failed for unknown reason.” This code works fine on my other machine. Other CUDA code seems to debug okay, but this Curand code is not happy with the new setup.
Does anyone have ideas on how to make CUDA work on the mobile GTX 1080 chipset?
Many thanks
#include “cuda_runtime.h”
#include “curand_kernel.h”
#include <stdio.h>
inline void __getLastCudaError(const char *errorMessage, const char *file, const int line)
{
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err)
{
fprintf(stderr, "%s(%i) : getLastCudaError() CUDA error : %s : (%d) %s.\n",
file, line, errorMessage, (int)err, cudaGetErrorString(err));
cudaDeviceReset();
exit(EXIT_FAILURE);
}
}
// This will output the proper error string when calling cudaGetLastError
#define getLastCudaError(msg) __getLastCudaError (msg, FILE, LINE)
global void setup_kernel(curandStateMRG32k3a * state, unsigned long seed)
{
const unsigned tid = threadIdx.x;
const unsigned bid = blockIdx.x;
const unsigned bsz = blockDim.x;
int index = tid + bid * bsz;
curand_init(seed, index, 0, &state[index]);
}
int main()
{
int minNumSims = 10000;
const unsigned BLOCK_SIZE = 256; // this is number of threads per block
const unsigned GRID_SIZE = (const unsigned)ceil(float(minNumSims) / float(BLOCK_SIZE));
curandStateMRG32k3a *devStates;
cudaError_t err = cudaMalloc((void **)&devStates, BLOCK_SIZE * GRID_SIZE * sizeof(curandStateMRG32k3a));
setup_kernel <<< GRID_SIZE, BLOCK_SIZE >>> (devStates, 1);
getLastCudaError("rngSetupStates kernel failed.\n");
err = cudaDeviceSynchronize();
}