I don’t have a jetson device to test on.
Here is what I see on Tesla V100:
$ cat t6.cu
#include <stdio.h>
#include <cufft.h>
void checkStatus(int status, const char *activity)
{
// If there is no error then just return
if (status == 0)
{
return;
}
// Otherwise print the error and exit the application
printf("CUDA Error %d performing %s\n", status, activity);
exit(-1);
}
__global__ void window(int n, float *input, float *output, float *window, int fftSize)
{
// !!!! no check against n so expect this to reveal a memory access error
int index = blockIdx.x * blockDim.x + threadIdx.x;
output[index] = input[index] * window[index % fftSize];
}
cudaStream_t testStream = 0;
int main()
{
cudaStreamCreate(&testStream);
float *fftInputCUDA;
float *fftInputWindowedCUDA;
float *windowCUDA;
cufftHandle plan;
int size = 64;
int numChannels = 1;
// With these next couple of lines commented out we see the expected errors
// With these not commented out the errors are silent
#ifdef USE_BUG
checkStatus(cufftPlan1d(&plan, size, CUFFT_R2C, numChannels), "fftInit");
checkStatus(cufftSetStream(plan, testStream),"fftInit");
#endif
checkStatus(cudaMalloc((void **)&fftInputCUDA, sizeof(float) * numChannels * size), "fftInit");
checkStatus(cudaMalloc((void **)&fftInputWindowedCUDA, sizeof(float) * numChannels * size), "fftInit");
checkStatus(cudaMalloc(&windowCUDA, size * sizeof(float)), "fftInit");
int blockSize = 128;
int numBlocks = ((size * numChannels) + (blockSize - 1)) / blockSize;
window<<<numBlocks, blockSize, 0, testStream>>>(size * numChannels, (float *)fftInputCUDA, (float *)fftInputWindowedCUDA, (float *)windowCUDA, size);
checkStatus(cudaPeekAtLastError(), "fft");
checkStatus(cudaStreamSynchronize(testStream), "Test");
}
$ nvcc -o t6 t6.cu
t6.cu(32): warning: variable "plan" was declared but never referenced
$ cuda-memcheck ./t6
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 4
========= at 0x000001d0 in window(int, float*, float*, float*, int)
========= by thread (95,0,0) in block (0,0,0)
========= Address 0x7fdc7c20017c is out of bounds
========= Device Frame:window(int, float*, float*, float*, int) (window(int, float*, float*, float*, int) : 0x1d0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x346) [0x2af0b6]
========= Host Frame:./t6 [0x10649]
========= Host Frame:./t6 [0x106d7]
========= Host Frame:./t6 [0x46a35]
========= Host Frame:./t6 [0x3820]
========= Host Frame:./t6 [0x36b1]
========= Host Frame:./t6 [0x36ea]
========= Host Frame:./t6 [0x34ce]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21b15]
========= Host Frame:./t6 [0x3259]
=========
========= Invalid __global__ read of size 4
========= at 0x000001d0 in window(int, float*, float*, float*, int)
========= by thread (94,0,0) in block (0,0,0)
========= Address 0x7fdc7c200178 is out of bounds
========= Device Frame:window(int, float*, float*, float*, int) (window(int, float*, float*, float*, int) : 0x1d0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x346) [0x2af0b6]
========= Host Frame:./t6 [0x10649]
========= Host Frame:./t6 [0x106d7]
========= Host Frame:./t6 [0x46a35]
========= Host Frame:./t6 [0x3820]
========= Host Frame:./t6 [0x36b1]
========= Host Frame:./t6 [0x36ea]
========= Host Frame:./t6 [0x34ce]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21b15]
========= Host Frame:./t6 [0x3259]
=========
========= Invalid __global__ read of size 4
========= at 0x000001d0 in window(int, float*, float*, float*, int)
========= by thread (93,0,0) in block (0,0,0)
========= Address 0x7fdc7c200174 is out of bounds
========= Device Frame:window(int, float*, float*, float*, int) (window(int, float*, float*, float*, int) : 0x1d0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x346) [0x2af0b6]
========= Host Frame:./t6 [0x10649]
========= Host Frame:./t6 [0x106d7]
========= Host Frame:./t6 [0x46a35]
========= Host Frame:./t6 [0x3820]
========= Host Frame:./t6 [0x36b1]
========= Host Frame:./t6 [0x36ea]
========= Host Frame:./t6 [0x34ce]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21b15]
========= Host Frame:./t6 [0x3259]
=========
========= Invalid __global__ read of size 4
========= at 0x000001d0 in window(int, float*, float*, float*, int)
========= by thread (92,0,0) in block (0,0,0)
========= Address 0x7fdc7c200170 is out of bounds
========= Device Frame:window(int, float*, float*, float*, int) (window(int, float*, float*, float*, int) : 0x1d0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x346) [0x2af0b6]
========= Host Frame:./t6 [0x10649]
========= Host Frame:./t6 [0x106d7]
...
========= ERROR SUMMARY: 33 errors
$ nvcc -o t6 t6.cu -lcufft -DUSE_BUG
$ cuda-memcheck ./t6
========= CUDA-MEMCHECK
========= Internal Memcheck Error: Initialization failed
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/lib64/libcuda.so.1 [0x13f42c]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3d887a]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3cb9a0]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3d7bca]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3db8cf]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3dc03a]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3cf66c]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3bf16e]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x3f138c]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x37b82]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x38186]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 [0x39cd2]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 (cufftXtMakePlanMany + 0x63a) [0x4d2aa]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 (cufftMakePlanMany64 + 0x157) [0x4e267]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 (cufftMakePlanMany + 0x193) [0x4acd3]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 (cufftPlanMany + 0xd2) [0x4b262]
========= Host Frame:/usr/local/cuda/lib64/libcufft.so.10 (cufftPlan1d + 0x48) [0x4b388]
========= Host Frame:./t6 [0x3492]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21b15]
========= Host Frame:./t6 [0x3329]
=========
========= ERROR SUMMARY: 1 error
$ CUDA_MEMCHECK_PATCH_MODULE=“1” cuda-memcheck ./t6
========= CUDA-MEMCHECK
========= Invalid __global__ read of size 4
========= at 0x000001d0 in window(int, float*, float*, float*, int)
========= by thread (95,0,0) in block (0,0,0)
========= Address 0x7f7ef280077c is out of bounds
========= Device Frame:window(int, float*, float*, float*, int) (window(int, float*, float*, float*, int) : 0x1d0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x346) [0x2af0b6]
========= Host Frame:./t6 [0x10759]
========= Host Frame:./t6 [0x107e7]
========= Host Frame:./t6 [0x46b45]
========= Host Frame:./t6 [0x3935]
========= Host Frame:./t6 [0x37c6]
========= Host Frame:./t6 [0x37ff]
========= Host Frame:./t6 [0x35e3]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21b15]
========= Host Frame:./t6 [0x3329]
=========
========= Invalid __global__ read of size 4
========= at 0x000001d0 in window(int, float*, float*, float*, int)
========= by thread (94,0,0) in block (0,0,0)
========= Address 0x7f7ef2800778 is out of bounds
========= Device Frame:window(int, float*, float*, float*, int) (window(int, float*, float*, float*, int) : 0x1d0)
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/lib64/libcuda.so.1 (cuLaunchKernel + 0x346) [0x2af0b6]
========= Host Frame:./t6 [0x10759]
========= Host Frame:./t6 [0x107e7]
========= Host Frame:./t6 [0x46b45]
========= Host Frame:./t6 [0x3935]
========= Host Frame:./t6 [0x37c6]
========= Host Frame:./t6 [0x37ff]
========= Host Frame:./t6 [0x35e3]
========= Host Frame:/lib64/libc.so.6 (__libc_start_main + 0xf5) [0x21b15]
========= Host Frame:./t6 [0x3329]
=========
...
========= ERROR SUMMARY: 33 errors
$