I have a Tesla C2070 in a HPC system.
I am trying to get cudamalloc(&longpointer, size) to run, but am getting the runtime error report below -
---------- ERROR REPORT ----------------
user@computer:/sata2/NVIDIA_GPU_Computing_SDK/C/src/MagicSquares$ …/…/bin/linux/release/magic
[magic] starting…
Using CUDA device [0]: Tesla C2070
Device 0: “Tesla C2070” with Compute 2.0 capability
magic: malloc.c:3096: sYSMALLOc: Assertion `(old_top == (((mbinptr) (((char *) &((av)->bins[((1) - 1) * 2])) - __builtin_offsetof (struct malloc_chunk, fd)))) && old_size == 0) || ((unsigned long) (old_size) >= (unsigned long)((((__builtin_offsetof (struct malloc_chunk, fd_nextsize))+((2 * (sizeof(size_t))) - 1)) & ~((2 * (sizeof(size_t))) - 1))) && ((old_top)->size & 0x1) && ((unsigned long)old_end & pagemask) == 0)’ failed.
Aborted
I carefully followed Chapter 3 Programming Interface page 20 of the Programming Guide 00 Nvidia CUDA C Programming Guide v4.0 and added some support code around it so that the module would compile.
Why is cudaMalloc() failing? I even followed the other code examples of wrapping it inside a cutilSafeCall() and used a (void**) to change the pointer type given to the cudaMalloc function, but still got the same failure as above.
Program_Listing:
---------------------- test.cu -----------------------------------
include <stdio.h>
include <shrUtils.h>
include <shrQATest.h>
include <cutil_inline.h>
global void myfunction()
{
}
int main(int argc, char **argv)
{
int devID;
cudaDeviceProp props;
shrQAStart(argc, argv);
// Check which GPU is used
cutilChooseCudaDevice(argc, argv);
// Get GPU information
cutilSafeCall(cudaGetDevice(&devID));
cutilSafeCall(cudaGetDeviceProperties(&props, devID));
printf(“Device %d: "%s" with Compute %d.%d capability\n”, devID, props.name, props.major, props.minor);
size_t size = 1000;
// Allocate Host Memory
long* host_mem = (long*) malloc(size);
// Zeroize the Answers
int i;
for(i=0;i<size;i++) { host_mem[i] = 0; }
// Allocate Device Memory
int* dev_mem = 0;
cudaMalloc(&dev_mem, size);
// Copy (zeroized) answers from host memory to device memory
cudaMemcpy(dev_mem, host_mem, size, cudaMemcpyHostToDevice);
// Invoke kernel
myfunction<<<1,1>>>();
cutilDeviceSynchronize();
// Copy result from device memory to host memory
cudaMemcpy(host_mem, dev_mem, size, cudaMemcpyDeviceToHost);
// Check Results;
// … (to be filled in)
// Free device memory
cudaFree(dev_mem);
// Free host memory
free(host_mem);
cutilDeviceReset();
shrQAFinishExit(argc, (const char **)argv, QA_PASSED);
}
---------------- setup lines in Makefile ---------------------
Add source files here
EXECUTABLE := magic
Cuda source files (compiled with cudacc)
CUFILES := test.cu
CUDEPS :=
C/C++ source files (compiled with gcc / c++)
CCFILES :=
add command line parameters so we can target multiple architectures
GENCODE_ARCH := -gencode=arch=compute_20,code="sm_20,compute_20"