Can't get cuMemcpy2D to Work

Hi,

I am trying to use cuMemcpy2D to copy a matrix from host memory into device memory but cannot get it to work for matrices with more than 128 rows of type float or 64 rows of type double. The corresponding pitches are 512 in both cases which is well below the maximum pitch reported by deviceQuery from the SDK. I am using CUDA Toolkit and SDK versions 3.2 and driver version 260.24.

Below is a sample program to illustrate the issue. It compiles with “gcc -std=c99 -I/opt/cuda/include -o memcpy2d memcpy2d.c -L/opt/cuda/lib64 -lcuda”. I keep getting “Error code 700 at line 48” when I run it. Error code 700 is “Launch failed”.

#include <stdio.h>

#include <stdbool.h>

#include <cuda.h>

#define N 1024

#define T float

#define SAFE_CALL(call) \

 do { \

   CUresult error = call; \

   if (error != CUDA_SUCCESS) { \

	 fprintf(stderr, "Error code %d at line %d\n", error, __LINE__); \

	 return -1; \

   } \

 } while (false)

int main(int argc, char * argv[]) {

  CUdevice device;

  CUcontext context;

SAFE_CALL(cuInit(0));

  SAFE_CALL(cuDeviceGet(&device, 0));

  SAFE_CALL(cuCtxCreate(&context, 0, device));

T * h_ptr;

  CUdeviceptr d_ptr;

  unsigned int pitch;

  SAFE_CALL(cuMemAllocPitch(&d_ptr, &pitch, N * sizeof(T), N, sizeof(T)));

  SAFE_CALL(cuMemAllocHost((void **)&h_ptr, N * N * sizeof(T)));

CUDA_MEMCPY2D p;

p.srcXInBytes = 0;

  p.srcY = 0;

  p.srcMemoryType = CU_MEMORYTYPE_HOST;

  p.srcHost = h_ptr;

  p.srcPitch = N * sizeof(T);

p.dstXInBytes = 0;

  p.dstY = 0;

  p.dstMemoryType = CU_MEMORYTYPE_DEVICE;

  p.dstDevice = d_ptr;

  p.dstPitch = pitch * sizeof(T);

p.WidthInBytes = N * sizeof(T);

  p.Height = N;

SAFE_CALL(cuMemcpy2D(&p));

SAFE_CALL(cuMemFree(d_ptr));

  SAFE_CALL(cuMemFreeHost(h_ptr));

SAFE_CALL(cuCtxDestroy(context));

return 0;

}

Hi,

I am trying to use cuMemcpy2D to copy a matrix from host memory into device memory but cannot get it to work for matrices with more than 128 rows of type float or 64 rows of type double. The corresponding pitches are 512 in both cases which is well below the maximum pitch reported by deviceQuery from the SDK. I am using CUDA Toolkit and SDK versions 3.2 and driver version 260.24.

Below is a sample program to illustrate the issue. It compiles with “gcc -std=c99 -I/opt/cuda/include -o memcpy2d memcpy2d.c -L/opt/cuda/lib64 -lcuda”. I keep getting “Error code 700 at line 48” when I run it. Error code 700 is “Launch failed”.

#include <stdio.h>

#include <stdbool.h>

#include <cuda.h>

#define N 1024

#define T float

#define SAFE_CALL(call) \

 do { \

   CUresult error = call; \

   if (error != CUDA_SUCCESS) { \

	 fprintf(stderr, "Error code %d at line %d\n", error, __LINE__); \

	 return -1; \

   } \

 } while (false)

int main(int argc, char * argv[]) {

  CUdevice device;

  CUcontext context;

SAFE_CALL(cuInit(0));

  SAFE_CALL(cuDeviceGet(&device, 0));

  SAFE_CALL(cuCtxCreate(&context, 0, device));

T * h_ptr;

  CUdeviceptr d_ptr;

  unsigned int pitch;

  SAFE_CALL(cuMemAllocPitch(&d_ptr, &pitch, N * sizeof(T), N, sizeof(T)));

  SAFE_CALL(cuMemAllocHost((void **)&h_ptr, N * N * sizeof(T)));

CUDA_MEMCPY2D p;

p.srcXInBytes = 0;

  p.srcY = 0;

  p.srcMemoryType = CU_MEMORYTYPE_HOST;

  p.srcHost = h_ptr;

  p.srcPitch = N * sizeof(T);

p.dstXInBytes = 0;

  p.dstY = 0;

  p.dstMemoryType = CU_MEMORYTYPE_DEVICE;

  p.dstDevice = d_ptr;

  p.dstPitch = pitch * sizeof(T);

p.WidthInBytes = N * sizeof(T);

  p.Height = N;

SAFE_CALL(cuMemcpy2D(&p));

SAFE_CALL(cuMemFree(d_ptr));

  SAFE_CALL(cuMemFreeHost(h_ptr));

SAFE_CALL(cuCtxDestroy(context));

return 0;

}