CUDA Runtime API error 1: invalid argument (cudaMemcpy)

I am moving GPU intensive operation from python to Cuda using Cython. But when I am trying to copy data from CPU (host) to GPU(device), I get CUDA Runtime API error 1 when performing the cudaMemcpy operation. I have tried everything for the last few days but I can not pass this error. Since the code I am working on is so large, So I managed to create a simple prototype for here. The main entry for code is from test.pyx file which is mentioned below

import numpy as np
cimport numpy as np


cdef extern from "test.h":
    cdef cppclass Test:
        float* data
        int msg_type

    void test_call(Test vol)

def py_test():
    cdef Test test
    data = np.ones((1, 256, 256, 256), dtype=np.float32)
    cdef float[:, :, :, ::1] data_view = data = &(data_view[0, 0, 0, 0])
    test.msg_type = 0


def test_gpu():

Similarly, the cuda file is

#include "test.h"

void test_call(Test &test_cpu)
     * Allocate memory for GPU
    Test test_gpu;

    mem_alloc_test(test_gpu, test_cpu);

    printf("%f\n", test_cpu.msg_type);
    printf("%f\n", test_gpu.msg_type);



, the header file is test.h

#include <iostream>
#include <math.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>

#define GLOBAL_2_HOST __host__ __device__

#define gpuErrchk(err)  __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )
    if(cudaSuccess != err)
        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );

const int THREADS_PER_BLOCK = 1024;

inline int getNumBlock(int N) { return (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; }

class Test {
    float *data;
    int msg_type;

void test_call(Test &test);

void host_2_device(const float *host, float *device, int N) {
    printf("%d", N);
    gpuErrchk(cudaMemcpy(device, host, N*sizeof(float), cudaMemcpyHostToDevice));

void device_malloc(float *device, int N) {
    gpuErrchk(cudaMalloc(&device, N*sizeof(float)));

void mem_alloc_test(Test &test_gpu, Test &test_cpu) {
    int N = 256 * 256 * 256;
    device_malloc(, N);
    host_2_device(,, N);

    test_gpu.msg_type = test_cpu.msg_type;

void mem_free_test(Test &test_gpu) {

To build cuda code run from the root folder

mkdir -p build
cmake ..

To build cython code run python build_ext --inplace from the root folder.

To test run python -c "import pytest; pytest.test_gpu()" . You will be thrown a error CUDA Runtime API error 1: invalid argument.

Any help regarding this will be appriciated. If need any extra info, I’ll be happy to provide that.

Your device_malloc routine is broken as described in your cross-posting