CUDA Runtime API error 1: invalid argument (cudaMemcpy)

I am moving GPU intensive operation from python to Cuda using Cython. But when I am trying to copy data from CPU (host) to GPU(device), I get CUDA Runtime API error 1 when performing the cudaMemcpy operation. I have tried everything for the last few days but I can not pass this error. Since the code I am working on is so large, So I managed to create a simple prototype for here. The main entry for code is from test.pyx file which is mentioned below

import numpy as np
cimport numpy as np

np.import_array()

cdef extern from "test.h":
    cdef cppclass Test:
        float* data
        int msg_type

    void test_call(Test vol)


def py_test():
    cdef Test test
    data = np.ones((1, 256, 256, 256), dtype=np.float32)
    cdef float[:, :, :, ::1] data_view = data
    test.data = &(data_view[0, 0, 0, 0])
    test.msg_type = 0

    test_call(test)

def test_gpu():
    py_test()

Similarly, the cuda file is test.cu

#include "test.h"

void test_call(Test &test_cpu)
{
    /**
     * Allocate memory for GPU
     */
    Test test_gpu;

    mem_alloc_test(test_gpu, test_cpu);

    printf("%f\n", test_cpu.msg_type);
    printf("%f\n", test_gpu.msg_type);
    printf("%f\n", test_cpu.data[0]);
    printf("%f\n", test_gpu.data[0]);

    mem_free_test(test_gpu);

}

, the header file is test.h

#include <iostream>
#include <math.h>
#include <cuda.h>
#include <cublas_v2.h>
#include <cuda_runtime.h>


#define GLOBAL_2_HOST __host__ __device__

#define gpuErrchk(err)  __checkCudaErrors (err, __FILE__, __LINE__)

inline void __checkCudaErrors(cudaError err, const char *file, const int line )
{
    if(cudaSuccess != err)
    {
        fprintf(stderr, "%s(%i) : CUDA Runtime API error %d: %s.\n",file, line, (int)err, cudaGetErrorString( err ) );
        exit(-1);
    }
}


const int THREADS_PER_BLOCK = 1024;

inline int getNumBlock(int N) { return (N + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; }

class Test {
public:
    float *data;
    int msg_type;
};

void test_call(Test &test);

void host_2_device(const float *host, float *device, int N) {
    printf("%d", N);
    gpuErrchk(cudaMemcpy(device, host, N*sizeof(float), cudaMemcpyHostToDevice));
}

void device_malloc(float *device, int N) {
    gpuErrchk(cudaMalloc(&device, N*sizeof(float)));
}

void mem_alloc_test(Test &test_gpu, Test &test_cpu) {
    int N = 256 * 256 * 256;
    device_malloc(test_gpu.data, N);
    host_2_device(test_cpu.data, test_gpu.data, N);

    test_gpu.msg_type = test_cpu.msg_type;
}

void mem_free_test(Test &test_gpu) {
    cudaFree(&test_gpu.data);
}

To build cuda code run from the root folder

mkdir -p build
cmake ..
make

To build cython code run python setup.py build_ext --inplace from the root folder.

To test run python -c "import pytest; pytest.test_gpu()" . You will be thrown a error CUDA Runtime API error 1: invalid argument.

Any help regarding this will be appriciated. If need any extra info, I’ll be happy to provide that.

Your device_malloc routine is broken as described in your cross-posting