Copy array from host to device in CUDA

Hello everyone,

i am new to CUDA and i got an error when i try to copy the array from host to device. error: no instance of overloaded function “cudaMalloc” matches the argument list argument types are: (int (*)[1048576], unsigned long)

#include <assert.h>
#include <cuda.h>
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <time.h>
#include <unistd.h>
#include <curand.h>
#include <curand_kernel.h>



#define N (1024*1024)
  #define M (1000000)

/**************************************************/
/* this GPU kernel function is used to initialize the random states */
__global__ void init(unsigned int seed, curandState_t* states) {

    /* we have to initialize the state */
    curand_init(seed, /* the seed can be the same for each core, here we pass the time in from the CPU */
                blockIdx.x, /* the sequence number should be different for each core (unless you want all
                               cores to get the same sequence of numbers for some reason - use thread id! */
                0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */
                &states[blockIdx.x]);
}

/* this GPU kernel takes an array of states, and an array of ints, and puts a random int into each */
__global__ void randoms(curandState_t* states, unsigned int* numbers) {
    /* curand works like rand - except that it takes a state as a parameter */
    numbers[blockIdx.x] = curand(&states[blockIdx.x]) %2000;
};

/*******************************************************/

  __global__ void cudakernel(int *buf)
  {
     int i = threadIdx.x + blockIdx.x * blockDim.x;
    // buf[i] = rand();
     for(int j = 0; j < M; j++)
        buf[i] = buf[i] * buf[i] - 0.25f;
  }

  int main()

  {
/*****************************************************/
/* CUDA's random number library uses curandState_t to keep track of the seed value
       we will store a random state for every thread  */
    curandState_t* states;

    /* allocate space on the GPU for the random states */
    cudaMalloc((void**) &states, N * sizeof(curandState_t));

    /* invoke the GPU to initialize all of the random states */
    init<<<N, 1>>>(time(0), states);

    /* allocate an array of unsigned ints on the CPU and GPU */
   // unsigned int cpu_nums[N];//getting error in median relared to type of int
    unsigned int* gpu_nums;
    int cpu_nums[N];
    cudaMalloc((void**) &gpu_nums, N * sizeof(unsigned int));

    /* invoke the kernel to get some random numbers */
    randoms<<<N, 1>>>(states, gpu_nums);

    /* copy the random numbers back */
    cudaMemcpy(cpu_nums, gpu_nums, N * sizeof(unsigned int), cudaMemcpyDeviceToHost);

/******************************************************************************/ 

     int data[N];// int count = 0;

     int cpunums[N],i;
     for (i=0;i<=N;i++)

     cpunums[i]=cpu_nums[i];


     cudaMalloc(&cpunums, N * sizeof(int));
     cudakernel<<<N/256, 256>>>(cpunums);
     cudaMemcpy(data, cpunums, N * sizeof(int), cudaMemcpyDeviceToHost);
     cudaFree(cpunums); 

     int sel;
     printf("Enter an index: ");
     scanf("%d", &sel);
     printf("data[%d] = %f\n", sel, data[sel]);
  }

i am trying to copy cpunums[i] array from host to device again after i generate a random numbers from device.

i tried to call the device function but i got many errors.so i tried this way.

int cpunums[N],i; // ※1
...
cudaMalloc(&cpunums, N * sizeof(int)); // ※2

※1 allocates N*sizeof(int) bytes on Host-memory.
Why you allocate again on Device at ※2 ?