How to copy list defined with "__device__" back to host?

I use cudaMemcpy to do the process, but i failed. Here is my code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda.h>


using namespace std;

#define ThreadPerBlock 512

__device__ double tt[ThreadPerBlock];

__global__ void t1()
{
    int i = threadIdx.x + blockDim.x * blockIdx.x;
    if (i < ThreadPerBlock)
    {
        tt[i] = i;
    }
}

int main()
{
    int size = ThreadPerBlock;  
    double* a = (double*)malloc(size * sizeof(double));

    t1 << <1, ThreadPerBlock >> > ();

    cudaMemcpy(a, tt, size * sizeof(double), cudaMemcpyDeviceToHost);

    for (int i = 0; i < size; i++)
    {
        cout << a[i] << endl;
    }

    free(a);
    return 0;
}

The result i got showed that it didn’t succeed by outputing 512 times"-6.27744e+66", indicating the copying process failed.

Use cudaMemcpyFromSymbol instead.
I would also suggest to do proper cuda api error checking.