I use cudaMemcpy to do the process, but i failed. Here is my code:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <cuda.h>
using namespace std;
#define ThreadPerBlock 512
__device__ double tt[ThreadPerBlock];
__global__ void t1()
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
if (i < ThreadPerBlock)
{
tt[i] = i;
}
}
int main()
{
int size = ThreadPerBlock;
double* a = (double*)malloc(size * sizeof(double));
t1 << <1, ThreadPerBlock >> > ();
cudaMemcpy(a, tt, size * sizeof(double), cudaMemcpyDeviceToHost);
for (int i = 0; i < size; i++)
{
cout << a[i] << endl;
}
free(a);
return 0;
}
The result i got showed that it didn’t succeed by outputing 512 times"-6.27744e+66", indicating the copying process failed.