Ok, thanks for the fast reply.
i’ill try to give you a overview:
c-code header:
extern “C” void X(…, float* a_d);
extern “C” void X2(…, float** a_d);
c-code:
X(…, &a_d);
“sync()”
X2(…, a_d);
cuda.cu code:
extern “C” void X2(…, float* a_d)
{
// execute the kernel
dim3 block(8, 8, 1);
dim3 grid(mesh_width / block.x, mesh_height / block.y, 1);
X2_kernel<<< grid, block>>>(…, a_d);
}
extern “C” void X(…, float **a_d)
{
size_t MEMsize = width*height*sizeof(float);
unsigned int ArraySize = width * height;
// allocate array on Device
(cudaMalloc((void **) a_d, MEMsize));
// copy data from host to device
( cudaMemcpy(*a_d, inputBuffer, MEMsize, cudaMemcpyHostToDevice));
//execute kernel
X_kernel <<< nBlocks, BLOCK_DIM>>> (....., *a_d, (ArraySize));
[b](cudaMemcpy(outputBuffer, *a_d, MEMsize, cudaMemcpyDeviceToHost));
[/b]
}
cuda_kernel.cu:
global void X_Kernel(…, float a, int const size)
{
int Xidx = blockIdx.x blockDim.x + threadIdx.x;
if (Xidx < size)
{
a[Xidx] = a[Xidx] * 0.5f; //for example
}
}
global void X2_kernel(…, float* a_d)
{
unsigned int x = blockIdx.xblockDim.x + threadIdx.x;
unsigned int y = blockIdx.yblockDim.y + threadIdx.y;
float test = a_d[0]; //ACCESING MEM in GPU, allocatet in X()
…
}
In emulation Mode i can set brakepoints an watch the values of the mem pointers in X() and X2(), and see they are the same.
but its not working in cuda mode.
i copy some audio data and i think its working ok, because i got the correct sound output, from the first kernel X().
but the second one isnt working alright. the adiodata copied is 2(channel) * 1024 (blocklength) * sizeof(float)
that shouldnt be to much.