Your device side code is pretty much how it should be done, but your host side code isn’t. You need to use a linear memory allocation for the host side array, rather than using an array of pointers, then copy with cudaMemcpy. There is no automatic “deep copying” or “flattening” of arrays of host pointers in the CUDA API. cudaMemcpy2D is for copying into a special type of opaque 2D memory allocation for textures. It can’t be used for anything else.
Yes something like that. All you need to do is decide is whether to use column or row majoring ordering in the array, and then make sure that both the host and device code follow the same indexing convection and use the same pitch (if you decide to use padding) and you should be in business.
You can’t use cudaMemcpy2D, it is intended for copying into texture arrays which can only be accessed via texture reads inside kernels. Just use the standard cudaMemcpy to transfer the host data to device memory.
Ok, I realise this is alot to ask but if anyone could turn this into a working example for me I would be eternally grateful. I’ve tried numerous different ways to make this work but I still haven’t managed. I think if I had one example of a working kernel I could make the rest of them myself but I can’t even manage that.
The purpose of the kernel is to take two arrays of Length A1 and B1 and create an array of size A*B where each element is something like exp[i(A*B)];
#include <stdio.h>
#include <vector>
#include <iostream>
#include <cuda.h>
#include <math.h>
#include <cufft.h>
#include <cuComplex.h>
#include <cuda_runtime.h>
using namespace std;
__global__ void CreatePropsKernel( float * kxIn, float * kyIn, int nkx, int nky, float dz, cuFloatComplex * Props, size_t pitch ){
int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
if (xIndex >= samplex || yIndex >= sampley)
return ;
int Index = yIndex * pitch + xIndex;
float k0x = kxIn[xIndex];
float k0y = kyIn[yIndex];
float Pi = 3.14159265;
k0x *= k0x;
k0y *= k0y;
Props[Index].x = cos(Pi*dz*(k0x+k0y));
Props[Index].y = sin(Pi*dz*(k0x+k0y));
}
int main(){
int SampleinY = 400;
int dz = 2;
int SampleinX = 300;
float sizex = 30;
float sizey = 40;
int nkx = SampleinX;
int nky = SampleinY;
int imidx = round(nkx/2 + 0.5);
int imidy = round(nky/2 + 0.5);
vector<float> k0x;
vector<float> k0y;
float temp;
// Put correct frequencies into vectors
for(int i=1 ; i <= nkx ; i++)
{
if ((i - 1) > imidx)
temp = ((i - 1) - nkx)/sizex;
else temp = (i - 1)/sizex;
k0x.push_back (temp);
}
for(int i=1 ; i <= nky ; i++)
{
if ((i - 1) > imidy)
temp = ((i - 1) - nky)/sizey;
else temp = (i - 1)/sizey;
k0y.push_back (temp);
}
size_t pitch;
// Get k0x and k0y onto device memory to use in Kernel.
float* k0xdev=NULL;
float* k0ydev=NULL;
cudaMalloc(&k0xdev,nkx*sizeof(float));
cudaMalloc(&k0ydev,nky*sizeof(float));
cudaMemcpy(k0xdev, &k0x[0], nkx*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(k0ydev, &k0y[0], nky*sizeof(float), cudaMemcpyHostToDevice);
// Make array to hold propogation array on host and device
cuFloatComplex** hostprops = NULL;
cuFloatComplex* props = NULL;
hostprops = (cuFloatComplex**)malloc(sizeof(cuFloatComplex*) * nky);
for(int i = 0; i < nky; i++)
{
hostprops[i] = (cuFloatComplex*)malloc(nkx * sizeof(cuFloatComplex));
memset(hostprops[i] , 0 , nkx * sizeof(cuFloatComplex));
}
//2D array by array of pointers
cudaMallocPitch((void**)&props,&pitch ,nkx * sizeof(cuFloatComplex),nky);
int blockSize = 256;
int totalElements = nky * nkx;
int numBlocks = (! (totalElements % blockSize) )? (totalElements / blockSize): 1 + (totalElements / blockSize);
size_t test = sizeof(cuFloatComplex);
dim3 dimGrid(numBlocks, 1);
dim3 dimBlock(blockSize,1);
CreatePropsKernel2<<<dimGrid,dimBlock>>>(k0xdev,k0ydev,nkx,nky,dz,props,pitch);
// Copy array back from device to host array
cudaMemcpy2D(hostprops,nkx*sizeof(cuFloatComplex),props,pitch,pitch,nky,cudaMemcpyDeviceToHost);
// Check to see if it actually did anything :(
cout << "hostprops[0][0] is " << hostprops[0][0].x << endl;
cout << "hostprops[0][1] is " << hostprops[0][0].y << endl;
system("PAUSE");
return 0;
}