Very confused with 2d arrays

Just wondering if anyone can tell me where i am going wrong with this.

I have a 2D array on host side

cuFloatComplex** hostprops = NULL;

		

	hostprops = (cuFloatComplex**)malloc(sizeof(cuFloatComplex*) * nky);

	

	for(int i = 0; i < nky; i++)

	{

         hostprops[i] = (cuFloatComplex*)malloc(nkx * sizeof(cuFloatComplex) );

         memset(hostprops[i] , 0 , nkx * sizeof(cuFloatComplex));

	}

And hopefully a 2d array on device side

cudaMallocPitch((void**)&props,&pitch ,nkx * sizeof(cuFloatComplex),nky);

And I want to access individual elements of the 2d array with each thread. I am just not sure how to go about this, never used pitch before.

I used to just do something like this

int xIndex = threadIdx.x + blockIdx.x * blockDim.x;

	int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

        int Index = yIndex * width + xIndex

Also i am unsure if this is the right way to copy things back.

cudaMemcpy2D(hostprops,pitch,props,pitch,nkx,nky,cudaMemcpyDeviceToHost);

If anyone could give me some advice on this that would be awesome

Your device side code is pretty much how it should be done, but your host side code isn’t. You need to use a linear memory allocation for the host side array, rather than using an array of pointers, then copy with cudaMemcpy. There is no automatic “deep copying” or “flattening” of arrays of host pointers in the CUDA API. cudaMemcpy2D is for copying into a special type of opaque 2D memory allocation for textures. It can’t be used for anything else.

So for my host side array should just use?

hostprops = (cuFloatComplex*)malloc(nkx*nky*sizeof(cuFloatComplex));

And then copy this back using cudaMemcpy not the 2d one?

Sorry I’m pretty new to this and in a little over my head at the moment.

Yes something like that. All you need to do is decide is whether to use column or row majoring ordering in the array, and then make sure that both the host and device code follow the same indexing convection and use the same pitch (if you decide to use padding) and you should be in business.

Managed to remove errors on copying memory back by changing to

cudaMemcpy2D(hostprops,nkx*sizeof(cuFloatComplex),props,pitch,pitch,nky,cudaMemcpyDeviceToHost);

Unfortunately all of the elements still seem to be zero so my kernel probably isn’t working.

You can’t use cudaMemcpy2D, it is intended for copying into texture arrays which can only be accessed via texture reads inside kernels. Just use the standard cudaMemcpy to transfer the host data to device memory.

Ok I’ll try it the other way then.

Thanks for all the help :)

Ok, I realise this is alot to ask but if anyone could turn this into a working example for me I would be eternally grateful. I’ve tried numerous different ways to make this work but I still haven’t managed. I think if I had one example of a working kernel I could make the rest of them myself but I can’t even manage that.

The purpose of the kernel is to take two arrays of Length A1 and B1 and create an array of size A*B where each element is something like exp[i(A*B)];

#include <stdio.h>

#include <vector>

#include <iostream>

#include <cuda.h>

#include <math.h>

#include <cufft.h>

#include <cuComplex.h>

#include <cuda_runtime.h>

using namespace std;

__global__ void CreatePropsKernel( float * kxIn, float * kyIn, int nkx, int nky, float dz, cuFloatComplex * Props, size_t pitch ){

	

	

	int xIndex = threadIdx.x + blockIdx.x * blockDim.x;

	int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

	

	if (xIndex >= samplex || yIndex >= sampley)

		return ;

		

	int Index = yIndex * pitch + xIndex;

	float k0x = kxIn[xIndex];

	float k0y = kyIn[yIndex];

	float Pi = 3.14159265;

	k0x *= k0x;

	k0y *= k0y;

	

	Props[Index].x = cos(Pi*dz*(k0x+k0y));

	Props[Index].y = sin(Pi*dz*(k0x+k0y));

	

}

int main(){

	int	SampleinY	= 400;

	int	dz		= 2;

	int	SampleinX	= 300;

	float   sizex		= 30;

	float 	sizey		= 40;

	int nkx = SampleinX;

	int nky = SampleinY;

	int imidx = round(nkx/2 + 0.5);

	int imidy = round(nky/2 + 0.5);

	vector<float> k0x;

	vector<float> k0y;

	float temp;

	

	// Put correct frequencies into vectors

	

	for(int i=1 ; i <= nkx ; i++)

	{

		if ((i - 1) > imidx)

			temp = ((i - 1) - nkx)/sizex;

		else temp = (i - 1)/sizex;

		k0x.push_back (temp);

	}

	for(int i=1 ; i <= nky ; i++)

	{

		if ((i - 1) > imidy)

			temp = ((i - 1) - nky)/sizey;

		else temp = (i - 1)/sizey;

		k0y.push_back (temp);

	}

	size_t pitch;

	// Get k0x and k0y onto device memory to use in Kernel.

	

	float* k0xdev=NULL;

	float* k0ydev=NULL;

	cudaMalloc(&k0xdev,nkx*sizeof(float));

	cudaMalloc(&k0ydev,nky*sizeof(float));

	cudaMemcpy(k0xdev, &k0x[0], nkx*sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(k0ydev, &k0y[0], nky*sizeof(float), cudaMemcpyHostToDevice);

	

	// Make array to hold propogation array on host and device

	

	cuFloatComplex** hostprops = NULL;

	cuFloatComplex* props = NULL;

	

	hostprops = (cuFloatComplex**)malloc(sizeof(cuFloatComplex*) * nky);

	

	for(int i = 0; i < nky; i++)

	{

		 hostprops[i] = (cuFloatComplex*)malloc(nkx * sizeof(cuFloatComplex));

		 memset(hostprops[i] , 0 , nkx * sizeof(cuFloatComplex));

	}

	

	//2D array by array of pointers

	cudaMallocPitch((void**)&props,&pitch ,nkx * sizeof(cuFloatComplex),nky);

	int blockSize = 256;

	int totalElements = nky * nkx;

	int numBlocks = (! (totalElements % blockSize) )? (totalElements / blockSize): 1 + (totalElements / blockSize);

	

	size_t test = sizeof(cuFloatComplex);

	dim3 dimGrid(numBlocks, 1);

	dim3 dimBlock(blockSize,1);

	

	CreatePropsKernel2<<<dimGrid,dimBlock>>>(k0xdev,k0ydev,nkx,nky,dz,props,pitch);

	// Copy array back from device to host array

	cudaMemcpy2D(hostprops,nkx*sizeof(cuFloatComplex),props,pitch,pitch,nky,cudaMemcpyDeviceToHost);

	// Check to see if it actually did anything :(

	cout << "hostprops[0][0] is " << hostprops[0][0].x << endl;

	cout << "hostprops[0][1] is " << hostprops[0][0].y << endl;

	system("PAUSE");

	return 0;

}

Would this help at all?

#include <math.h>

#include <assert.h>

#include <stdio.h>

#include <stdlib.h>

#include <cuComplex.h>

const float Pi = 3.14159265f;

inline void GPUassert(cudaError_t code, bool Abort=true)

{

        if (code != 0) {

                fprintf(stderr, "GPUassert: %s\n", cudaGetErrorString(code));

                if (Abort) exit(code);

        }       

}

inline int cmplxCompare( const cuFloatComplex a, const cuFloatComplex b, float tol = 1e-6f)

{

    float dx = fabs(a.x - b.x);

    float dy = fabs(a.y - b.y);

return ( (dx + dy) < 2.0f * tol);

}

__global__ void CreatePropsKernel( const float * kxIn, const float * kyIn, const float dz, 

                cuFloatComplex * Props, const size_t pitch, const int Nmax )

{

    int xIndex = threadIdx.x + blockIdx.x * blockDim.x;

    int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

    int Index = yIndex * pitch + xIndex;

if (Index > Nmax) return;

float k0x = kxIn[xIndex];

    float k0y = kyIn[yIndex];

k0x *= k0x;

    k0y *= k0y;

cuFloatComplex result;

    result.x = cos(Pi*dz*(k0x+k0y));

    result.y = sin(Pi*dz*(k0x+k0y));

Props[Index] = result;

}

int main(void)

{

const int nx = 128, ny = 256;

    const int Nmax = nx * ny;

    const size_t ctsize = sizeof(cuFloatComplex) * size_t(Nmax);

    const size_t fxtsize = sizeof(float) * size_t(nx);

    const size_t fytsize = sizeof(float) * size_t(ny);

const float dz = 2.0f;

float * kx = (float *)malloc(fxtsize);

    float * ky = (float *)malloc(fytsize);

for(int i = 0; i < nx; i++) {

        kx[i] = (float)drand48();

    }

for(int i = 0; i < ny; i++) {

        ky[i] = (float)drand48();

    }

float * _kx, * _ky;

    cuFloatComplex * _cmplex2D;

GPUassert( cudaMalloc((void **)&_cmplex2D, ctsize) );

    GPUassert( cudaMalloc((void **)&_kx, fxtsize) );

    GPUassert( cudaMalloc((void **)&_ky, fytsize) );

GPUassert( cudaMemcpy(_kx, kx, fxtsize, cudaMemcpyHostToDevice) );

    GPUassert( cudaMemcpy(_ky, ky, fytsize, cudaMemcpyHostToDevice) );

dim3 gridsize = dim3(16,32), blocksize = dim3(8,8);

CreatePropsKernel <<< gridsize, blocksize >>> ( _kx, _ky, dz, _cmplex2D, size_t(nx), Nmax);

    GPUassert( cudaGetLastError() );

    GPUassert( cudaThreadSynchronize() );

cuFloatComplex * cmplex2D = (cuFloatComplex *)malloc( ctsize );

    GPUassert( cudaMemcpy( cmplex2D, _cmplex2D, ctsize, cudaMemcpyDeviceToHost) );

// Verification on Host 

    for(int i = 0; i < nx; i++) {

        float k0x = kx[i] * kx[i];

        for(int j = 0; j < ny; j++) {

            float k0y = ky[j] * ky[j];

cuFloatComplex resulth;

            resulth.x = cos(Pi*dz*(k0x+k0y));

            resulth.y = sin(Pi*dz*(k0x+k0y));

assert( cmplxCompare( resulth, cmplex2D[i + j*nx] ) );

        }

    }

return cudaThreadExit();

}