Very confused with 2d arrays

AdamDyson · February 16, 2011, 3:42pm

Just wondering if anyone can tell me where i am going wrong with this.

I have a 2D array on host side

cuFloatComplex** hostprops = NULL;

		

	hostprops = (cuFloatComplex**)malloc(sizeof(cuFloatComplex*) * nky);

	

	for(int i = 0; i < nky; i++)

	{

         hostprops[i] = (cuFloatComplex*)malloc(nkx * sizeof(cuFloatComplex) );

         memset(hostprops[i] , 0 , nkx * sizeof(cuFloatComplex));

	}

And hopefully a 2d array on device side

cudaMallocPitch((void**)&props,&pitch ,nkx * sizeof(cuFloatComplex),nky);

And I want to access individual elements of the 2d array with each thread. I am just not sure how to go about this, never used pitch before.

I used to just do something like this

int xIndex = threadIdx.x + blockIdx.x * blockDim.x;

	int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

        int Index = yIndex * width + xIndex

Also i am unsure if this is the right way to copy things back.

cudaMemcpy2D(hostprops,pitch,props,pitch,nkx,nky,cudaMemcpyDeviceToHost);

If anyone could give me some advice on this that would be awesome

avidday · February 16, 2011, 3:53pm

Your device side code is pretty much how it should be done, but your host side code isn’t. You need to use a linear memory allocation for the host side array, rather than using an array of pointers, then copy with cudaMemcpy. There is no automatic “deep copying” or “flattening” of arrays of host pointers in the CUDA API. cudaMemcpy2D is for copying into a special type of opaque 2D memory allocation for textures. It can’t be used for anything else.

AdamDyson · February 16, 2011, 4:10pm

So for my host side array should just use?

hostprops = (cuFloatComplex*)malloc(nkx*nky*sizeof(cuFloatComplex));

And then copy this back using cudaMemcpy not the 2d one?

Sorry I’m pretty new to this and in a little over my head at the moment.

avidday · February 16, 2011, 4:30pm

Yes something like that. All you need to do is decide is whether to use column or row majoring ordering in the array, and then make sure that both the host and device code follow the same indexing convection and use the same pitch (if you decide to use padding) and you should be in business.

AdamDyson · February 16, 2011, 5:05pm

Managed to remove errors on copying memory back by changing to

cudaMemcpy2D(hostprops,nkx*sizeof(cuFloatComplex),props,pitch,pitch,nky,cudaMemcpyDeviceToHost);

Unfortunately all of the elements still seem to be zero so my kernel probably isn’t working.

avidday · February 16, 2011, 5:45pm

You can’t use cudaMemcpy2D, it is intended for copying into texture arrays which can only be accessed via texture reads inside kernels. Just use the standard cudaMemcpy to transfer the host data to device memory.

AdamDyson · February 16, 2011, 6:22pm

Ok I’ll try it the other way then.

Thanks for all the help :)

AdamDyson · February 16, 2011, 8:48pm

Ok, I realise this is alot to ask but if anyone could turn this into a working example for me I would be eternally grateful. I’ve tried numerous different ways to make this work but I still haven’t managed. I think if I had one example of a working kernel I could make the rest of them myself but I can’t even manage that.

The purpose of the kernel is to take two arrays of Length A1 and B1 and create an array of size A*B where each element is something like exp[i(A*B)];

#include <stdio.h>

#include <vector>

#include <iostream>

#include <cuda.h>

#include <math.h>

#include <cufft.h>

#include <cuComplex.h>

#include <cuda_runtime.h>

using namespace std;

__global__ void CreatePropsKernel( float * kxIn, float * kyIn, int nkx, int nky, float dz, cuFloatComplex * Props, size_t pitch ){

	

	

	int xIndex = threadIdx.x + blockIdx.x * blockDim.x;

	int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

	

	if (xIndex >= samplex || yIndex >= sampley)

		return ;

		

	int Index = yIndex * pitch + xIndex;

	float k0x = kxIn[xIndex];

	float k0y = kyIn[yIndex];

	float Pi = 3.14159265;

	k0x *= k0x;

	k0y *= k0y;

	

	Props[Index].x = cos(Pi*dz*(k0x+k0y));

	Props[Index].y = sin(Pi*dz*(k0x+k0y));

	

}

int main(){

	int	SampleinY	= 400;

	int	dz		= 2;

	int	SampleinX	= 300;

	float   sizex		= 30;

	float 	sizey		= 40;

	int nkx = SampleinX;

	int nky = SampleinY;

	int imidx = round(nkx/2 + 0.5);

	int imidy = round(nky/2 + 0.5);

	vector<float> k0x;

	vector<float> k0y;

	float temp;

	

	// Put correct frequencies into vectors

	

	for(int i=1 ; i <= nkx ; i++)

	{

		if ((i - 1) > imidx)

			temp = ((i - 1) - nkx)/sizex;

		else temp = (i - 1)/sizex;

		k0x.push_back (temp);

	}

	for(int i=1 ; i <= nky ; i++)

	{

		if ((i - 1) > imidy)

			temp = ((i - 1) - nky)/sizey;

		else temp = (i - 1)/sizey;

		k0y.push_back (temp);

	}

	size_t pitch;

	// Get k0x and k0y onto device memory to use in Kernel.

	

	float* k0xdev=NULL;

	float* k0ydev=NULL;

	cudaMalloc(&k0xdev,nkx*sizeof(float));

	cudaMalloc(&k0ydev,nky*sizeof(float));

	cudaMemcpy(k0xdev, &k0x[0], nkx*sizeof(float), cudaMemcpyHostToDevice);

	cudaMemcpy(k0ydev, &k0y[0], nky*sizeof(float), cudaMemcpyHostToDevice);

	

	// Make array to hold propogation array on host and device

	

	cuFloatComplex** hostprops = NULL;

	cuFloatComplex* props = NULL;

	

	hostprops = (cuFloatComplex**)malloc(sizeof(cuFloatComplex*) * nky);

	

	for(int i = 0; i < nky; i++)

	{

		 hostprops[i] = (cuFloatComplex*)malloc(nkx * sizeof(cuFloatComplex));

		 memset(hostprops[i] , 0 , nkx * sizeof(cuFloatComplex));

	}

	

	//2D array by array of pointers

	cudaMallocPitch((void**)&props,&pitch ,nkx * sizeof(cuFloatComplex),nky);

	int blockSize = 256;

	int totalElements = nky * nkx;

	int numBlocks = (! (totalElements % blockSize) )? (totalElements / blockSize): 1 + (totalElements / blockSize);

	

	size_t test = sizeof(cuFloatComplex);

	dim3 dimGrid(numBlocks, 1);

	dim3 dimBlock(blockSize,1);

	

	CreatePropsKernel2<<<dimGrid,dimBlock>>>(k0xdev,k0ydev,nkx,nky,dz,props,pitch);

	// Copy array back from device to host array

	cudaMemcpy2D(hostprops,nkx*sizeof(cuFloatComplex),props,pitch,pitch,nky,cudaMemcpyDeviceToHost);

	// Check to see if it actually did anything :(

	cout << "hostprops[0][0] is " << hostprops[0][0].x << endl;

	cout << "hostprops[0][1] is " << hostprops[0][0].y << endl;

	system("PAUSE");

	return 0;

}

avidday · February 17, 2011, 3:44pm

Would this help at all?

#include <math.h>

#include <assert.h>

#include <stdio.h>

#include <stdlib.h>

#include <cuComplex.h>

const float Pi = 3.14159265f;

inline void GPUassert(cudaError_t code, bool Abort=true)

{

        if (code != 0) {

                fprintf(stderr, "GPUassert: %s\n", cudaGetErrorString(code));

                if (Abort) exit(code);

        }       

}

inline int cmplxCompare( const cuFloatComplex a, const cuFloatComplex b, float tol = 1e-6f)

{

    float dx = fabs(a.x - b.x);

    float dy = fabs(a.y - b.y);

return ( (dx + dy) < 2.0f * tol);

}

__global__ void CreatePropsKernel( const float * kxIn, const float * kyIn, const float dz, 

                cuFloatComplex * Props, const size_t pitch, const int Nmax )

{

    int xIndex = threadIdx.x + blockIdx.x * blockDim.x;

    int yIndex = threadIdx.y + blockIdx.y * blockDim.y;

    int Index = yIndex * pitch + xIndex;

if (Index > Nmax) return;

float k0x = kxIn[xIndex];

    float k0y = kyIn[yIndex];

k0x *= k0x;

    k0y *= k0y;

cuFloatComplex result;

    result.x = cos(Pi*dz*(k0x+k0y));

    result.y = sin(Pi*dz*(k0x+k0y));

Props[Index] = result;

}

int main(void)

{

const int nx = 128, ny = 256;

    const int Nmax = nx * ny;

    const size_t ctsize = sizeof(cuFloatComplex) * size_t(Nmax);

    const size_t fxtsize = sizeof(float) * size_t(nx);

    const size_t fytsize = sizeof(float) * size_t(ny);

const float dz = 2.0f;

float * kx = (float *)malloc(fxtsize);

    float * ky = (float *)malloc(fytsize);

for(int i = 0; i < nx; i++) {

        kx[i] = (float)drand48();

    }

for(int i = 0; i < ny; i++) {

        ky[i] = (float)drand48();

    }

float * _kx, * _ky;

    cuFloatComplex * _cmplex2D;

GPUassert( cudaMalloc((void **)&_cmplex2D, ctsize) );

    GPUassert( cudaMalloc((void **)&_kx, fxtsize) );

    GPUassert( cudaMalloc((void **)&_ky, fytsize) );

GPUassert( cudaMemcpy(_kx, kx, fxtsize, cudaMemcpyHostToDevice) );

    GPUassert( cudaMemcpy(_ky, ky, fytsize, cudaMemcpyHostToDevice) );

dim3 gridsize = dim3(16,32), blocksize = dim3(8,8);

CreatePropsKernel <<< gridsize, blocksize >>> ( _kx, _ky, dz, _cmplex2D, size_t(nx), Nmax);

    GPUassert( cudaGetLastError() );

    GPUassert( cudaThreadSynchronize() );

cuFloatComplex * cmplex2D = (cuFloatComplex *)malloc( ctsize );

    GPUassert( cudaMemcpy( cmplex2D, _cmplex2D, ctsize, cudaMemcpyDeviceToHost) );

// Verification on Host 

    for(int i = 0; i < nx; i++) {

        float k0x = kx[i] * kx[i];

        for(int j = 0; j < ny; j++) {

            float k0y = ky[j] * ky[j];

cuFloatComplex resulth;

            resulth.x = cos(Pi*dz*(k0x+k0y));

            resulth.y = sin(Pi*dz*(k0x+k0y));

assert( cmplxCompare( resulth, cmplex2D[i + j*nx] ) );

        }

    }

return cudaThreadExit();

}

Topic		Replies	Views
help cudaMemcpy2d Trying to modify a 2d array on cuda device CUDA Programming and Performance	8	5087	September 11, 2010
2D array & Memory space Mostly about cudaMallocPitch & cudaMemcpy2D CUDA Programming and Performance	1	1516	October 15, 2009
2D arrays with cuda confusion CUDA Programming and Performance	2	1131	May 9, 2010
CUDA 2D Array Problem Need help to manipulate 2D arrays in CUDA CUDA Programming and Performance	4	26518	March 17, 2011
Can't get copyDeviceToHost to work with cudaMemcpy2D CUDA Programming and Performance	0	3648	November 13, 2009
CudaMallocPitch and CudaMemcpy2D CUDA Programming and Performance	7	5737	August 3, 2015
2D Array Not Updated CUDA Programming and Performance	6	5322	May 4, 2010
2 dimensional array CUDA Programming and Performance	7	5736	January 2, 2009
2D Array CUDA Programming and Performance	16	77316	January 20, 2012
Problems using/copying 2D arrays cudaMallocPitch, cudaMemcpy2D CUDA Programming and Performance	0	3553	December 8, 2008

Very confused with 2d arrays

Related topics