Working with array in CUDA

Paritosh · July 16, 2011, 2:00pm

Hi
I am trying to transpose n x n array in GPU using CUDA but i am having trouble with it. I can not transfer the array to GPU using cudamemcpy() or cudaMemCpy2D(). Nothing worked out. Can you plz post the code to transfer n x n array in GPU using CUDA? The array is multi-dimensional and consists of many rows and columns…
Please someone help me out.

veda87 · July 16, 2011, 8:10pm

How do you allocated memory for the multidimensional array. Post your code.

tera · July 16, 2011, 8:54pm

Or check out the transpose example in the SDK.

Paritosh · July 17, 2011, 4:05pm

Here’s a code…Actually i read an xls file which was in comma separated format and put that in array…I tried to send the array to GPU but failed to do so…
When i transpose the array outside of GPU i.e. outside CUDA function, it works properly but i am unable to send the array to GPU for transpose…Can you help me out plz…

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdlib.h>
#include <stdafx.h>
#define MAX_LEN 5000

FILE *streamread, *streamwrite;

global void transpose(float ** qp, int numrows, int numcols)
{
float temp = 0;
for (int i = 0; i < numcols; ++i)
{
for (int j = 0; j < numrows; ++j)
{
temp = qp[i][j];
qp[i][j] = qp[j][i];
qp[j][i] = temp;
}
}
}

int main()
{
char s[MAX_LEN + 1];
int numrows = 0, numcols = 0;
float **queryPoint = 0;
char *str;
char delims = “\t”;
char *result;

if( fopen_s( &streamread, “Raw Data 16.xls”, “r” ) != 0 )
{
printf( “The file Raw Data 16.xls was not opened\n” );
getchar();
exit(1);
}

if( fopen_s( &streamwrite, “Transposed Raw Data 16.xls”, “w” ) != 0 )
{
printf( “The file Transposed Raw Data 16.xls was not opened\n” );
getchar();
exit(1);
}

queryPoint = (float**) malloc(sizeof(float **));

while( !feof( streamread ) )
{
fgets(s, MAX_LEN + 1, streamread);
if (strlen(s) <= 1) continue;
queryPoint = (float **) realloc(queryPoint, sizeof(float) * (numrows + 1));
str = s;
result = strtok( str, delims );
if (numcols != 0)
queryPoint[numrows] = (float *) malloc(sizeof(float) * (numcols));
else
queryPoint[numrows] = (float *) malloc(sizeof(float *) * MAX_LEN);
numcols = 0;
while( result != NULL )
{
numcols += 1;
queryPoint[numrows][numcols - 1] = atof(result);
result = strtok( NULL, delims );
}
numrows += 1;
s[0] = ‘\0’;
}

float** devPtr;
size_t pitch;
cudaError_t err = cudaSuccess;
float ** resultset = (float**) malloc(sizeof(float **) * numrows);
for (int i = 0; i < numrows; i++)
{
resultset[i] = (float *) malloc(sizeof(float *) * numcols);
}

cudaMallocPitch((void**)&devPtr, &pitch, numcols * sizeof(float), numrows);
cudaMemcpy(devPtr, queryPoint, numrows * numcols * sizeof(float*), cudaMemcpyHostToDevice);

transpose <<< 1, 1 >>> (devPtr, numrows, numcols); //Executes the function on device kernel
err = cudaMemcpy(queryPoint, devPtr, numrows * sizeof(float*), cudaMemcpyDeviceToHost);

fclose( streamread );
fclose( streamwrite );
free(queryPoint);
free(resultset);
cudaFree(devPtr);
printf(“Transposed successfully.”);
getchar();
}

I am using Visual Studio 2008…Please help me out…

veda87 · July 17, 2011, 8:44pm

I guess the cudaMemcpy will fail because queryPoint is not contiguous. since you are allocating the cols as

The queryPoint won’t be contiguous. So, cudaMemcpy will fail.

What I would suggest is After you cudaMemcpy(devPtr, queryPoint, …, todevice), Memcopy the devPtr back to host and check its values. If it doesn’t have the right values then the error is due to what I said above.

Paritosh · July 18, 2011, 2:42am

It is not showing a right value…it is showing different values…so what shall i do now??

veda87 · July 18, 2011, 4:12am

As I told you before, this error is due to your memory allocation. Your memory allocation is not contiguous.

What I would suggest is to read the file and store the values in temporary array (say temp). Then create a contiguous array (queryPoint) and copy the data from temp to querypoint. Then copy the follow your remaining steps[cudaMemcpy and etc…]

Paritosh · July 19, 2011, 3:04am

Here’s my new code. The problem is it is working fine with limited rows and columns for xls file test.xls and test1.xls but when I try to transpose large amount of rows and columns from xls file Raw data 16.xls(attached below), it is writing the value -0.0013270393 in every row and column. What might be the reason behind this? Can you please help me out?

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

__global__ void kernel(float *odata, float *idata, int width, int height)

{

	int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

	int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

	int index_in = xIndex + width * yIndex;

	int index_out = yIndex + height * xIndex;

	for (int i=0; i<height; i+=1) 

	{

		odata[index_out+i] = idata[index_in+i*width];

	}

}

FILE *streamread, *streamwrite;

int main()

{

	char *arrayInFiles[] = {

      "test.xls", "test1.xls"

};

	char *arrayOutFiles[] = {

      "Ttest.xls", "Ttest1.xls"

};

	int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

for(int i = 0; i <numElements; i++)

	  {       

	char s[MAX_LEN + 1];

	int numelems = 0, numrows = 0;

	float *queryPoint = 0;

	char *str;

	char delims[] = "\t";

	char *result;

	if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

	{

		printf( "The file was not opened\n" );

		getchar();

		exit(1);

	}

	if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

	{

		printf( "The file was not opened for writing\n" );

		getchar();

		exit(1);

	}

	//int curprod;

	while( !feof( streamread ) )

	{	  

		fgets(s, MAX_LEN + 1, streamread);

		if (strlen(s) <= 1) continue;

		//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

		str = s;		

		result = strtok( str, delims );

		if (numelems == 0)		

		{

			queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

		}

		/*curprod = numelems * numcols;

		numcols = 0;*/

		while( result != NULL ) 

		{

			queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

			queryPoint[numelems] =  atof(result);

			//printf("%d = %f ",numelems, atof(result));

			result = strtok( NULL, delims );	

			numelems += 1;

		}		

		numrows += 1;

		s[0] = '

#include “cuda_runtime.h”

#include “device_launch_parameters.h”

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

global void kernel(float *odata, float *idata, int width, int height)

{

int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

int index_in = xIndex + width * yIndex;

int index_out = yIndex + height * xIndex;

for (int i=0; i<height; i+=1) 

{

	odata[index_out+i] = idata[index_in+i*width];

}

}

FILE *streamread, *streamwrite;

int main()

{

char *arrayInFiles[] = {

  "test.xls", "test1.xls"

};

char *arrayOutFiles[] = {

  "Ttest.xls", "Ttest1.xls"

};

int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

for(int i = 0; i <numElements; i++)

  {       

char s[MAX_LEN + 1];

int numelems = 0, numrows = 0;

float *queryPoint = 0;

char *str;

char delims[] = "\t";

char *result;

if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

{

	printf( "The file was not opened\n" );

	getchar();

	exit(1);

}

if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

{

	printf( "The file was not opened for writing\n" );

	getchar();

	exit(1);

}

//int curprod;

while( !feof( streamread ) )

{	  

	fgets(s, MAX_LEN + 1, streamread);

	if (strlen(s) <= 1) continue;

	//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

	str = s;		

	result = strtok( str, delims );

	if (numelems == 0)		

	{

		queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

	}

	/*curprod = numelems * numcols;

	numcols = 0;*/

	while( result != NULL ) 

	{

		queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

		queryPoint[numelems] =  atof(result);

		//printf("%d = %f ",numelems, atof(result));

		result = strtok( NULL, delims );	

		numelems += 1;

	}		

	numrows += 1;

	s[0] = '\0';

}

//printf("%d\n", numrows);

float mem_size = (numelems) * sizeof(float);

float *h_odata = (float *) malloc(mem_size);

float *d_idata, *d_odata;

cudaMalloc( (void**) &d_idata, mem_size);

cudaMalloc( (void**) &d_odata, mem_size);

cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

//printf("%d %d\n", numelems/numrows, numrows);

kernel<<<numrows, numelems/numrows>>>(d_odata, d_idata, numelems/numrows, numrows);	

cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

int traceline = 0;

for (int j = 0; j < numelems; ++j)

{

	//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

	traceline += 1;

	if (traceline == numrows)

	{

		fprintf(streamwrite, "%.10f\n", h_odata[j]);

		traceline = 0;

	}

	else

		fprintf(streamwrite, "%.10f\t", h_odata[j]);

}

fflush(streamwrite);	  

fclose( streamread );

fclose( streamwrite );

free(queryPoint);

}

printf("Transposed successfully.");

getchar();	

return 0;

}

';

	}

	//printf("%d\n", numrows);

	float mem_size = (numelems) * sizeof(float);

	float *h_odata = (float *) malloc(mem_size);

	float *d_idata, *d_odata;

	cudaMalloc( (void**) &d_idata, mem_size);

	cudaMalloc( (void**) &d_odata, mem_size);

	cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

	//printf("%d %d\n", numelems/numrows, numrows);

	kernel<<<numrows, numelems/numrows>>>(d_odata, d_idata, numelems/numrows, numrows);	

	cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

	int traceline = 0;

	for (int j = 0; j < numelems; ++j)

	{

		//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

		traceline += 1;

		if (traceline == numrows)

		{

			fprintf(streamwrite, "%.10f\n", h_odata[j]);

			traceline = 0;

		}

		else

			fprintf(streamwrite, "%.10f\t", h_odata[j]);

	}

	fflush(streamwrite);	  

	fclose( streamread );

	fclose( streamwrite );

	free(queryPoint);

}

	printf("Transposed successfully.");

	getchar();	

	return 0;

}

Raw Data 16.xls (1.66 MB)

kbam · July 19, 2011, 6:15am

One possible problem is that you are making the number of threads per block equal to the width of the array ( i.e. numelems/numrows), so could easily exceed the maximum number of threads per block defined for cuda.

You also need to test for possible error code after running the kernel e.g.

kernel<<<numrows, numelems/numrows>>>(d_odata, d_idata, numelems/numrows, numrows);

    checkCUDAError("kernel execution");

void checkCUDAError(const char *msg)

{

    cudaError_t err = cudaGetLastError();

    if( cudaSuccess != err) 

    {

        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

        exit(-1);

    }                         

}

Also shouldn’t mem_size be of type size_t

if (traceline == numrows) << should this be numrows or on columns ?

Paritosh · July 21, 2011, 2:08pm

So for the attached xls file, what should be the parameter to be passed in the kernel? Plz help me out…

Topic		Replies	Views
Multi-threading in host(CPU) CUDA Programming and Performance	2	828	August 26, 2011
Parallelism or Threading problem CUDA Programming and Performance	4	2321	August 12, 2011
2D Array Not Updated CUDA Programming and Performance	6	5349	May 4, 2010
Transpose matrix like 8x1M in bytes by memcpy2d CUDA Programming and Performance cuda	10	212	November 13, 2024
2D matrix transfer and handling problem Help required CUDA Programming and Performance	7	1545	July 13, 2010
transposed matrix of size N*M CUDA Programming and Performance	0	764	August 12, 2011
Passing a multidimensional array to kernel how to allocate space in host and pass to device? CUDA Programming and Performance	12	16402	November 22, 2014
How to cudaMalloc two-dimensional array ? CUDA Programming and Performance	46	67535	September 7, 2023
how to allocate a 2d array in cudamalloc? CUDA Programming and Performance	0	2059	April 4, 2014
2d array testing in very simple code using CUDA CUDA Programming and Performance	29	30637	November 15, 2010

Working with array in CUDA

Related topics