Working with array in CUDA

Hi
I am trying to transpose n x n array in GPU using CUDA but i am having trouble with it. I can not transfer the array to GPU using cudamemcpy() or cudaMemCpy2D(). Nothing worked out. Can you plz post the code to transfer n x n array in GPU using CUDA? The array is multi-dimensional and consists of many rows and columns…
Please someone help me out.

How do you allocated memory for the multidimensional array. Post your code.

Or check out the transpose example in the SDK.

Here’s a code…Actually i read an xls file which was in comma separated format and put that in array…I tried to send the array to GPU but failed to do so…
When i transpose the array outside of GPU i.e. outside CUDA function, it works properly but i am unable to send the array to GPU for transpose…Can you help me out plz…

#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdlib.h>
#include <stdafx.h>
#define MAX_LEN 5000

FILE *streamread, *streamwrite;

global void transpose(float ** qp, int numrows, int numcols)
{
float temp = 0;
for (int i = 0; i < numcols; ++i)
{
for (int j = 0; j < numrows; ++j)
{
temp = qp[i][j];
qp[i][j] = qp[j][i];
qp[j][i] = temp;
}
}
}

int main()
{
char s[MAX_LEN + 1];
int numrows = 0, numcols = 0;
float **queryPoint = 0;
char *str;
char delims = “\t”;
char *result;

if( fopen_s( &streamread, “Raw Data 16.xls”, “r” ) != 0 )
{
printf( “The file Raw Data 16.xls was not opened\n” );
getchar();
exit(1);
}

if( fopen_s( &streamwrite, “Transposed Raw Data 16.xls”, “w” ) != 0 )
{
printf( “The file Transposed Raw Data 16.xls was not opened\n” );
getchar();
exit(1);
}

queryPoint = (float**) malloc(sizeof(float **));

while( !feof( streamread ) )
{
fgets(s, MAX_LEN + 1, streamread);
if (strlen(s) <= 1) continue;
queryPoint = (float **) realloc(queryPoint, sizeof(float) * (numrows + 1));
str = s;
result = strtok( str, delims );
if (numcols != 0)
queryPoint[numrows] = (float *) malloc(sizeof(float) * (numcols));
else
queryPoint[numrows] = (float *) malloc(sizeof(float *) * MAX_LEN);
numcols = 0;
while( result != NULL )
{
numcols += 1;
queryPoint[numrows][numcols - 1] = atof(result);
result = strtok( NULL, delims );
}
numrows += 1;
s[0] = ‘\0’;
}

float** devPtr;
size_t pitch;
cudaError_t err = cudaSuccess;
float ** resultset = (float**) malloc(sizeof(float **) * numrows);
for (int i = 0; i < numrows; i++)
{
resultset[i] = (float *) malloc(sizeof(float *) * numcols);
}

cudaMallocPitch((void**)&devPtr, &pitch, numcols * sizeof(float), numrows);
cudaMemcpy(devPtr, queryPoint, numrows * numcols * sizeof(float*), cudaMemcpyHostToDevice);

transpose <<< 1, 1 >>> (devPtr, numrows, numcols); //Executes the function on device kernel
err = cudaMemcpy(queryPoint, devPtr, numrows * sizeof(float*), cudaMemcpyDeviceToHost);

fclose( streamread );
fclose( streamwrite );
free(queryPoint);
free(resultset);
cudaFree(devPtr);
printf(“Transposed successfully.”);
getchar();
}

I am using Visual Studio 2008…Please help me out…

I guess the cudaMemcpy will fail because queryPoint is not contiguous. since you are allocating the cols as

The queryPoint won’t be contiguous. So, cudaMemcpy will fail.

What I would suggest is After you cudaMemcpy(devPtr, queryPoint, …, todevice), Memcopy the devPtr back to host and check its values. If it doesn’t have the right values then the error is due to what I said above.

It is not showing a right value…it is showing different values…so what shall i do now??

As I told you before, this error is due to your memory allocation. Your memory allocation is not contiguous.

What I would suggest is to read the file and store the values in temporary array (say temp). Then create a contiguous array (queryPoint) and copy the data from temp to querypoint. Then copy the follow your remaining steps[cudaMemcpy and etc…]

Here’s my new code. The problem is it is working fine with limited rows and columns for xls file test.xls and test1.xls but when I try to transpose large amount of rows and columns from xls file Raw data 16.xls(attached below), it is writing the value -0.0013270393 in every row and column. What might be the reason behind this? Can you please help me out?

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

__global__ void kernel(float *odata, float *idata, int width, int height)

{

	int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

	int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

	int index_in = xIndex + width * yIndex;

	int index_out = yIndex + height * xIndex;

	for (int i=0; i<height; i+=1) 

	{

		odata[index_out+i] = idata[index_in+i*width];

	}

}

FILE *streamread, *streamwrite;

int main()

{

	char *arrayInFiles[] = {

      "test.xls", "test1.xls"

};

	char *arrayOutFiles[] = {

      "Ttest.xls", "Ttest1.xls"

};

	int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

for(int i = 0; i <numElements; i++)

	  {       

	char s[MAX_LEN + 1];

	int numelems = 0, numrows = 0;

	float *queryPoint = 0;

	char *str;

	char delims[] = "\t";

	char *result;

	if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

	{

		printf( "The file was not opened\n" );

		getchar();

		exit(1);

	}

	if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

	{

		printf( "The file was not opened for writing\n" );

		getchar();

		exit(1);

	}

	//int curprod;

	while( !feof( streamread ) )

	{	  

		fgets(s, MAX_LEN + 1, streamread);

		if (strlen(s) <= 1) continue;

		//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

		str = s;		

		result = strtok( str, delims );

		if (numelems == 0)		

		{

			queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

		}

		/*curprod = numelems * numcols;

		numcols = 0;*/

		while( result != NULL ) 

		{

			queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

			queryPoint[numelems] =  atof(result);

			//printf("%d = %f ",numelems, atof(result));

			result = strtok( NULL, delims );	

			numelems += 1;

		}		

		numrows += 1;

		s[0] = '

#include “cuda_runtime.h”

#include “device_launch_parameters.h”

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

global void kernel(float *odata, float *idata, int width, int height)

{

int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

int index_in = xIndex + width * yIndex;

int index_out = yIndex + height * xIndex;

for (int i=0; i<height; i+=1) 

{

	odata[index_out+i] = idata[index_in+i*width];

}

}

FILE *streamread, *streamwrite;

int main()

{

char *arrayInFiles[] = {

  "test.xls", "test1.xls"

};

char *arrayOutFiles[] = {

  "Ttest.xls", "Ttest1.xls"

};

int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

for(int i = 0; i <numElements; i++)

  {       

char s[MAX_LEN + 1];

int numelems = 0, numrows = 0;

float *queryPoint = 0;

char *str;

char delims[] = "\t";

char *result;

if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

{

	printf( "The file was not opened\n" );

	getchar();

	exit(1);

}

if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

{

	printf( "The file was not opened for writing\n" );

	getchar();

	exit(1);

}

//int curprod;

while( !feof( streamread ) )

{	  

	fgets(s, MAX_LEN + 1, streamread);

	if (strlen(s) <= 1) continue;

	//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

	str = s;		

	result = strtok( str, delims );

	if (numelems == 0)		

	{

		queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

	}

	/*curprod = numelems * numcols;

	numcols = 0;*/

	while( result != NULL ) 

	{

		queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

		queryPoint[numelems] =  atof(result);

		//printf("%d = %f ",numelems, atof(result));

		result = strtok( NULL, delims );	

		numelems += 1;

	}		

	numrows += 1;

	s[0] = '\0';

}

//printf("%d\n", numrows);

float mem_size = (numelems) * sizeof(float);

float *h_odata = (float *) malloc(mem_size);

float *d_idata, *d_odata;

cudaMalloc( (void**) &d_idata, mem_size);

cudaMalloc( (void**) &d_odata, mem_size);

cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

//printf("%d %d\n", numelems/numrows, numrows);

kernel<<<numrows, numelems/numrows>>>(d_odata, d_idata, numelems/numrows, numrows);	

cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

int traceline = 0;

for (int j = 0; j < numelems; ++j)

{

	//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

	traceline += 1;

	if (traceline == numrows)

	{

		fprintf(streamwrite, "%.10f\n", h_odata[j]);

		traceline = 0;

	}

	else

		fprintf(streamwrite, "%.10f\t", h_odata[j]);

}

fflush(streamwrite);	  

fclose( streamread );

fclose( streamwrite );

free(queryPoint);

}

printf("Transposed successfully.");

getchar();	

return 0;

}

';

	}

	//printf("%d\n", numrows);

	float mem_size = (numelems) * sizeof(float);

	float *h_odata = (float *) malloc(mem_size);

	float *d_idata, *d_odata;

	cudaMalloc( (void**) &d_idata, mem_size);

	cudaMalloc( (void**) &d_odata, mem_size);

	cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

	//printf("%d %d\n", numelems/numrows, numrows);

	kernel<<<numrows, numelems/numrows>>>(d_odata, d_idata, numelems/numrows, numrows);	

	cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

	int traceline = 0;

	for (int j = 0; j < numelems; ++j)

	{

		//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

		traceline += 1;

		if (traceline == numrows)

		{

			fprintf(streamwrite, "%.10f\n", h_odata[j]);

			traceline = 0;

		}

		else

			fprintf(streamwrite, "%.10f\t", h_odata[j]);

	}

	fflush(streamwrite);	  

	fclose( streamread );

	fclose( streamwrite );

	free(queryPoint);

}

	printf("Transposed successfully.");

	getchar();	

	return 0;

}

Raw Data 16.xls (1.66 MB)

One possible problem is that you are making the number of threads per block equal to the width of the array ( i.e. numelems/numrows), so could easily exceed the maximum number of threads per block defined for cuda.

You also need to test for possible error code after running the kernel e.g.

kernel<<<numrows, numelems/numrows>>>(d_odata, d_idata, numelems/numrows, numrows);

    checkCUDAError("kernel execution");
void checkCUDAError(const char *msg)

{

    cudaError_t err = cudaGetLastError();

    if( cudaSuccess != err) 

    {

        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

        exit(-1);

    }                         

}

Also shouldn’t mem_size be of type size_t

if (traceline == numrows) << should this be numrows or on columns ?

So for the attached xls file, what should be the parameter to be passed in the kernel? Plz help me out…