Parallelism or Threading problem

Paritosh · August 11, 2011, 4:22am

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

__global__ void kernel(float *odata, float *idata, int width, int height)

{

	int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

	int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

	int index_in = xIndex + width * yIndex;

	int index_out = yIndex + height * xIndex;

	for (int i=0; i<height; i+=1) 

	{

		odata[index_out+i] = idata[index_in+i*width];

	}

}

FILE *streamread, *streamwrite;

int main()

{

	time_t mytime;

	mytime = time(NULL);

	printf("Initial time: %s", ctime(&mytime));

	char *arrayInFiles[] = {

		"raw data 0.xls", "raw data 1.xls", "raw data 2.xls", "raw data 3.xls", "raw data 4.xls", "raw data 5.xls", 

		"raw data 6.xls", "raw data 7.xls", "raw data 8.xls", "raw data 9.xls", "raw data 10.xls", "raw data 11.xls",

		"raw data 12.xls", "raw data 13.xls", "raw data 14.xls", "raw data 15.xls", "raw data 16.xls", "raw data 17.xls"

	};

	char *arrayOutFiles[] = {

		"t_raw data 0.xls", "t_raw data 1.xls", "t_raw data 2.xls", "t_raw data 3.xls", "t_raw data 4.xls", "t_raw data 5.xls",

		"t_raw data 6.xls", "t_raw data 7.xls", "t_raw data 8.xls", "t_raw data 9.xls", "t_raw data 10.xls", "t_raw data 11.xls",

		"t_raw data 12.xls", "t_raw data 13.xls", "t_raw data 14.xls", "t_raw data 15.xls", "t_raw data 16.xls", "t_raw data 17.xls"

	};

	int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

	for(int i = 0; i <numElements; i++)

	{ 

		char s[MAX_LEN + 1];

		int numelems = 0, numrows = 0;

		float *queryPoint = 0;

		char *str;

		char delims[] = "\t";

		char *result;

		if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

		{

			printf( "The file was not opened\n" );

			getchar();

			exit(1);

		}

		if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

		{

			printf( "The file was not opened for writing\n" );

			getchar();

			exit(1);

		}

		//int curprod;

		while( !feof( streamread ) )

		{	

			fgets(s, MAX_LEN + 1, streamread);

			if (strlen(s) <= 1) continue;

			//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

			str = s;	

			result = strtok( str, delims );

			if (numelems == 0)	

			{

				queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

			}

			/*curprod = numelems * numcols;

			numcols = 0;*/

			while( result != NULL ) 

			{

				queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

				queryPoint[numelems] = atof(result);

				//printf("%d = %f ",numelems, atof(result));

				result = strtok( NULL, delims );	

				numelems += 1;

			}	

			numrows += 1;

			s[0] = '

#include “cuda_runtime.h”

#include “device_launch_parameters.h”

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

global void kernel(float *odata, float *idata, int width, int height)

{

int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

int index_in = xIndex + width * yIndex;

int index_out = yIndex + height * xIndex;

for (int i=0; i<height; i+=1) 

{

	odata[index_out+i] = idata[index_in+i*width];

}

}

FILE *streamread, *streamwrite;

int main()

{

time_t mytime;

mytime = time(NULL);

printf("Initial time: %s", ctime(&mytime));

char *arrayInFiles[] = {

	"raw data 0.xls", "raw data 1.xls", "raw data 2.xls", "raw data 3.xls", "raw data 4.xls", "raw data 5.xls", 

	"raw data 6.xls", "raw data 7.xls", "raw data 8.xls", "raw data 9.xls", "raw data 10.xls", "raw data 11.xls",

	"raw data 12.xls", "raw data 13.xls", "raw data 14.xls", "raw data 15.xls", "raw data 16.xls", "raw data 17.xls"

};

char *arrayOutFiles[] = {

	"t_raw data 0.xls", "t_raw data 1.xls", "t_raw data 2.xls", "t_raw data 3.xls", "t_raw data 4.xls", "t_raw data 5.xls",

	"t_raw data 6.xls", "t_raw data 7.xls", "t_raw data 8.xls", "t_raw data 9.xls", "t_raw data 10.xls", "t_raw data 11.xls",

	"t_raw data 12.xls", "t_raw data 13.xls", "t_raw data 14.xls", "t_raw data 15.xls", "t_raw data 16.xls", "t_raw data 17.xls"

};

int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

for(int i = 0; i <numElements; i++)

{ 

	char s[MAX_LEN + 1];

	int numelems = 0, numrows = 0;

	float *queryPoint = 0;

	char *str;

	char delims[] = "\t";

	char *result;

	if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

	{

		printf( "The file was not opened\n" );

		getchar();

		exit(1);

	}

	if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

	{

		printf( "The file was not opened for writing\n" );

		getchar();

		exit(1);

	}

	//int curprod;

	while( !feof( streamread ) )

	{	

		fgets(s, MAX_LEN + 1, streamread);

		if (strlen(s) <= 1) continue;

		//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

		str = s;	

		result = strtok( str, delims );

		if (numelems == 0)	

		{

			queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

		}

		/*curprod = numelems * numcols;

		numcols = 0;*/

		while( result != NULL ) 

		{

			queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

			queryPoint[numelems] = atof(result);

			//printf("%d = %f ",numelems, atof(result));

			result = strtok( NULL, delims );	

			numelems += 1;

		}	

		numrows += 1;

		s[0] = '\0';

	}

	//printf("%d\n", numrows);

	float mem_size = (numelems) * sizeof(float);

	float *h_odata = (float *) malloc(mem_size);

	float *d_idata, *d_odata;

	cudaMalloc( (void**) &d_idata, mem_size);

	cudaMalloc( (void**) &d_odata, mem_size);

	cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

	//printf("%d %d\n", numelems/numrows, numrows);

	kernel<<<numelems/numrows, 1>>>(d_odata, d_idata, numelems/numrows, numrows);	

	cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

	int traceline = 0;

	for (int j = 0; j < numelems; ++j)

	{

		//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

		traceline += 1;

		if (traceline == numrows)

		{

			fprintf(streamwrite, "%.10f\n", h_odata[j]);

			traceline = 0;

		}

		else

			fprintf(streamwrite, "%.10f\t", h_odata[j]);

	}

	fflush(streamwrite);	

	fclose( streamread );

	fclose( streamwrite );

	free(queryPoint);

}

printf("Transposed successfully.\n");

time_t mytime1;

mytime1 = time(NULL);

printf("Final time: %s", ctime(&mytime1));

getchar();	

return 0;

}

';

		}

		//printf("%d\n", numrows);

		float mem_size = (numelems) * sizeof(float);

		float *h_odata = (float *) malloc(mem_size);

		float *d_idata, *d_odata;

		cudaMalloc( (void**) &d_idata, mem_size);

		cudaMalloc( (void**) &d_odata, mem_size);

		cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

		//printf("%d %d\n", numelems/numrows, numrows);

		kernel<<<numelems/numrows, 1>>>(d_odata, d_idata, numelems/numrows, numrows);	

		cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

		int traceline = 0;

		for (int j = 0; j < numelems; ++j)

		{

			//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

			traceline += 1;

			if (traceline == numrows)

			{

				fprintf(streamwrite, "%.10f\n", h_odata[j]);

				traceline = 0;

			}

			else

				fprintf(streamwrite, "%.10f\t", h_odata[j]);

		}

		fflush(streamwrite);	

		fclose( streamread );

		fclose( streamwrite );

		free(queryPoint);

	}

	printf("Transposed successfully.\n");

	time_t mytime1;

	mytime1 = time(NULL);

	printf("Final time: %s", ctime(&mytime1));

	getchar();	

	return 0;

}

This is my code to transpose multiple xls files using CUDA but the time taken for compilation is very much as this code does not make use of parallelism. This code transposes raw data files one after another. Can you please tell me the way to transpose all the files simultaneously? I’ve not been able to make use of multi-threading properties of GPU. Please help me out…

tera · August 11, 2011, 8:56am

See the SDK example, which comes with an instructive whitepaper, for advice on how to efficiently transpose a matrix in CUDA. However, in your case there is no point in using CUDA at all - you do not need to transpose the matrix in memory at all, you could just as well do that on the fly while writing the transposed matrix out to a file.

What likely eats most of the CPU time in your case is the realloc() called for each new row.

Paritosh · August 11, 2011, 2:24pm

But I need to do the transpose in GPU for all the files simultaneously…I mean to say my work needs me to use GPU and transpose all the files parallely in less time as possible…can u suggest me some ways to do that please??

tera · August 11, 2011, 3:47pm

So is this a class assignment, or why do you need to do this on the GPU? There is no work for the GPU to do as the transpose can be done in zero cycles (just exchange the order of indices).

Paritosh · August 12, 2011, 2:23am

Isn’t the GPU parallel processing fastest? I need to do it fastest on CUDA because I have to assemble it with other projects done in CUDA using GPU. And the number of xls files to be transposed is also very large…What needs to be done? plz help me out…

Topic		Replies	Views
Multi-threading in host(CPU) CUDA Programming and Performance	2	828	August 26, 2011
Optimizing the transpose code CUDA Programming and Performance	0	2946	August 23, 2011
Working with array in CUDA CUDA Programming and Performance	9	1487	July 21, 2011
Question about tranpose CUDA Programming and Performance	19	7540	June 11, 2008
Doubling the speed of the SDK transpose CUDA Programming and Performance	16	6473	October 15, 2008
about __syncthreads() in SDK/project/transpose CUDA Programming and Performance	5	2806	September 18, 2009
Matrix multiplication CUDA CUDA Programming and Performance	7	3069	November 12, 2012
An Efficient Matrix Transpose in CUDA C/C++ Technical Blog	31	3106	October 30, 2020
Cuda Latency problems Slow Cuda CUDA Programming and Performance	15	14201	September 5, 2008
transpose demo: gpu vs cpu CUDA Programming and Performance	3	9487	August 9, 2007

Parallelism or Threading problem

Related topics