Parallelism or Threading problem

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

__global__ void kernel(float *odata, float *idata, int width, int height)

{

	int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

	int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

	int index_in = xIndex + width * yIndex;

	int index_out = yIndex + height * xIndex;

	for (int i=0; i<height; i+=1) 

	{

		odata[index_out+i] = idata[index_in+i*width];

	}

}

FILE *streamread, *streamwrite;

int main()

{

	time_t mytime;

	mytime = time(NULL);

	printf("Initial time: %s", ctime(&mytime));

	char *arrayInFiles[] = {

		"raw data 0.xls", "raw data 1.xls", "raw data 2.xls", "raw data 3.xls", "raw data 4.xls", "raw data 5.xls", 

		"raw data 6.xls", "raw data 7.xls", "raw data 8.xls", "raw data 9.xls", "raw data 10.xls", "raw data 11.xls",

		"raw data 12.xls", "raw data 13.xls", "raw data 14.xls", "raw data 15.xls", "raw data 16.xls", "raw data 17.xls"

	};

	char *arrayOutFiles[] = {

		"t_raw data 0.xls", "t_raw data 1.xls", "t_raw data 2.xls", "t_raw data 3.xls", "t_raw data 4.xls", "t_raw data 5.xls",

		"t_raw data 6.xls", "t_raw data 7.xls", "t_raw data 8.xls", "t_raw data 9.xls", "t_raw data 10.xls", "t_raw data 11.xls",

		"t_raw data 12.xls", "t_raw data 13.xls", "t_raw data 14.xls", "t_raw data 15.xls", "t_raw data 16.xls", "t_raw data 17.xls"

	};

	int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

	for(int i = 0; i <numElements; i++)

	{ 

		char s[MAX_LEN + 1];

		int numelems = 0, numrows = 0;

		float *queryPoint = 0;

		char *str;

		char delims[] = "\t";

		char *result;

		if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

		{

			printf( "The file was not opened\n" );

			getchar();

			exit(1);

		}

		if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

		{

			printf( "The file was not opened for writing\n" );

			getchar();

			exit(1);

		}

		//int curprod;

		while( !feof( streamread ) )

		{	

			fgets(s, MAX_LEN + 1, streamread);

			if (strlen(s) <= 1) continue;

			//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

			str = s;	

			result = strtok( str, delims );

			if (numelems == 0)	

			{

				queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

			}

			/*curprod = numelems * numcols;

			numcols = 0;*/

			while( result != NULL ) 

			{

				queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

				queryPoint[numelems] = atof(result);

				//printf("%d = %f ",numelems, atof(result));

				result = strtok( NULL, delims );	

				numelems += 1;

			}	

			numrows += 1;

			s[0] = '

#include “cuda_runtime.h”

#include “device_launch_parameters.h”

#include <stdlib.h>

#include <stdafx.h>

#include <stdio.h>

#define MAX_LEN 10000

global void kernel(float *odata, float *idata, int width, int height)

{

int xIndex = blockIdx.x*blockDim.x + threadIdx.x;

int yIndex = blockIdx.y*blockDim.y + threadIdx.y;

int index_in = xIndex + width * yIndex;

int index_out = yIndex + height * xIndex;

for (int i=0; i<height; i+=1) 

{

	odata[index_out+i] = idata[index_in+i*width];

}

}

FILE *streamread, *streamwrite;

int main()

{

time_t mytime;

mytime = time(NULL);

printf("Initial time: %s", ctime(&mytime));

char *arrayInFiles[] = {

	"raw data 0.xls", "raw data 1.xls", "raw data 2.xls", "raw data 3.xls", "raw data 4.xls", "raw data 5.xls", 

	"raw data 6.xls", "raw data 7.xls", "raw data 8.xls", "raw data 9.xls", "raw data 10.xls", "raw data 11.xls",

	"raw data 12.xls", "raw data 13.xls", "raw data 14.xls", "raw data 15.xls", "raw data 16.xls", "raw data 17.xls"

};

char *arrayOutFiles[] = {

	"t_raw data 0.xls", "t_raw data 1.xls", "t_raw data 2.xls", "t_raw data 3.xls", "t_raw data 4.xls", "t_raw data 5.xls",

	"t_raw data 6.xls", "t_raw data 7.xls", "t_raw data 8.xls", "t_raw data 9.xls", "t_raw data 10.xls", "t_raw data 11.xls",

	"t_raw data 12.xls", "t_raw data 13.xls", "t_raw data 14.xls", "t_raw data 15.xls", "t_raw data 16.xls", "t_raw data 17.xls"

};

int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);

for(int i = 0; i <numElements; i++)

{ 

	char s[MAX_LEN + 1];

	int numelems = 0, numrows = 0;

	float *queryPoint = 0;

	char *str;

	char delims[] = "\t";

	char *result;

	if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )

	{

		printf( "The file was not opened\n" );

		getchar();

		exit(1);

	}

	if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )

	{

		printf( "The file was not opened for writing\n" );

		getchar();

		exit(1);

	}

	//int curprod;

	while( !feof( streamread ) )

	{	

		fgets(s, MAX_LEN + 1, streamread);

		if (strlen(s) <= 1) continue;

		//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));

		str = s;	

		result = strtok( str, delims );

		if (numelems == 0)	

		{

			queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);

		}

		/*curprod = numelems * numcols;

		numcols = 0;*/

		while( result != NULL ) 

		{

			queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));	

			queryPoint[numelems] = atof(result);

			//printf("%d = %f ",numelems, atof(result));

			result = strtok( NULL, delims );	

			numelems += 1;

		}	

		numrows += 1;

		s[0] = '\0';

	}

	//printf("%d\n", numrows);

	float mem_size = (numelems) * sizeof(float);

	float *h_odata = (float *) malloc(mem_size);

	float *d_idata, *d_odata;

	cudaMalloc( (void**) &d_idata, mem_size);

	cudaMalloc( (void**) &d_odata, mem_size);

	cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

	//printf("%d %d\n", numelems/numrows, numrows);

	kernel<<<numelems/numrows, 1>>>(d_odata, d_idata, numelems/numrows, numrows);	

	cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

	int traceline = 0;

	for (int j = 0; j < numelems; ++j)

	{

		//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

		traceline += 1;

		if (traceline == numrows)

		{

			fprintf(streamwrite, "%.10f\n", h_odata[j]);

			traceline = 0;

		}

		else

			fprintf(streamwrite, "%.10f\t", h_odata[j]);

	}

	fflush(streamwrite);	

	fclose( streamread );

	fclose( streamwrite );

	free(queryPoint);

}

printf("Transposed successfully.\n");

time_t mytime1;

mytime1 = time(NULL);

printf("Final time: %s", ctime(&mytime1));

getchar();	

return 0;

}

';

		}

		//printf("%d\n", numrows);

		float mem_size = (numelems) * sizeof(float);

		float *h_odata = (float *) malloc(mem_size);

		float *d_idata, *d_odata;

		cudaMalloc( (void**) &d_idata, mem_size);

		cudaMalloc( (void**) &d_odata, mem_size);

		cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);

		//printf("%d %d\n", numelems/numrows, numrows);

		kernel<<<numelems/numrows, 1>>>(d_odata, d_idata, numelems/numrows, numrows);	

		cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);

		int traceline = 0;

		for (int j = 0; j < numelems; ++j)

		{

			//fprintf(streamwrite, "%.10f\n", queryPoint[j]);

			traceline += 1;

			if (traceline == numrows)

			{

				fprintf(streamwrite, "%.10f\n", h_odata[j]);

				traceline = 0;

			}

			else

				fprintf(streamwrite, "%.10f\t", h_odata[j]);

		}

		fflush(streamwrite);	

		fclose( streamread );

		fclose( streamwrite );

		free(queryPoint);

	}

	printf("Transposed successfully.\n");

	time_t mytime1;

	mytime1 = time(NULL);

	printf("Final time: %s", ctime(&mytime1));

	getchar();	

	return 0;

}

This is my code to transpose multiple xls files using CUDA but the time taken for compilation is very much as this code does not make use of parallelism. This code transposes raw data files one after another. Can you please tell me the way to transpose all the files simultaneously? I’ve not been able to make use of multi-threading properties of GPU. Please help me out…

See the SDK example, which comes with an instructive whitepaper, for advice on how to efficiently transpose a matrix in CUDA. However, in your case there is no point in using CUDA at all - you do not need to transpose the matrix in memory at all, you could just as well do that on the fly while writing the transposed matrix out to a file.

What likely eats most of the CPU time in your case is the realloc() called for each new row.

But I need to do the transpose in GPU for all the files simultaneously…I mean to say my work needs me to use GPU and transpose all the files parallely in less time as possible…can u suggest me some ways to do that please??

So is this a class assignment, or why do you need to do this on the GPU? There is no work for the GPU to do as the transpose can be done in zero cycles (just exchange the order of indices).

Isn’t the GPU parallel processing fastest? I need to do it fastest on CUDA because I have to assemble it with other projects done in CUDA using GPU. And the number of xls files to be transposed is also very large…What needs to be done? plz help me out…