Optimizing the transpose code

//#include <conio.h>

#include <cuda_runtime.h>

#include <device_launch_parameters.h>

#include <io.h>

#include <process.h>

//#include <stdafx.h>

#include <stdio.h>

//#include <stdlib.h>

//#include <string.h>

#include <windows.h>

#define MAX_LEN 3200 //Size of memory to hold one row in one file at a time

#define MAX_ELEMS 100000 //Size of memory allocated to the array that holds entire rows and columns serially, one file at a time

#define THREADCOUNT 8 //Number of threads to execute at a time

#define TOTALFILES 200 //Total number of files to process

#define REAL float //Setting the type here. If precision of float isn't sufficient, use double. But double consumes twice as much memory as float

///To hold the names of all TOTALFILES in array of struct.

struct fileList 

{

	char* filename;

}; 

///Device code. Transposes the given matrix.

__global__ void kernel(REAL *odata, REAL *idata, unsigned int width, unsigned int height)

{

	unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

	unsigned int index_in = xIndex + width * yIndex;

	unsigned int index_out = yIndex + height * xIndex;

	for (unsigned int i = 0; i < height; ++i) 

		odata[index_out + i] = idata[index_in + i * width];	

}

///The thread function. A total of THREADCOUNT (or less) instances are executed at a time.

unsigned __stdcall ThreadFunc( void *filename  )  

{ 	

	///Checks if the file exists. If it doesn't, thread returns.

	if(_access((char *) filename, 0) == -1)

		return 0;

	FILE *streamread; //Stream for input file

	FILE *streamwrite; //Stream for output file

	char* in_filename; //Holds input filename

	char out_filename[30]; //Holds output filename		

	in_filename = (char *) filename; //Holds the input file. Format is "Raw data <number>.xls"	

	sprintf_s(out_filename, "%s%s", "Trns-",  in_filename);	//Holds the output file. Format is "Trns-Raw data <number>.xls"	

	//printf("From thread: %s\n", in_filename);	

	errno_t openerrcode; //Error code returned for errors occuring during file opening

	char s[MAX_LEN]; //Holds one row of string from a file at a time

	char *str; //This array will take command of the row read into s[MAX_LEN] to eventually tokenize it

	char *result; //The tokenized result (single elment) as broken down from str will be held here

	unsigned int numelems = 0; //Counter to hold the total number of elements in a file

	unsigned int numrows = 0; //Counter to hold the total number of rows in a file	

	char delims[] = "\t"; //File is tab-delimited. Change to comma or pipe accordingly if file-format changes

	REAL *allElems = (REAL *) malloc(sizeof(REAL *) * MAX_ELEMS); //Holds all elements of the file, one file at a time

	///Opens reading stream (for input file) and writing stream (for output file).

	if( openerrcode = fopen_s( &streamread, in_filename, "r" ) != 0 )

	{		

		printf( "The file %s was not opened for reading. Error code: %d.\n", in_filename, openerrcode );

		return 0;

	}

	if( openerrcode = fopen_s( &streamwrite, out_filename, "w" ) != 0 )

	{		

		printf( "The file %s was not opened for writing. Error code: %d.\n", out_filename, openerrcode );

		return 0;

	}		

	///Reads till the end of the input file/stream.

	while( !feof( streamread ) )

	{	

		fgets(s, MAX_LEN, streamread); //Read a row from file

		if (strlen(s) <= 1) continue; //Bypass empty rows

		str = s; //Take command of the row, for tokenizing

		result = strtok( str, delims ); //First token (element) of the row obtained in result

		///Reads each element and stores all data sequentially into an array

		while( result != NULL ) //Loop until whole row is read

		{				

			allElems[numelems] = (REAL) atof(result); //Element is converted to numeric format and stored in array

			result = strtok( NULL, delims ); //Obtains second token onwards

			++numelems; //Recording total number of elements in the file

		}	

		++numrows; //Recording total number of rows in the file

		s[0] = '\0'; //Flushing the row which was taken in command; for the purpose of reading next row.Done because s was causing errors during subsequent reads

	}

	fclose( streamread );

	cudaError_t err; //Holds the error number

	cudaError_t errrunning; //Holds the run-time error number

	REAL *d_idata; //Holds the entire matrix content in device

	REAL *d_odata; //Holds the resulting transposed matrix content in device

	REAL mem_size = numelems * sizeof(REAL); //Total size of memory needed to be allocated in device

	REAL *h_odata = (REAL *) malloc(mem_size); //Memory allocation for the transposed result as received from device

	//printf("File %s has total %d elements and occupies %.3lf KB in GPU.\n", in_filename, numelems, (mem_size / 1024) * 2);

	///Allocate memory and copy in-memory data into device.

	err = cudaMalloc( (void **) &d_idata, mem_size); //Memory to hold matrix is allocated in device

	if( err != cudaSuccess ) 			

		printf( "Error: %s\n", cudaGetErrorString(err) );			

	err = cudaMalloc( (void **) &d_odata, mem_size); //Memory to hold transposed matrix is allocated in device

	if( err != cudaSuccess ) 	

		printf( "Error: %s\n", cudaGetErrorString(err) );		

	err =cudaMemcpy(d_idata, allElems, mem_size, cudaMemcpyHostToDevice); //Matrix copied to device from host

	if( err != cudaSuccess ) 

		printf( "Error: %s\n", cudaGetErrorString(err) );	

	free( allElems );

	///Perform transpose operation and return the transposed result to host.

	kernel<<<numelems / numrows, 1>>>(d_odata, d_idata, numelems / numrows, numrows); //Perform transpose on device

	//The GPU function-launch is async, so the launch "returns" to the CPU immediately, even though the GPU code has not finished executing

	err = cudaGetLastError(); //Catches errors occuring immediately at launchtime

	errrunning = cudaThreadSynchronize(); //Sync threads and catches all other kernel run-time errors

	if( err != cudaSuccess ) 

		printf( "Error: %s\n", cudaGetErrorString(err) );

	if( errrunning != cudaSuccess ) 

		printf( "Error: %s\n", cudaGetErrorString(errrunning) );

	cudaFree( d_idata );

	err = cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost); //Copy back the transposed result to host from device

	if( err != cudaSuccess ) 

		printf( "Error: %s\n", cudaGetErrorString(err) );

	cudaFree( d_odata );

	///Write the transposed output to output stream.

	unsigned int traceline = 0; //Column counter flag	

	for (unsigned int i = 0; i < numelems; ++i)

	{			

		++traceline;

		if (traceline == numrows)

		{

			//A new line has been reached. Reset writing to next new line

			fprintf(streamwrite, "%.10f\n", h_odata[i]);

			traceline = 0; //Reset column counter flag

		}

		else

			fprintf(streamwrite, "%.10f\t", h_odata[i]); //End of line not reached; proceed with writing in the same line

	}

	fclose( streamwrite );

	return 1;

	//printf( "The file %s was transposed successfully.\n", in_filename );	

}

int main( void ) 

{ 	

	unsigned int fileID = 0; //Holds the file's number as "Raw data <fileID>.xls"

	unsigned int dwThreadId; //Holds the thread id as returned by call to _beginthreadex()

	char generatedFilename[30]; //To hold the filename

	struct fileList ap[TOTALFILES]; //Holds name of all files that need to be transposed

	HANDLE hThread[THREADCOUNT]; //Thread handles' array. Holds IDs of all the THREADCOUNT number of running threads in array

	unsigned int threadCnt; //Holds the number of threads to spawn

	unsigned int i; //Counter used for loops

	DWORD dwRet; //Holds the signal returned by the thread(s) after it's termination

	time_t start; //Record start time

	time_t end; //Record end time

	for (i = 0; i < TOTALFILES; ++i)

	{	

		ap[i].filename = new char[30]; 

		sprintf_s(generatedFilename, "%s%d%s", "Raw data ", i, ".xls"); //Holds the input file. Format is "Raw data <number>.xls"		

		strcpy(ap[i].filename, generatedFilename);		

	}

	time (&start);

	printf("Initial time: %s", ctime(&start));	

	printf("Transposing file(s) started...\n");	

	///Begin transposing operations on multiple threads.

	while ( TOTALFILES > fileID )

	{		

		threadCnt = (THREADCOUNT < (TOTALFILES - fileID)) ? THREADCOUNT: (TOTALFILES - fileID);

		for (i = 0; i < threadCnt; ++i)

		{

			//printf("From main (%d): %s\n", fileID, ap[fileID].filename);

			hThread[i] = (HANDLE) _beginthreadex(NULL, 0, &ThreadFunc, ap[fileID].filename, 0, &dwThreadId); //Creates a new executing thread 			

			if (hThread[i] == NULL) 

			{printf( "CreateThread failed." ); getchar(); exit(1);}

			++fileID;

		}				

		///Waits for infinite until all threadCnt number of threads complete execution and signal.

		dwRet = WaitForMultipleObjects(threadCnt, hThread, TRUE, INFINITE); // The return value indicates that the state of all specified objects is signaled

		///Close all handles to threads.

		for(i = 0; i < threadCnt; ++i)

			CloseHandle(hThread[i]);

	}

	///Display completion time.

	time (&end);

	printf("Final time: %s", ctime(&end));	

	printf("Time taken: %.2lf seconds\n", difftime(end, start));

	getchar();

}

Here’s my code to take multiple xls files in CPU, pass them to the GPU to transpose and return the data to CPU to write in other output files. In this code, CPU was taking too much time so I used multi-threading. Can we further optimize this code to save execution time (speed is the priority here rather than memory at runtime)? For instance, can I pass single array to GPU and do an in-place transpose rather than passing two arrays for input and output to GPU (It saves memory too, but I think that will also reduce the execution time)? How can I optimize this code to speed up the total run-time? Please help.