Multi-threading in host(CPU)

#include "cuda_runtime.h"

#include "device_launch_parameters.h"

#include <stdafx.h>

#include <stdio.h>

#include <stdlib.h>

#define MAX_LEN 5000 //Size of memory to hold one row in one file at a time.

#define MAX_ELEMS 300000 //Size of memory allocated to the array that holds entire rows and columns serially, one file at a time

#define REAL float //Setting the type here. Precision of float wasn't sufficient so double was used. Consumed twice memory then float

FILE *streamlog; //Stream for log file

FILE *streamread; //Stream for input file

FILE *streamwrite; //Stream for output file

__global__ void kernel(REAL *odata, REAL *idata, unsigned int width, unsigned int height)


	unsigned int xIndex = blockIdx.x * blockDim.x + threadIdx.x;

	unsigned int yIndex = blockIdx.y * blockDim.y + threadIdx.y;

	unsigned int index_in = xIndex + width * yIndex;

	unsigned int index_out = yIndex + height * xIndex;

	for (unsigned int i = 0; i < height; i += 1) 

		odata[index_out + i] = idata[index_in + i * width];	


int main(void)


	char log_filename[30] = "Logging.txt"; //Holds log filename

	char in_filename[30] = ""; //Holds input filename

	char out_filename[30] = ""; //Holds output filename	

	errno_t openerrcode; //Error code returned for errors occuring during file opening

	unsigned int totalfiles = 560; //Enter total number of files to transpose	

	bool reading_failedflag = false; //Flag to remember if any file reading caused error

	bool writing_failedflag = false; //Flag to remember if any file writing caused error

	bool global_failedflag = false; //Flag to remember if any error occured during whole program execution

	unsigned int filecount = 0; //Total count of files successfully processed

	///Opens a log stream to log all messages into log file for later review.

	if( openerrcode = fopen_s( &streamlog, log_filename, "w" ) != 0 )


		printf( "The log file %s was not opened for writing. Error code: %d\n", log_filename, openerrcode );		


		exit(1); //Exits the program


	///Initiate time-monitored transpose process.

	time_t start, end;

	time (&start);

	printf("Initial time: %s", ctime(&start));

	fprintf( streamlog, "Initial time: %s", ctime(&start));

	printf("Transposing file(s) started...\n");

	fprintf( streamlog, "Transposing file(s) started...\n");

	///Loop through each input files.

	for (unsigned int i = 0; i < totalfiles; ++i)


		char s[MAX_LEN]; //Holds one row at a time

		unsigned int numelems = 0; //Counter to hold the total number of elements in a file

		unsigned int numrows = 0; //Counter to hold the total number of rows in a file

		char *str; //This array will take command of the row read into s[MAX_LEN] to eventually tokenize it

		char delims[] = "\t"; //File is tab-delimited. Change to comma or pipe accordingly if file-format changes

		char *result; //The tokenized result (single elment) as broken down from str will be hold here

		REAL *allElems = (REAL *) malloc(sizeof(REAL *) * MAX_ELEMS); //Holds all elements of the file, one file at a time

		sprintf_s(in_filename, "%s%d%s", "raw data ", i, ".xls"); //Holds the input file. Format is "raw data <number>.xls"

		sprintf_s(out_filename, "%s%s", "trns-", in_filename);	//Holds the output file. Format is "trns-raw data <number>.xls"

		///Opens reading stream (for input file) and writing stream (for output file).

		if( openerrcode = fopen_s( &streamread, in_filename, "r" ) != 0 )


			fprintf( streamlog, "The file %s was not opened for reading. Error code: %d.\n", in_filename, openerrcode );

			reading_failedflag = true; //Recording error

			global_failedflag = true; //Recording error

			continue; //Continues to next file


		if( openerrcode = fopen_s( &streamwrite, out_filename, "w" ) != 0 )


			fprintf( streamlog, "The file %s was not opened for writing. Error code: %d.\n", out_filename, openerrcode );

			writing_failedflag = true; //Recording error

			global_failedflag = true; //Recording error

			continue; //Continues to next file


		///Reads till the end of the input file/stream.

		while( !feof( streamread ) )


			fgets(s, MAX_LEN, streamread); //Read a row from file

			if (strlen(s) <= 1) continue; //Bypass empty rows

			str = s; //Take command of the row, for tokenizing

			result = strtok( str, delims ); //First token (element) of the row obtained in result

			///Reads each element and stores all data sequentially into an array (single column, multiple rows).

			while( result != NULL ) //Loop until whole row is read


				allElems[numelems] = (REAL) atof(result); //Element is converted to numeric format and stored in array

				result = strtok( NULL, delims ); //Obtains second token onwards

				numelems += 1; //Recording total number of elements in the file


			numrows += 1; //Recording total number of rows in the file

			s[0] = '

Here’s my code to tranpose xls file from a directory using CUDA in GPU. However it is taking too much time than required. I found out that it may be due to the limitation of CPU rather than GPU. The CPU is taking a long time to read data from xls file and then send it to GPU. Can you suggest me some ways in which I can reduce the compilation time by using CPU multi-threading (reading multiple files simultaneously and send to GPU) or any other way. Please help me out…

Your kernel is unnecessarily sequential. You probably don’t even need a for loop there.
You’re also launching a large number of blocks of 1 thread each.
Something tells me you need a better understanding of CUDA first.

I am new to CUDA so I am not able to optimize the code. So can you pls tell me how much thread I can define to make it run faster. pls help…