#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdlib.h>
#include <stdafx.h>
#include <stdio.h>
#define MAX_LEN 10000
__global__ void kernel(float *odata, float *idata, int width, int height)
{
int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
int index_in = xIndex + width * yIndex;
int index_out = yIndex + height * xIndex;
for (int i=0; i<height; i+=1)
{
odata[index_out+i] = idata[index_in+i*width];
}
}
FILE *streamread, *streamwrite;
int main()
{
time_t mytime;
mytime = time(NULL);
printf("Initial time: %s", ctime(&mytime));
char *arrayInFiles[] = {
"raw data 0.xls", "raw data 1.xls", "raw data 2.xls", "raw data 3.xls", "raw data 4.xls", "raw data 5.xls",
"raw data 6.xls", "raw data 7.xls", "raw data 8.xls", "raw data 9.xls", "raw data 10.xls", "raw data 11.xls",
"raw data 12.xls", "raw data 13.xls", "raw data 14.xls", "raw data 15.xls", "raw data 16.xls", "raw data 17.xls"
};
char *arrayOutFiles[] = {
"t_raw data 0.xls", "t_raw data 1.xls", "t_raw data 2.xls", "t_raw data 3.xls", "t_raw data 4.xls", "t_raw data 5.xls",
"t_raw data 6.xls", "t_raw data 7.xls", "t_raw data 8.xls", "t_raw data 9.xls", "t_raw data 10.xls", "t_raw data 11.xls",
"t_raw data 12.xls", "t_raw data 13.xls", "t_raw data 14.xls", "t_raw data 15.xls", "t_raw data 16.xls", "t_raw data 17.xls"
};
int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);
for(int i = 0; i <numElements; i++)
{
char s[MAX_LEN + 1];
int numelems = 0, numrows = 0;
float *queryPoint = 0;
char *str;
char delims[] = "\t";
char *result;
if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )
{
printf( "The file was not opened\n" );
getchar();
exit(1);
}
if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )
{
printf( "The file was not opened for writing\n" );
getchar();
exit(1);
}
//int curprod;
while( !feof( streamread ) )
{
fgets(s, MAX_LEN + 1, streamread);
if (strlen(s) <= 1) continue;
//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));
str = s;
result = strtok( str, delims );
if (numelems == 0)
{
queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);
}
/*curprod = numelems * numcols;
numcols = 0;*/
while( result != NULL )
{
queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));
queryPoint[numelems] = atof(result);
//printf("%d = %f ",numelems, atof(result));
result = strtok( NULL, delims );
numelems += 1;
}
numrows += 1;
s[0] = '
#include “cuda_runtime.h”
#include “device_launch_parameters.h”
#include <stdlib.h>
#include <stdafx.h>
#include <stdio.h>
#define MAX_LEN 10000
global void kernel(float *odata, float *idata, int width, int height)
{
int xIndex = blockIdx.x*blockDim.x + threadIdx.x;
int yIndex = blockIdx.y*blockDim.y + threadIdx.y;
int index_in = xIndex + width * yIndex;
int index_out = yIndex + height * xIndex;
for (int i=0; i<height; i+=1)
{
odata[index_out+i] = idata[index_in+i*width];
}
}
FILE *streamread, *streamwrite;
int main()
{
time_t mytime;
mytime = time(NULL);
printf("Initial time: %s", ctime(&mytime));
char *arrayInFiles[] = {
"raw data 0.xls", "raw data 1.xls", "raw data 2.xls", "raw data 3.xls", "raw data 4.xls", "raw data 5.xls",
"raw data 6.xls", "raw data 7.xls", "raw data 8.xls", "raw data 9.xls", "raw data 10.xls", "raw data 11.xls",
"raw data 12.xls", "raw data 13.xls", "raw data 14.xls", "raw data 15.xls", "raw data 16.xls", "raw data 17.xls"
};
char *arrayOutFiles[] = {
"t_raw data 0.xls", "t_raw data 1.xls", "t_raw data 2.xls", "t_raw data 3.xls", "t_raw data 4.xls", "t_raw data 5.xls",
"t_raw data 6.xls", "t_raw data 7.xls", "t_raw data 8.xls", "t_raw data 9.xls", "t_raw data 10.xls", "t_raw data 11.xls",
"t_raw data 12.xls", "t_raw data 13.xls", "t_raw data 14.xls", "t_raw data 15.xls", "t_raw data 16.xls", "t_raw data 17.xls"
};
int numElements = sizeof(arrayInFiles) / sizeof(arrayInFiles[0]);
for(int i = 0; i <numElements; i++)
{
char s[MAX_LEN + 1];
int numelems = 0, numrows = 0;
float *queryPoint = 0;
char *str;
char delims[] = "\t";
char *result;
if( fopen_s( &streamread, arrayInFiles[i], "r" ) != 0 )
{
printf( "The file was not opened\n" );
getchar();
exit(1);
}
if( fopen_s( &streamwrite, arrayOutFiles[i], "w" ) != 0 )
{
printf( "The file was not opened for writing\n" );
getchar();
exit(1);
}
//int curprod;
while( !feof( streamread ) )
{
fgets(s, MAX_LEN + 1, streamread);
if (strlen(s) <= 1) continue;
//queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));
str = s;
result = strtok( str, delims );
if (numelems == 0)
{
queryPoint = (float *) malloc(sizeof(float *) * MAX_LEN);
}
/*curprod = numelems * numcols;
numcols = 0;*/
while( result != NULL )
{
queryPoint = (float *) realloc(queryPoint, sizeof(float) * (numelems + 1));
queryPoint[numelems] = atof(result);
//printf("%d = %f ",numelems, atof(result));
result = strtok( NULL, delims );
numelems += 1;
}
numrows += 1;
s[0] = '\0';
}
//printf("%d\n", numrows);
float mem_size = (numelems) * sizeof(float);
float *h_odata = (float *) malloc(mem_size);
float *d_idata, *d_odata;
cudaMalloc( (void**) &d_idata, mem_size);
cudaMalloc( (void**) &d_odata, mem_size);
cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);
//printf("%d %d\n", numelems/numrows, numrows);
kernel<<<numelems/numrows, 1>>>(d_odata, d_idata, numelems/numrows, numrows);
cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
int traceline = 0;
for (int j = 0; j < numelems; ++j)
{
//fprintf(streamwrite, "%.10f\n", queryPoint[j]);
traceline += 1;
if (traceline == numrows)
{
fprintf(streamwrite, "%.10f\n", h_odata[j]);
traceline = 0;
}
else
fprintf(streamwrite, "%.10f\t", h_odata[j]);
}
fflush(streamwrite);
fclose( streamread );
fclose( streamwrite );
free(queryPoint);
}
printf("Transposed successfully.\n");
time_t mytime1;
mytime1 = time(NULL);
printf("Final time: %s", ctime(&mytime1));
getchar();
return 0;
}
';
}
//printf("%d\n", numrows);
float mem_size = (numelems) * sizeof(float);
float *h_odata = (float *) malloc(mem_size);
float *d_idata, *d_odata;
cudaMalloc( (void**) &d_idata, mem_size);
cudaMalloc( (void**) &d_odata, mem_size);
cudaMemcpy(d_idata, queryPoint, mem_size, cudaMemcpyHostToDevice);
//printf("%d %d\n", numelems/numrows, numrows);
kernel<<<numelems/numrows, 1>>>(d_odata, d_idata, numelems/numrows, numrows);
cudaMemcpy(h_odata, d_odata, mem_size, cudaMemcpyDeviceToHost);
int traceline = 0;
for (int j = 0; j < numelems; ++j)
{
//fprintf(streamwrite, "%.10f\n", queryPoint[j]);
traceline += 1;
if (traceline == numrows)
{
fprintf(streamwrite, "%.10f\n", h_odata[j]);
traceline = 0;
}
else
fprintf(streamwrite, "%.10f\t", h_odata[j]);
}
fflush(streamwrite);
fclose( streamread );
fclose( streamwrite );
free(queryPoint);
}
printf("Transposed successfully.\n");
time_t mytime1;
mytime1 = time(NULL);
printf("Final time: %s", ctime(&mytime1));
getchar();
return 0;
}
This is my code to transpose multiple xls files using CUDA but the time taken for compilation is very much as this code does not make use of parallelism. This code transposes raw data files one after another. Can you please tell me the way to transpose all the files simultaneously? I’ve not been able to make use of multi-threading properties of GPU. Please help me out…