Hi, I decided to go back to basics and write a simple program to find the mean of a vector of numbers. I figured this would be a good way to learn. I’m finding that my GPU version of the code runs ~3x slower than the host version, primarily due to the cost of copying data from the host to the device. I’m not sure if there is anyway to get around this, or whether I am actually using cudaMallocPitch() correctly.
My algorithm is to divide the data into DATA_PER_THREAD sized chunks, and find the average of those chunks in parallel across a set of THREADS_PER_BLOCK threads running in a block. The average of each thread’s data is written to shared memory. Then each block writes the average of this temporary shared memory result to a global block_avg array. The host program then takes the average of the block_avg array to find the average of the entire data set.
Any pointers as to how I go about optimizing the following code would be appreciated.
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <cutil.h>
#include <time.h>
#define DATA_PER_THREAD 10000
#define THREADS_PER_BLOCK 64
__device__ float *d_data;
__device__ float *block_avg;
__global__ void cudaProcess(float *data, float *block_avg, size_t pitch)
{
extern __shared__ float sdata[];
int i;
float *dp;
float avg;
dp = (float *) ((char *) data + (blockIdx.x * THREADS_PER_BLOCK + threadIdx.x) * pitch);
avg = 0.0f;
for (i=0; i<DATA_PER_THREAD; i++)
{
avg += *dp;
dp++;
}
avg /= (float) DATA_PER_THREAD;
sdata[threadIdx.x] = avg;
__syncthreads();
if (threadIdx.x == 0)
{
// The block average now needs to be stored globally
// Only one thread needs to do this (idx = 0)
avg = 0.0f;
for (i=0; i<THREADS_PER_BLOCK; i++) avg += sdata[i];
avg /= (float) THREADS_PER_BLOCK;
block_avg[blockIdx.x] = avg;
}
}
float process (float *data, unsigned int N)
{
int blocks;
dim3 block(THREADS_PER_BLOCK, 1, 1);
dim3 grid;
int sbytes;
float *host_block_avg;
unsigned int i, j;
float avg;
size_t pitch;
blocks = N / (THREADS_PER_BLOCK * DATA_PER_THREAD); // Fails if N is not a multiple!
grid.x = blocks;
grid.y = 1;
grid.z = 1;
// In shared memory we store each thread's average
sbytes = sizeof(float) * THREADS_PER_BLOCK;
// In global memory we store each block's average
cudaMalloc((void **) &block_avg, sizeof(float) * blocks);
host_block_avg = (float *) malloc (sizeof(float) * blocks);
// Allocate & copy the data over to the device
// Using MallocPitch to facilitate coalesced memory access
cudaMallocPitch((void **) &d_data, &pitch, sizeof(float) * DATA_PER_THREAD, blocks * THREADS_PER_BLOCK);
for (j=0; j<blocks; j++)
for (i=0; i<THREADS_PER_BLOCK; i++)
{
void *d_vp;
d_vp = (void *) ((char *) d_data + (j*THREADS_PER_BLOCK + i) * pitch);
cudaMemcpy(d_vp,
&data[(j * THREADS_PER_BLOCK * DATA_PER_THREAD) + (i * DATA_PER_THREAD)],
sizeof(float) * DATA_PER_THREAD, cudaMemcpyHostToDevice);
}
cudaProcess <<<grid, block, sbytes>>> (d_data, block_avg, pitch);
// Copy the results back to the host
cudaMemcpy(host_block_avg, block_avg, sizeof(float) * blocks, cudaMemcpyDeviceToHost);
// Summarize the results
avg = 0.0f;
for (i=0; i<blocks; i++) avg += host_block_avg[i];
avg /= (float) blocks;
cudaFree(block_avg);
cudaFree(d_data);
return (avg);
}
int main(int argc, char **argv)
{
unsigned int N = 6400000;
unsigned int i,j;
int T = 100;
time_t ts, te;
float *h_data;
float avg;
h_data = (float *) malloc (sizeof(float) * N);
for (i=0; i<N; i++) h_data[i] = i;
// Do it on the host T times
ts = time(NULL);
for (j=0; j<T; j++)
{
avg = 0.0f;
for (i=0; i<N; i++) avg += h_data[i];
avg /= (float) N;
}
te = time(NULL);
printf ("%d items, average = %6.4f\n", N, avg);
printf ("Host time : %d sec\n", te - ts);
ts = time(NULL);
for (j=0; j<T; j++)
{
avg = process (h_data, N);
}
te = time(NULL);
printf ("CUDA result = %6.4f\n", avg);
printf ("GPU time : %d sec\n", te - ts);
return (0);
}