help with first cuda program

In short I copied a program written by someone else (from here: http://llpanorama.wordpress.com/2008/05/21…-cuda-program/) and added in a bit where the cpu carries out the same operation. The CPU runs much faster, and I’m not sure why? here is the code:

[codebox] // Kernel that executes on the CUDA device

global void square_array(float *a, int N)

{

int idx = blockIdx.x * blockDim.x + threadIdx.x;

if (idx<N) a[idx] = a[idx] * a[idx] ;

__syncthreads();

}

// main routine that executes on the host

int main(void)

{

float *a_h, *a_d; // Pointer to host & device arrays

float *a,*b;

unsigned int timer = 0;

cutCreateTimer( &timer );

unsigned int timercpu = 0;

cutCreateTimer( &timercpu );

const int N = 400; // Number of elements in arrays

size_t size = N * sizeof(float);

a_h = (float *)malloc(size); // Allocate array on host

a = (float *)malloc(size);

b = (float *)malloc(size);

cudaMalloc((void **) &a_d, size); // Allocate array on device

// Initialize host array and copy it to CUDA device

for (int i=0; i<N; i++)

   a_h[i] = (float)i;  

cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice);

// Do calculation on device:

int block_size = 1;

int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);

cutStartTimer( timer ); // Start timer

square_array <<< n_blocks, block_size >>> (a_d, N);

cutStopTimer( timer ); // Stop timer

// Retrieve result from device and store it in host array

cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost);

// Print results

printf(“CUDA execution time = %f ms\n”,cutGetTimerValue( timer ));

// Cleanup

free(a_h); cudaFree(a_d);

system(“pause”);

cutStartTimer( timercpu ); // Start timer

for (int i=0; i<N; i++) { a[i] = (float)i;

b[i]= a[i] * a[i] ;

}

cutStopTimer( timercpu ); // Stop timer

printf("CPU execution time = %f ms\n",cutGetTimerValue( timercpu )); 

system(“pause”);

return 0;

} [/codebox]

It seems that you only run 1 thread per block, you should run something like 256 - 512 threads per block, i.e. set block_size to 256. Then I think the “__syncthreads()” in the kernel is unnecessary.

It looks like your kernal is so simple that the CPU can finish the job in the time it takes to copy data over to the GPU! Give it more work to do to see it really shine. External Image

Sorted! I was building with the EmuDebug. When I changed it to release it ran much quicker.

Hi spp,

I’m wondering how you compiled this and what header files you used.

Thanks,

Matt

Hi matthewraymondwong,

I complied in visual studio and used the headers,
// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
#include <cutil_inline.h>
#include <cuda.h>