I wrote a little portion of code for testing not optimized
The purpose of the code is to find the index of query on an array of 50000000 elements
/* find index position on an monodimensional array
performance between cpu and gpu*/
#include <stdlib.h>
#include <stdio.h>
#include <cuda.h>
#include <cutil_inline.h>
#include <cutil.h>
__global__ void vectorcompare( int* g_vector, int* g_result)
{
//nothing to do for now
}
int main() {
int *h_vector = (int*) malloc(sizeof(int)*50000000); //vector for random number generated by cpu 0<x<1000
int *h_result = (int*) malloc(sizeof(int)*50000000); //vector for result on cpu,not all result is true
int query=7; //query to search
int i=0;
int k=0;
for (i=0;i<50000000;i++)
h_vector[i]=rand()%1000;
double start = (double)clock(); //time
double elapsed = 0; //time
for (i=0;i<50000000;i++)
if (h_vector[i]==query) {
h_result[k]=i;
k++;
}
elapsed = ((double)clock() - start)/CLOCKS_PER_SEC; //time
printf ("It took you %f seconds to analyze and find index of array where the query is.\n", elapsed );
i=0;
int *d_vector;
cudaMalloc( (void**) &d_vector, sizeof(int)*50000000) ; //alloc and copy to device
cudaMemcpy( d_vector, h_vector, sizeof(int)*50000000, cudaMemcpyHostToDevice);
int *d_result; //alloc and copy to device
cudaMalloc( (void**) &d_result, sizeof(int)*50000000);
cudaMemcpy( d_result, h_result, sizeof(int)*50000000, cudaMemcpyHostToDevice) ;
int num_of_blocks =1;
int num_of_threads_per_block = 1;
double start2 = (double)clock();
double elapsed2 = 0;
vectorcompare<<< num_of_blocks,num_of_threads_per_block>>>( d_vector,d_result);
cudaThreadSynchronize() ;
elapsed2 = ((double)clock() - start)/CLOCKS_PER_SEC;
printf ("gpu time :%f seconds .\n", elapsed2);
cudaMemcpy( h_result, d_result, sizeof(int)*50000000, cudaMemcpyDeviceToHost) ; //copyng result to host
}
on my dual core e4400 and gts 250 the cpu version in 3x time faster than the only call of a kernel(the gpu does nothing)