Finding element on array gpu vs cpu why cpu is 3x time faster the gpu

icemanct · December 17, 2010, 11:15pm

Hi all,

I wrote a little portion of code for testing not optimized

The purpose of the code is to find the index of query on an array of 50000000 elements

/* find index position on an monodimensional array 

performance between cpu and gpu*/

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <cutil_inline.h>

#include <cutil.h>

__global__ void vectorcompare( int* g_vector, int* g_result) 

{	

//nothing to do for now

	

}

int main() {

int *h_vector = (int*) malloc(sizeof(int)*50000000); //vector for random number generated by cpu 0<x<1000 

int *h_result = (int*) malloc(sizeof(int)*50000000); //vector for result on cpu,not all result is true

int query=7; //query to search 

int i=0;

int k=0;

for (i=0;i<50000000;i++) 

		h_vector[i]=rand()%1000;

double start = (double)clock(); //time

double elapsed = 0; //time

for (i=0;i<50000000;i++) 

if (h_vector[i]==query) {

h_result[k]=i;

k++;

}

elapsed = ((double)clock() - start)/CLOCKS_PER_SEC; //time

printf ("It took you %f seconds to analyze and find index of array where the query is.\n", elapsed );

i=0;

int *d_vector;

	cudaMalloc( (void**) &d_vector, sizeof(int)*50000000) ; //alloc and copy to device

	cudaMemcpy( d_vector, h_vector, sizeof(int)*50000000, cudaMemcpyHostToDevice);

int *d_result; //alloc and copy to device

	cudaMalloc( (void**) &d_result, sizeof(int)*50000000);

	cudaMemcpy( d_result, h_result, sizeof(int)*50000000, cudaMemcpyHostToDevice) ;	

	

int num_of_blocks =1;

int num_of_threads_per_block = 1;

double start2 = (double)clock();

double elapsed2 = 0;

	

vectorcompare<<< num_of_blocks,num_of_threads_per_block>>>( d_vector,d_result);

	cudaThreadSynchronize() ;

elapsed2 = ((double)clock() - start)/CLOCKS_PER_SEC;

printf ("gpu time :%f seconds .\n", elapsed2);

cudaMemcpy( h_result, d_result, sizeof(int)*50000000, cudaMemcpyDeviceToHost) ; //copyng result to host

		

}

on my dual core e4400 and gts 250 the cpu version in 3x time faster than the only call of a kernel(the gpu does nothing)

why there is this large difference in time?

thanks in advance

Girolamo

Lev · December 18, 2010, 4:47am

Hi all,

I wrote a little portion of code for testing not optimized

The purpose of the code is to find the index of query on an array of 50000000 elements

/* find index position on an monodimensional array 

performance between cpu and gpu*/

#include <stdlib.h>

#include <stdio.h>

#include <cuda.h>

#include <cutil_inline.h>

#include <cutil.h>

__global__ void vectorcompare( int* g_vector, int* g_result) 

{	

//nothing to do for now

	

}

int main() {

int *h_vector = (int*) malloc(sizeof(int)*50000000); //vector for random number generated by cpu 0<x<1000 

int *h_result = (int*) malloc(sizeof(int)*50000000); //vector for result on cpu,not all result is true

int query=7; //query to search 

int i=0;

int k=0;

for (i=0;i<50000000;i++) 

		h_vector[i]=rand()%1000;

double start = (double)clock(); //time

double elapsed = 0; //time

for (i=0;i<50000000;i++) 

if (h_vector[i]==query) {

h_result[k]=i;

k++;

}

elapsed = ((double)clock() - start)/CLOCKS_PER_SEC; //time

printf ("It took you %f seconds to analyze and find index of array where the query is.\n", elapsed );

i=0;

int *d_vector;

	cudaMalloc( (void**) &d_vector, sizeof(int)*50000000) ; //alloc and copy to device

	cudaMemcpy( d_vector, h_vector, sizeof(int)*50000000, cudaMemcpyHostToDevice);

int *d_result; //alloc and copy to device

	cudaMalloc( (void**) &d_result, sizeof(int)*50000000);

	cudaMemcpy( d_result, h_result, sizeof(int)*50000000, cudaMemcpyHostToDevice) ;	

	

int num_of_blocks =1;

int num_of_threads_per_block = 1;

double start2 = (double)clock();

double elapsed2 = 0;

	

vectorcompare<<< num_of_blocks,num_of_threads_per_block>>>( d_vector,d_result);

	cudaThreadSynchronize() ;

elapsed2 = ((double)clock() - start)/CLOCKS_PER_SEC;

printf ("gpu time :%f seconds .\n", elapsed2);

cudaMemcpy( h_result, d_result, sizeof(int)*50000000, cudaMemcpyDeviceToHost) ; //copyng result to host

		

}

on my dual core e4400 and gts 250 the cpu version in 3x time faster than the only call of a kernel(the gpu does nothing)

why there is this large difference in time?

thanks in advance

Girolamo

elapsed2 = ((double)clock() - start

should be start2?

icemanct · December 18, 2010, 8:26am

i’m sorry i’m a stupid coder External Image External Image External Image

you are right

many thanks