Hello,
I’m evaluationg CUDA using my old GeForce 9600GT. I’ve created a simple test and got some result. Can someone help me to get what that result can be on new modern devices like TESLA M2090 or better.
For evaluation I generate an array of 32-bit integers and calculate count of those which contain bits specified by mask. I found out that 9600GT can handle 4 different masks over 256 MB of numbers for 180ms. What speed can be achived on modern devices?
Here is my test code:
#include <stdio.h>
const int THREADS = 512; // Number of threads for <<<1, XXX>>> clause
const int DATASIZE = 1024*1024*256; // Data size in bytes to process
__global__ void processPage(unsigned int * page, int * Results, int N, int mask)
{
int4 * vpage = (int4*)page;
int c = 0;
int max = N/4;
for ( int i = threadIdx.x; i < max; i+=THREADS )
{
int4 v = vpage[i];
if ( (v.x & mask) > 0 ) c++;
if ( (v.y & mask) > 0 ) c++;
if ( (v.z & mask) > 0 ) c++;
if ( (v.w & mask) > 0 ) c++;
}
Results[threadIdx.x] = c;
}
int N;
int Results[THREADS];
int * cResults;
unsigned int * data;
unsigned int * cData;
void initTest()
{
N = DATASIZE / sizeof(int);
cudaMalloc(&cResults, THREADS*sizeof(int));
cudaMalloc(&cData, N*sizeof(int));
data = (unsigned int*)malloc(N*sizeof(int));
for ( int i = 0; i < N; i++ )
data[i]=i;
cudaMemcpy(cData, data, N*sizeof(int), cudaMemcpyHostToDevice);
free(data);
}
int doTest(int mask)
{
processPage<<<1, THREADS>>>(cData, cResults, N, mask);
cudaMemcpy(Results, cResults, THREADS*sizeof(int), cudaMemcpyDeviceToHost);
int c = 0;
for ( int i = 0; i < THREADS; i++ )
c += Results[i];
return c;
}
void finishTest()
{
cudaFree(&cData);
cudaFree(&cResults);
}
int main()
{
initTest();
int clc = clock();
int r
= doTest(87234)
+ doTest(45786)
+ doTest(923569726)
+ doTest(51465123);
clc = clock()-clc;
finishTest();
printf("%d - %dms\n", r, (clc*1000/CLOCKS_PER_SEC));
return 0;
}