Hi,
I am trying to convert our code from Visual 2008 + CUDA 3.0 to Visual 2010 + CUDA 4.0.
Unfortunately, after building the code in the new environment it runs very slow.
I reduced our code to the following sample below.
When built in the old environment (VS 2008 + CUDA 3.0) it completes after 125 milliseconds.
When build with VS 2010 + CUDA 4.0 it completes after about 2500 milliseconds (20 times slower!) and most of the times the display stops responding within that time.
Could someone help me and tell me what did I do wrong?
Thanks, External Image
Israel
#include <stdio.h>
struct DataStruct
{
float m[6];
float3 data1;
float3 data2;
};
#define MAX_X_Y 700
#define NUM_DATA_OBJECTS 10000
global void Kernel ( DataStruct* data
, int* indices
)
{
for(int i = 0; i < NUM_DATA_OBJECTS; ++i)
{
int tiptr_i = indices[i];
if ( tiptr_i == 0 )
DataStruct tri = data[tiptr_i];
}
}
inline int iDivUp(int a, int b)
{
return (a + b - 1)/b;
}
#define NUM_THREADS_X (8)
#define NUM_THREADS_Y (8)
void RunKernel(void)
{
DataStruct* m_data = NULL;
int* m_indices = NULL;
cudaMalloc( (void**)&m_data, NUM_DATA_OBJECTS*sizeof(DataStruct));
cudaMalloc( (void**)&m_indices, NUM_DATA_OBJECTS*sizeof(int));
cudaMemset( (void*)m_data, 0 , NUM_DATA_OBJECTS*sizeof(DataStruct));
cudaMemset( (void*)m_indices, 0 , NUM_DATA_OBJECTS*sizeof(int));
dim3 dimBlock(NUM_THREADS_X, NUM_THREADS_Y);
dim3 dimGrid(iDivUp(MAX_X_Y, NUM_THREADS_X), iDivUp(MAX_X_Y, NUM_THREADS_Y) );
cudaGetLastError(); // reset the error code
Kernel<<<dimGrid, dimBlock>>>(m_data, m_indices);
cudaError err = cudaThreadSynchronize();
if ( err != 0 )
{
const char *errstr = cudaGetErrorString(err);
printf("Cuda error: (%d) %s.\n" , err, errstr);
}
}