I’ve got a code which isn’t working as expected. It may be that I’ve been staring at this for far too long but I cannot get this working right. NB this is my first CUDA application, so any help is appreciated. Please let me know if more details are required, I’m still just a newbie :).
[*]when compiled with the -deviceemu flag
[*]works as expected when run in the VS2008 debugger
[*]works as expected when run from command line
[*]when compiled without the -deviceemu flag
[*]crashes when run in the VS2008 debugger (crash happens when kernel is executed)
[*]fills h_sum_out with 0.0000 values when run from command line
#define BLOCK_SIZE 200
struct svm_node
{
int index;
float value;
};
struct svm_problem
{
unsigned int rowCount; // number of rows
float* y; // value for this vector (used only in a classifier and can be ignored atm)
unsigned int* columnCount; // number of columns for each row
unsigned long* columnOffsets; // Starting offset of columns in each row
svm_node* x; // A flattened 2D VLA
};
The data structures above represent a collection of sparse N-dimensional vectors. The code below works only on a single svm_problem at any time. The program flow is like so:
[list=1]
[*]create a svm_problem problem on the host and fill it with some data
[list=1]
[*]Malloc for the members y, columnCount and columnOffset
[*]Set the number of columns/dimensions of every vector in problem to something like 10
[*]Calculate the offset for each row in the 1D array problem.x
[*]Malloc for the 1D array problem.x
[*]fill problem.x with vectors
[*]create svm_problem d_problem_host on the host which members point to device pointers (marshalling code)
[list=1]
[*]cudaMalloc for d_problem_host members
[*]cudaMemcpy problem members to d_problem members
[*]create svm_problem d_problem which is stored on device
[list=1]
[*]cudaMalloc for d_problem
[*]cudaMemcpy d_problem_host to d_problem
[*] Prepare d_sum_out and h_sum_out
[*] Execute kernel
The kernel svm_dot_kernel calculates the square of the xNodes parameter using the dot function. It’s not specifically for a svm_problem and therefor it takes an array of svm_nodes instead of a svm_problem. It’s worth to mention that svm_dot_kernel worked when I didn’t put the svm_nodes into svm_problem.
// outputs a single float to "float* sum_out" parameter
__device__ void device_dot(svm_node *px, svm_node *py,
const unsigned int indexI, const unsigned int indexJ, float* sum_out,
unsigned long* integerOffsets, unsigned int* columnCount) {
float sum = 0;
unsigned int i = integerOffsets[indexI];
unsigned int j = integerOffsets[indexJ];
while(i < (integerOffsets[indexI] + columnCount[indexI])
&& j < (integerOffsets[indexJ] + columnCount[indexJ]))
{
if(px[i].index != -1 && py[j].index != -1)
{
sum += px[i].value * py[j].value;
++i;
++j;
}
else
{
if(px[i].index > py[j].index)
++j;
else
++i;
}
}
*sum_out = sum;
}
__global__
void svm_dot_kernel(svm_node* xNodes, float* sum_out, unsigned long* integerOffsets, unsigned int* columnCount, unsigned int rowCount) {
unsigned int i = threadIdx.x;
unsigned int j = blockIdx.x;
j = j*BLOCK_SIZE;
unsigned int index = j+i;
if(index < rowCount)
{
float value = 0.0;
device_dot(xNodes, xNodes, index,index, &value, integerOffsets, columnCount);
sum_out[index] = value;
}
}
void
runTest ( int argc, char** argv)
{
// initialize a problem
svm_problem problem = {0};
problem.rowCount = 500;
problem.y = (float*) malloc(sizeof(float)*problem.rowCount);
problem.columnCount = (unsigned int*) malloc(sizeof(int)*problem.rowCount);
problem.columnOffsets = (unsigned long*) malloc(sizeof(long)*problem.rowCount);
unsigned long size = 0;
unsigned int memsize;
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
if( cutCheckCmdLineFlag(argc, (const char**)argv, "device") )
cutilDeviceInit(argc, argv);
else
cudaSetDevice( cutGetMaxGflopsDeviceId() );
unsigned int timer = 0;
cutilCheckError( cutCreateTimer( &timer));
cutilCheckError( cutStartTimer( timer));
// Create a list of columns to test, in reality this will be
// random and created by parsing a file
for(unsigned int i = 0; i < problem.rowCount; i++)
{
problem.columnCount[i] = 40;
}
// Calculatet he offset for each row
for( unsigned int i = 0; i < problem.rowCount; i++)
{
problem.columnOffsets[i] = size;
size += problem.columnCount[i];
}
memsize = sizeof(svm_node)*size;
problem.x = (svm_node*) malloc(memsize);
// Fill problem.x with data
for(unsigned int i = 0; i < problem.rowCount; i++)
{
for(unsigned int j = 0; j < (problem.columnCount[i]); j++)
{
float value = 0.5f;
svm_node item;
item.index = j;
item.value = value;
problem.x[problem.columnOffsets[i]+j] = item;
}
}
// allocate memory for d_problem - marshalling
svm_problem* d_problem;
svm_problem d_problem_host; // used for marshalling
d_problem_host.rowCount = problem.rowCount;
cutilSafeCall( cudaMalloc((void**) &d_problem, sizeof(svm_problem)) );
cutilSafeCall( cudaMalloc((void**) &d_problem_host.y, sizeof(float)*problem.rowCount));
cutilSafeCall( cudaMalloc((void**) &d_problem_host.columnCount, sizeof(int)*problem.rowCount));
cutilSafeCall( cudaMalloc((void**) &d_problem_host.columnOffsets, sizeof(long)*problem.rowCount));
cutilSafeCall( cudaMalloc((void**) &d_problem_host.x, memsize));
// copy problem to device
cutilSafeCall( cudaMemcpy( d_problem_host.y, problem.y, sizeof(float)*problem.rowCount, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy( d_problem_host.columnCount, problem.columnCount, sizeof(int)*problem.rowCount, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy( d_problem_host.columnOffsets, problem.columnOffsets, sizeof(long)*problem.rowCount, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy( d_problem_host.x, problem.x, memsize, cudaMemcpyHostToDevice) );
cutilSafeCall( cudaMemcpy( d_problem, &d_problem_host, sizeof(svm_problem), cudaMemcpyHostToDevice) );
float* d_sum_out;
cutilSafeCall( cudaMalloc( (void**) &d_sum_out, sizeof(float)*problem.rowCount) );
// Fill problem.x with some other data to test if d_problem points to the same values as problem
for(unsigned int i = 0; i < problem.rowCount; i++)
{
for(unsigned int j = 0; j < (problem.columnCount[i]); j++)
{
float value = 1.5f;
svm_node item;
item.index = j;
item.value = value;
problem.x[problem.columnOffsets[i]+j] = item;
}
}
free(problem.x);
// kernel parameters
unsigned int numBlocks = problem.rowCount/BLOCK_SIZE +1;
dim3 blocks(numBlocks,1,1);
dim3 threads( BLOCK_SIZE ,1,1);
// execute kernel
svm_dot_kernel<<< blocks, threads>>>( d_problem->x, d_sum_out, d_problem->columnOffsets, d_problem->columnCount, d_problem->rowCount);
cutilCheckMsg("Kernel execution failed");
float* h_sum_out = (float*) malloc(sizeof(float)*problem.rowCount);
cutilSafeCall( cudaMemcpy( h_sum_out, d_sum_out, sizeof(float)*problem.rowCount, cudaMemcpyDeviceToHost) );
cutilCheckError( cutStopTimer( timer));
printf( "Processing time: %f (ms)\n", cutGetTimerValue( timer));
cutilCheckError( cutDeleteTimer( timer));
for(unsigned int i = 0; i < problem.rowCount; i++)
{
printf("%f \n", h_sum_out[i]);
}
free(problem.y);
free(problem.columnCount);
free(problem.columnOffsets);
free(h_sum_out);
cutilSafeCall( cudaFree(d_sum_out));
}
What do you guys think?