Help with strange error

Hi eveoryone,

I’m running the listed kernel with emulation and it works perfectly. I get the correct

results for the vector scan, but when I run it without emulation I get very strange output

like nothing is being saved. I get all 0.0s. I think the temp array is not getting copied into

g_odata.

I was wondering if someone might have an idea what may be going wrong when it’s

running concurrently in the GPU :.

Any help appreciated,

Ted.

#include <iostream>

#include <ctime>

using namespace std;

const int blocksize = 16;

time_t seconds;

__global__ void scan(float *g_odata, float *g_idata, int n)

{

	extern __shared__ float temp[]; // allocated on invocation

	int thid = threadIdx.x;

	int pout = 0, pin = 1;

// load input into shared memory.

// This is exclusive scan, so shift right by one and set first elt to 0

	temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;

	__syncthreads();

	for ( int offset = 1; offset < n; offset *= 2 )

	{

		pout = 1 - pout; // swap double buffer indices

		pin = 1 - pout;

		if ( thid >= offset )

			temp[pout*n+thid] += temp[pin*n+thid - offset];

		else

			temp[pout*n+thid] = temp[pin*n+thid];

		__syncthreads();

	}

	g_odata[thid] = temp[pout*n+thid]; // write output

	// printf("g_odata = %f\n", g_odata[thid]);

}

Have you tried checking to see whether any errors are returned by the CUDA functions?

You may have run out of registers. Try reducing the number of threads per block increasing the number of blocks per grid instead.

How do I check if there are errors returned by the CUDA function, do they appear

when I compile? Because when I’ve compiled the program no error shows up.

Ted.

try to put

CUERR;

after the kernel launch

define CUERR do{ cudaError_t err; \

    cudaThreadSynchronize();        \         

    if ((err = cudaGetLastError()) != cudaSuccess) { \

    printf("ERROR: CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \

    exit(1);}}while(0)

Hmmm. I don’t get any errors :\

I’m just posting the whole code:

#define CUERR do{ cudaError_t err; \

cudaThreadSynchronize(); \

if ((err = cudaGetLastError()) != cudaSuccess) { \

printf("ERROR: CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \

exit(1);}}while(0)

#include <iostream>

#include <ctime>

using namespace std;

/**

 * Number of threads per block

 */

const int blocksize = 8;

time_t seconds;

__global__ void scan(float *g_odata, float *g_idata, int n)

{

	extern __shared__ float temp[]; // allocated on invocation

	int thid = threadIdx.x;

	int pout = 0, pin = 1;

// load input into shared memory.

// This is exclusive scan, so shift right by one and set first elt to 0

	temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;

	__syncthreads();

	for ( int offset = 1; offset < n; offset *= 2 )

	{

		pout = 1 - pout; // swap double buffer indices

		pin = 1 - pout;

		if ( thid >= offset )

			temp[pout*n+thid] += temp[pin*n+thid - offset];

		else

			temp[pout*n+thid] = temp[pin*n+thid];

		__syncthreads();

	}

	g_odata[thid] = temp[pout*n+thid]; // write output

	// g_odata[thid] = 3.0;

//  printf("g_odata = %f\n", g_odata[thid]);

}

__global__ void prescan(float *g_odata, float *g_idata, int n)

{

	extern __shared__ float temp[];// allocated on invocation

	int thid = threadIdx.x;

	int offset = 1;

	temp[2*thid] = g_idata[2*thid]; // load input into shared memory

	temp[2*thid+1] = g_idata[2*thid+1];

	for ( int d = n>>1; d > 0; d >>= 1 ) // build sum in place up the tree

	{

		__syncthreads();

		if ( thid < d )

		{

			int ai = offset*(2*thid+1)-1;

			int bi = offset*(2*thid+2)-1;

			temp[bi] += temp[ai];

		}

		offset *= 2;

	}

	if ( thid == 0 )

	{

		temp[n - 1] = 0;

	} // clear the last element

	for ( int d = 1; d < n; d *= 2 ) // traverse down tree & build scan

	{

		offset >>= 1;

		__syncthreads();

		if ( thid < d )

		{

			int ai = offset*(2*thid+1)-1;

			int bi = offset*(2*thid+2)-1;

			float t = temp[ai];

			temp[ai] = temp[bi];

			temp[bi] += t;

		}

	}

	__syncthreads();

	g_odata[2*thid] = temp[2*thid]; // write results to device memory

	g_odata[2*thid+1] = temp[2*thid+1];

}

int main ( int argc, char *argv[] )

{

	/**

	 * Command line arguments must be 1 which is the number of rows 

	 * and columns for a matrix and the size of the vector. 

	 */

//  if ( argc != 2 )

//  {

//	  cout<<"usage: "<< argv[0] <<" <size n>\n";

//	  return EXIT_FAILURE;

//  }

	int N = 1024;

	float *a = new float[N];

	float *b = new float[N];

//  float *c = new float[N];

	seconds = time (NULL);

	srand(seconds);

	for ( int i = 0; i < N; ++i )

	{

		a[i] = (float) i*2.0;

		//cout << "a[" << i << "]: " << a[i] << endl; 

	}

	for ( int i = 0; i < N; ++i )

	{

		// calculate a random number between 0 and 1000

		b[i] = (float) 10;

	}

	float *ad, *bd;

	const int sizeVec = N*sizeof(float);

	cudaMalloc( (void**)&ad, sizeVec );

	cudaMalloc( (void**)&bd, sizeVec );

	cudaMemcpy( ad, a, sizeVec, cudaMemcpyHostToDevice );

	CUERR;

	dim3 dimBlock(blocksize);

	dim3 dimGrid(ceil(N/(float)blocksize));

	cout << endl;

	prescan<<<dimGrid, dimBlock>>>( bd, ad, N );

	CUERR;

	cudaMemcpy( b, bd, sizeVec, cudaMemcpyDeviceToHost );

	CUERR;

	/**

	 * GPU Output.

	 */

	/*for ( int i = 0; i < N; ++i )

	{

		cout << "b[" << i << "]: " << b[i] << endl; 

	}*/

	cudaFree( ad ); 

	cudaFree( bd ); 

	delete[] a;

	delete[] b;

	return 0;

}

I’m using this command to compile it:

Ted.

Hmmm. I don’t get any errors :\

I’m just posting the whole code:

#define CUERR do{ cudaError_t err; \

cudaThreadSynchronize(); \

if ((err = cudaGetLastError()) != cudaSuccess) { \

printf("ERROR: CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \

exit(1);}}while(0)

#include <iostream>

#include <ctime>

using namespace std;

/**

 * Number of threads per block

 */

const int blocksize = 8;

time_t seconds;

__global__ void scan(float *g_odata, float *g_idata, int n)

{

	extern __shared__ float temp[]; // allocated on invocation

	int thid = threadIdx.x;

	int pout = 0, pin = 1;

// load input into shared memory.

// This is exclusive scan, so shift right by one and set first elt to 0

	temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;

	__syncthreads();

	for ( int offset = 1; offset < n; offset *= 2 )

	{

		pout = 1 - pout; // swap double buffer indices

		pin = 1 - pout;

		if ( thid >= offset )

			temp[pout*n+thid] += temp[pin*n+thid - offset];

		else

			temp[pout*n+thid] = temp[pin*n+thid];

		__syncthreads();

	}

	g_odata[thid] = temp[pout*n+thid]; // write output

	// g_odata[thid] = 3.0;

//  printf("g_odata = %f\n", g_odata[thid]);

}

__global__ void prescan(float *g_odata, float *g_idata, int n)

{

	extern __shared__ float temp[];// allocated on invocation

	int thid = threadIdx.x;

	int offset = 1;

	temp[2*thid] = g_idata[2*thid]; // load input into shared memory

	temp[2*thid+1] = g_idata[2*thid+1];

	for ( int d = n>>1; d > 0; d >>= 1 ) // build sum in place up the tree

	{

		__syncthreads();

		if ( thid < d )

		{

			int ai = offset*(2*thid+1)-1;

			int bi = offset*(2*thid+2)-1;

			temp[bi] += temp[ai];

		}

		offset *= 2;

	}

	if ( thid == 0 )

	{

		temp[n - 1] = 0;

	} // clear the last element

	for ( int d = 1; d < n; d *= 2 ) // traverse down tree & build scan

	{

		offset >>= 1;

		__syncthreads();

		if ( thid < d )

		{

			int ai = offset*(2*thid+1)-1;

			int bi = offset*(2*thid+2)-1;

			float t = temp[ai];

			temp[ai] = temp[bi];

			temp[bi] += t;

		}

	}

	__syncthreads();

	g_odata[2*thid] = temp[2*thid]; // write results to device memory

	g_odata[2*thid+1] = temp[2*thid+1];

}

int main ( int argc, char *argv[] )

{

	/**

	 * Command line arguments must be 1 which is the number of rows 

	 * and columns for a matrix and the size of the vector. 

	 */

//  if ( argc != 2 )

//  {

//	  cout<<"usage: "<< argv[0] <<" <size n>\n";

//	  return EXIT_FAILURE;

//  }

	int N = 1024;

	float *a = new float[N];

	float *b = new float[N];

//  float *c = new float[N];

	seconds = time (NULL);

	srand(seconds);

	for ( int i = 0; i < N; ++i )

	{

		a[i] = (float) i*2.0;

		//cout << "a[" << i << "]: " << a[i] << endl; 

	}

	for ( int i = 0; i < N; ++i )

	{

		// calculate a random number between 0 and 1000

		b[i] = (float) 10;

	}

	float *ad, *bd;

	const int sizeVec = N*sizeof(float);

	cudaMalloc( (void**)&ad, sizeVec );

	cudaMalloc( (void**)&bd, sizeVec );

	cudaMemcpy( ad, a, sizeVec, cudaMemcpyHostToDevice );

	CUERR;

	dim3 dimBlock(blocksize);

	dim3 dimGrid(ceil(N/(float)blocksize));

	cout << endl;

	prescan<<<dimGrid, dimBlock>>>( bd, ad, N );

	CUERR;

	cudaMemcpy( b, bd, sizeVec, cudaMemcpyDeviceToHost );

	CUERR;

	/**

	 * GPU Output.

	 */

	/*for ( int i = 0; i < N; ++i )

	{

		cout << "b[" << i << "]: " << b[i] << endl; 

	}*/

	cudaFree( ad ); 

	cudaFree( bd ); 

	delete[] a;

	delete[] b;

	return 0;

}

I’m using this command to compile it:

Ted.

Hmmm. I don’t get any errors :\

I’m just posting the whole code:

#define CUERR do{ cudaError_t err; \

cudaThreadSynchronize(); \

if ((err = cudaGetLastError()) != cudaSuccess) { \

printf("ERROR: CUDA error: %s, line %d\n", cudaGetErrorString(err), __LINE__); \

exit(1);}}while(0)

#include <iostream>

#include <ctime>

using namespace std;

/**

 * Number of threads per block

 */

const int blocksize = 8;

time_t seconds;

__global__ void scan(float *g_odata, float *g_idata, int n)

{

	extern __shared__ float temp[]; // allocated on invocation

	int thid = threadIdx.x;

	int pout = 0, pin = 1;

// load input into shared memory.

// This is exclusive scan, so shift right by one and set first elt to 0

	temp[pout*n + thid] = (thid > 0) ? g_idata[thid-1] : 0;

	__syncthreads();

	for ( int offset = 1; offset < n; offset *= 2 )

	{

		pout = 1 - pout; // swap double buffer indices

		pin = 1 - pout;

		if ( thid >= offset )

			temp[pout*n+thid] += temp[pin*n+thid - offset];

		else

			temp[pout*n+thid] = temp[pin*n+thid];

		__syncthreads();

	}

	g_odata[thid] = temp[pout*n+thid]; // write output

	// g_odata[thid] = 3.0;

//  printf("g_odata = %f\n", g_odata[thid]);

}

__global__ void prescan(float *g_odata, float *g_idata, int n)

{

	extern __shared__ float temp[];// allocated on invocation

	int thid = threadIdx.x;

	int offset = 1;

	temp[2*thid] = g_idata[2*thid]; // load input into shared memory

	temp[2*thid+1] = g_idata[2*thid+1];

	for ( int d = n>>1; d > 0; d >>= 1 ) // build sum in place up the tree

	{

		__syncthreads();

		if ( thid < d )

		{

			int ai = offset*(2*thid+1)-1;

			int bi = offset*(2*thid+2)-1;

			temp[bi] += temp[ai];

		}

		offset *= 2;

	}

	if ( thid == 0 )

	{

		temp[n - 1] = 0;

	} // clear the last element

	for ( int d = 1; d < n; d *= 2 ) // traverse down tree & build scan

	{

		offset >>= 1;

		__syncthreads();

		if ( thid < d )

		{

			int ai = offset*(2*thid+1)-1;

			int bi = offset*(2*thid+2)-1;

			float t = temp[ai];

			temp[ai] = temp[bi];

			temp[bi] += t;

		}

	}

	__syncthreads();

	g_odata[2*thid] = temp[2*thid]; // write results to device memory

	g_odata[2*thid+1] = temp[2*thid+1];

}

int main ( int argc, char *argv[] )

{

	/**

	 * Command line arguments must be 1 which is the number of rows 

	 * and columns for a matrix and the size of the vector. 

	 */

//  if ( argc != 2 )

//  {

//	  cout<<"usage: "<< argv[0] <<" <size n>\n";

//	  return EXIT_FAILURE;

//  }

	int N = 1024;

	float *a = new float[N];

	float *b = new float[N];

//  float *c = new float[N];

	seconds = time (NULL);

	srand(seconds);

	for ( int i = 0; i < N; ++i )

	{

		a[i] = (float) i*2.0;

		//cout << "a[" << i << "]: " << a[i] << endl; 

	}

	for ( int i = 0; i < N; ++i )

	{

		// calculate a random number between 0 and 1000

		b[i] = (float) 10;

	}

	float *ad, *bd;

	const int sizeVec = N*sizeof(float);

	cudaMalloc( (void**)&ad, sizeVec );

	cudaMalloc( (void**)&bd, sizeVec );

	cudaMemcpy( ad, a, sizeVec, cudaMemcpyHostToDevice );

	CUERR;

	dim3 dimBlock(blocksize);

	dim3 dimGrid(ceil(N/(float)blocksize));

	cout << endl;

	prescan<<<dimGrid, dimBlock>>>( bd, ad, N );

	CUERR;

	cudaMemcpy( b, bd, sizeVec, cudaMemcpyDeviceToHost );

	CUERR;

	/**

	 * GPU Output.

	 */

	/*for ( int i = 0; i < N; ++i )

	{

		cout << "b[" << i << "]: " << b[i] << endl; 

	}*/

	cudaFree( ad ); 

	cudaFree( bd ); 

	delete[] a;

	delete[] b;

	return 0;

}

I’m using this command to compile it:

Ted.

That means your kernel is working fine but you need to debug it to find out why the results are all zeros.

You can probably start by assigning some constant number to the output device memory g_odata inside the kernel

and see if the number is printed out correctly