Logic works on CPU but not GPU

I have this code that runs perfectly fine on the CPU, however it does compute results on the GPU. It does compile for the GPU and it does not throw a segmentation fault it just doesn’t do its job.

The CPU testfile:

[codebox] /*

  • Dominik Gothe

  • overlapArea.cu

  • Compute the overlapArea of two polygons and smile : )

  • ASSUMPTIONS ==== ASSUMPTIONS ==== ASSUMPTIONS ====

  • Polygon 1 is to the right of Polygon 0

  • Both poygons have the same number of sides nv_g

  • Will calculate convex hull of overlap, assume concave is not possible

  • overlapping verticies have not been adressed

  • the point (0,0) is illegitimate

  • dug says they can do dot and cross real fast

*/

#include<stdio.h>

#include<math.h>

#include

#include

//#include <cutil_inline.h> // cutilSafeCall

#include // pow( float base, float exp);

using namespace std;

float invalid = 6546746146874; // This serves to illimina

float *xval_0; float *yval_0; // Polygon 0

float *xval_1; float *yval_1; // Polygon 1

float *xval_n; float *yval_n; // Newly Created Polygon

float *overlapArea; // Area of New Polygon

int nv_g; // Number of Verticies of Input Polygons

int nv_n = 0; // Number of verticies of New Polygon

float *angle; // device can’t allocate memory

void FindIntersections() { // ============================================================

============================

// stupid ass computer can't deal with infinity nor does he understant the line x=2; freakin dumbass nvidea piece o crap	

float infinitio = 10e20;

for(int i = 0; i<nv_g; i++){

	int j = (i+1)%nv_g;

	for(int k = 0; k<nv_g; k++){

		int l=(k+1)%nv_g;

		float ris_0   = yval_0[j] - yval_0[i];

		float run_0   = xval_0[j] - xval_0[i];

		float sloap_0 = infinitio;

		float inter_0 = 0;



		float ris_1   = yval_1[k] - yval_1[l];

		float run_1   = xval_1[k] - xval_1[l];

		float sloap_1 = infinitio;

		float inter_1 = 0;

		if( run_0 != 0 ){sloap_0 = ris_0/run_0;}

		if( run_1 != 0 ){sloap_1 = ris_1/run_1;}

		inter_0 = yval_0[i] - xval_0[i]*sloap_0;

		inter_1 = yval_1[k] - xval_1[k]*sloap_1;

		if( ( sloap_0 - sloap_1 != 0 ) && sloap_0 < infinitio && sloap_1 < infinitio ){

			xval_n[i+k*4] = ( inter_0 - inter_1 ) / ( sloap_1 - sloap_0 );

			yval_n[i+k*4] = xval_n[i]*sloap_0+inter_0;

		} else if( ( sloap_0 < infinitio ) && !( sloap_1 < infinitio ) ){

			xval_n[i+k*4] = xval_1[k];

			yval_n[i+k*4] = xval_1[l]*sloap_0+inter_0;

		} else if( ( sloap_1 < infinitio ) && !( sloap_0 < infinitio ) ){

			xval_n[i+k*4] = xval_0[i];

			yval_n[i+k*4] = xval_0[j]*sloap_1+inter_1; 

		} // the parralell case?? mhh do nothing

		if(

		     (//----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			 ) //============================================================

===========

		    )//----------------------------------------------------------------------------- 

		 &&

		    (//-----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			 ) //============================================================

==========

		    )//----------------------------------------------------------------------------

		) { nv_n++; }

		else { xval_n[i+k*4] = invalid; yval_n[i+k*4] = invalid; }

	}

}

} // ============================================================

==========================================

int main(){

nv_g = 4;

xval_0 = (float*) malloc( sizeof(float)*nv_g );

yval_0 = (float*) malloc( sizeof(float)*nv_g );

// Allocate memory for Polygon_1 on the _host;

xval_1 = (float*) malloc( sizeof(float)*nv_g );

yval_1 = (float*) malloc( sizeof(float)*nv_g );

// Create the polygon_0 on _host

xval_0[0] = -.5;	yval_0[0] = -.5;

xval_0[1] =  .5;	yval_0[1] = -.5;

xval_0[2] =  .5;	yval_0[2] =  .5;

xval_0[3] = -.5;	yval_0[3] =  .5;

// Create the polygon_1 on _host

xval_1[0] =  0;			yval_1[0] = -0.707106781;

xval_1[1] =  0.707106781;	yval_1[1] =  0;

xval_1[2] =  0;			yval_1[2] =  0.707106781;

xval_1[3] = -0.707106781;	yval_1[3] =  0;



xval_n = (float*) malloc( sizeof(float)*nv_g*nv_g );

yval_n = (float*) malloc( sizeof(float)*nv_g*nv_g );

FindIntersections();

for(int i = 0; i < 4; i++){

	for(int j = 0; j < 4; j++){

		cout<<"("<<i<<","<<j<<"): ";

		cout.precision(2);

		cout.setf(ios::fixed,ios::floatfield);

		if(xval_n[i+j*4] != invalid || true){cout<<xval_n[i+j*4]<<" , "<<yval_n[i+j*4];}

		cout<<endl;

	}

}

}[/codebox]

The same on the GPU:

[codebox] /*

  • Dominik Gothe

  • overlapArea.cu

  • Compute the overlapArea of two polygons and smile : )

    1. This pipeline can be easily adjusted to recive pointers or values
  • Ofcourse a pointer to the storage area for the Overlap is necessary

    1. define invalid point, a point that can not possibly occur
  • ASSUMPTIONS ==== ASSUMPTIONS ==== ASSUMPTIONS ====

    1. The Overlap Area is Convex
    1. Both poygons have the same number of sides nv_g
    1. overlapping verticies have not been adressed ???

*/

#include <stdio.h>

#include <math.h>

#include

#include

#include <cutil_inline.h> // cutilSafeCall

#include // pow( float base, float exp);

using namespace std;

device float invalid = 6416415;

device float *xval_0; device float *yval_0; // Polygon 0

device float *xval_1; device float *yval_1; // Polygon 1

device float *xval_n; device float *yval_n; // Newly Created Polygon

device float *overlapArea; // Area of New Polygon

device int nv_g; // Number of Verticies of Input Polygons

device int nv_n = 0; // Number of verticies of New Polygon

device float *angle; // device can’t allocate memory

device void FindIntersections() { // ============================================================

============================

float infinitio = 10e20;

for(int i = 0; i<nv_g; i++){

	int j = (i+1)%nv_g;

	for(int k = 0; k<nv_g; k++){

		int l=(k+1)%nv_g;

		float ris_0   = yval_0[j] - yval_0[i];

		float run_0   = xval_0[j] - xval_0[i];

		float sloap_0 = infinitio;

		float inter_0 = 0;



		float ris_1   = yval_1[k] - yval_1[l];

		float run_1   = xval_1[k] - xval_1[l];

		float sloap_1 = infinitio;

		float inter_1 = 0;

		if( run_0 != 0 ){sloap_0 = ris_0/run_0;}

		if( run_1 != 0 ){sloap_1 = ris_1/run_1;}

		inter_0 = yval_0[i] - xval_0[i]*sloap_0;

		inter_1 = yval_1[k] - xval_1[k]*sloap_1;

		if( ( sloap_0 - sloap_1 != 0 ) && sloap_0 < infinitio && sloap_1 < infinitio ){

			xval_n[i+k*4] = ( inter_0 - inter_1 ) / ( sloap_1 - sloap_0 );

			yval_n[i+k*4] = xval_n[i]*sloap_0+inter_0;

		} else if( ( sloap_0 < infinitio ) && !( sloap_1 < infinitio ) ){

			xval_n[i+k*4] = xval_1[k];

			yval_n[i+k*4] = xval_1[l]*sloap_0+inter_0;

		} else if( ( sloap_1 < infinitio ) && !( sloap_0 < infinitio ) ){

			xval_n[i+k*4] = xval_0[i];

			yval_n[i+k*4] = xval_0[j]*sloap_1+inter_1; 

		} // the parralell case?? mhh do nothing

		if(

		     (//----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			 ) //============================================================

===========

		    )//----------------------------------------------------------------------------- 

		 &&

		    (//-----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			 ) //============================================================

==========

		    )//----------------------------------------------------------------------------

		) { nv_n++; }

		else { xval_n[i+k*4] = invalid; yval_n[i+k*4] = invalid; }

	}

}

} // ============================================================

==========================================

global void ComputeOverlap( //float *xval_0_p, float *yval_0_p,

			//float *xval_1_p, float *yval_1_p,

			float *xval_n_p, float *yval_n_p,

			int nv_g_p,

			float *overlapArea_p,

			float *angle_p                    ){ // =========================================================



//xval_0 = xval_0_p; 	yval_0 = yval_0_p;

//xval_1 = xval_1_p; 	yval_1 = yval_1_p;

xval_n = xval_n_p; 	yval_n = yval_n_p;

overlapArea = overlapArea_p;

nv_g = nv_g_p;

angle = angle_p;

FindIntersections();

// RemoveInvalids();

// Merge();

// Order();

// PolyArea();

} // ============================================================

==========================================

int main(){

// Create the Polygon on _host and copy to _device =============================

const int nv_g_h = 4;        // number of verticies for each polygon (global)

// Allocate memory for Polygon_0 on the _host;

float *xval_0_h; cudaMallocHost ( (void**)&xval_0_h, sizeof(float)*nv_g_h );

float *yval_0_h; cudaMallocHost ( (void**)&yval_0_h, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _host;

float *xval_1_h; cudaMallocHost ( (void**)&xval_1_h, sizeof(float)*nv_g_h );

float *yval_1_h; cudaMallocHost ( (void**)&yval_1_h, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _host;

float *xval_n_h; cudaMallocHost ( (void**)&xval_n_h, sizeof(float)*nv_g_h*nv_g_h );

float *yval_n_h; cudaMallocHost ( (void**)&yval_n_h, sizeof(float)*nv_g_h*nv_g_h );

// Allocate memory for Polygon_0 on the _device;

float *xval_0_d; cudaMalloc	( (void**)&xval_0_d, sizeof(float)*nv_g_h );

float *yval_0_d; cudaMalloc	( (void**)&yval_0_d, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _device;

float *xval_1_d; cudaMalloc	( (void**)&xval_1_d, sizeof(float)*nv_g_h );

float *yval_1_d; cudaMalloc	( (void**)&yval_1_d, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _device;

float *xval_n_d; cudaMalloc	( (void**)&xval_n_d, sizeof(float)*nv_g_h*nv_g_h );

float *yval_n_d; cudaMalloc	( (void**)&yval_n_d, sizeof(float)*nv_g_h*nv_g_h );

// Allocate memory for the Overlap Area on the host

float *overlap_h; cudaMallocHost ( (void**)&overlap_h, sizeof(float) );

// Allocate memory for the Overlap Area on the device

float *overlap_d; cudaMalloc     ( (void**)&overlap_d, sizeof(float) );

// Create the polygon_0 on _host

xval_0_h[0] = -.5;	yval_0_h[0] = -.5;

xval_0_h[1] =  .5;	yval_0_h[1] = -.5;

xval_0_h[2] =  .5;	yval_0_h[2] =  .5;

xval_0_h[3] = -.5;	yval_0_h[3] =  .5;

// Create the polygon_1 on _host

xval_1_h[0] =  0;		yval_1_h[0] = -0.707106781;

xval_1_h[1] =  0.707106781;	yval_1_h[1] =  0;

xval_1_h[2] =  0;		yval_1_h[2] =  0.707106781;

xval_1_h[3] = -0.707106781;	yval_1_h[3] =  0;

// Initilize the new polygon to zero

for(int i = 0; i < nv_g_h*nv_g_h; i++){

	xval_n_h[i] = 0;

	yval_n_h[i] = 0;

}

cutilSafeCall( cudaMemcpy( xval_n_d, xval_n_h, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyHostToDevice ) );

cutilSafeCall( cudaMemcpy( yval_n_d, yval_n_h, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyHostToDevice ) );

// Initilize the overlap area

overlap_h[0] = 0;

cutilSafeCall( cudaMemcpy( overlap_d, overlap_h, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyHostToDevice ) );

// Create the Angle array that is needed for the device

float *angle_d; cudaMalloc ( (void**)&angle_d, sizeof(float)*nv_g*nv_g );

ComputeOverlap<<<1,1>>>(

			 xval_0_d, yval_0_d,

			 xval_1_d, yval_1_d,

			 xval_n_d, yval_n_d,

			 nv_g_h,

			 overlap_d,

			 angle_d            );

cutilSafeCall( cudaMemcpy( overlap_h, overlap_d, sizeof(float), cudaMemcpyDeviceToHost ) );

cout<<overlap_h[0]<<endl<<endl;

// BEGIN DEBUG

cutilSafeCall( cudaMemcpy( xval_n_h, xval_n_d, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyDeviceToHost ) );

cutilSafeCall( cudaMemcpy( yval_n_h, yval_n_d, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyDeviceToHost ) );

for(int i = 0; i < nv_g_h*nv_g_h; i++){

	cout.precision(2);

	cout.setf(ios::fixed,ios::floatfield);

	cout<<xval_n_h[i]<<" , "<<yval_n_h[i]<<endl;

}

// END DEBUG

}

// END DEBUG

}[/codebox]

please help, coffee no longer does :wacko:

my suspects are either copying crap back and forth from the device or the the fact that I declared global variables in my kernal.

thanks

thanks - Dominik Gothe

add a cudaGetLastError and check its return value immediately after the kernel call–maybe the kernel isn’t even being launched for some reason.

gdiamos@cuda:~/temp$ LD_LIBRARY_PATH=/home/gdiamos/checkout/gpuocelot/trunk/ocelot/.libs/ ./test

terminate called after throwing an instance of 'hydrazine::Exception'

  what():  Invalid destination 0xf00220 (64 bytes) in host to device memcpy.

Device 0 : Ocelot PTX Emulator

 Nearby Global Variable Allocations

  [0x61a3b8] - [0x61a3c0] (8 bytes) (global)

  [0x61a3c0] - [0x61a3c8] (8 bytes) (global)

  [0x61a3c8] - [0x61a3d0] (8 bytes) (global)

  [0x61a3d0] - [0x61a3d4] (4 bytes) (global)

  [0x61a3d4] - [0x61a3d8] (4 bytes) (global)

  [0x61a3d8] -  [0x61a3e0] (8 bytes) (global)

   ****0xf00220****

Nearby Device Memory Allocations

  [0xf00160] - [0xf001a0] (64 bytes) (global)

  [0xf00220] -	****0xf00220**** - [0xf00224] (4 bytes) (global)

  [0xf01030] - [0xf01070] (64 bytes) (global)

  [0xf19070] - [0xf19080] (16 bytes) (global)

  [0xf20160] - [0xf20170] (16 bytes) (global)

  [0xfa3ed0] - [0xfa3ee0] (16 bytes) (global)

  [0xfa8ac0] - [0xfa8ad0] (16 bytes) (global)

Aborted

Your code wrote 64 bytes to an array of size 4 in a cudaMemcpyHostToDevice at line 189: cutilSafeCall( cudaMemcpy( overlap_d, overlap_h, sizeof(float)nv_g_hnv_g_h, cudaMemcpyHostToDevice ) );