Logic works on CPU but not GPU

gothed · August 6, 2009, 10:57am

I have this code that runs perfectly fine on the CPU, however it does compute results on the GPU. It does compile for the GPU and it does not throw a segmentation fault it just doesn’t do its job.

The CPU testfile:

[codebox] /*

Dominik Gothe
overlapArea.cu
Compute the overlapArea of two polygons and smile : )
ASSUMPTIONS ==== ASSUMPTIONS ==== ASSUMPTIONS ====
Polygon 1 is to the right of Polygon 0
Both poygons have the same number of sides nv_g
Will calculate convex hull of overlap, assume concave is not possible
overlapping verticies have not been adressed
the point (0,0) is illegitimate
dug says they can do dot and cross real fast

*/

#include<stdio.h>

#include<math.h>

#include

//#include <cutil_inline.h> // cutilSafeCall

#include // pow( float base, float exp);

using namespace std;

float invalid = 6546746146874; // This serves to illimina

float *xval_0; float *yval_0; // Polygon 0

float *xval_1; float *yval_1; // Polygon 1

float *xval_n; float *yval_n; // Newly Created Polygon

float *overlapArea; // Area of New Polygon

int nv_g; // Number of Verticies of Input Polygons

int nv_n = 0; // Number of verticies of New Polygon

float *angle; // device can’t allocate memory

void FindIntersections() { // ============================================================

============================

// stupid ass computer can't deal with infinity nor does he understant the line x=2; freakin dumbass nvidea piece o crap	

float infinitio = 10e20;

for(int i = 0; i<nv_g; i++){

	int j = (i+1)%nv_g;

	for(int k = 0; k<nv_g; k++){

		int l=(k+1)%nv_g;

		float ris_0   = yval_0[j] - yval_0[i];

		float run_0   = xval_0[j] - xval_0[i];

		float sloap_0 = infinitio;

		float inter_0 = 0;



		float ris_1   = yval_1[k] - yval_1[l];

		float run_1   = xval_1[k] - xval_1[l];

		float sloap_1 = infinitio;

		float inter_1 = 0;

		if( run_0 != 0 ){sloap_0 = ris_0/run_0;}

		if( run_1 != 0 ){sloap_1 = ris_1/run_1;}

		inter_0 = yval_0[i] - xval_0[i]*sloap_0;

		inter_1 = yval_1[k] - xval_1[k]*sloap_1;

		if( ( sloap_0 - sloap_1 != 0 ) && sloap_0 < infinitio && sloap_1 < infinitio ){

			xval_n[i+k*4] = ( inter_0 - inter_1 ) / ( sloap_1 - sloap_0 );

			yval_n[i+k*4] = xval_n[i]*sloap_0+inter_0;

		} else if( ( sloap_0 < infinitio ) && !( sloap_1 < infinitio ) ){

			xval_n[i+k*4] = xval_1[k];

			yval_n[i+k*4] = xval_1[l]*sloap_0+inter_0;

		} else if( ( sloap_1 < infinitio ) && !( sloap_0 < infinitio ) ){

			xval_n[i+k*4] = xval_0[i];

			yval_n[i+k*4] = xval_0[j]*sloap_1+inter_1; 

		} // the parralell case?? mhh do nothing

		if(

		     (//----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			 ) //============================================================

===========

		    )//----------------------------------------------------------------------------- 

		 &&

		    (//-----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			 ) //============================================================

==========

		    )//----------------------------------------------------------------------------

		) { nv_n++; }

		else { xval_n[i+k*4] = invalid; yval_n[i+k*4] = invalid; }

	}

}

} // ============================================================

==========================================

int main(){

nv_g = 4;

xval_0 = (float*) malloc( sizeof(float)*nv_g );

yval_0 = (float*) malloc( sizeof(float)*nv_g );

// Allocate memory for Polygon_1 on the _host;

xval_1 = (float*) malloc( sizeof(float)*nv_g );

yval_1 = (float*) malloc( sizeof(float)*nv_g );

// Create the polygon_0 on _host

xval_0[0] = -.5;	yval_0[0] = -.5;

xval_0[1] =  .5;	yval_0[1] = -.5;

xval_0[2] =  .5;	yval_0[2] =  .5;

xval_0[3] = -.5;	yval_0[3] =  .5;

// Create the polygon_1 on _host

xval_1[0] =  0;			yval_1[0] = -0.707106781;

xval_1[1] =  0.707106781;	yval_1[1] =  0;

xval_1[2] =  0;			yval_1[2] =  0.707106781;

xval_1[3] = -0.707106781;	yval_1[3] =  0;



xval_n = (float*) malloc( sizeof(float)*nv_g*nv_g );

yval_n = (float*) malloc( sizeof(float)*nv_g*nv_g );

FindIntersections();

for(int i = 0; i < 4; i++){

	for(int j = 0; j < 4; j++){

		cout<<"("<<i<<","<<j<<"): ";

		cout.precision(2);

		cout.setf(ios::fixed,ios::floatfield);

		if(xval_n[i+j*4] != invalid || true){cout<<xval_n[i+j*4]<<" , "<<yval_n[i+j*4];}

		cout<<endl;

	}

}

}[/codebox]

The same on the GPU:

[codebox] /*

Dominik Gothe
overlapArea.cu
Compute the overlapArea of two polygons and smile : )
1. This pipeline can be easily adjusted to recive pointers or values
Ofcourse a pointer to the storage area for the Overlap is necessary
1. define invalid point, a point that can not possibly occur
ASSUMPTIONS ==== ASSUMPTIONS ==== ASSUMPTIONS ====
1. The Overlap Area is Convex
1. Both poygons have the same number of sides nv_g
1. overlapping verticies have not been adressed ???

*/

#include <stdio.h>

#include <math.h>

#include

#include <cutil_inline.h> // cutilSafeCall

#include // pow( float base, float exp);

using namespace std;

device float invalid = 6416415;

device float *xval_0; device float *yval_0; // Polygon 0

device float *xval_1; device float *yval_1; // Polygon 1

device float *xval_n; device float *yval_n; // Newly Created Polygon

device float *overlapArea; // Area of New Polygon

device int nv_g; // Number of Verticies of Input Polygons

device int nv_n = 0; // Number of verticies of New Polygon

device float *angle; // device can’t allocate memory

device void FindIntersections() { // ============================================================

============================

float infinitio = 10e20;

for(int i = 0; i<nv_g; i++){

	int j = (i+1)%nv_g;

	for(int k = 0; k<nv_g; k++){

		int l=(k+1)%nv_g;

		float ris_0   = yval_0[j] - yval_0[i];

		float run_0   = xval_0[j] - xval_0[i];

		float sloap_0 = infinitio;

		float inter_0 = 0;



		float ris_1   = yval_1[k] - yval_1[l];

		float run_1   = xval_1[k] - xval_1[l];

		float sloap_1 = infinitio;

		float inter_1 = 0;

		if( run_0 != 0 ){sloap_0 = ris_0/run_0;}

		if( run_1 != 0 ){sloap_1 = ris_1/run_1;}

		inter_0 = yval_0[i] - xval_0[i]*sloap_0;

		inter_1 = yval_1[k] - xval_1[k]*sloap_1;

		if( ( sloap_0 - sloap_1 != 0 ) && sloap_0 < infinitio && sloap_1 < infinitio ){

			xval_n[i+k*4] = ( inter_0 - inter_1 ) / ( sloap_1 - sloap_0 );

			yval_n[i+k*4] = xval_n[i]*sloap_0+inter_0;

		} else if( ( sloap_0 < infinitio ) && !( sloap_1 < infinitio ) ){

			xval_n[i+k*4] = xval_1[k];

			yval_n[i+k*4] = xval_1[l]*sloap_0+inter_0;

		} else if( ( sloap_1 < infinitio ) && !( sloap_0 < infinitio ) ){

			xval_n[i+k*4] = xval_0[i];

			yval_n[i+k*4] = xval_0[j]*sloap_1+inter_1; 

		} // the parralell case?? mhh do nothing

		if(

		     (//----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] >= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] <= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] >= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] <= yval_0[j]) )

			      ) 

			   ||

			      (    ( (xval_n[i+k*4] <= xval_0[i]) && (yval_n[i+k*4] <= yval_0[i]) )

				&& ( (xval_n[i+k*4] >= xval_0[j]) && (yval_n[i+k*4] >= yval_0[j]) )

			      )

			 ) //============================================================

===========

		    )//----------------------------------------------------------------------------- 

		 &&

		    (//-----------------------------------------------------------------------------

			 ( //============================================================

===========

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] >= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] <= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] >= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] <= yval_1[l]) )

			      )

			   ||

			      (

				   ( (xval_n[i+k*4] <= xval_1[k]) && (yval_n[i+k*4] <= yval_1[k]) )

				&& ( (xval_n[i+k*4] >= xval_1[l]) && (yval_n[i+k*4] >= yval_1[l]) )

			      )

			 ) //============================================================

==========

		    )//----------------------------------------------------------------------------

		) { nv_n++; }

		else { xval_n[i+k*4] = invalid; yval_n[i+k*4] = invalid; }

	}

}

} // ============================================================

==========================================

global void ComputeOverlap( //float *xval_0_p, float *yval_0_p,

			//float *xval_1_p, float *yval_1_p,

			float *xval_n_p, float *yval_n_p,

			int nv_g_p,

			float *overlapArea_p,

			float *angle_p                    ){ // =========================================================



//xval_0 = xval_0_p; 	yval_0 = yval_0_p;

//xval_1 = xval_1_p; 	yval_1 = yval_1_p;

xval_n = xval_n_p; 	yval_n = yval_n_p;

overlapArea = overlapArea_p;

nv_g = nv_g_p;

angle = angle_p;

FindIntersections();

// RemoveInvalids();

// Merge();

// Order();

// PolyArea();

} // ============================================================

==========================================

int main(){

// Create the Polygon on _host and copy to _device =============================

const int nv_g_h = 4;        // number of verticies for each polygon (global)

// Allocate memory for Polygon_0 on the _host;

float *xval_0_h; cudaMallocHost ( (void**)&xval_0_h, sizeof(float)*nv_g_h );

float *yval_0_h; cudaMallocHost ( (void**)&yval_0_h, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _host;

float *xval_1_h; cudaMallocHost ( (void**)&xval_1_h, sizeof(float)*nv_g_h );

float *yval_1_h; cudaMallocHost ( (void**)&yval_1_h, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _host;

float *xval_n_h; cudaMallocHost ( (void**)&xval_n_h, sizeof(float)*nv_g_h*nv_g_h );

float *yval_n_h; cudaMallocHost ( (void**)&yval_n_h, sizeof(float)*nv_g_h*nv_g_h );

// Allocate memory for Polygon_0 on the _device;

float *xval_0_d; cudaMalloc	( (void**)&xval_0_d, sizeof(float)*nv_g_h );

float *yval_0_d; cudaMalloc	( (void**)&yval_0_d, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _device;

float *xval_1_d; cudaMalloc	( (void**)&xval_1_d, sizeof(float)*nv_g_h );

float *yval_1_d; cudaMalloc	( (void**)&yval_1_d, sizeof(float)*nv_g_h );

// Allocate memory for Polygon_1 on the _device;

float *xval_n_d; cudaMalloc	( (void**)&xval_n_d, sizeof(float)*nv_g_h*nv_g_h );

float *yval_n_d; cudaMalloc	( (void**)&yval_n_d, sizeof(float)*nv_g_h*nv_g_h );

// Allocate memory for the Overlap Area on the host

float *overlap_h; cudaMallocHost ( (void**)&overlap_h, sizeof(float) );

// Allocate memory for the Overlap Area on the device

float *overlap_d; cudaMalloc     ( (void**)&overlap_d, sizeof(float) );

// Create the polygon_0 on _host

xval_0_h[0] = -.5;	yval_0_h[0] = -.5;

xval_0_h[1] =  .5;	yval_0_h[1] = -.5;

xval_0_h[2] =  .5;	yval_0_h[2] =  .5;

xval_0_h[3] = -.5;	yval_0_h[3] =  .5;

// Create the polygon_1 on _host

xval_1_h[0] =  0;		yval_1_h[0] = -0.707106781;

xval_1_h[1] =  0.707106781;	yval_1_h[1] =  0;

xval_1_h[2] =  0;		yval_1_h[2] =  0.707106781;

xval_1_h[3] = -0.707106781;	yval_1_h[3] =  0;

// Initilize the new polygon to zero

for(int i = 0; i < nv_g_h*nv_g_h; i++){

	xval_n_h[i] = 0;

	yval_n_h[i] = 0;

}

cutilSafeCall( cudaMemcpy( xval_n_d, xval_n_h, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyHostToDevice ) );

cutilSafeCall( cudaMemcpy( yval_n_d, yval_n_h, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyHostToDevice ) );

// Initilize the overlap area

overlap_h[0] = 0;

cutilSafeCall( cudaMemcpy( overlap_d, overlap_h, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyHostToDevice ) );

// Create the Angle array that is needed for the device

float *angle_d; cudaMalloc ( (void**)&angle_d, sizeof(float)*nv_g*nv_g );

ComputeOverlap<<<1,1>>>(

			 xval_0_d, yval_0_d,

			 xval_1_d, yval_1_d,

			 xval_n_d, yval_n_d,

			 nv_g_h,

			 overlap_d,

			 angle_d            );

cutilSafeCall( cudaMemcpy( overlap_h, overlap_d, sizeof(float), cudaMemcpyDeviceToHost ) );

cout<<overlap_h[0]<<endl<<endl;

// BEGIN DEBUG

cutilSafeCall( cudaMemcpy( xval_n_h, xval_n_d, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyDeviceToHost ) );

cutilSafeCall( cudaMemcpy( yval_n_h, yval_n_d, sizeof(float)*nv_g_h*nv_g_h, cudaMemcpyDeviceToHost ) );

for(int i = 0; i < nv_g_h*nv_g_h; i++){

	cout.precision(2);

	cout.setf(ios::fixed,ios::floatfield);

	cout<<xval_n_h[i]<<" , "<<yval_n_h[i]<<endl;

}

// END DEBUG

}

// END DEBUG

}[/codebox]

please help, coffee no longer does :wacko:

my suspects are either copying crap back and forth from the device or the the fact that I declared global variables in my kernal.

thanks

thanks - Dominik Gothe

tmurray · August 6, 2009, 4:40pm

add a cudaGetLastError and check its return value immediately after the kernel call–maybe the kernel isn’t even being launched for some reason.

Gregory_Diamos · August 6, 2009, 4:59pm

gdiamos@cuda:~/temp$ LD_LIBRARY_PATH=/home/gdiamos/checkout/gpuocelot/trunk/ocelot/.libs/ ./test

terminate called after throwing an instance of 'hydrazine::Exception'

  what():  Invalid destination 0xf00220 (64 bytes) in host to device memcpy.

Device 0 : Ocelot PTX Emulator

 Nearby Global Variable Allocations

  [0x61a3b8] - [0x61a3c0] (8 bytes) (global)

  [0x61a3c0] - [0x61a3c8] (8 bytes) (global)

  [0x61a3c8] - [0x61a3d0] (8 bytes) (global)

  [0x61a3d0] - [0x61a3d4] (4 bytes) (global)

  [0x61a3d4] - [0x61a3d8] (4 bytes) (global)

  [0x61a3d8] -  [0x61a3e0] (8 bytes) (global)

   ****0xf00220****

Nearby Device Memory Allocations

  [0xf00160] - [0xf001a0] (64 bytes) (global)

  [0xf00220] -	****0xf00220**** - [0xf00224] (4 bytes) (global)

  [0xf01030] - [0xf01070] (64 bytes) (global)

  [0xf19070] - [0xf19080] (16 bytes) (global)

  [0xf20160] - [0xf20170] (16 bytes) (global)

  [0xfa3ed0] - [0xfa3ee0] (16 bytes) (global)

  [0xfa8ac0] - [0xfa8ad0] (16 bytes) (global)

Aborted

Your code wrote 64 bytes to an array of size 4 in a cudaMemcpyHostToDevice at line 189: cutilSafeCall( cudaMemcpy( overlap_d, overlap_h, sizeof(float)nv_g_hnv_g_h, cudaMemcpyHostToDevice ) );

Topic		Replies	Views
Device Memeroy allocation and data transfer Data transfer between host and device CUDA Programming and Performance	5	2646	June 16, 2011
Why doesn't overlapping data transfers and kernel execution work here? CUDA Programming and Performance	88	1371	April 8, 2025
Can't overlap streams CUDA Programming and Performance	2	993	December 16, 2021
CUDA C++ Segmentation Fault CUDA Programming and Performance	7	14976	October 1, 2017
compile segmentation fault CUDA Programming and Performance	14	14648	September 2, 2010
Possibilities to further optimize PoC programme using custom copy kernels CUDA Programming and Performance cuda	34	501	June 16, 2025
nvcc Segfault CUDA Programming and Performance	6	11505	October 14, 2010
cpu function and gpu kernel overlap CUDA Programming and Performance	12	1574	July 25, 2017
Overlapping CPU and GPU operations using streams. Total failure. Any help? CUDA Programming and Performance	6	6153	April 2, 2013
Deep subroutine use causes cudaError_enum? CUDA Programming and Performance	14	14517	February 6, 2010

Logic works on CPU but not GPU

Related topics