Invalid __global__ write of size 4 Error on kernel Launch

Hi all,
I am writing a simple CUDA nbody program based on examples in Nvidia SDK and also on nbody example in WebCL.
But I keep getting this error in cuda-memcheck.

========= Invalid global write of size 4
========= at 0x00000130 in nbodySamsung_kernel
========= by thread (32,0,0) in block (0,0,0)
========= Address 0x00000200 is out of bounds

I am unable to figure out what’s going on here. Any help/pointers would be really appreciated.
Note, I am new to CUDA programming

Here is my kernel code

#include <cuda_runtime.h>
#include <math.h>

#define PI 3.14159265 
#define cospi(x) cos(x * PI)
#define sinpi(x) cos(x * PI)

#define localMemSize 256 * 4
#define sqrt_val 1.5
#define sinpi_val 2.5
#define cospi_val 3.5

extern "C"
__global__ void nbodySamsung_kernel(float* curPos,float* curVel,
		int numBodies,float deltaTime,int epsSqr, float* nxtPos,
		float* nxtVel)
	{
		/*unsigned int tid = get_local_id(0);
		unsigned int 0 = get_global_id(0);
		unsigned int localSize = get_local_size(0);*/

		__shared__ float localPos[localMemSize];

		unsigned int tid = __builtin_ptx_read_tid_x();
		

		unsigned blockNumInGrid_0   = __builtin_ptx_read_ctaid_x() ;


		unsigned threadsPerBlock_0  = __builtin_ptx_read_ntid_x();


		unsigned threadNumInBlock_0 = __builtin_ptx_read_tid_x() ;

		unsigned int gid            =   blockNumInGrid_0 * threadsPerBlock_0 + threadNumInBlock_0;

		unsigned int localSize      =   __builtin_ptx_read_ntid_x();
                

		// Number of tiles we need to iterate
		unsigned int numTiles = numBodies / localSize;

		// position of this work-item
		float4 myPos = (make_float4) (curPos[4*0 + 0], curPos[4*0 + 1], curPos[4*0 + 2], curPos[4*0 + 3]);
		float4 acc = (make_float4) (0.0f, 0.0f, 0.0f, 0.0f);

		for(int i = 0; i < numTiles; ++i)
		{
			// load one tile into local memory
			int idx = i * localSize + tid;
			for(int k=0; k<4; k++)
			{
					localPos[4*tid+k] = curPos[4*idx+k];
					
			}
			// Synchronize to make sure data is available for processing
			//barrier(CLK_LOCAL_MEM_FENCE);
			//__threadfence_block();
			__syncthreads();
			// calculate acceleration effect due to each body
			// a[i->j] = m[j] * r[i->j] / (r^2 + epsSqr)^(3/2)
			for(int j = 0; j < localSize; ++j)
			{
				// Calculate acceleration caused by particle j on particle i
				float4 aLocalPos = (make_float4) (localPos[4*j + 0], localPos[4*j + 1], localPos[4*j + 2], localPos[4*j + 3]);
				
				float4 r;
				r.x = aLocalPos.x - myPos.x;
				r.y = aLocalPos.y - myPos.y;
				r.z = aLocalPos.z - myPos.z;
				r.w = aLocalPos.w - myPos.w;
				
				float distSqr = r.x * r.x  +  r.y * r.y  +  r.z * r.z;
				//float invDist = 1.0f / sqrt(distSqr + epsSqr);
				float invDist = 1.0f / sqrt_val;
				float invDistCube = invDist * invDist * invDist;
				float s = aLocalPos.w * invDistCube;
				// accumulate effect of all particles
				acc.x += s * r.x;
				acc.y += s * r.y;
				acc.z += s * r.z;
				acc.x += s * r.w;
			}
			// Synchronize so that next tile can be loaded
			//barrier(CLK_LOCAL_MEM_FENCE);
			//__threadfence_block();
			__syncthreads();
		}

		float4 oldVel = (make_float4) (curVel[4*0 + 0], curVel[4*0 + 1], curVel[4*0 + 2], curVel[4*0 + 3]);

		// updated position and velocity
		float4 newPos;	
		newPos.x = myPos.x + oldVel.x * deltaTime + acc.x * 0.5f * deltaTime * deltaTime;
		newPos.y = myPos.y + oldVel.y * deltaTime + acc.y * 0.5f * deltaTime * deltaTime;
		newPos.z = myPos.z + oldVel.z * deltaTime + acc.z * 0.5f * deltaTime * deltaTime;
		newPos.w = myPos.w + oldVel.w * deltaTime + acc.w * 0.5f * deltaTime * deltaTime;
		

		newPos.w = myPos.w;
		float4 newVel;;
		newVel.x = oldVel.x + acc.x * deltaTime;
		newVel.y = oldVel.y + acc.y * deltaTime;
		newVel.z = oldVel.z + acc.z * deltaTime;
		newVel.w = oldVel.w + acc.w * deltaTime;

		// check boundry
		if(newPos.x > 1.0f || newPos.x < -1.0f || newPos.y > 1.0f || newPos.y < -1.0f || newPos.z > 1.0f || newPos.z < -1.0f) {
			float rand = (1.0f * 0) / numBodies;
			float r = 0.05f *  rand;
			float theta = rand;
			float phi = 2 * rand;
			/*newPos.x = r * sinpi(theta) * cospi(phi);
			newPos.y = r * sinpi(theta) * sinpi(phi);
			newPos.x = r * cospi(theta);*/
			newPos.x = r * sinpi_val * cospi_val;
			newPos.y = r * sinpi_val * sinpi_val;
			newPos.x = r * cospi_val;
			newVel.x = 0.0f;
			newVel.y = 0.0f;
			newVel.z = 0.0f;
		}

		// write to global memory
		nxtPos[4*0 + 0] = newPos.x;
		nxtPos[4*0 + 1] = newPos.y;
		nxtPos[4*0 + 2] = newPos.z;
		nxtPos[4*0 + 3] = newPos.w;

		nxtVel[4*0 + 0] = newVel.x;
		nxtVel[4*0 + 1] = newVel.y;
		nxtVel[4*0 + 2] = newVel.z;
		nxtVel[4*0 + 3] = newVel.w;
}

And my host code

#include <stdio.h>
#include <string.h>
#include <iostream>
#include <cstring>

// includes, project
#include <sdkHelper.h>
#include <shrQATest.h>

// includes, CUDA
#include <cuda.h>
#include <builtin_types.h>
#include <drvapi_error_string.h>

#define NBODY 256
#define POS_ATTRIB_SIZE 4            
#define VEL_ATTRIB_SIZE 4   
#define EPSSQR 50

using namespace std;

// Variables
CUdevice cuDevice;
CUcontext cuContext;
CUmodule cuModule;
CUfunction nbodySamsung_kernel;

float* hostcurPos;
float* hostcurVel;
float* hostnxtPos;
float* hostnxtVel;

CUdeviceptr devcurPos;
CUdeviceptr devcurVel;
CUdeviceptr devnxtPos;
CUdeviceptr devnxtVel;

bool noprompt = false;

// Functions
void Cleanup(bool);
CUresult CleanupNoFailure();
void RandomInit(float*, int);
bool findModulePath(const char*, string &, char**, string &);
void ParseArguments(int, char**);

int *pArgc = NULL;
char **pArgv = NULL;


////////////////////////////////////////////////////////////////////////////////
// These are CUDA Helper functions

// This will output the proper CUDA error strings in the event that a CUDA host call returns an error
#define checkCudaErrors(err)  __checkCudaErrors (err, __FILE__, __LINE__)

// These are the inline versions for all of the SDK helper functions
inline void __checkCudaErrors( CUresult err, const char *file, const int line )
{
    if( CUDA_SUCCESS != err) {
        fprintf(stderr, "checkCudaErrors() Driver API error = %04d \"%s\" from file <%s>, line %i.\n",
                err, getCudaDrvErrorString(err), file, line );
        exit(-1);
    }
}


inline int cudaDeviceInit(int ARGC, char ** ARGV) 
{
    int cuDevice = 0;
    int deviceCount = 0;
    CUresult err = cuInit(0);
    if (CUDA_SUCCESS == err)
        checkCudaErrors(cuDeviceGetCount(&deviceCount));
    if (deviceCount == 0) {
        fprintf(stderr, "cudaDeviceInit error: no devices supporting CUDA\n");
        exit(-1);
    }
    int dev = 0;
    dev = getCmdLineArgumentInt(ARGC, (const char **) ARGV, "device=");
    if (dev < 0) dev = 0;
    if (dev > deviceCount-1) {
		fprintf(stderr, "\n");
		fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", deviceCount);
        fprintf(stderr, ">> cudaDeviceInit (-device=%d) is not a valid GPU device. <<\n", dev);
		fprintf(stderr, "\n");
        return -dev;
    }
    checkCudaErrors(cuDeviceGet(&cuDevice, dev));
    char name[100];
    cuDeviceGetName(name, 100, cuDevice);
    if (checkCmdLineFlag(ARGC, (const char **) ARGV, "quiet") == false) {
       printf("> Using CUDA Device [%d]: %s\n", dev, name);
   	}
    return dev;
}

// This function returns the best GPU based on performance
inline int getMaxGflopsDeviceId()
{
    CUdevice current_device = 0, max_perf_device = 0;
    int device_count     = 0, sm_per_multiproc = 0;
    int max_compute_perf = 0, best_SM_arch     = 0;
    int major = 0, minor = 0, multiProcessorCount, clockRate;

    cuInit(0);
    checkCudaErrors(cuDeviceGetCount(&device_count));

	// Find the best major SM Architecture GPU device
	while ( current_device < device_count ) {
		checkCudaErrors( cuDeviceComputeCapability(&major, &minor, current_device ) );
		if (major > 0 && major < 9999) {
			best_SM_arch = MAX(best_SM_arch, major);
		}
		current_device++;
	}

    // Find the best CUDA capable GPU device
	current_device = 0;
	while( current_device < device_count ) {
		checkCudaErrors( cuDeviceGetAttribute( &multiProcessorCount, 
                                                CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 
                                                current_device ) );
        checkCudaErrors( cuDeviceGetAttribute( &clockRate, 
                                                CU_DEVICE_ATTRIBUTE_CLOCK_RATE, 
                                                current_device ) );
		checkCudaErrors( cuDeviceComputeCapability(&major, &minor, current_device ) );

		if (major == 9999 && minor == 9999) {
		    sm_per_multiproc = 1;
		} else {
		    sm_per_multiproc = _ConvertSMVer2Cores(major, minor);
		}

		int compute_perf  = multiProcessorCount * sm_per_multiproc * clockRate;
		if( compute_perf  > max_compute_perf ) {
            // If we find GPU with SM major > 2, search only these
			if ( best_SM_arch > 2 ) {
				// If our device==dest_SM_arch, choose this, or else pass
				if (major == best_SM_arch) {	
                    max_compute_perf  = compute_perf;
                    max_perf_device   = current_device;
				}
			} else {
				max_compute_perf  = compute_perf;
				max_perf_device   = current_device;
			}
		}
		++current_device;
	}
	return max_perf_device;
}
//==--- Utility Functions --------------------------------------------------== //
const char * statusToString(CUresult error)
{
  switch (error) {
    case CUDA_SUCCESS: return "No errors";
    case CUDA_ERROR_INVALID_VALUE: return "Invalid value";
    case CUDA_ERROR_OUT_OF_MEMORY: return "Out of memory";
    case CUDA_ERROR_NOT_INITIALIZED: return "Driver not initialized";
    case CUDA_ERROR_DEINITIALIZED: return "Driver deinitialized";

    case CUDA_ERROR_NO_DEVICE: return "No CUDA-capable device available";
    case CUDA_ERROR_INVALID_DEVICE: return "Invalid device";

    case CUDA_ERROR_INVALID_IMAGE: return "Invalid kernel image";
    case CUDA_ERROR_INVALID_CONTEXT: return "Invalid context";
    case CUDA_ERROR_CONTEXT_ALREADY_CURRENT: return "Context already current";
    case CUDA_ERROR_MAP_FAILED: return "Map failed";
    case CUDA_ERROR_UNMAP_FAILED: return "Unmap failed";
    case CUDA_ERROR_ARRAY_IS_MAPPED: return "Array is mapped";
    case CUDA_ERROR_ALREADY_MAPPED: return "Already mapped";
    case CUDA_ERROR_NO_BINARY_FOR_GPU: return "No binary for GPU";
    case CUDA_ERROR_ALREADY_ACQUIRED: return "Already acquired";
    case CUDA_ERROR_NOT_MAPPED: return "Not mapped";

    case CUDA_ERROR_INVALID_SOURCE: return "Invalid source";
    case CUDA_ERROR_FILE_NOT_FOUND: return "File not found";

    case CUDA_ERROR_INVALID_HANDLE: return "Invalid handle";

    case CUDA_ERROR_NOT_FOUND: return "Not found";

    case CUDA_ERROR_NOT_READY: return "CUDA not ready";

    case CUDA_ERROR_LAUNCH_FAILED: return "Launch failed";
    case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: return "Launch exceeded resources";
    case CUDA_ERROR_LAUNCH_TIMEOUT: return "Launch exceeded timeout";
    case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING: return "Launch with incompatible texturing";

    case CUDA_ERROR_UNKNOWN: return "Unknown error";
    default: return "Unknown error ID ";
  }
}

void checkSuccess(CUresult    status,
                  const char *func,
                  const char *errorBuffer = 0)
{
  if (status != CUDA_SUCCESS) {
    if (errorBuffer != 0) {
      std::cerr << "ERROR LOG:" << std::endl
                << errorBuffer << std::endl;
    }

    std::cerr << "ERROR: Could not execute '" << func << "', error ("
              << status << ") " << statusToString(status) << std::endl;
    exit(1);
  }
}
// General initialization call to pick the best CUDA Device
inline CUdevice findCudaDevice(int argc, char **argv, int *p_devID)
{
    CUdevice cuDevice;
    int devID = 0;
    // If the command-line has a device number specified, use it
    if( checkCmdLineFlag(argc, (const char**)argv, "device") ) {
        devID = cudaDeviceInit(argc, argv);
        if (devID < 0) {
            printf("exiting...\n");
            exit(0);
        }
    } else {
        // Otherwise pick the device with highest Gflops/s
        char name[100];
        devID = getMaxGflopsDeviceId();
        checkCudaErrors(cuDeviceGet(&cuDevice, devID));
        cuDeviceGetName(name, 100, cuDevice);
        printf("> Using CUDA Device [%d]: %s\n", devID, name);
    }
    cuDeviceGet(&cuDevice, devID);
    if (p_devID) *p_devID = devID;
    return cuDevice;
}
// end of CUDA Helper Functions


// Host code
int main(int argc, char** argv)
{
    pArgc = &argc;
    pArgv = argv;

    shrQAStart(argc, argv);

    printf("nbody Samsung (Driver API)\n");
    int N = NBODY , devID = 0;
    size_t  size = NBODY * POS_ATTRIB_SIZE * sizeof(float);

    CUresult error;
    ParseArguments(argc, argv);

    // Initialize
    checkCudaErrors ( cuInit(0) );

   // This assumes that the user is attempting to specify a explicit device -device=n
   if (argc > 1) {
       bool bFound = false;
       for (int param=0; param < argc; param++) {
       int string_start = 0;
            while (argv[param][string_start] == '-') {
               string_start++;
            }
            char *string_argv = &argv[param][string_start];
            if (!strncmp(string_argv, "device", 6)) {
                int len=(int)strlen(string_argv);
                while (string_argv[len] != '=') {
                   len--;
                }
                devID = atoi(&string_argv[++len]);
                bFound = true;
            }
            if (bFound) 
                break;
        }
    }

    // Get number of devices supporting CUDA
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    if (error != CUDA_SUCCESS) Cleanup(false);
    if (deviceCount == 0) {
        printf("There is no device supporting CUDA.\n");
        Cleanup(false);
    }
    if (devID < 0) {
        devID = 0;
    }
    if (devID > deviceCount-1) {
       fprintf(stderr, "(Device=%d) invalid GPU device.  %d GPU device(s) detected.\nexiting...\n", devID, deviceCount);
       CleanupNoFailure();
       shrQAFinishExit(argc, (const char **)argv, QA_WAIVED);
    } else {
       int major, minor;
       char deviceName[100];
       checkCudaErrors( cuDeviceComputeCapability(&major, &minor, devID) );
       checkCudaErrors( cuDeviceGetName(deviceName, 256, devID) );
       printf("> Using Device %d: \"%s\" with Compute %d.%d capability\n", devID, deviceName, major, minor);
    }

    // pick up device with zero ordinal (default, or devID)
    error = cuDeviceGet(&cuDevice, devID);
    if (error != CUDA_SUCCESS) Cleanup(false);

    // Create context
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    if (error != CUDA_SUCCESS) Cleanup(false);

    // first search for the module path before we load the results
    string module_path, ptx_source;

    if (!findModulePath ("nbodySamsung.kernel.ptx", module_path, argv, ptx_source)) {
       if (!findModulePath ("nbodySamsung.kernel.cubin", module_path, argv, ptx_source)) {
           printf("> findModulePath could not find <nbodySamsung> ptx or cubin\n");
           Cleanup(false);
       }
    } else {
       printf("> initCUDA loading module: <%s>\n", module_path.c_str());
    }

    // Create module from binary file (PTX or CUBIN)
    if (module_path.rfind("ptx") != string::npos) {
        // in this branch we use compilation with parameters
       /* const unsigned int jitNumOptions = 3;
        CUjit_option *jitOptions = new CUjit_option[jitNumOptions];
        void **jitOptVals = new void*[jitNumOptions];

        // set up size of compilation log buffer
        jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
        int jitLogBufferSize = 1024;
        jitOptVals[0] = (void *)(size_t)jitLogBufferSize;

        // set up pointer to the compilation log buffer
        jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
        char *jitLogBuffer = new char[jitLogBufferSize];
        jitOptVals[1] = jitLogBuffer;

	// set up pointer to set the Maximum # of registers for a particular kernel
	jitOptions[2] = CU_JIT_MAX_REGISTERS;
	int jitRegCount = 32;
	jitOptVals[2] = (void *)(size_t)jitRegCount;
	//printf(" %s",ptx_source.c_str());*/
  
	const int kLogSize = 1024;
  	char      logBuffer[kLogSize];

	// Read the PTX kernel from disk
	  std::ifstream kernelFile("nbodySamsung.kernel.ptx");
	  if (!kernelFile.is_open()) {
	    std::cerr << "Failed to open nbody.kernel.ptx\n";
	    return 1;
	  }

	  // Load entire kernel into a string
	  std::string source(std::istreambuf_iterator<char>(kernelFile),
		             (std::istreambuf_iterator<char>()));


	// Configure JIT options
	  CUjit_option jitOptions[] = { CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
		                        CU_JIT_ERROR_LOG_BUFFER };
	  void* jitOptionValues[]   = { reinterpret_cast<void*>(kLogSize), logBuffer };

	  // Load the kernel onto the device
	  error = cuModuleLoadDataEx(&cuModule, source.c_str(),
		                      sizeof(jitOptions)/sizeof(jitOptions[0]),
		                      jitOptions, jitOptionValues);
	  checkSuccess(error, "cuModuleLoadDataEx", logBuffer);

	printf("> PTX JIT log:\n%s\n", logBuffer);
    }  else {
        error = cuModuleLoad(&cuModule, module_path.c_str());
    }
    if (error != CUDA_SUCCESS) Cleanup(false);

    // Get function handle from module
    error = cuModuleGetFunction(&nbodySamsung_kernel, cuModule, "nbodySamsung_kernel");
    checkSuccess(error, "cuModuleGetFunction");
    if (error != CUDA_SUCCESS) Cleanup(false);
    
    printf("\n Got function Handle");
    // Allocate input vectors h_A and h_B in host memory
    hostcurPos = (float*)malloc(size);
    if (hostcurPos == 0) Cleanup(false);

    hostcurVel = (float*)malloc(size);
    if (hostcurVel == 0) Cleanup(false);

    hostnxtPos = (float*)malloc(size);
    if (hostnxtPos == 0) Cleanup(false);

    hostnxtVel = (float*)malloc(size);
    if (hostnxtPos == 0) Cleanup(false);		
	
    printf("\n Host Buffers Allocated");
    // Initialize input vectors
    RandomInit(hostcurPos, NBODY * POS_ATTRIB_SIZE);
    RandomInit(hostcurVel, NBODY * VEL_ATTRIB_SIZE);
    printf("\n Host Buffers inited");

    // Allocate vectors in device memory
    error = cuMemAlloc(&devcurPos, size);
    checkSuccess(error, "cuMemAlloc ");
    if (error != CUDA_SUCCESS) Cleanup(false);

    error = cuMemAlloc(&devcurVel, size);
    checkSuccess(error, "cuMemAlloc "); 
    if (error != CUDA_SUCCESS) Cleanup(false);

    error = cuMemAlloc(&devnxtPos, size);
    checkSuccess(error, "cuMemAlloc ");
    if (error != CUDA_SUCCESS) Cleanup(false);

    error = cuMemAlloc(&devnxtVel, size);
    checkSuccess(error, "cuMemAlloc ");
    if (error != CUDA_SUCCESS) Cleanup(false);	

    printf("\n Dev Buffers Allocated");

    // Copy vectors from host memory to device memory
    error = cuMemcpyHtoD(devcurPos, hostcurPos, size);
    checkSuccess(error, "cuMemcpyHtoD ");
    if (error != CUDA_SUCCESS) Cleanup(false);

    error = cuMemcpyHtoD(devcurVel, hostcurVel, size);
    checkSuccess(error, "cuMemcpyHtoD ");	
    if (error != CUDA_SUCCESS) Cleanup(false);

    int numBodies   = NBODY;
    float deltaTime =  0.005;
    int epsSqr      = EPSSQR;
   
    int threadsPerBlock = 256;
    unsigned int localMemSize = threadsPerBlock * POS_ATTRIB_SIZE * sizeof(float);

#if 1
    if (1) {
    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (simpler method)

        // Grid/Block configuration
        int threadsPerBlock = 256;
        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

        void *args[] = { &devcurPos, &devcurVel, &numBodies, &deltaTime, &epsSqr, &devnxtPos, &devnxtVel };

        // Launch the CUDA kernel
        error = cuLaunchKernel( nbodySamsung_kernel,  blocksPerGrid, 1, 1, 
                                threadsPerBlock, 1, 1,
                                localMemSize, 
                                NULL, args, NULL);
	
        checkSuccess(error, "cuLaunchKernel - 1");

        if (error != CUDA_SUCCESS) Cleanup(false);
    } else {
    // This is the new CUDA 4.0 API for Kernel Parameter Passing and Kernel Launch (advanced method)
        int offset = 0;
        void *argBuffer[16];
       

	*((CUdeviceptr *)&argBuffer[offset])   = devcurPos;     offset += sizeof(devcurPos);
        *((CUdeviceptr *)&argBuffer[offset])   = devcurVel;     offset += sizeof(devcurVel);
	*((int         *)&argBuffer[offset])   = numBodies;     offset += sizeof(numBodies);
	*((float         *)&argBuffer[offset]) = deltaTime;     offset += sizeof(deltaTime);
	*((int         *)&argBuffer[offset])   = epsSqr;        offset += sizeof(epsSqr);
        *((CUdeviceptr *)&argBuffer[offset])   = devnxtPos;     offset += sizeof(devnxtPos);
        *((CUdeviceptr *)&argBuffer[offset])   = devnxtVel;     offset += sizeof(devnxtVel);

        // Grid/Block configuration
        int threadsPerBlock = 256;
        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

        // Launch the CUDA kernel
        error = cuLaunchKernel( nbodySamsung_kernel,  blocksPerGrid, 1, 1, 
                                threadsPerBlock, 1, 1,
                                localMemSize,
                                NULL, NULL, argBuffer);
        checkSuccess(error, "cuLaunchKernel - 2");

        if (error != CUDA_SUCCESS) Cleanup(false);
    }
#else
    {
	int offset = 0;
        char argBuffer[256];

        // pass in launch parameters (not actually de-referencing CUdeviceptr).  CUdeviceptr is
        // storing the value of the parameters
        *((CUdeviceptr *)&argBuffer[offset])   = devcurPos;     offset += sizeof(devcurPos);
        *((CUdeviceptr *)&argBuffer[offset])   = devcurVel;     offset += sizeof(devcurVel);
	*((int         *)&argBuffer[offset])   = numBodies;     offset += sizeof(numBodies);
	*((float         *)&argBuffer[offset]) = deltaTime;     offset += sizeof(deltaTime);
	*((int         *)&argBuffer[offset])   = epsSqr;        offset += sizeof(epsSqr);
        *((CUdeviceptr *)&argBuffer[offset])   = devnxtPos;     offset += sizeof(devnxtPos);
        *((CUdeviceptr *)&argBuffer[offset])   = devnxtVel;     offset += sizeof(devnxtVel);

        void *kernel_launch_config[5] = {
            CU_LAUNCH_PARAM_BUFFER_POINTER, argBuffer,
            CU_LAUNCH_PARAM_BUFFER_SIZE,    &offset,
            CU_LAUNCH_PARAM_END
        };

        // Grid/Block configuration
        int threadsPerBlock = 256;
        int blocksPerGrid   = (N + threadsPerBlock - 1) / threadsPerBlock;

        // Launch the CUDA kernel
        error = cuLaunchKernel( nbodySamsung_kernel,  blocksPerGrid, 1, 1, 
                                threadsPerBlock, 1, 1,
                                localMemSize, 0,
                                NULL, (void **)&kernel_launch_config);
	checkSuccess(error, "cuLaunchKernel - 3");
        if (error != CUDA_SUCCESS) Cleanup(false);
    }
#endif

#ifdef _DEBUG
    error = cuCtxSynchronize();
    if (error != CUDA_SUCCESS) Cleanup(false);
#endif
     
    // Copy result from device memory to host memory
    // h_C contains the result in host memory
    error = cuMemcpyDtoH(hostcurPos, devcurPos, size);
    checkSuccess(error, "cuMemcpyDtoH");
    if (error != CUDA_SUCCESS) Cleanup(false);
  
    error = cuMemcpyDtoH(hostcurVel, devcurVel, size);
    checkSuccess(error, "cuMemcpyDtoH");	
    if (error != CUDA_SUCCESS) Cleanup(false);

    error = cuMemcpyDtoH(hostnxtPos, devnxtPos, size);
    checkSuccess(error, "cuMemcpyDtoH");
    if (error != CUDA_SUCCESS) Cleanup(false);

    error = cuMemcpyDtoH(hostnxtVel, devnxtVel, size);
    checkSuccess(error, "cuMemcpyDtoH");
    if (error != CUDA_SUCCESS) Cleanup(false);    
	
    
    // Verify result
    /*int i;
    for (i = 0; i < N; ++i) {
        float sum = h_A[i] + h_B[i];
        if (fabs(h_C[i] - sum) > 1e-7f)
            break;
    }*/
    shrQAFinishExit(argc, (const char **)argv, 1 ? QA_PASSED : QA_FAILED);
}

CUresult CleanupNoFailure()
{
    CUresult error;
        
    // Free device memory
    if (devcurPos)
        error = cuMemFree(devcurPos);
    if (devcurVel)
        error = cuMemFree(devcurVel);
    if (devnxtPos)
        error = cuMemFree(devnxtPos);
    if (devnxtVel)
        error = cuMemFree(devnxtVel);

    // Free host memory
    if (hostcurPos)
        free(hostcurPos);
    if (hostcurVel)
        free(hostcurVel);
    if (hostnxtPos)
        free(hostnxtPos);
    if (hostnxtVel)
        free(hostnxtVel);  	
        
    error = cuCtxDetach(cuContext);

    return error;
}

void Cleanup(bool noError)
{
    CUresult error;
    error = CleanupNoFailure();

    if (!noError || error != CUDA_SUCCESS) {
        printf("Function call failed\nFAILED\n");
        shrQAFinish2(true, *pArgc, (const char **)pArgv, QA_FAILED);
    }
    
    if (!noprompt) {
        printf("\nPress ENTER to exit...\n");
        fflush( stdout);
        fflush( stderr);
        getchar();
    }
}


// Allocates an array with random float entries.
void RandomInit(float* data, int n)
{
    for (int i = 0; i < n; ++i)
        data[i] = rand() / (float)RAND_MAX;
}

bool inline
findModulePath(const char *module_file, string & module_path, char **argv, string & ptx_source)
{
    char *actual_path = sdkFindFilePath(module_file, argv[0]);
    if (actual_path) {
       module_path = actual_path;
    } else {
       printf("> findModulePath file not found: <%s> \n", module_file); 
       return false;
    }

    if (module_path.empty()) {
       printf("> findModulePath could not find file: <%s> \n", module_file); 
       return false;
    } else {
       printf("> findModulePath found file at <%s>\n", module_path.c_str());

       if (module_path.rfind(".ptx") != string::npos) {
           FILE *fp = fopen(module_path.c_str(), "rb");
           fseek(fp, 0, SEEK_END);
           int file_size = ftell(fp);
           char *buf = new char[file_size+1];
           fseek(fp, 0, SEEK_SET);
           fread(buf, sizeof(char), file_size, fp);
           fclose(fp);
           buf[file_size] = '\0';
           ptx_source = buf;
           delete[] buf;
       }
       return true;
    }
}

// Parse program arguments
void ParseArguments(int argc, char** argv)
{
    for (int i = 0; i < argc; ++i) {
        if (strcmp(argv[i], "--noprompt") == 0 ||
            strcmp(argv[i], "-noprompt") == 0) 
        {
            noprompt = true;
            break;
        }
    }
}

Would anyone help me figure out the mistake here ? thanks in advance

If you run with a debug instead of release build, cuda-memcheck should return the offending line in the source code, rather than the relevant instruction pointer value.

As an orthogonal question, why does the code have #defines for sinpi() and cospi(). These functions are already provided by the CUDA math library, with better numerical properties than the #defined versions.

I using LLVM-3.2 to build the host and kernel code(to PTX). It is built using debug options and also my program. I am unable to get line number in cuda-memcheck by that. Does that work only if it is built with nvcc ?

I did not find separate sinpi & cospi in math lib. Although it did have sincospi but I wasn;t sure how ot use it

I am not sure what you mean by “using LLVM-3.2 to build”. Are you using CUDA 5.0 and the nvcc compiler that comes with it? The fact that sincospi() is available would indicate you are using CUDA 5.0, because this function was introduced in that CUDA release. sinpi() and cospi() were introduced in CUDA versions prior to CUDA 5.0, so if sincospi() is available, so should sinpi() and cospi(). You can see the prototypes in the header file math_functions.h. The single precision implementation of these functions is located in that same file, the corresponding double-precision implementation in the file math_functions_dbl_ptx3.h. These header files with the inline functions of the CUDA math library are automatically included when nvcc compiles source file from a .cu file.

OK. I will check that. By saying I use “LLVM-3.2”, I am saying I don’t use nvcc compiler that comes with it. I am using the LLVM compiler for nvptx backend

I am not familiar with the setup. This probably also explains why sinpi() and cospi() don’t seem to be available. I would suggest installing CUDA 5.0 and using nvcc to compile the program in debug mode (using the switches -g -G, I think) to generate debug info that cuda-memcheck can use to pin-point the relevant line in the source code. Make sure to also install the latest drivers.

Is there a particular reason you don’t use the CUDA toolchain to compile CUDA code?

Yes, I am working on a project that requires me to use LLVM. Sorry, I should have mentioned the above code works fine when built with nvcc. But I was kinda thinking that there is some bug in the program that nvcc is unable to catch. I was wondering if there is anything I am overlooking in my code or if there is anything I should take care of that is causing this segfault.

A compiler’s static analysis has only limited chances of catching out-of-bounds accesses and the like. This is why tools providing runtime checking like valgrind and cuda-memcheck exist. If the program compiles fine with nvcc and running the resulting executable with cuda-memcheck reports no errors, the likely cause of the problem you are seeing is your custom toolchain. Debug from first principles. Check the generated machine code, double-check the data (pointers in particular) when running with a debugger, etc.