driver.cu(255) : cudaSafeCall() Runtime API error : unspecified launch failure.

q1) how shall i allocate in device memory a sturct ALLNODES which itself contains pointer to array of struct NODES

//real_t means FLOAT

typedef struct {

real_t *x; // X coordinate of the points

real_t *y; // Y coordinate of the points

real_t *z; // Z coordinate of the points

real_t *den_pot; // Density/Potential of the points

int num_pts; // Number of points in the node

} Node;

typedef struct {

int num_nodes; // Total number of nodes

Node *N;

real_t *xbuffer;

real_t *ybuffer;

real_t *zbuffer;

real_t *den_potbuffer;

int *counts;

} AllNodes;

q2) How to debug cuda code in MAC…

my code is

[codebox]#include <assert.h>

#include <stdio.h>

#include <stdlib.h>

//#include <sys/time.h>

#include “node.h”

#include “reals.h”

#include “reals_aligned.h”

#include “evaluate.h”

// includes, kernels

#include <japnik_kernel.cu>

// includes, project

#include <cutil_inline.h>

static void usage (const char* use) {

fprintf (stderr, “usage: %s <source_filename> <target_filename>”

" <Ulist_filename> \n", use);

}

int main (int argc, char* argv)

{

AllNodes *src_node;

AllNodes *trg_node;

AllLists *ulist;

const char* input_src_filename;

const char* input_trg_filename;

const char* input_ulist_filename;

int src_node_count, trg_node_count;

int global_count;

FILE *fout;

if (argc != 4) {

usage (argv[0]);

return -1;

}

/** Command line input */

input_src_filename = argv[1];

input_trg_filename = argv[2];

input_ulist_filename = argv[3];

/* /CLOCKING; //removed

struct stopwatch_t* timer = NULL;

long double t_u;

stopwatch_init ();

timer = stopwatch_create ();

*/

/** Setup to load points and create data structure for nodes/lists */

src_node = load_nodes (input_src_filename);

trg_node = load_nodes (input_trg_filename);

trg_node_count = trg_node->num_nodes;

src_node_count = src_node->num_nodes;

ulist = load_lists (input_ulist_filename, trg_node_count);

fprintf(stderr, “Finished setup\n”);

/* Assign random values to all the target points */

global_count = 0;

for (int i = 0; i < trg_node_count; i++) {

get_value(trg_node->counts[i], trg_node->xbuffer + global_count);

get_value(trg_node->counts[i], trg_node->ybuffer + global_count);

get_value(trg_node->counts[i], trg_node->zbuffer + global_count);

set_zero(trg_node->counts[i], trg_node->den_potbuffer + global_count);

global_count += trg_node->counts[i];

}

global_count = 0;

for (int i = 0; i < trg_node_count; i++) {

trg_node->N[i].x = trg_node->xbuffer + global_count;

trg_node->N[i].y = trg_node->ybuffer + global_count;

trg_node->N[i].z = trg_node->zbuffer + global_count;

trg_node->N[i].den_pot = trg_node->den_potbuffer + global_count;

trg_node->N[i].num_pts = trg_node->counts[i];

global_count += trg_node->counts[i];

}

/* Assign random values to all the source points */

global_count = 0;

for (int i = 0; i < src_node_count; i++) {

get_value(src_node->counts[i], src_node->xbuffer + global_count);

get_value(src_node->counts[i], src_node->ybuffer + global_count);

get_value(src_node->counts[i], src_node->zbuffer + global_count);

get_value(src_node->counts[i], src_node->den_potbuffer + global_count);

global_count += src_node->counts[i];

}

global_count = 0;

for (int i = 0; i < src_node_count; i++) {

src_node->N[i].x = src_node->xbuffer + global_count;

src_node->N[i].y = src_node->ybuffer + global_count;

src_node->N[i].z = src_node->zbuffer + global_count;

src_node->N[i].den_pot = src_node->den_potbuffer + global_count;

src_node->N[i].num_pts = src_node->counts[i];

global_count += src_node->counts[i];

}

for (int i = 0; i < trg_node_count; i++) {

ulist->L[i].node_list = ulist->nodelist_buffer + ulist->offsets[i];

ulist->L[i].num_listnodes = ulist->counts[i];

}

printf(“random number assigned inhost memory”);

//…for allocating and copying memory to device

int BLOCK_SIZE=8;

int threadsPerBlock= BLOCK_SIZE * BLOCK_SIZE;

//for target

size_t size=0; //just chk size_t is predefined

Node N[trg_node_count]; //to be passed as argument to multiplicat kernel

for(int i=0; i<trg_node_count;i++)

{

size=trg_node->N[i].num_pts*sizeof(real_t); //assigning sapce to node to the nearest multiple of thredsperblock

N[i].num_pts=trg_node->N[i].num_pts;

cutilSafeCall(cudaMalloc((void**)&N[i].x,size));

cutilSafeCall(cudaMalloc((void**)&N[i].y,size));

cutilSafeCall(cudaMalloc((void**)&N[i].z,size));

cutilSafeCall(cudaMalloc((void**)&N[i].den_pot,size));

cutilSafeCall(cudaMemcpy(N[i].x,trg_node->N[i].x,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(N[i].y,trg_node->N[i].y,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(N[i].z,trg_node->N[i].z,size,cudaMemcpyHostToDevice));

//cudaMemcpy(dev_ptr_trg_den[i],trg_node->N[i].den_pot,size,cudaMemcpyHostToDevice); //to be calculted

}

//…

//for source

AllNodes src; //to be passed to kernel

//src.counts=src_node->counts;

src.num_nodes=src_node->num_nodes;

size=src.num_nodes*sizeof(Node);

//printf (“%s \n”, “A string”); //debuggin tym

cutilSafeCall(cudaMalloc((void**)&src.N,size));

//cutilSafeCall(cudaMemcpy((void**)&src.N,src_node->N,size,cudaMemcpyHostToDevice)); //ques if num_pts in each node gets copied

for(int i=0; i<src_node_count;i++)

{

size=src_node->N[i].num_pts*sizeof(real_t);

cutilSafeCall(cudaMalloc((void**)&src.N[i].x,size));

cutilSafeCall(cudaMalloc((void**)&src.N[i].y,size));

cutilSafeCall(cudaMalloc((void**)&src.N[i].z,size));

cutilSafeCall(cudaMalloc((void**)&src.N[i].den_pot,size));

cutilSafeCall(cudaMemcpy(src.N[i].x,src_node->N[i].x,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(src.N[i].y,src_node->N[i].y,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(src.N[i].z,src_node->N[i].z,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(src.N[i].den_pot,src_node->N[i].den_pot,size,cudaMemcpyHostToDevice));

}

//…

//for ulist

List L[trg_node_count];

for(int i=0;i<trg_node_count;i++)

{

L[i].num_listnodes=ulist->L[i].num_listnodes;

size=ulist->L[i].num_listnodes*sizeof(int);

cutilSafeCall(cudaMalloc((void**)&L[i].node_list,size));

cutilSafeCall(cudaMemcpy(L[i].node_list,ulist->L[i].node_list,size,cudaMemcpyHostToDevice));

}

printf(“\n memory allocation done for device \n”);

//memory allocation done…

dim3 dimBlock;

dimBlock.x=BLOCK_SIZE;

dimBlock.y=BLOCK_SIZE;

for(int i=0;i<trg_node_count;i++)

{

printf(“enterin for loop %i”,i);

size=src_node->N[i].num_pts * sizeof(real_t);

dim3 dimgrid;

dimgrid.x= (trg_node->N[i].num_pts + threadsPerBlock - 1) / threadsPerBlock ; //need to ckeck again…if num blocks

dir_eval<<<dimgrid,dimBlock>>>(N[i],L[i],src,threadsPerBlock);

printf(" copying memeory from device to host \n");

cutilSafeCall(cudaMemcpy(trg_node->N[i].den_pot,N[i].den_pot,size,cudaMemcpyDeviceToHost));

}

/*****************

IN LAST LINE OF ABOVE FOR LOOP I M GETTING THE ERROR…driver.cu(255) : cudaSafeCall() Runtime API error : unspecified launch failure.

********************/

//freeing memory

//targets and ulists

for(int i=0; i<trg_node_count;i++)

{

	cudaFree(N[i].x);

	cudaFree(N[i].y);

	cudaFree(N[i].z);

	cudaFree(N[i].den_pot);

	cudaFree(L[i].node_list);

}

//source

cudaFree(src.N);

for (int i=0; i<src_node_count; i++) 

{

	cudaFree(src.N[i].x);		

	cudaFree(src.N[i].y);		

	cudaFree(src.N[i].z);		

	cudaFree(src.N[i].den_pot);		

}

//***********************************************

//kernel its another file but i am pasting it here

#ifndef M_PI

#define M_PI 3.1415926535897932385

#endif

#define OOFP 1.0/(4.0 * M_PI)

host device int nearthreadsPerBlock(int num,int threadsPerBlock)

{

while (num%threadsPerBlock!=0) {

	num++;

}

return num;

}

global void dir_eval(Node A,List B,AllNodes C,int threadsPerBlock)

{

int i=blockIdx.x * threadsPerBlock + (threadIdx.y*blockDim.x)+threadIdx.x;

//printf(“%i”,i);

if(i<A.num_pts)

{

A.den_pot[i]=7; //initializing it to 0… actually this is to be found

}

for(int j=0;j<B.num_listnodes;j++)

{

A.den_pot[i]=A.den_pot[i]+1;

for(int k=0;k<nearthreadsPerBlock(C.N[B.node_list[j]].num_pts,threadsPe

rBlock)/threadsPerBlock;k++)

{

shared float4 point_src[64] ; //number of threads in a block

int l=i%threadsPerBlock;

point_src[l].x=C.N[B.node_list[j]].x[(threadsPerBlock*k)+l]; // each thread puts one paoints stuff in shared memory

point_src[l].y=C.N[B.node_list[j]].y[k*threadsPerBlock+l]; // each thread puts one paoints stuff in shared memory

point_src[l].z=C.N[B.node_list[j]].z[k*threadsPerBlock+l]; // each thread puts one paoints stuff in shared memory

point_src[l].w=C.N[B.node_list[j]].den_pot[k*threadsPerBlock

+l]; // each thread puts one paoints stuff in shared memory

__syncthreads(); //to enure that all threads 've loaded shard memory

for(int m=0;m< threadsPerBlock;m++)

{

if(i<A.num_pts){

real_t x,y,z,r,r2;

//oofp = (real_t) (1.0/(4.0 * M_PI));

x=A.x[i]-point_src[m].x;

y=A.y[i]-point_src[m].y;

z=A.z[i]-point_src[m].z;

r2=xx + yy + z*z;

r=sqrt(r2); //assuming sqrt predefined

A.den_pot[i]+=(OOFP/r)*point_src[m].w; //finall answer

}

}

__syncthreads(); //to ensure all threads are done with this calc

}

}

}

printf(“done”);

fout = fopen (“direct_japnik”, “w”);

for (int i = 0; i < trg_node_count; i++) {

for (int j = 0; j < trg_node->N[i].num_pts; j++) {

fprintf (fout, "%lf ", trg_node->N[i].den_pot[j]);

}

fprintf (fout, “\n”);

}

fclose (fout);

//stopwatch_destroy (timer);

free_lists(ulist);

free_nodes(src_node); free_nodes(trg_node);

free(ulist);

free(src_node); free(trg_node);

return 0;

}

/* ----------------------------------------------------------------------------------------------------------

  • eof

*/

/*

  • japnik.h

  • Created by Japnik Singh on 5/26/10.

  • Copyright 2010 MyCompanyName. All rights reserved.

*/

[/codebox]