driver.cu(255) : cudaSafeCall() Runtime API error : unspecified launch failure.

japnik_Singh · June 3, 2010, 8:50pm

q1) how shall i allocate in device memory a sturct ALLNODES which itself contains pointer to array of struct NODES

//real_t means FLOAT

typedef struct {

real_t *x; // X coordinate of the points

real_t *y; // Y coordinate of the points

real_t *z; // Z coordinate of the points

real_t *den_pot; // Density/Potential of the points

int num_pts; // Number of points in the node

} Node;

typedef struct {

int num_nodes; // Total number of nodes

Node *N;

real_t *xbuffer;

real_t *ybuffer;

real_t *zbuffer;

real_t *den_potbuffer;

int *counts;

} AllNodes;

q2) How to debug cuda code in MAC…

my code is

[codebox]#include <assert.h>

#include <stdio.h>

#include <stdlib.h>

//#include <sys/time.h>

#include “node.h”

#include “reals.h”

#include “reals_aligned.h”

#include “evaluate.h”

// includes, kernels

#include <japnik_kernel.cu>

// includes, project

#include <cutil_inline.h>

static void usage (const char* use) {

fprintf (stderr, “usage: %s <source_filename> <target_filename>”

" <Ulist_filename> \n", use);

}

int main (int argc, char* argv)

{

AllNodes *src_node;

AllNodes *trg_node;

AllLists *ulist;

const char* input_src_filename;

const char* input_trg_filename;

const char* input_ulist_filename;

int src_node_count, trg_node_count;

int global_count;

FILE *fout;

if (argc != 4) {

usage (argv[0]);

return -1;

}

/** Command line input */

input_src_filename = argv[1];

input_trg_filename = argv[2];

input_ulist_filename = argv[3];

/* /CLOCKING; //removed

struct stopwatch_t* timer = NULL;

long double t_u;

stopwatch_init ();

timer = stopwatch_create ();

*/

/** Setup to load points and create data structure for nodes/lists */

src_node = load_nodes (input_src_filename);

trg_node = load_nodes (input_trg_filename);

trg_node_count = trg_node->num_nodes;

src_node_count = src_node->num_nodes;

ulist = load_lists (input_ulist_filename, trg_node_count);

fprintf(stderr, “Finished setup\n”);

/* Assign random values to all the target points */

global_count = 0;

for (int i = 0; i < trg_node_count; i++) {

get_value(trg_node->counts[i], trg_node->xbuffer + global_count);

get_value(trg_node->counts[i], trg_node->ybuffer + global_count);

get_value(trg_node->counts[i], trg_node->zbuffer + global_count);

set_zero(trg_node->counts[i], trg_node->den_potbuffer + global_count);

global_count += trg_node->counts[i];

}

global_count = 0;

for (int i = 0; i < trg_node_count; i++) {

trg_node->N[i].x = trg_node->xbuffer + global_count;

trg_node->N[i].y = trg_node->ybuffer + global_count;

trg_node->N[i].z = trg_node->zbuffer + global_count;

trg_node->N[i].den_pot = trg_node->den_potbuffer + global_count;

trg_node->N[i].num_pts = trg_node->counts[i];

global_count += trg_node->counts[i];

}

/* Assign random values to all the source points */

global_count = 0;

for (int i = 0; i < src_node_count; i++) {

get_value(src_node->counts[i], src_node->xbuffer + global_count);

get_value(src_node->counts[i], src_node->ybuffer + global_count);

get_value(src_node->counts[i], src_node->zbuffer + global_count);

get_value(src_node->counts[i], src_node->den_potbuffer + global_count);

global_count += src_node->counts[i];

}

global_count = 0;

for (int i = 0; i < src_node_count; i++) {

src_node->N[i].x = src_node->xbuffer + global_count;

src_node->N[i].y = src_node->ybuffer + global_count;

src_node->N[i].z = src_node->zbuffer + global_count;

src_node->N[i].den_pot = src_node->den_potbuffer + global_count;

src_node->N[i].num_pts = src_node->counts[i];

global_count += src_node->counts[i];

}

for (int i = 0; i < trg_node_count; i++) {

ulist->L[i].node_list = ulist->nodelist_buffer + ulist->offsets[i];

ulist->L[i].num_listnodes = ulist->counts[i];

}

printf(“random number assigned inhost memory”);

//…for allocating and copying memory to device

int BLOCK_SIZE=8;

int threadsPerBlock= BLOCK_SIZE * BLOCK_SIZE;

//for target

size_t size=0; //just chk size_t is predefined

Node N[trg_node_count]; //to be passed as argument to multiplicat kernel

for(int i=0; i<trg_node_count;i++)

{

size=trg_node->N[i].num_pts*sizeof(real_t); //assigning sapce to node to the nearest multiple of thredsperblock

N[i].num_pts=trg_node->N[i].num_pts;

cutilSafeCall(cudaMalloc((void**)&N[i].x,size));

cutilSafeCall(cudaMalloc((void**)&N[i].y,size));

cutilSafeCall(cudaMalloc((void**)&N[i].z,size));

cutilSafeCall(cudaMalloc((void**)&N[i].den_pot,size));

cutilSafeCall(cudaMemcpy(N[i].x,trg_node->N[i].x,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(N[i].y,trg_node->N[i].y,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(N[i].z,trg_node->N[i].z,size,cudaMemcpyHostToDevice));

//cudaMemcpy(dev_ptr_trg_den[i],trg_node->N[i].den_pot,size,cudaMemcpyHostToDevice); //to be calculted

}

//…

//for source

AllNodes src; //to be passed to kernel

//src.counts=src_node->counts;

src.num_nodes=src_node->num_nodes;

size=src.num_nodes*sizeof(Node);

//printf (“%s \n”, “A string”); //debuggin tym

cutilSafeCall(cudaMalloc((void**)&src.N,size));

//cutilSafeCall(cudaMemcpy((void**)&src.N,src_node->N,size,cudaMemcpyHostToDevice)); //ques if num_pts in each node gets copied

for(int i=0; i<src_node_count;i++)

{

size=src_node->N[i].num_pts*sizeof(real_t);

cutilSafeCall(cudaMalloc((void**)&src.N[i].x,size));

cutilSafeCall(cudaMalloc((void**)&src.N[i].y,size));

cutilSafeCall(cudaMalloc((void**)&src.N[i].z,size));

cutilSafeCall(cudaMalloc((void**)&src.N[i].den_pot,size));

cutilSafeCall(cudaMemcpy(src.N[i].x,src_node->N[i].x,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(src.N[i].y,src_node->N[i].y,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(src.N[i].z,src_node->N[i].z,size,cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(src.N[i].den_pot,src_node->N[i].den_pot,size,cudaMemcpyHostToDevice));

}

//…

//for ulist

List L[trg_node_count];

for(int i=0;i<trg_node_count;i++)

{

L[i].num_listnodes=ulist->L[i].num_listnodes;

size=ulist->L[i].num_listnodes*sizeof(int);

cutilSafeCall(cudaMalloc((void**)&L[i].node_list,size));

cutilSafeCall(cudaMemcpy(L[i].node_list,ulist->L[i].node_list,size,cudaMemcpyHostToDevice));

}

printf(“\n memory allocation done for device \n”);

//memory allocation done…

dim3 dimBlock;

dimBlock.x=BLOCK_SIZE;

dimBlock.y=BLOCK_SIZE;

for(int i=0;i<trg_node_count;i++)

{

printf(“enterin for loop %i”,i);

size=src_node->N[i].num_pts * sizeof(real_t);

dim3 dimgrid;

dimgrid.x= (trg_node->N[i].num_pts + threadsPerBlock - 1) / threadsPerBlock ; //need to ckeck again…if num blocks

dir_eval<<<dimgrid,dimBlock>>>(N[i],L[i],src,threadsPerBlock);

printf(" copying memeory from device to host \n");

cutilSafeCall(cudaMemcpy(trg_node->N[i].den_pot,N[i].den_pot,size,cudaMemcpyDeviceToHost));

}

/*****************

IN LAST LINE OF ABOVE FOR LOOP I M GETTING THE ERROR…driver.cu(255) : cudaSafeCall() Runtime API error : unspecified launch failure.

********************/

//freeing memory

//targets and ulists

for(int i=0; i<trg_node_count;i++)

{

	cudaFree(N[i].x);

	cudaFree(N[i].y);

	cudaFree(N[i].z);

	cudaFree(N[i].den_pot);

	cudaFree(L[i].node_list);

}

//source

cudaFree(src.N);

for (int i=0; i<src_node_count; i++) 

{

	cudaFree(src.N[i].x);		

	cudaFree(src.N[i].y);		

	cudaFree(src.N[i].z);		

	cudaFree(src.N[i].den_pot);		

}

//***********************************************

//kernel its another file but i am pasting it here

#ifndef M_PI

#define M_PI 3.1415926535897932385

#endif

#define OOFP 1.0/(4.0 * M_PI)

host device int nearthreadsPerBlock(int num,int threadsPerBlock)

{

while (num%threadsPerBlock!=0) {

	num++;

}

return num;

}

global void dir_eval(Node A,List B,AllNodes C,int threadsPerBlock)

{

int i=blockIdx.x * threadsPerBlock + (threadIdx.y*blockDim.x)+threadIdx.x;

//printf(“%i”,i);

if(i<A.num_pts)

{

A.den_pot[i]=7; //initializing it to 0… actually this is to be found

}

for(int j=0;j<B.num_listnodes;j++)

{

A.den_pot[i]=A.den_pot[i]+1;

for(int k=0;k<nearthreadsPerBlock(C.N[B.node_list[j]].num_pts,threadsPe

rBlock)/threadsPerBlock;k++)

{

shared float4 point_src[64] ; //number of threads in a block

int l=i%threadsPerBlock;

point_src[l].x=C.N[B.node_list[j]].x[(threadsPerBlock*k)+l]; // each thread puts one paoints stuff in shared memory

point_src[l].y=C.N[B.node_list[j]].y[k*threadsPerBlock+l]; // each thread puts one paoints stuff in shared memory

point_src[l].z=C.N[B.node_list[j]].z[k*threadsPerBlock+l]; // each thread puts one paoints stuff in shared memory

point_src[l].w=C.N[B.node_list[j]].den_pot[k*threadsPerBlock

+l]; // each thread puts one paoints stuff in shared memory

__syncthreads(); //to enure that all threads 've loaded shard memory

for(int m=0;m< threadsPerBlock;m++)

{

if(i<A.num_pts){

real_t x,y,z,r,r2;

//oofp = (real_t) (1.0/(4.0 * M_PI));

x=A.x[i]-point_src[m].x;

y=A.y[i]-point_src[m].y;

z=A.z[i]-point_src[m].z;

r2=xx + yy + z*z;

r=sqrt(r2); //assuming sqrt predefined

A.den_pot[i]+=(OOFP/r)*point_src[m].w; //finall answer

}

__syncthreads(); //to ensure all threads are done with this calc

}

printf(“done”);

fout = fopen (“direct_japnik”, “w”);

for (int i = 0; i < trg_node_count; i++) {

for (int j = 0; j < trg_node->N[i].num_pts; j++) {

fprintf (fout, "%lf ", trg_node->N[i].den_pot[j]);

}

fprintf (fout, “\n”);

}

fclose (fout);

//stopwatch_destroy (timer);

free_lists(ulist);

free_nodes(src_node); free_nodes(trg_node);

free(ulist);

free(src_node); free(trg_node);

return 0;

}

/* ----------------------------------------------------------------------------------------------------------

eof

*/

/*

japnik.h
Created by Japnik Singh on 5/26/10.

*/

[/codebox]

Topic		Replies	Views
cudaMalloc issue CUDA Programming and Performance	4	948	January 23, 2018
How to allocation in cudaMalloc...? Please help me... CUDA Programming and Performance	7	846	January 15, 2018
cudaMalloc error in big loop CUDA Programming and Performance	12	15782	May 21, 2008
Device Memeroy allocation and data transfer Data transfer between host and device CUDA Programming and Performance	5	2661	June 16, 2011
Problem With GPU Memory CUDA Programming and Performance	4	3082	May 29, 2009
CUDA complex memory allocation problems Problem allocating pointer to pointer me CUDA Programming and Performance	4	3166	March 3, 2008
kernel fails to launch (example included) stripped down self-contained example CUDA Programming and Performance	5	2688	August 27, 2007
A naive question about dereferenced pointers unspecified launch failure CUDA Programming and Performance	10	3077	September 24, 2008
How get in host the memory allocated from device CUDA Programming and Performance	10	3227	August 16, 2017
Allocating device memory for an struc inside an std::vector<struct> CUDA Programming and Performance	2	136	September 28, 2024

driver.cu(255) : cudaSafeCall() Runtime API error : unspecified launch failure.

Related topics