q1) how shall i allocate in device memory a sturct ALLNODES which itself contains pointer to array of struct NODES
//real_t means FLOAT
typedef struct {
real_t *x; // X coordinate of the points
real_t *y; // Y coordinate of the points
real_t *z; // Z coordinate of the points
real_t *den_pot; // Density/Potential of the points
int num_pts; // Number of points in the node
} Node;
typedef struct {
int num_nodes; // Total number of nodes
Node *N;
real_t *xbuffer;
real_t *ybuffer;
real_t *zbuffer;
real_t *den_potbuffer;
int *counts;
} AllNodes;
q2) How to debug cuda code in MAC…
my code is
[codebox]#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
//#include <sys/time.h>
#include “node.h”
#include “reals.h”
#include “reals_aligned.h”
#include “evaluate.h”
// includes, kernels
#include <japnik_kernel.cu>
// includes, project
#include <cutil_inline.h>
static void usage (const char* use) {
fprintf (stderr, “usage: %s <source_filename> <target_filename>”
" <Ulist_filename> \n", use);
}
int main (int argc, char* argv)
{
AllNodes *src_node;
AllNodes *trg_node;
AllLists *ulist;
const char* input_src_filename;
const char* input_trg_filename;
const char* input_ulist_filename;
int src_node_count, trg_node_count;
int global_count;
FILE *fout;
if (argc != 4) {
usage (argv[0]);
return -1;
}
/** Command line input */
input_src_filename = argv[1];
input_trg_filename = argv[2];
input_ulist_filename = argv[3];
/* /CLOCKING; //removed
struct stopwatch_t* timer = NULL;
long double t_u;
stopwatch_init ();
timer = stopwatch_create ();
*/
/** Setup to load points and create data structure for nodes/lists */
src_node = load_nodes (input_src_filename);
trg_node = load_nodes (input_trg_filename);
trg_node_count = trg_node->num_nodes;
src_node_count = src_node->num_nodes;
ulist = load_lists (input_ulist_filename, trg_node_count);
fprintf(stderr, “Finished setup\n”);
/* Assign random values to all the target points */
global_count = 0;
for (int i = 0; i < trg_node_count; i++) {
get_value(trg_node->counts[i], trg_node->xbuffer + global_count);
get_value(trg_node->counts[i], trg_node->ybuffer + global_count);
get_value(trg_node->counts[i], trg_node->zbuffer + global_count);
set_zero(trg_node->counts[i], trg_node->den_potbuffer + global_count);
global_count += trg_node->counts[i];
}
global_count = 0;
for (int i = 0; i < trg_node_count; i++) {
trg_node->N[i].x = trg_node->xbuffer + global_count;
trg_node->N[i].y = trg_node->ybuffer + global_count;
trg_node->N[i].z = trg_node->zbuffer + global_count;
trg_node->N[i].den_pot = trg_node->den_potbuffer + global_count;
trg_node->N[i].num_pts = trg_node->counts[i];
global_count += trg_node->counts[i];
}
/* Assign random values to all the source points */
global_count = 0;
for (int i = 0; i < src_node_count; i++) {
get_value(src_node->counts[i], src_node->xbuffer + global_count);
get_value(src_node->counts[i], src_node->ybuffer + global_count);
get_value(src_node->counts[i], src_node->zbuffer + global_count);
get_value(src_node->counts[i], src_node->den_potbuffer + global_count);
global_count += src_node->counts[i];
}
global_count = 0;
for (int i = 0; i < src_node_count; i++) {
src_node->N[i].x = src_node->xbuffer + global_count;
src_node->N[i].y = src_node->ybuffer + global_count;
src_node->N[i].z = src_node->zbuffer + global_count;
src_node->N[i].den_pot = src_node->den_potbuffer + global_count;
src_node->N[i].num_pts = src_node->counts[i];
global_count += src_node->counts[i];
}
for (int i = 0; i < trg_node_count; i++) {
ulist->L[i].node_list = ulist->nodelist_buffer + ulist->offsets[i];
ulist->L[i].num_listnodes = ulist->counts[i];
}
printf(“random number assigned inhost memory”);
//…for allocating and copying memory to device
int BLOCK_SIZE=8;
int threadsPerBlock= BLOCK_SIZE * BLOCK_SIZE;
//for target
size_t size=0; //just chk size_t is predefined
Node N[trg_node_count]; //to be passed as argument to multiplicat kernel
for(int i=0; i<trg_node_count;i++)
{
size=trg_node->N[i].num_pts*sizeof(real_t); //assigning sapce to node to the nearest multiple of thredsperblock
N[i].num_pts=trg_node->N[i].num_pts;
cutilSafeCall(cudaMalloc((void**)&N[i].x,size));
cutilSafeCall(cudaMalloc((void**)&N[i].y,size));
cutilSafeCall(cudaMalloc((void**)&N[i].z,size));
cutilSafeCall(cudaMalloc((void**)&N[i].den_pot,size));
cutilSafeCall(cudaMemcpy(N[i].x,trg_node->N[i].x,size,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(N[i].y,trg_node->N[i].y,size,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(N[i].z,trg_node->N[i].z,size,cudaMemcpyHostToDevice));
//cudaMemcpy(dev_ptr_trg_den[i],trg_node->N[i].den_pot,size,cudaMemcpyHostToDevice); //to be calculted
}
//…
//for source
AllNodes src; //to be passed to kernel
//src.counts=src_node->counts;
src.num_nodes=src_node->num_nodes;
size=src.num_nodes*sizeof(Node);
//printf (“%s \n”, “A string”); //debuggin tym
cutilSafeCall(cudaMalloc((void**)&src.N,size));
//cutilSafeCall(cudaMemcpy((void**)&src.N,src_node->N,size,cudaMemcpyHostToDevice)); //ques if num_pts in each node gets copied
for(int i=0; i<src_node_count;i++)
{
size=src_node->N[i].num_pts*sizeof(real_t);
cutilSafeCall(cudaMalloc((void**)&src.N[i].x,size));
cutilSafeCall(cudaMalloc((void**)&src.N[i].y,size));
cutilSafeCall(cudaMalloc((void**)&src.N[i].z,size));
cutilSafeCall(cudaMalloc((void**)&src.N[i].den_pot,size));
cutilSafeCall(cudaMemcpy(src.N[i].x,src_node->N[i].x,size,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(src.N[i].y,src_node->N[i].y,size,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(src.N[i].z,src_node->N[i].z,size,cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(src.N[i].den_pot,src_node->N[i].den_pot,size,cudaMemcpyHostToDevice));
}
//…
//for ulist
List L[trg_node_count];
for(int i=0;i<trg_node_count;i++)
{
L[i].num_listnodes=ulist->L[i].num_listnodes;
size=ulist->L[i].num_listnodes*sizeof(int);
cutilSafeCall(cudaMalloc((void**)&L[i].node_list,size));
cutilSafeCall(cudaMemcpy(L[i].node_list,ulist->L[i].node_list,size,cudaMemcpyHostToDevice));
}
printf(“\n memory allocation done for device \n”);
//memory allocation done…
dim3 dimBlock;
dimBlock.x=BLOCK_SIZE;
dimBlock.y=BLOCK_SIZE;
for(int i=0;i<trg_node_count;i++)
{
printf(“enterin for loop %i”,i);
size=src_node->N[i].num_pts * sizeof(real_t);
dim3 dimgrid;
dimgrid.x= (trg_node->N[i].num_pts + threadsPerBlock - 1) / threadsPerBlock ; //need to ckeck again…if num blocks
dir_eval<<<dimgrid,dimBlock>>>(N[i],L[i],src,threadsPerBlock);
printf(" copying memeory from device to host \n");
cutilSafeCall(cudaMemcpy(trg_node->N[i].den_pot,N[i].den_pot,size,cudaMemcpyDeviceToHost));
}
/*****************
IN LAST LINE OF ABOVE FOR LOOP I M GETTING THE ERROR…driver.cu(255) : cudaSafeCall() Runtime API error : unspecified launch failure.
********************/
//freeing memory
//targets and ulists
for(int i=0; i<trg_node_count;i++)
{
cudaFree(N[i].x);
cudaFree(N[i].y);
cudaFree(N[i].z);
cudaFree(N[i].den_pot);
cudaFree(L[i].node_list);
}
//source
cudaFree(src.N);
for (int i=0; i<src_node_count; i++)
{
cudaFree(src.N[i].x);
cudaFree(src.N[i].y);
cudaFree(src.N[i].z);
cudaFree(src.N[i].den_pot);
}
//***********************************************
//kernel its another file but i am pasting it here
#ifndef M_PI
#define M_PI 3.1415926535897932385
#define OOFP 1.0/(4.0 * M_PI)
host device int nearthreadsPerBlock(int num,int threadsPerBlock)
{
while (num%threadsPerBlock!=0) {
num++;
}
return num;
}
global void dir_eval(Node A,List B,AllNodes C,int threadsPerBlock)
{
int i=blockIdx.x * threadsPerBlock + (threadIdx.y*blockDim.x)+threadIdx.x;
//printf(“%i”,i);
if(i<A.num_pts)
{
A.den_pot[i]=7; //initializing it to 0… actually this is to be found
}
for(int j=0;j<B.num_listnodes;j++)
{
A.den_pot[i]=A.den_pot[i]+1;
for(int k=0;k<nearthreadsPerBlock(C.N[B.node_list[j]].num_pts,threadsPe
rBlock)/threadsPerBlock;k++)
{
shared float4 point_src[64] ; //number of threads in a block
int l=i%threadsPerBlock;
point_src[l].x=C.N[B.node_list[j]].x[(threadsPerBlock*k)+l]; // each thread puts one paoints stuff in shared memory
point_src[l].y=C.N[B.node_list[j]].y[k*threadsPerBlock+l]; // each thread puts one paoints stuff in shared memory
point_src[l].z=C.N[B.node_list[j]].z[k*threadsPerBlock+l]; // each thread puts one paoints stuff in shared memory
point_src[l].w=C.N[B.node_list[j]].den_pot[k*threadsPerBlock
+l]; // each thread puts one paoints stuff in shared memory
__syncthreads(); //to enure that all threads 've loaded shard memory
for(int m=0;m< threadsPerBlock;m++)
{
if(i<A.num_pts){
real_t x,y,z,r,r2;
//oofp = (real_t) (1.0/(4.0 * M_PI));
x=A.x[i]-point_src[m].x;
y=A.y[i]-point_src[m].y;
z=A.z[i]-point_src[m].z;
r2=xx + yy + z*z;
r=sqrt(r2); //assuming sqrt predefined
A.den_pot[i]+=(OOFP/r)*point_src[m].w; //finall answer
}
}
__syncthreads(); //to ensure all threads are done with this calc
}
}
}
printf(“done”);
fout = fopen (“direct_japnik”, “w”);
for (int i = 0; i < trg_node_count; i++) {
for (int j = 0; j < trg_node->N[i].num_pts; j++) {
fprintf (fout, "%lf ", trg_node->N[i].den_pot[j]);
}
fprintf (fout, “\n”);
}
fclose (fout);
//stopwatch_destroy (timer);
free_lists(ulist);
free_nodes(src_node); free_nodes(trg_node);
free(ulist);
free(src_node); free(trg_node);
return 0;
}
/* ----------------------------------------------------------------------------------------------------------
- eof
*/
/*
-
japnik.h
-
Created by Japnik Singh on 5/26/10.
-
Copyright 2010 MyCompanyName. All rights reserved.
*/
[/codebox]