I’m trying to get some cuda code running on a Tesla C870, CUDA 1.0 but after I launch the kernel, printf(“cudaMalloc: %s\n”,cudaGetErrorString(cudaGetLastError())); prints “unspecified launch error”. I’ve minimalized the kernel to the point where it essentially does nothing, with the host allocating device memory calling the empty kernel, and then reading some data back out. here’s the code:
[codebox]#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <cutil.h>
#define NTHREADS 4
#include <kernel.cu>
extern void initialize(float *x, float *y, float *z, float *v_x, float *v_y, float *v_z, int mm, float *ts, float *tscale,float *sh_force,float *sh_force2,i\
nt ntheads);
host int main(int argc, char **argv) {
CUT_DEVICE_INIT();
int mm = 8;
int PARTSIZE = mm*mm*mm*4;
int i;
int nthreads = 32;
float den = 0.83134;
float side = pow(PARTSIZE/den,(float)0.3333333);
float ts, tscale;
float *ek, *sh_force, *sh_force2, *x, *y, *z, *v_x, *v_y, *v_z;
float *d_sh_force, *d_sh_force2, *d_x, *d_y, *d_z, *d_vx, *d_vy, *d_vz;
float *final_x, *final_y, *final_z;
final_x = (float *)malloc(sizeof(float) * PARTSIZE);
final_y = (float *)malloc(sizeof(float)* PARTSIZE);
final_z = (float *)malloc(sizeof(float)*PARTSIZE);
sh_force = (float *)malloc(sizeof(float) * 3 * PARTSIZE);
sh_force2 = (float *)malloc(sizeof(float) * 3 * PARTSIZE*NTHREADS);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force, sizeof(float)*3*PARTSIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force2, sizeof(float)*3*PARTSIZE));
x = (float *)malloc(sizeof(float)*PARTSIZE);
y = (float *)malloc(sizeof(float)*PARTSIZE);
z = (float *)malloc(sizeof(float)*PARTSIZE);
sh_force = (float *)malloc(sizeof(float) * 3 * PARTSIZE);
sh_force2 = (float *)malloc(sizeof(float) * 3 * PARTSIZE*NTHREADS);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force, sizeof(float)*3*PARTSIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force2, sizeof(float)*3*PARTSIZE));
x = (float *)malloc(sizeof(float)*PARTSIZE);
y = (float *)malloc(sizeof(float)*PARTSIZE);
z = (float *)malloc(sizeof(float)*PARTSIZE);
v_x = (float *)malloc(sizeof(float)*PARTSIZE);
v_y = (float *)malloc(sizeof(float)*PARTSIZE);
v_z = (float *)malloc(sizeof(float)*PARTSIZE);
CUDA_SAFE_CALL(cudaMalloc((void**)&d_x, sizeof(float)*PARTSIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_y, sizeof(float)*PARTSIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_z, sizeof(float)*PARTSIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_vx,sizeof(float)*PARTSIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_vy,sizeof(float)*PARTSIZE));
CUDA_SAFE_CALL(cudaMalloc((void**)&d_vz,sizeof(float)*PARTSIZE));
printf("cudaMalloc: %s\n",cudaGetErrorString(cudaGetLastError()));
initialize(x,y,z,v_x,v_y,v_z,mm,&ts,&tscale,sh_force,sh_force2,NTHREADS);
CUDA_SAFE_CALL(cudaMemcpy(d_x,x,sizeof(float)*PARTSIZE,cudaM
emcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_y,y,sizeof(float)*PARTSIZE,cudaM
emcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_z,z,sizeof(float)*PARTSIZE,cudaM
emcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_vx,v_x,sizeof(float)*PARTSIZE,cu
daMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_vy,v_y,sizeof(float)*PARTSIZE,cu
daMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_vz,v_z,sizeof(float)*PARTSIZE,cu
daMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_sh_force,sh_force,sizeof(float)*
PARTSIZE*3,cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(d_sh_force2,sh_force2,sizeof(float
)PARTSIZE3*NTHREADS,cudaMemcpyHostToDevice));
printf("cudaMemcpy: %s\n",cudaGetErrorString(cudaGetLastError()));
dim3 grid(1,1,1);
dim3 threads(NTHREADS, 1, 1);
printf(“Entering kernel.\n”);
mdsim<<<grid,threads,0>>>(mm, side, d_x,d_y, d_z, d_vx, d_vy, d_vz, d_sh_force, d_sh_force2, ts, tscale);
printf("%s\n",cudaGetErrorString(cudaGetLastError()));
cudaThreadSynchronize();
printf("%s\n",cudaGetErrorString(cudaGetLastError()));
printf("0");
CUDA_SAFE_CALL(cudaMemcpy(final_x, d_x, sizeof(float)*PARTSIZE,cudaMemcpyDeviceToHost));printf("1");
CUDA_SAFE_CALL(cudaMemcpy(final_y, d_y, sizeof(float)*PARTSIZE,cudaMemcpyDeviceToHost));printf("2");
CUDA_SAFE_CALL(cudaMemcpy(final_z, d_z, sizeof(float)*PARTSIZE,cudaMemcpyDeviceToHost));printf("3");
CUDA_SAFE_CALL(cudaMemcpy(x,d_vx,sizeof(float)*PARTSIZE,cuda
MemcpyDeviceToHost));printf(“4”);
CUDA_SAFE_CALL(cudaMemcpy(y,d_vy,sizeof(float)*PARTSIZE,cuda
MemcpyDeviceToHost));printf(“5”);
CUDA_SAFE_CALL(cudaMemcpy(z,d_vz,sizeof(float)*PARTSIZE,cuda
MemcpyDeviceToHost));printf(“6”);
printf("%s\n",cudaGetErrorString(cudaGetLastError()));
printf(“Kernel complete\n”);
printf("\n");
for(i=0;i<20;i++) printf("%f %f %f\n",final_x[i], final_y[i], final_z[i]);
printf("\n");
for(i=0;i<20;i++) printf("%f %f %f\n",x[i],y[i],z[i]);
CUDA_SAFE_CALL(cudaFree(d_x));
CUDA_SAFE_CALL(cudaFree(d_y));
CUDA_SAFE_CALL(cudaFree(d_z));
CUDA_SAFE_CALL(cudaFree(d_vx));
CUDA_SAFE_CALL(cudaFree(d_vy));
CUDA_SAFE_CALL(cudaFree(d_vz));
CUDA_SAFE_CALL(cudaFree(d_sh_force2));
CUDA_SAFE_CALL(cudaFree(d_sh_force));
free(sh_force);
free(sh_force2);
free(x); free(y); free(z);
free(v_x); free(v_y); free(v_z);
free(final_x); free(final_y); free(final_z);
printf(“run complete\n”);
CUT_EXIT(argc,argv);
return(0);
}
[/codebox]
The kernel code is:
[codebox]global void mdsim(int mm, float side, float *x, float *y, float *z, float *v_x, float *v_y, float *v_z, float *sh_force, float *sh_force2, float ts,floa\
t tscale) {
}[/codebox]
Anyone know what could cause this? Could it be too much memory allocated?