unspecified launch failure

I’m trying to get some cuda code running on a Tesla C870, CUDA 1.0 but after I launch the kernel, printf(“cudaMalloc: %s\n”,cudaGetErrorString(cudaGetLastError())); prints “unspecified launch error”. I’ve minimalized the kernel to the point where it essentially does nothing, with the host allocating device memory calling the empty kernel, and then reading some data back out. here’s the code:

[codebox]#include <stdio.h>

#include <stdlib.h>

#include <malloc.h>

#include <cutil.h>

#define NTHREADS 4

#include <kernel.cu>

extern void initialize(float *x, float *y, float *z, float *v_x, float *v_y, float *v_z, int mm, float *ts, float *tscale,float *sh_force,float *sh_force2,i\

nt ntheads);

host int main(int argc, char **argv) {

    CUT_DEVICE_INIT();

    int mm = 8;

    int PARTSIZE = mm*mm*mm*4;

    int i;

    int nthreads = 32;

float den = 0.83134;

    float side = pow(PARTSIZE/den,(float)0.3333333);

    float ts, tscale;

float *ek, *sh_force, *sh_force2, *x, *y, *z, *v_x, *v_y, *v_z;

    float *d_sh_force, *d_sh_force2, *d_x, *d_y, *d_z, *d_vx, *d_vy, *d_vz;

    float *final_x, *final_y, *final_z;

    final_x = (float *)malloc(sizeof(float) * PARTSIZE);

    final_y = (float *)malloc(sizeof(float)* PARTSIZE);

    final_z = (float *)malloc(sizeof(float)*PARTSIZE);

sh_force = (float *)malloc(sizeof(float) * 3 * PARTSIZE);

    sh_force2 = (float *)malloc(sizeof(float) * 3 * PARTSIZE*NTHREADS);

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force, sizeof(float)*3*PARTSIZE));

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force2, sizeof(float)*3*PARTSIZE));

x = (float *)malloc(sizeof(float)*PARTSIZE);

    y = (float *)malloc(sizeof(float)*PARTSIZE);

    z = (float *)malloc(sizeof(float)*PARTSIZE);

    sh_force = (float *)malloc(sizeof(float) * 3 * PARTSIZE);

    sh_force2 = (float *)malloc(sizeof(float) * 3 * PARTSIZE*NTHREADS);

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force, sizeof(float)*3*PARTSIZE));

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_sh_force2, sizeof(float)*3*PARTSIZE));

x = (float *)malloc(sizeof(float)*PARTSIZE);

    y = (float *)malloc(sizeof(float)*PARTSIZE);

    z = (float *)malloc(sizeof(float)*PARTSIZE);

    v_x = (float *)malloc(sizeof(float)*PARTSIZE);

    v_y = (float *)malloc(sizeof(float)*PARTSIZE);

    v_z = (float *)malloc(sizeof(float)*PARTSIZE);

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_x, sizeof(float)*PARTSIZE));

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_y, sizeof(float)*PARTSIZE));

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_z, sizeof(float)*PARTSIZE));

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_vx,sizeof(float)*PARTSIZE));

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_vy,sizeof(float)*PARTSIZE));

    CUDA_SAFE_CALL(cudaMalloc((void**)&d_vz,sizeof(float)*PARTSIZE));

    printf("cudaMalloc: %s\n",cudaGetErrorString(cudaGetLastError()));

initialize(x,y,z,v_x,v_y,v_z,mm,&ts,&tscale,sh_force,sh_force2,NTHREADS);

CUDA_SAFE_CALL(cudaMemcpy(d_x,x,sizeof(float)*PARTSIZE,cudaM

emcpyHostToDevice));

    CUDA_SAFE_CALL(cudaMemcpy(d_y,y,sizeof(float)*PARTSIZE,cudaM

emcpyHostToDevice));

    CUDA_SAFE_CALL(cudaMemcpy(d_z,z,sizeof(float)*PARTSIZE,cudaM

emcpyHostToDevice));

    CUDA_SAFE_CALL(cudaMemcpy(d_vx,v_x,sizeof(float)*PARTSIZE,cu

daMemcpyHostToDevice));

    CUDA_SAFE_CALL(cudaMemcpy(d_vy,v_y,sizeof(float)*PARTSIZE,cu

daMemcpyHostToDevice));

    CUDA_SAFE_CALL(cudaMemcpy(d_vz,v_z,sizeof(float)*PARTSIZE,cu

daMemcpyHostToDevice));

    CUDA_SAFE_CALL(cudaMemcpy(d_sh_force,sh_force,sizeof(float)*

PARTSIZE*3,cudaMemcpyHostToDevice));

    CUDA_SAFE_CALL(cudaMemcpy(d_sh_force2,sh_force2,sizeof(float

)PARTSIZE3*NTHREADS,cudaMemcpyHostToDevice));

    printf("cudaMemcpy: %s\n",cudaGetErrorString(cudaGetLastError()));

dim3 grid(1,1,1);

    dim3 threads(NTHREADS, 1, 1);

printf(“Entering kernel.\n”);

mdsim<<<grid,threads,0>>>(mm, side, d_x,d_y, d_z, d_vx, d_vy, d_vz, d_sh_force, d_sh_force2, ts, tscale);

    printf("%s\n",cudaGetErrorString(cudaGetLastError()));

cudaThreadSynchronize();

    printf("%s\n",cudaGetErrorString(cudaGetLastError()));

    printf("0");

    CUDA_SAFE_CALL(cudaMemcpy(final_x, d_x, sizeof(float)*PARTSIZE,cudaMemcpyDeviceToHost));printf("1");

    CUDA_SAFE_CALL(cudaMemcpy(final_y, d_y, sizeof(float)*PARTSIZE,cudaMemcpyDeviceToHost));printf("2");

    CUDA_SAFE_CALL(cudaMemcpy(final_z, d_z, sizeof(float)*PARTSIZE,cudaMemcpyDeviceToHost));printf("3");

    CUDA_SAFE_CALL(cudaMemcpy(x,d_vx,sizeof(float)*PARTSIZE,cuda

MemcpyDeviceToHost));printf(“4”);

    CUDA_SAFE_CALL(cudaMemcpy(y,d_vy,sizeof(float)*PARTSIZE,cuda

MemcpyDeviceToHost));printf(“5”);

    CUDA_SAFE_CALL(cudaMemcpy(z,d_vz,sizeof(float)*PARTSIZE,cuda

MemcpyDeviceToHost));printf(“6”);

    printf("%s\n",cudaGetErrorString(cudaGetLastError()));

printf(“Kernel complete\n”);

printf("\n");

for(i=0;i<20;i++) printf("%f %f %f\n",final_x[i], final_y[i], final_z[i]);

    printf("\n");

    for(i=0;i<20;i++) printf("%f %f %f\n",x[i],y[i],z[i]);

CUDA_SAFE_CALL(cudaFree(d_x));

    CUDA_SAFE_CALL(cudaFree(d_y));

CUDA_SAFE_CALL(cudaFree(d_z));

CUDA_SAFE_CALL(cudaFree(d_vx));

CUDA_SAFE_CALL(cudaFree(d_vy));

CUDA_SAFE_CALL(cudaFree(d_vz));

CUDA_SAFE_CALL(cudaFree(d_sh_force2));

CUDA_SAFE_CALL(cudaFree(d_sh_force));

free(sh_force);

free(sh_force2);

free(x); free(y); free(z);

free(v_x); free(v_y); free(v_z);

    free(final_x); free(final_y); free(final_z);

printf(“run complete\n”);

    CUT_EXIT(argc,argv);

return(0);

}

[/codebox]

The kernel code is:

[codebox]global void mdsim(int mm, float side, float *x, float *y, float *z, float *v_x, float *v_y, float *v_z, float *sh_force, float *sh_force2, float ts,floa\

t tscale) {

}[/codebox]

Anyone know what could cause this? Could it be too much memory allocated?

Does this reproduce with CUDA_2.1 ?

have you tried it what problem do you get with this code