k20c Problem Dynamic Parallelism

I have a device with computability 3.5, and I’m testing the dynamic parallelism. I have programmed a simple example that compiles but does nothing to run. I have installed CUDA 5.

The example is hosted in a file .cu

#include
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>

using namespace std;

// Definitions

global void process (int *data);
global void process_kernel (int *data);

cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);

int main()
{
int *devicecurrent = new int;
int *data_g;
cudaDeviceProp *prop = new cudaDeviceProp();
int data[10];
for (int ii=0;ii<10;ii++) data[ii]=ii+1;

cudaMalloc((void**)&data_g,sizeof(int)*10);
cudaMemcpy(data_g,data,sizeof(int)*10,cudaMemcpyHostToDevice);

cudaSetDevice(0);
cudaGetDevice(devicecurrent);
cudaGetDeviceProperties(prop,*devicecurrent);
printf("\n%s\n",prop->name);


process <<<1,10>>> (data_g);
cudaDeviceSynchronize();

cudaFree(data_g);		 

}

global void process_kernel (int *data)
{

int idx = threadIdx.x;
printf("Hilo %d Data %d\n",idx,data[idx]);

}

global void process (int *data)

{

if (blockIdx.x == 0)
{			
	printf("Block 0\n");
	process_kernel <<<1,2>>> (data);	
	cudaDeviceSynchronize();	
}

}

The instruccion for compiling this code is:

$ nvcc -arch=compute_35 kernel.cu -o kernel

I wonder if this instruccion is correct, things missing or any errors because I can not use this important feature of K20C.

Thank you.

Check if your cudaDeviceSynchronize() calls return errors.