I have a device with computability 3.5, and I’m testing the dynamic parallelism. I have programmed a simple example that compiles but does nothing to run. I have installed CUDA 5.
The example is hosted in a file .cu
#include
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cuda_runtime.h>
using namespace std;
// Definitions
global void process (int *data);
global void process_kernel (int *data);
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);
int main()
{
int *devicecurrent = new int;
int *data_g;
cudaDeviceProp *prop = new cudaDeviceProp();
int data[10];
for (int ii=0;ii<10;ii++) data[ii]=ii+1;
cudaMalloc((void**)&data_g,sizeof(int)*10);
cudaMemcpy(data_g,data,sizeof(int)*10,cudaMemcpyHostToDevice);
cudaSetDevice(0);
cudaGetDevice(devicecurrent);
cudaGetDeviceProperties(prop,*devicecurrent);
printf("\n%s\n",prop->name);
process <<<1,10>>> (data_g);
cudaDeviceSynchronize();
cudaFree(data_g);
}
global void process_kernel (int *data)
{
int idx = threadIdx.x;
printf("Hilo %d Data %d\n",idx,data[idx]);
}
global void process (int *data)
{
if (blockIdx.x == 0)
{
printf("Block 0\n");
process_kernel <<<1,2>>> (data);
cudaDeviceSynchronize();
}
}
The instruccion for compiling this code is:
$ nvcc -arch=compute_35 kernel.cu -o kernel
I wonder if this instruccion is correct, things missing or any errors because I can not use this important feature of K20C.
Thank you.