Hello,
I just start developing CUDA Applications. I compiled the code example I posted at the end with
NSight and Nvidia CUDA Toolkit Version 5.0 and libcudart.so version 4.1.28. Running it on the same device (GeForce GTX 580M) is no problem.
But running it on a cluster(Tesla C2075, installed libcudart.so version is 4.2.9) leads to a problem with the cudaMalloc- and cudaMemCpy-functions.
The error occuring is: “device kernel image is invalid”
How can we solve this?
// example1.cpp : Defines the entry point for the console application.
//
#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <cutil.h>
// Kernel that executes on the CUDA device
__global__ void square_array(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N) a[idx] = a[idx] * a[idx];
}
// main routine that executes on the host
int main(int argc, char *argv[])
{
float *a_h, *a_d, *b_h, *b_d; // Pointer to host & device arrays
int N = atoi(argv[1]);
size_t size = N * sizeof(float);
a_h = (float *)malloc(size); // Allocate array on host
b_h = (float *)malloc(size); // Allocate array on host
CUDA_SAFE_CALL(cudaMalloc((void **) &a_d, size)); // Allocate array on device
CUDA_SAFE_CALL(cudaMalloc((void **) &b_d, size)); // Allocate array on device
// Initialize host array and copy it to CUDA device
for (int i=0; i<N; i++) a_h[i] = (float)i;
for (int i=0; i<N; i++) b_h[i] = (float)i;
CUDA_SAFE_CALL(cudaMemcpy(a_d, a_h, size, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpy(b_d, b_h, size, cudaMemcpyHostToDevice));
// Do calculation on device:
int block_size = atoi(argv[2]);
int n_blocks = N/block_size + (N%block_size == 0 ? 0:1);
square_array <<< n_blocks, block_size >>> (a_d, N);
// Retrieve result from device and store it in host array
CUDA_SAFE_CALL(cudaMemcpy(a_h, a_d, sizeof(float)*N, cudaMemcpyDeviceToHost));
CUDA_SAFE_CALL(cudaMemcpy(b_h, b_d, sizeof(float)*N, cudaMemcpyDeviceToHost));
//Print results
if ((argc == 4) && !(strcmp(argv[3], "-p"))) {
int i;
printf("
The results are:
");
for (i = 0; i < N; i++) {
printf("a[%d] = %f
", i, a_h[i]);
}
}
// Cleanup
free(a_h); cudaFree(a_d);
}