I’m trying to use cudaLaunch to be able to specify which kernel I’d like to execute on a particular piece of data, and every time I call cudaLaunch, I get a cudaErrorInvalidDeviceFunction error. Any help anyone can provide would be much appreciated.
System/CUDA Info
32-bit machine
CentOS release 5.2
8GB RAM
Cuda 2.2
GeForce 8800 GTX
Code
All the files are in the same directory & compile fine w/no errors. The code should just generate numbers from 0 to 255, print them out, pass them to the kernel to be scaled by 3, then print the output values.
launchTest.cc (main function)
#include <iostream>
#include <stdlib.h>
#include <string>
#include "dat_gpu.h"
#define N_ELEM 256
int main(int argc, char *argv[])
{
int inBuff[N_ELEM];
int outBuff[N_ELEM];
int i = 0;
int dataSize = N_ELEM * sizeof(int);
memset(inBuff, 0, dataSize);
memset(outBuff, 0, dataSize);
printf("Input Vector: ");
for (i = 0; i < N_ELEM; i++) {
inBuff[i] = i;
printf("%d ",inBuff[i]);
}
printf("\n\n");
dat_gpu("scale_vect_op", inBuff, outBuff);
printf("Output Vector: ");
for (i = 0; i < N_ELEM; i++) {
printf("%d ", outBuff[i]);
}
printf("\n\n");
return 0;
}
not shown: dat_gpu.h (just declares dat_gpu function)
dat_gpu.cu
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
#include "scale_vect_op.cu"
extern "C" int dat_gpu(const char * func, int * inData, int * outData) {
int data_size = 256 * sizeof(int);
int * gpu_in_data;
int * gpu_out_data;
cudaSetDevice( cutGetMaxGflopsDeviceId() );
cudaMalloc((void**)&gpu_in_data, data_size);
cudaMalloc((void**)&gpu_out_data, data_size);
cudaMemset(gpu_in_data, 0, data_size);
cudaMemset(gpu_out_data, 0, data_size);
cudaMemcpy(gpu_in_data, inData, data_size, cudaMemcpyHostToDevice);
dim3 gridDim(1, 1, 1);
dim3 blockDim(256, 1, 1);
size_t offset = 0;
cudaConfigureCall(gridDim, blockDim, data_size);
cudaSetupArgument(gpu_in_data, offset);
offset += sizeof(gpu_in_data);
cudaSetupArgument(gpu_out_data, offset);
cudaLaunch(func);
cudaThreadSynchronize();
cudaError_t err = cudaGetLastError();
if(err != cudaSuccess) {
const char * err_str = cudaGetErrorString(err);
printf("ERROR: ");
printf(err_str);
printf("\n");
}
cudaMemcpy(outData, gpu_out_data, data_size, cudaMemcpyDeviceToHost);
cudaFree(gpu_in_data);
cudaFree(gpu_out_data);
cudaThreadExit();
return 0;
}
scale_vect_op.cu
extern "C" __global__ static void scale_vect_op( int * inValues, int * outValues, int scale ) {
extern __shared__ int shared[];
const unsigned int tid = threadIdx.x;
shared[tid] = inValues[tid];
outValues[tid] = shared[tid] * scale;
__syncthreads();
}
Thanks for any help!