Hello,
Here is the code, which works fine with -deviceemu option (result is 17) and
doesn’t work on real GPU. Please help, what’s wrong and how to fix it?
[codebox]#include <stdio.h>
#include <cuda.h>
#define MAX_SIZE 255
global void strlenDevice(char *str, int *len) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
// Process each char in each thread
// If char is ‘\0’ then it’s possibly end-of-string
// Choose a minimal of that chars
if (i < MAX_SIZE)
if (str[i] == '\0') if ( i< *len) *len =i;
}
int main(void) {
char *device_str; // pointers to device memory
char str[MAX_SIZE]; // string
int *pLen = NULL;
int len;
// allocate string on device
cudaMalloc((void **) &device_str, MAX_SIZE);
// allocate length
cudaMalloc( (void**) &pLen, 4 );
// set string
strcpy(str, “Hello CUDA World!”);
// len = maximal value
len = MAX_SIZE;
// send data from host to device
cudaMemcpy (device_str, str, MAX_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy (pLen, &len, 4, cudaMemcpyHostToDevice );
// data copied on device, invoking kernel
strlenDevice <<< 1, 256 >>> (device_str, pLen);
// retrieve data from device
cudaMemcpy( &len, pLen, 4, cudaMemcpyDeviceToHost );
// print strlen
printf ("strlen = %d", len);
}[/codebox]