when i copy from " Unified Memory for CUDA Beginners" the code and compile with cmake
# CMakeLists.txt to build hellocuda.cu
cmake_minimum_required(VERSION 2.8)
find_package(CUDA QUIET REQUIRED)
# Specify binary name and source file to build it from
cuda_add_executable(
current
m4emtest.cu) << renamed the democode
the program crashes between "ST wo " AND "ST hree "
int main(void)
{
int N = 1<<20;
float *x, *y;
printf("ST art \n");
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&x, N*sizeof(float));
cudaMallocManaged(&y, N*sizeof(float));//1048576
// initialize x and y arrays on the host
printf("ST wo \n");
for (int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
}
printf("ST hree \n");
cudaMallocManaged seems not implemented (yet),
work around use “cudaMalloc” and the couple “cudaMemcpyHostToDevice” and “cudaMemcpyDeviceToHost”
demo program does work on Orin Nano Developerkit
#include <unistd.h>
#include <stdio.h>
/* we need these includes for CUDA's random number stuff */
#include <curand.h>
#include <curand_kernel.h>
#define N 25
#define MAX 501
/* this GPU kernel function is used to initialize the random states */
__global__ void init(unsigned int seed, curandState_t* states) {
/* we have to initialize the state */
curand_init(seed, /* the seed can be the same for each core, here we pass the time in from the CPU */
blockIdx.x, /* the sequence number should be different for each core (unless you want all
cores to get the same sequence of numbers for some reason - use thread id! */
0, /* the offset is how much extra we advance in the sequence for each call, can be 0 */
&states[blockIdx.x]);
}
/* this GPU kernel takes an array of states, and an array of ints, and puts a random int into each */
__global__ void randoms(curandState_t* states, unsigned int* numbers) {
/* curand works like rand - except that it takes a state as a parameter */
numbers[blockIdx.x] = curand(&states[blockIdx.x]) % 501;
}
int main() {
/* CUDA's random number library uses curandState_t to keep track of the seed value
we will store a random state for every thread */
curandState_t* states;
/* allocate space on the GPU for the random states */
cudaMalloc((void**) &states, N * sizeof(curandState_t));
/* invoke the GPU to initialize all of the random states */
init<<<N, 1>>>(time(0), states);
/* allocate an array of unsigned ints on the CPU and GPU */
unsigned int cpu_nums[N];
unsigned int* gpu_nums;
cudaMalloc((void**) &gpu_nums, N * sizeof(unsigned int));
/* invoke the kernel to get some random numbers */
randoms<<<N, 1>>>(states, gpu_nums);
/* copy the random numbers back */
cudaMemcpy(cpu_nums, gpu_nums, N * sizeof(unsigned int), cudaMemcpyDeviceToHost);
/* print them out */
for (int i = 0; i < N; i++) {
printf("%u\n", cpu_nums[i]);
}
/* free the memory we allocated for the states and numbers */
cudaFree(states);
cudaFree(gpu_nums);
return 0;
}