Hi yesterday I updated to MAC OS X to version 10.6.5, after that my old programs are having Bus Error. All of them were working fine. If someone has a resolution to this problem I would really appreciate.
Regards,
Arup
Hi yesterday I updated to MAC OS X to version 10.6.5, after that my old programs are having Bus Error. All of them were working fine. If someone has a resolution to this problem I would really appreciate.
Regards,
Arup
Hi yesterday I updated to MAC OS X to version 10.6.5, after that my old programs are having Bus Error. All of them were working fine. If someone has a resolution to this problem I would really appreciate.
Regards,
Arup
Just wanted to update the forum where exactly I am getting the error, it is when I am doing cudaMemcpy of cudaMemcpyDeviceToHost. Please see my code below. Compilation is working fine, but during execution it is failing with “Bus Error”.
If I comment the following line, I do not get any error.
cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);
I would really appreciate if someone could help me.
#include <stdio.h>
#include "ArrayProcessing.cuh"
#include <cuda_runtime.h>
#include <cuda.h>
#include "cuPrintf.cu"
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
//faster kernel execution using shared memory
__global__ void reverseArrayBlock(float * d_out, float * d_in)
{
extern __shared__ float s_data[];
int inOffset = blockDim.x * blockIdx.x;
int in = inOffset + threadIdx.x;
//print in varable
cuPrintf("Thread Number: %f\n", in);
//check the array index
int arrayIdx = gridDim.x;
cuPrintf("Array Index: %f \n", arrayIdx);
//Load one element per thread from device memory and store it
//in reverse order into temporary shared memory
s_data[blockDim.x - 1 - threadIdx.x] = d_in[in];
//Block untill all threads in the block have written their data in shared memory
__syncthreads();
// write data from shared memory in forward order, but to the reverse block offset as before
int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
int out = outOffset + threadIdx.x;
d_out[out]= s_data[threadIdx.x];
}
__global__ void device_Array_Increment_Kernel(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N)
{
cuPrintf("Thread Index: %d\n", idx);
a[idx]=a[idx]+1.f;
}
}
float * Array_Processing(float * a_d, int N)
{
float * b_d; //array to be assigned to device
float * c_d;
float * d_a;
float * h_a;
int blocksize = 4;
int nBlocks = N/blocksize + (N%blocksize == 0?0:1);
int sharedMemSize = blocksize * sizeof(float);
printf("Number of blocks in the grid:%d\n", nBlocks);
printf("Number of threads in each block:%d\n", blocksize);
//pointer memory allocation for incrementArray
cudaMalloc((void **) &b_d, sizeof(float)*N);
cudaMalloc((void **) &c_d, sizeof(float)*N);
//copy the pointer from c++ code to device pointer
cudaMemcpy(b_d,a_d, sizeof(float)*N,cudaMemcpyHostToDevice);
//Initialize the display cuPrintf
cudaPrintfInit();
//call the kernel to increment array by one unit
//device_Array_Increment_Kernel <<< nBlocks, blocksize >>> (b_d,N);
//pointer memory allocation for reverseArray
cudaMalloc((void **) &d_a, sizeof(float)*N);
cudaMalloc((void **) &h_a, sizeof(float)*N);
//call kernel to reverse the array.
reverseArrayBlock <<< nBlocks, blocksize, sharedMemSize >>> (d_a, b_d);
cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);
// display the device's greeting
cudaPrintfDisplay();
cudaPrintfEnd();
printf("Array Processing from device complete. Transferring control to cpp file.!!!\n");
return c_d;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
Just wanted to update the forum where exactly I am getting the error, it is when I am doing cudaMemcpy of cudaMemcpyDeviceToHost. Please see my code below. Compilation is working fine, but during execution it is failing with “Bus Error”.
If I comment the following line, I do not get any error.
cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);
I would really appreciate if someone could help me.
#include <stdio.h>
#include "ArrayProcessing.cuh"
#include <cuda_runtime.h>
#include <cuda.h>
#include "cuPrintf.cu"
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
//faster kernel execution using shared memory
__global__ void reverseArrayBlock(float * d_out, float * d_in)
{
extern __shared__ float s_data[];
int inOffset = blockDim.x * blockIdx.x;
int in = inOffset + threadIdx.x;
//print in varable
cuPrintf("Thread Number: %f\n", in);
//check the array index
int arrayIdx = gridDim.x;
cuPrintf("Array Index: %f \n", arrayIdx);
//Load one element per thread from device memory and store it
//in reverse order into temporary shared memory
s_data[blockDim.x - 1 - threadIdx.x] = d_in[in];
//Block untill all threads in the block have written their data in shared memory
__syncthreads();
// write data from shared memory in forward order, but to the reverse block offset as before
int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);
int out = outOffset + threadIdx.x;
d_out[out]= s_data[threadIdx.x];
}
__global__ void device_Array_Increment_Kernel(float *a, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx<N)
{
cuPrintf("Thread Index: %d\n", idx);
a[idx]=a[idx]+1.f;
}
}
float * Array_Processing(float * a_d, int N)
{
float * b_d; //array to be assigned to device
float * c_d;
float * d_a;
float * h_a;
int blocksize = 4;
int nBlocks = N/blocksize + (N%blocksize == 0?0:1);
int sharedMemSize = blocksize * sizeof(float);
printf("Number of blocks in the grid:%d\n", nBlocks);
printf("Number of threads in each block:%d\n", blocksize);
//pointer memory allocation for incrementArray
cudaMalloc((void **) &b_d, sizeof(float)*N);
cudaMalloc((void **) &c_d, sizeof(float)*N);
//copy the pointer from c++ code to device pointer
cudaMemcpy(b_d,a_d, sizeof(float)*N,cudaMemcpyHostToDevice);
//Initialize the display cuPrintf
cudaPrintfInit();
//call the kernel to increment array by one unit
//device_Array_Increment_Kernel <<< nBlocks, blocksize >>> (b_d,N);
//pointer memory allocation for reverseArray
cudaMalloc((void **) &d_a, sizeof(float)*N);
cudaMalloc((void **) &h_a, sizeof(float)*N);
//call kernel to reverse the array.
reverseArrayBlock <<< nBlocks, blocksize, sharedMemSize >>> (d_a, b_d);
cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);
// display the device's greeting
cudaPrintfDisplay();
cudaPrintfEnd();
printf("Array Processing from device complete. Transferring control to cpp file.!!!\n");
return c_d;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg,
cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}