Bus Error after MAC OS X update

arupsarkar · November 19, 2010, 5:52am

Hi yesterday I updated to MAC OS X to version 10.6.5, after that my old programs are having Bus Error. All of them were working fine. If someone has a resolution to this problem I would really appreciate.

Regards,
Arup

arupsarkar · November 19, 2010, 5:52am

Hi yesterday I updated to MAC OS X to version 10.6.5, after that my old programs are having Bus Error. All of them were working fine. If someone has a resolution to this problem I would really appreciate.

Regards,
Arup

arupsarkar · November 24, 2010, 5:06am

Just wanted to update the forum where exactly I am getting the error, it is when I am doing cudaMemcpy of cudaMemcpyDeviceToHost. Please see my code below. Compilation is working fine, but during execution it is failing with “Bus Error”.

If I comment the following line, I do not get any error.

cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);

I would really appreciate if someone could help me.

#include <stdio.h>

#include "ArrayProcessing.cuh"

#include <cuda_runtime.h>

#include <cuda.h>

#include "cuPrintf.cu"

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char* msg);

//faster kernel execution using shared memory

__global__ void reverseArrayBlock(float * d_out, float * d_in)

{

   extern __shared__ float s_data[];

   int inOffset = blockDim.x * blockIdx.x;

   int in = inOffset + threadIdx.x;

//print in varable

   cuPrintf("Thread Number: %f\n", in);

//check the array index

   int arrayIdx = gridDim.x;

   cuPrintf("Array Index: %f \n", arrayIdx);

//Load one element per thread from device memory and store it 

   //in reverse order into temporary shared memory

   s_data[blockDim.x - 1 - threadIdx.x] = d_in[in];

//Block untill all threads in the block have written their data in shared memory

   __syncthreads();

// write data from shared memory in forward order, but to the reverse block offset as before

   int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);

int out = outOffset + threadIdx.x;

   d_out[out]= s_data[threadIdx.x];   

}

__global__ void device_Array_Increment_Kernel(float *a, int N)

{

   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   if (idx<N)

   {

      cuPrintf("Thread Index: %d\n", idx);

      a[idx]=a[idx]+1.f;

   } 

}

float * Array_Processing(float * a_d, int N)

{

   float * b_d; //array to be assigned to device

   float * c_d;

   float * d_a;

   float * h_a;

int blocksize = 4;

   int nBlocks = N/blocksize + (N%blocksize == 0?0:1);

int sharedMemSize = blocksize * sizeof(float);

printf("Number of blocks in the grid:%d\n", nBlocks);

   printf("Number of threads in each block:%d\n", blocksize);

//pointer memory allocation for incrementArray

   cudaMalloc((void **) &b_d, sizeof(float)*N);

   cudaMalloc((void **) &c_d, sizeof(float)*N);

   //copy the pointer from c++ code to device pointer

   cudaMemcpy(b_d,a_d, sizeof(float)*N,cudaMemcpyHostToDevice);

//Initialize the display cuPrintf

   cudaPrintfInit();

//call the kernel to increment array by one unit

   //device_Array_Increment_Kernel <<< nBlocks, blocksize >>> (b_d,N);

//pointer memory allocation for reverseArray

   cudaMalloc((void **) &d_a, sizeof(float)*N);

   cudaMalloc((void **) &h_a, sizeof(float)*N);

//call kernel to reverse the array.

   reverseArrayBlock <<< nBlocks, blocksize, sharedMemSize >>> (d_a, b_d);

cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);

// display the device's greeting

   cudaPrintfDisplay();

   cudaPrintfEnd();

printf("Array Processing from device complete. Transferring control to cpp file.!!!\n");

   return c_d;

}

void checkCUDAError(const char *msg)

{

    cudaError_t err = cudaGetLastError();

    if( cudaSuccess != err) 

    {

        fprintf(stderr, "Cuda error: %s: %s.\n", msg, 

                                  cudaGetErrorString( err) );

        exit(EXIT_FAILURE);

    }                         

}

arupsarkar · November 24, 2010, 5:06am

Just wanted to update the forum where exactly I am getting the error, it is when I am doing cudaMemcpy of cudaMemcpyDeviceToHost. Please see my code below. Compilation is working fine, but during execution it is failing with “Bus Error”.

If I comment the following line, I do not get any error.

cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);

I would really appreciate if someone could help me.

#include <stdio.h>

#include "ArrayProcessing.cuh"

#include <cuda_runtime.h>

#include <cuda.h>

#include "cuPrintf.cu"

// Simple utility function to check for CUDA runtime errors

void checkCUDAError(const char* msg);

//faster kernel execution using shared memory

__global__ void reverseArrayBlock(float * d_out, float * d_in)

{

   extern __shared__ float s_data[];

   int inOffset = blockDim.x * blockIdx.x;

   int in = inOffset + threadIdx.x;

//print in varable

   cuPrintf("Thread Number: %f\n", in);

//check the array index

   int arrayIdx = gridDim.x;

   cuPrintf("Array Index: %f \n", arrayIdx);

//Load one element per thread from device memory and store it 

   //in reverse order into temporary shared memory

   s_data[blockDim.x - 1 - threadIdx.x] = d_in[in];

//Block untill all threads in the block have written their data in shared memory

   __syncthreads();

// write data from shared memory in forward order, but to the reverse block offset as before

   int outOffset = blockDim.x * (gridDim.x - 1 - blockIdx.x);

int out = outOffset + threadIdx.x;

   d_out[out]= s_data[threadIdx.x];   

}

__global__ void device_Array_Increment_Kernel(float *a, int N)

{

   int idx = blockIdx.x * blockDim.x + threadIdx.x;

   if (idx<N)

   {

      cuPrintf("Thread Index: %d\n", idx);

      a[idx]=a[idx]+1.f;

   } 

}

float * Array_Processing(float * a_d, int N)

{

   float * b_d; //array to be assigned to device

   float * c_d;

   float * d_a;

   float * h_a;

int blocksize = 4;

   int nBlocks = N/blocksize + (N%blocksize == 0?0:1);

int sharedMemSize = blocksize * sizeof(float);

printf("Number of blocks in the grid:%d\n", nBlocks);

   printf("Number of threads in each block:%d\n", blocksize);

//pointer memory allocation for incrementArray

   cudaMalloc((void **) &b_d, sizeof(float)*N);

   cudaMalloc((void **) &c_d, sizeof(float)*N);

   //copy the pointer from c++ code to device pointer

   cudaMemcpy(b_d,a_d, sizeof(float)*N,cudaMemcpyHostToDevice);

//Initialize the display cuPrintf

   cudaPrintfInit();

//call the kernel to increment array by one unit

   //device_Array_Increment_Kernel <<< nBlocks, blocksize >>> (b_d,N);

//pointer memory allocation for reverseArray

   cudaMalloc((void **) &d_a, sizeof(float)*N);

   cudaMalloc((void **) &h_a, sizeof(float)*N);

//call kernel to reverse the array.

   reverseArrayBlock <<< nBlocks, blocksize, sharedMemSize >>> (d_a, b_d);

cudaMemcpy(h_a,d_a,sizeof(float)*N,cudaMemcpyDeviceToHost);

// display the device's greeting

   cudaPrintfDisplay();

   cudaPrintfEnd();

printf("Array Processing from device complete. Transferring control to cpp file.!!!\n");

   return c_d;

}

void checkCUDAError(const char *msg)

{

    cudaError_t err = cudaGetLastError();

    if( cudaSuccess != err) 

    {

        fprintf(stderr, "Cuda error: %s: %s.\n", msg, 

                                  cudaGetErrorString( err) );

        exit(EXIT_FAILURE);

    }                         

}