Segmentation fault (core dumped)

mahyarelc · September 8, 2011, 8:20am

Hi,

I just started programming with CUDA and I’m totally new with the environment.

as i execute the following code (adding to vectors), there is a problem: “Segmentation fault (core dumped)”

how can I fix the problem?

thanks

/* Vector addition: C = A + B.

This sample is a very basic sample that implements element by element

vector addition. It is the same as the sample illustrating Chapter 3

of the programming guide with some additions like error checking.

*/

// Includes

include <stdio.h>

include <cutil_inline.h>

// Variables

float* h_A;

float* h_B;

float* h_C;

float* d_A;

float* d_B;

float* d_C;

bool noprompt = false;

// Functions

void Cleanup(void);

void RandomInit(float*, int);

void ParseArguments(int, char**);

// Device code

global void VecAdd(const float* A, const float* B, float* C, int N)

{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < N)

    C[i] = A[i] + B[i];
}

int GlobalSize = 50000;

// Host code

int main(int argc, char** argv)

{
ParseArguments(argc, argv);
int N = GlobalSize;
printf("Vector addition : size %d\n", N);

size_t size = N * sizeof(float);
// Allocate input vectors h_A and h_B in host memory

malloc((void**)&h_A,size);

malloc((void**)&h_B,size);

// Initialize input vectors
RandomInit(h_A, N);

RandomInit(h_B, N);
// Allocate vectors in device memory

cudaMalloc((void**)&d_A,size);

cudaMalloc((void**)&d_B,size);

// Copy vectors from host memory to device memory

cudaMemcpy(d_A,&h_A,size,cudaMemcpyHostToDevice);

cudaMemcpy(d_B,&h_B,size,cudaMemcpyHostToDevice);

// Set the kernel arguments
int threadsPerBlock = 256;

int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
// Invoke kernel
VecAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, N);
cutilCheckMsg(“kernel launch failure”);

// Copy result from device memory to host memory
// h_C contains the result in host memory
cudaMemcpy(&h_C,d_C,size,cudaMemcpyDeviceToHost);

// Verify result
int i;

for (i = 0; i < N; ++i) {

    float sum = h_A[i] + h_B[i];

    if (fabs(h_C[i] - sum) > 1e-5)

        break;

}

printf("%s \n", (i == N) ? "PASSED" : "FAILED");
Cleanup();

}

void Cleanup(void)

{
// Free device memory

if (d_A)

    cudaFree(d_A);

if (d_B)

    cudaFree(d_B);

if (d_C)

    cudaFree(d_C);
// Free host memory
if (h_A)

    free(h_A);

if (h_B)

    free(h_B);

if (h_C)

    free(h_C);
cutilSafeCall( cudaThreadExit() );

exit(0);

}

// Allocates an array with random float entries.

void RandomInit(float* data, int n)

{
for (int i = 0; i < n; ++i)

    data[i] = rand() / (float)RAND_MAX;
}

// Parse program arguments

void ParseArguments(int argc, char** argv)

{
for (int i = 0; i < argc; ++i) {

    if (strcmp(argv[i], "--size") == 0 || strcmp(argv[i], "-size") == 0) {

              GlobalSize = atoi(argv[i+1]);

	  i = i + 1;

              break;

    }
}

}

ARom_nsk · September 8, 2011, 9:10am

Hi!

remove “&” sign in cudaMemcpy calls.

mahyarelc · September 8, 2011, 4:34pm

I removed them, but that wasn’t solved…

tera · September 8, 2011, 6:06pm

Allocate h_A and h_B with

h_A = malloc(size);

 h_B = malloc(size);

Properly #including <stdlib.h> would have caught this. Along the same lines, you should provide prototypes for functions in your code. If nothing else helps, a debugger will point out which source code line causes the segmentation fault.

duubey · May 13, 2017, 3:00am

I have similar problem while running cuda sdk 6.5 vector add program.

Vector addition: C = A + B.
This sample is a very basic sample that implements element by element
vector addition. It is the same as the sample illustrating Chapter 2
of the programming guide with some additions like error checking.
*/

#include <stdio.h>

// For the CUDA runtime routines (prefixed with “cuda_”)
#include <cuda_runtime.h>

/**

CUDA Kernel Device code
Computes the vector addition of A and B into C. The 3 vectors have the same
number of elements numElements.
*/
global void
vectorAdd(const float *A, const float *B, float *C, int numElements)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;

if (i < numElements)
{
C[i] = A[i] + B[i];
}
}

/**

Host main routine
*/
int
main(void)
{
// Error code to check return values for CUDA calls
cudaError_t err = cudaSuccess;

// Print the vector length to be used, and compute its size
int numElements = 50000;
size_t size = numElements * sizeof(float);
printf(“[Vector addition of %d elements]\n”, numElements);

// Allocate the host input vector A
float *h_A = (float *)malloc(size);

// Allocate the host input vector B
float *h_B = (float *)malloc(size);

// Allocate the host output vector C
float *h_C = (float *)malloc(size);

// Verify that allocations succeeded
if (h_A == NULL || h_B == NULL || h_C == NULL)
{
fprintf(stderr, “Failed to allocate host vectors!\n”);
exit(EXIT_FAILURE);
}

// Initialize the host input vectors
for (int i = 0; i < numElements; ++i)
{
h_A[i] = rand()/(float)RAND_MAX;
h_B[i] = rand()/(float)RAND_MAX;
}

// Allocate the device input vector A
float *d_A = NULL;
err = cudaMalloc((void **)&d_A, size);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to allocate device vector A (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Allocate the device input vector B
float *d_B = NULL;
err = cudaMalloc((void **)&d_B, size);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to allocate device vector B (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Allocate the device output vector C
float *d_C = NULL;
err = cudaMalloc((void **)&d_C, size);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to allocate device vector C (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Copy the host input vectors A and B in host memory to the device input vectors in
// device memory
printf(“Copy input data from the host memory to the CUDA device\n”);
err = cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to copy vector A from host to device (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

err = cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to copy vector B from host to device (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Launch the Vector Add CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid =(numElements + threadsPerBlock - 1) / threadsPerBlock;
printf(“CUDA kernel launch with %d blocks of %d threads\n”, blocksPerGrid, threadsPerBlock);
vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(d_A, d_B, d_C, numElements);
err = cudaGetLastError();

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to launch vectorAdd kernel (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Copy the device result vector in device memory to the host result vector
// in host memory.
printf(“Copy output data from the CUDA device to the host memory\n”);
err = cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to copy vector C from device to host (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Verify that the result vector is correct
for (int i = 0; i < numElements; ++i)
{
if (fabs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
{
fprintf(stderr, “Result verification failed at element %d!\n”, i);
exit(EXIT_FAILURE);
}
}

printf(“Test PASSED\n”);

// Free device global memory
err = cudaFree(d_A);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to free device vector A (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

err = cudaFree(d_B);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to free device vector B (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

err = cudaFree(d_C);

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to free device vector C (error code %s)!\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

// Free host memory
free(h_A);
free(h_B);
free(h_C);

// Reset the device and exit
// cudaDeviceReset causes the driver to clean up all state. While
// not mandatory in normal operation, it is good practice. It is also
// needed to ensure correct operation when the application is being
// profiled. Calling cudaDeviceReset causes all profile data to be
// flushed before the application exits
err = cudaDeviceReset();

if (err != cudaSuccess)
{
fprintf(stderr, “Failed to deinitialize the device! error=%s\n”, cudaGetErrorString(err));
exit(EXIT_FAILURE);
}

printf(“Done\n”);
return 0;
}

ERROR: Segmentation fault (core dumped)

Topic		Replies	Views
Segmentation fault when calling virtual function on host CUDA Programming and Performance	9	2490	September 10, 2019
The Cuda Programming Guide Samples Errors CUDA Programming and Performance	5	2155	August 26, 2009
Segmentation fault (core dumped) CUDA Programming and Performance	5	8133	September 14, 2021
segmentation fault at the first cudaMalloc with --device-emulation everything was fine CUDA Programming and Performance	10	4326	January 25, 2010
Segmentation fault (core dumped) CUDA Programming and Performance	5	4549	July 2, 2012
CUDA C++ Segmentation Fault CUDA Programming and Performance	7	14814	October 1, 2017
Problem with a kernel example and SDK please help - it's very important for me CUDA Programming and Performance	1	7954	October 13, 2010
compilation CUDA Programming and Performance	3	7874	March 25, 2010
Strange memory gremlins Getting pwned by pointers CUDA Programming and Performance	9	12180	July 1, 2009
CUDA segmentation Fault Error in Cudastream CUDA Programming and Performance	3	2748	April 24, 2017

Segmentation fault (core dumped)

Related topics