So I am writing a program to operate on matrices on the graphics card, and I have run into a problem. When I run this program on the my card, it works perfectly, and completes everything. When I run it in device emulation mode, however, it crashes. This is odd because this is the opposite behavior of what I would normally expect.
I have written a test case that demonstrates my problem. What it does, is it initializes the matrix, fills it, and prints it. The problem is in the filling step. It seems that the fill operation somehow goes out of bounds in emulation mode. Perhaps I am doing something wrong with the pitch? Here is the code, together with a zip file containing the code and a makefile:
// -----------------------------------------------------------------------------
//
// example.cu
//
// An example to illustrate a problem with segfaulting and such.
//
//
//
// -----------------------------------------------------------------------------
#include <cuda.h>
#include <stdio.h>
// -----------------------------------------------------------------------------
// The size of the square block.
#define FILL_BLOCK 16
// The size of the matrix we are working on.
#define MATRIX_ROWS 32
#define MATRIX_COLS 32
// -----------------------------------------------------------------------------
// This kernel fills each entry in the selected matrix with its index.
// The index is calculated as though the matrix was represented as one long
// array, rather than a padded, pitched array.
__global__ void fill (int * to_fill, int height, int width, int pitch) {
// Positional data
unsigned int row = blockIdx.x * FILL_BLOCK + threadIdx.x;
unsigned int col = blockIdx.y * FILL_BLOCK + threadIdx.y;
if ((row < height) && (col < width)) {
// The index in the matrix of the current element
unsigned int index = row * width + col;
// Pointer arithmetic as demonstrated in the cuda programming guide
int * row_p = to_fill + pitch * row;
#ifdef __DEVICE_EMULATION__
if (threadIdx.x == 0) {
printf("------- > Operating on column %d\n", col);
}
printf("Writing index %d to position %lp\n", index, &row_p[col]);
#endif // __DEVICE_EMULATION__
row_p[col] = index;
}
}
// -----------------------------------------------------------------------------
// The main function creates a matrix, fills it, and then attempts to print it.
int main() {
// ---------------------------------------------------------------------------
// Init phase
// ---------------------------------------------------------------------------
printf("\n\n ------- Entering program\n\n");
// The matrix structure itself
int * matrix;
size_t pitch;
int rows = MATRIX_ROWS;
int cols = MATRIX_COLS;
printf(" --- Allocating memory on device\n");
// Initialize the memory on the device
cudaError_t err = cudaMallocPitch((void **) &matrix, &pitch, sizeof(int) * cols, rows);
if (err != cudaSuccess) {
printf(" ************ Matrix Allocation Failed ************\n");
exit(-1);
}
printf(" --- The matrix is as such: \n");
printf(" --- pitch : %d\n", pitch);
printf(" --- rows : %d\n", rows);
printf(" --- cols : %d\n", cols);
printf(" --- location : %lp\n\n", matrix);
printf(" ------- Init phase complete\n\n");
// ---------------------------------------------------------------------------
// Filling phase
// ---------------------------------------------------------------------------
printf(" --- Calculating grid and block dimensions\n");
// Calculate the grid and block dimensions for the fill kernel
dim3 grid (rows / FILL_BLOCK + ((rows % FILL_BLOCK) == 0 ? 0 : 1),
cols / FILL_BLOCK + ((cols % FILL_BLOCK) == 0 ? 0 : 1),
1);
dim3 block (FILL_BLOCK, FILL_BLOCK, 1);
printf(" --- Calling fill with dimensions : \n");
printf(" --- Grid : %d x %d\n", grid.x, grid.y);
printf(" --- Block : %d x %d\n\n", block.x, block.y);
// Perform the fill
fill <<<grid, block>>> (matrix, rows, cols, pitch);
printf(" ------- Filling phase complete\n\n");
// ---------------------------------------------------------------------------
// Printing phase
// ---------------------------------------------------------------------------
// Create a local copy of the matrix and copy into it
int * local = (int *) malloc (sizeof(int) * pitch * rows);
printf(" --- Copying matrix into host memory space\n");
cudaMemcpy((void *) local, (void *) matrix, sizeof (int) * pitch * rows,
cudaMemcpyDeviceToHost);
printf(" --- Printing\n\n");
// Determine what the longest interget output will be.
int longest = 0;
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
char print[100];
sprintf(print, "%d", local[pitch * i + j]);
if (strlen(print) > longest) {
longest = strlen(print);
}
}
}
// Then, using the longest variable, we go ahead and print.
for (int i = 0; i < rows; i++) {
for (int j = 0; j < cols; j++) {
char printf_string[100];
sprintf(printf_string, "%%%dd ", longest);
printf(printf_string, local[pitch * i + j]);
}
printf("\n");
}
printf("\n\n");
printf(" ------- Printing phase complete\n");
// ---------------------------------------------------------------------------
// Done
// ---------------------------------------------------------------------------
printf(" --- Cleaning up\n\n\n");
free(local);
return 0;
}
// -----------------------------------------------------------------------------
example.zip (2.15 KB)