Would any of you please mind running or having a look at this code and seeing if it works for you? I’m not even calling a kernel.
I’m struggling with this one and am beginning to think that my implementation must be buggy or unstable. Weird things are happening here on x86_64 in linux newest 3.1, and also with a stable 3.0. Or, maybe I’m just coding something wrong.
The operations seem so basic and yet the results are so weird…I must be missing something silly!?
Or, is this a bug?
[codebox]/*
-
File: testHMAX.cu
-
Author: martin
-
Created on August 14, 2010, 3:29 AM
*/
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#define TYPE float
void checkError(char* file, int line);
/*
*/
int main(int argc, char** argv) {
int image_width = 5;
int image_height = 5;
int patch_width = 3;
int patch_height = 3;
int filter_width = 3;
int filter_height = 3;
TYPE image[image_height][image_width];
TYPE filters[1][3][3];
TYPE patches[1][3][3];
printf(“Here is the image: \n”);
for(int i=0;i<image_height;i++) { //new image and filter
for(int j=0;j<image_width;j++) {
image[i][j]=(TYPE)(rand()%256);
printf("%d ", (int) (image[i][j]));
}
printf("\n");
}
printf(“Here is the patch: \n”);
for(int i=0;i<patch_height;i++) { //new image and filter
for(int j=0;j<patch_width;j++) {
patches[1][i][j]=(TYPE)(rand()%256);
printf("%d ", (int) (patches[1][i][j]));
}
printf("\n");
}
printf(“Here is the filter: \n”);
for(int i=0;i<patch_height;i++) { //new image and filter
for(int j=0;j<patch_width;j++) {
filters[1][i][j]=(TYPE)(rand()%256);
printf("%d ", (int) (filters[1][i][j]));
}
printf("\n");
}
int num_filters = 1;
int num_patches = 1;
int filter_rows[num_filters];
int filter_columns[num_filters];
for(int i=0;i<num_filters;i++) {
filter_rows[i] = 3;
filter_columns[i] =3;
}
/images/
TYPE *imageGPU;
/*filters*/
TYPE *filtersGPU[num_filters];
/*patches*/
TYPE *patchesGPU[num_patches];
size_t filterpitches[num_filters];
size_t patchpitches[num_patches];
/* Allocate and copy memory to GPU */
for(int i=0;i<num_filters;i++) {
printf("Trying to allocate space for a filter on the GPU...\n");
cudaMallocPitch((void**) &(filtersGPU[i]), &filterpitches[i],filter_columns[i] * sizeof(TYPE),filter_rows[i]);
checkError(__FILE__,__LINE__);
/*
printf("Trying to delete space to zero for a filter on the GPU...\n");
cudaMemset2D(filtersGPU[i], filterpitches[i],0, filterpitches[i] * filter_columns[i],filter_rows[i]);
checkError(__FILE__,__LINE__);
*/
printf(“Copying filter to GPU!\n”);
cudaMemcpy2D((void**) filtersGPU[i],filterpitches[i],filters[i],filter_columns[i] * sizeof(TYPE ),filter_columns[i] * sizeof(TYPE),filter_rows[i],cudaMemcpyHostToDevice);
checkError(__FILE__,__LINE__);
cudaThreadSynchronize();
}
cudaThreadSynchronize();
printf("Finished allocating filters on GPU with pitches!\n");
for(int i=0;i<num_filters;i++) {
printf("Here is the filter from the CPU before copying!!\n");
for(int k=0;k<filter_height;k++) { //new image and filter
for(int j=0;j<filter_width;j++) {
printf("%f ", (TYPE) filters[1][k][j]);
}
printf("\n");
}
printf(“Here is the filter from the GPU after copying!!\n”);
TYPE tmp[filter_rows[i]][filter_columns[i]];
cudaMemcpy2D((void**) tmp,filter_columns[i] * sizeof(TYPE),filtersGPU[i],filterpitches[i],filter_columns[i
] * sizeof(TYPE),filter_rows[i],cudaMemcpyDeviceToHost);
cudaThreadSynchronize();
checkError(__FILE__,__LINE__);
for(int k=0;k<filter_height;k++) { //new image and filter
for(int j=0;j<filter_width;j++) {
printf("%d ", (int) tmp[k][j]);
}
printf("\n");
}
for(int j=0;j<filter_rows[i];j++) {
printf("Freeing row %d\n",j);
cudaFree(tmp[j]);
}
cudaFree(tmp);
/*
TYPE **tmp = (TYPE**)malloc(filter_rows[i]*sizeof(TYPE*));
for(int j=0;j<filter_rows[i];j++)
tmp[j] = (TYPE*)malloc(filter_columns[i]*sizeof(TYPE));
cudaMemcpy2D(tmp,filter_columns[i] * sizeof(TYPE),filtersGPU[i],filterpitches[i],filter_columns[i
] * sizeof(TYPE),filter_rows[i],cudaMemcpyDeviceToHost);
checkError(__FILE__,__LINE__);
print(tmp,filter_columns[i],filter_rows[i]);
*/
/* for(j=0;j<filter_rows[i];j++)
free(tmp[j]);
free(tmp);
*/
}
fflush(stdout);
fflush(stderr);
return (EXIT_SUCCESS);
}
void checkError(char* file,int line) {
cudaError_t err = cudaGetLastError();
if (cudaSuccess != err) {
printf("There was an error!\n");
fprintf(stderr,"Cuda error: %s in file '%s' in line %i : %s.\n",
"CUDA operation failed",file,line, cudaGetErrorString(err));
printf("error code = %d\n", err);
fflush(stdout);
fflush(stderr);
fflush ((FILE*)0);
exit(EXIT_FAILURE);
}
}[/codebox]
And here is the output I get:
[codebox]Here is the image:
103 198 105 115 81
255 74 236 41 205
186 171 242 251 227
70 124 194 84 248
27 232 231 141 118
Here is the patch:
90 46 99
51 159 201
154 102 50
Here is the filter:
13 183 49
88 163 90
37 93 5
Trying to allocate space for a filter on the GPU…
Copying filter to GPU!
Finished allocating filters on GPU with pitches!
Here is the filter from the CPU before copying!!
13.000000 183.000000 49.000000
88.000000 163.000000 -1719463078730986946560.000000
0.000000 -1719458575131359576064.000000 0.000000
Here is the filter from the GPU after copying!!
51 159 201
154 102 50
0 0 -2147483648
Freeing row 0
Freeing row 1
Freeing row 2
[/codebox]
Then, the next time I run the same compiled code, it’ll be something really weird like this:
[codebox]Here is the image:
103 198 105 115 81
255 74 236 41 205
186 171 242 251 227
70 124 194 84 248
27 232 231 141 118
Here is the patch:
90 46 99
51 159 201
154 102 50
Here is the filter:
13.000000 183.000000 49.000000
88.000000 163.000000 90.000000
37.000000 93.000000 5.000000
Trying to allocate space for a filter on the GPU…
Copying filter to GPU!
Finished allocating filters on GPU with pitches!
Here is the filter from the CPU before copying!!
13.000000 183.000000 49.000000
88.000000 163.000000 -3492112793945374751089657643008.000000
0.000000 -3492103122538817834056259993600.000000 0.000000
Here is the filter from the GPU after copying!!
51 159 201
154 102 50
0 0 -2147483648
Freeing row 0
Freeing row 1
Freeing row 2
[/codebox]