Corruption in memory copy?

Hello,

I have a problem in which I have no idea what’s going wrong. After commenting some lines to isolate the origin the problem, my code is similar to this one:

[codebox]#define SURF_DESCRIPTOR_SIZE 64

#define CORRELATION_THRESH 0.99

#define TILE_WIDTH 16

#define MAX_THREADS_PER_BLOCK 512

#define MEAN_SDV_THREADS MAX_THREADS_PER_BLOCK / SURF_DESCRIPTOR_SIZE

global

void calcMean(float * desc1, float * desc2, float * m1, float * m2) {

unsigned int tx = threadIdx.x;

unsigned int ty = threadIdx.y;

unsigned int bx = blockIdx.x;

unsigned int by = blockIdx.y;

if (tx == 0) {

    m1[bx * MEAN_SDV_THREADS + ty] = desc1[(bx * (MEAN_SDV_THREADS) + ty) * SURF_DESCRIPTOR_SIZE + tx]; //partialSum1[ty][0]; // SURF_DESCRIPTOR_SIZE;

}

}

void Match(vector<t_Point> points1, vector<t_Point> points2, vector desc1, vector desc2, vector &matches, t_Timings &timings) {

cout << "Points1 " << points1.size() << endl;

cout << "Points2 " << points2.size() << endl;

// Sizes must be multiples of TILE_WIDTH

int size1 = (int(points1.size() / TILE_WIDTH) + 1) * TILE_WIDTH;

int size2 = (int(points2.size() / TILE_WIDTH) + 1) * TILE_WIDTH;

int size = max(size1, size2);

// Host memory pointers

float * h_desc1 = (float *)malloc(size * SURF_DESCRIPTOR_SIZE * sizeof(float));

float * h_desc2 = (float *)malloc(size * SURF_DESCRIPTOR_SIZE * sizeof(float));

float * h_response1 = (float *)malloc(size1 * sizeof(float));

float * h_response2 = (float *)malloc(size2 * sizeof(float));

// Initialization

for (int i = 0; i < desc1.size(); i++) {

    h_desc1[i] = desc1.at(i);

}

for (int i = 0; i < desc2.size(); i++) {

    h_desc2[i] = desc2.at(i);

}

for (int i = 0; i < points1.size(); i++) {

    h_response1[i] = points1.at(i).response;

}

for (int i = 0; i < points2.size(); i++) {

    h_response2[i] = points2.at(i).response;

}

// Extra values are setted to 0

for (int i = desc1.size(); i < size * SURF_DESCRIPTOR_SIZE; i++) {

    h_desc1[i] = 0;

}

for (int i = desc2.size(); i < size * SURF_DESCRIPTOR_SIZE; i++) {

    h_desc2[i] = 0;

}    

// Device pointers

float * d_corr;

float * d_m1;

float * d_m2;

float * d_sdv1;

float * d_sdv2;

bool * d_response1;

bool * d_response2;

int * d_bestCorr1;

int * d_bestCorr2;

int * d_matches;    

// Memory allocation

cutilSafeCall(cudaMalloc(&d_m1, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_sdv1, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_m2, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_sdv2, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_desc1, size * SURF_DESCRIPTOR_SIZE * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_desc2, size * SURF_DESCRIPTOR_SIZE * sizeof(float)));

// Memory copy

cutilSafeCall(cudaMemcpy(d_desc1, h_desc1, size * sizeof(float), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_desc2, h_desc2, size * sizeof(float), cudaMemcpyHostToDevice));    

// Mean is calculated

dim3 dimBlockMeanSdv(SURF_DESCRIPTOR_SIZE, MEAN_SDV_THREADS);

dim3 dimGridMeanSdv(size / dimBlockMeanSdv.y, 1);

calcMean <<< dimGridMeanSdv, dimBlockMeanSdv >>> (d_desc1, d_desc2, d_m1, d_m2);

cudaThreadSynchronize();

// Results are shown

for (int i = 0; i < 65; i++) {

    cout << i << "[" << h_desc1[i * SURF_DESCRIPTOR_SIZE] << "]";

}

float * m1 = (float *)malloc(size * sizeof(float));

cutilSafeCall(cudaMemcpy(m1, d_m1, size * sizeof(float), cudaMemcpyDeviceToHost));

cout << dimBlockMeanSdv.x << endl;

for (int i = 0; i < 65; i++) {

    cout << i << "[" << m1[i] << "]";

}

cout << endl;

free(m1);

}[/codebox]

The output for a given example is this:

[codebox]Points1 2136

Points2 2090

64

0[-9.84887e-07][-9.84887e-07]

1[-9.92876e-07][-9.92876e-07]

2[7.63803e-07][7.63803e-07]

3[-5.75615e-06][-5.75615e-06]

4[5.28887e-06][5.28887e-06]

32[0.00377774][0.00377774]

33[-2.09816e-06][-2.09816e-06]

34[0.00188525][-3.02126e-06] <<- At this point, the two arrays have different values. Why not before?

35[-0.00295363][-7.57153e-05]

36[0.00315898][-2.0971e-05]

37[-0.00191124][5.81767e-07]

38[-0.00143699][0.000399498]

39[0.000764901][2.25177e-06]

40[-0.0060265][2.95596e-07]

41[-0.0490147][-0.00174616]

42[-0.0559343][8.19524e-06]

43[nan][-0.00516252]

44[nan][-7.84284e-06]

45[nan][-2.47702e-06]

46[nan][-3.99062e-06]

47[nan][5.18736e-06]

48[nan][-4.69436e-06]

49[nan][0.000917098]

50[nan][5.49397e-05]

51[nan][1.4918e-06]

52[nan][0.00240714]

53[nan][-6.25982e-06]

54[nan][0.000783656]

55[nan][0.0161487]

56[nan][0.00147562]

57[nan][-1.23446e-06]

58[nan][-2.06568e-07]

59[nan][-1.09316e-06]

60[nan][-2.11338e-06]

61[nan][0.0115776]

62[nan][0.00340284]

63[nan][-5.88755e-07]

64[nan][-0.00109655][/codebox]

I have tryied a lot of tests, but I don’t understand the cause of the problem. Does anybody know why? I’m working with a GeForce 9800 GT and an Intel Core 2 Duo CPU E8400 at 3.00GHz with 4GB of RAM.

Thank you in advance,

Néstor

Hello,

I have a problem in which I have no idea what’s going wrong. After commenting some lines to isolate the origin the problem, my code is similar to this one:

[codebox]#define SURF_DESCRIPTOR_SIZE 64

#define CORRELATION_THRESH 0.99

#define TILE_WIDTH 16

#define MAX_THREADS_PER_BLOCK 512

#define MEAN_SDV_THREADS MAX_THREADS_PER_BLOCK / SURF_DESCRIPTOR_SIZE

global

void calcMean(float * desc1, float * desc2, float * m1, float * m2) {

unsigned int tx = threadIdx.x;

unsigned int ty = threadIdx.y;

unsigned int bx = blockIdx.x;

unsigned int by = blockIdx.y;

if (tx == 0) {

    m1[bx * MEAN_SDV_THREADS + ty] = desc1[(bx * (MEAN_SDV_THREADS) + ty) * SURF_DESCRIPTOR_SIZE + tx]; //partialSum1[ty][0]; // SURF_DESCRIPTOR_SIZE;

}

}

void Match(vector<t_Point> points1, vector<t_Point> points2, vector desc1, vector desc2, vector &matches, t_Timings &timings) {

cout << "Points1 " << points1.size() << endl;

cout << "Points2 " << points2.size() << endl;

// Sizes must be multiples of TILE_WIDTH

int size1 = (int(points1.size() / TILE_WIDTH) + 1) * TILE_WIDTH;

int size2 = (int(points2.size() / TILE_WIDTH) + 1) * TILE_WIDTH;

int size = max(size1, size2);

// Host memory pointers

float * h_desc1 = (float *)malloc(size * SURF_DESCRIPTOR_SIZE * sizeof(float));

float * h_desc2 = (float *)malloc(size * SURF_DESCRIPTOR_SIZE * sizeof(float));

float * h_response1 = (float *)malloc(size1 * sizeof(float));

float * h_response2 = (float *)malloc(size2 * sizeof(float));

// Initialization

for (int i = 0; i < desc1.size(); i++) {

    h_desc1[i] = desc1.at(i);

}

for (int i = 0; i < desc2.size(); i++) {

    h_desc2[i] = desc2.at(i);

}

for (int i = 0; i < points1.size(); i++) {

    h_response1[i] = points1.at(i).response;

}

for (int i = 0; i < points2.size(); i++) {

    h_response2[i] = points2.at(i).response;

}

// Extra values are setted to 0

for (int i = desc1.size(); i < size * SURF_DESCRIPTOR_SIZE; i++) {

    h_desc1[i] = 0;

}

for (int i = desc2.size(); i < size * SURF_DESCRIPTOR_SIZE; i++) {

    h_desc2[i] = 0;

}    

// Device pointers

float * d_corr;

float * d_m1;

float * d_m2;

float * d_sdv1;

float * d_sdv2;

bool * d_response1;

bool * d_response2;

int * d_bestCorr1;

int * d_bestCorr2;

int * d_matches;    

// Memory allocation

cutilSafeCall(cudaMalloc(&d_m1, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_sdv1, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_m2, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_sdv2, size * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_desc1, size * SURF_DESCRIPTOR_SIZE * sizeof(float)));

cutilSafeCall(cudaMalloc(&d_desc2, size * SURF_DESCRIPTOR_SIZE * sizeof(float)));

// Memory copy

cutilSafeCall(cudaMemcpy(d_desc1, h_desc1, size * sizeof(float), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_desc2, h_desc2, size * sizeof(float), cudaMemcpyHostToDevice));    

// Mean is calculated

dim3 dimBlockMeanSdv(SURF_DESCRIPTOR_SIZE, MEAN_SDV_THREADS);

dim3 dimGridMeanSdv(size / dimBlockMeanSdv.y, 1);

calcMean <<< dimGridMeanSdv, dimBlockMeanSdv >>> (d_desc1, d_desc2, d_m1, d_m2);

cudaThreadSynchronize();

// Results are shown

for (int i = 0; i < 65; i++) {

    cout << i << "[" << h_desc1[i * SURF_DESCRIPTOR_SIZE] << "]";

}

float * m1 = (float *)malloc(size * sizeof(float));

cutilSafeCall(cudaMemcpy(m1, d_m1, size * sizeof(float), cudaMemcpyDeviceToHost));

cout << dimBlockMeanSdv.x << endl;

for (int i = 0; i < 65; i++) {

    cout << i << "[" << m1[i] << "]";

}

cout << endl;

free(m1);

}[/codebox]

The output for a given example is this:

[codebox]Points1 2136

Points2 2090

64

0[-9.84887e-07][-9.84887e-07]

1[-9.92876e-07][-9.92876e-07]

2[7.63803e-07][7.63803e-07]

3[-5.75615e-06][-5.75615e-06]

4[5.28887e-06][5.28887e-06]

32[0.00377774][0.00377774]

33[-2.09816e-06][-2.09816e-06]

34[0.00188525][-3.02126e-06] <<- At this point, the two arrays have different values. Why not before?

35[-0.00295363][-7.57153e-05]

36[0.00315898][-2.0971e-05]

37[-0.00191124][5.81767e-07]

38[-0.00143699][0.000399498]

39[0.000764901][2.25177e-06]

40[-0.0060265][2.95596e-07]

41[-0.0490147][-0.00174616]

42[-0.0559343][8.19524e-06]

43[nan][-0.00516252]

44[nan][-7.84284e-06]

45[nan][-2.47702e-06]

46[nan][-3.99062e-06]

47[nan][5.18736e-06]

48[nan][-4.69436e-06]

49[nan][0.000917098]

50[nan][5.49397e-05]

51[nan][1.4918e-06]

52[nan][0.00240714]

53[nan][-6.25982e-06]

54[nan][0.000783656]

55[nan][0.0161487]

56[nan][0.00147562]

57[nan][-1.23446e-06]

58[nan][-2.06568e-07]

59[nan][-1.09316e-06]

60[nan][-2.11338e-06]

61[nan][0.0115776]

62[nan][0.00340284]

63[nan][-5.88755e-07]

64[nan][-0.00109655][/codebox]

I have tryied a lot of tests, but I don’t understand the cause of the problem. Does anybody know why? I’m working with a GeForce 9800 GT and an Intel Core 2 Duo CPU E8400 at 3.00GHz with 4GB of RAM.

Thank you in advance,

Néstor

I’ve solved it. Sorry, I hadn’t check my code good enough. The problem was in these two lines:

[codebox]// Memory copy

cutilSafeCall(cudaMemcpy(d_desc1, h_desc1, size * sizeof(float), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_desc2, h_desc2, size * sizeof(float), cudaMemcpyHostToDevice));[/codebox]

They should be like these others:

[codebox]// Memory copy

cutilSafeCall(cudaMemcpy(d_desc1, h_desc1, size * SURF_DESCRIPTOR_SIZE * sizeof(float), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_desc2, h_desc2, size * SURF_DESCRIPTOR_SIZE * sizeof(float), cudaMemcpyHostToDevice));[/codebox]

Now it works. :rolleyes:

I’ve solved it. Sorry, I hadn’t check my code good enough. The problem was in these two lines:

[codebox]// Memory copy

cutilSafeCall(cudaMemcpy(d_desc1, h_desc1, size * sizeof(float), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_desc2, h_desc2, size * sizeof(float), cudaMemcpyHostToDevice));[/codebox]

They should be like these others:

[codebox]// Memory copy

cutilSafeCall(cudaMemcpy(d_desc1, h_desc1, size * SURF_DESCRIPTOR_SIZE * sizeof(float), cudaMemcpyHostToDevice));

cutilSafeCall(cudaMemcpy(d_desc2, h_desc2, size * SURF_DESCRIPTOR_SIZE * sizeof(float), cudaMemcpyHostToDevice));[/codebox]

Now it works. :rolleyes: