Hi,
Previously I posted a similar topic in the OFED forum. In case it’s a hardware bug, I also repost it here. Sorry for any inconvinience.
We found that when we issue large direct write to the target with NVMeoF offloading, the content won’t be written to the disk correctly. Some blocks will be replaced by other blocks in the result. For example, we constructed a 17MB buffer by filling every block with a formatted string (‘blk%05d’, block_index) and wrote the buffer all at once to the target. But when reading it, we found some out-of-order and repeated blocks. For example, in the attachment ‘output.txt’, if grepping ‘wrong’, we can see that blk27135 appears right after blk16382 (instead of blk16383) and again at its own position (following blk27134). Moreover, if we issue smaller writes, e.g., 16KB, this situation doesn’t happen (but the performance is much lower).
For more information, we installed MLNX_OFED_LINUX-5.6-2.0.9.0-ubuntu20.04-x86_64 drivers on both host and target. The OS is Ubuntu 20.04 LTS, and we use Ext4 as the filesystem. The NIC is Dual-port Mellanox ConnectX-6 100 Gb NIC (PCIe v4.0). We also pasted our testing program after this post (I can only attach one link as a new user).
The way we reproduce the bug is to run ‘./simulator /mnt/remote-sst/2/45 17825792 > output.txt’ on the host, where /mnt/remote-sst is an LVM partition on the target NVMe disk connect by NVMeoF.
Does anyone know why this behavior would happen and how to fix/avoid it? Any thoughts are highly appreciated!
Testing program:
/**
* gcc simulator.c -o simulator -D_GNU_SOURCE
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/file.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <time.h>
#include <assert.h>
#define BUF_SIZE 17 * 1024 * 1024
#define BLK_SIZE 512
int main(int argc, char * argv[])
{
int fd;
int ret;
int io_size;
unsigned char *w_buf, *r_buf;
unsigned int w_sum = 0, r_sum = 0;
if (argc < 3) {
printf("Usage: ./simulator file_name io_size\n");
return 0;
}
sscanf(argv[2], "%d", &io_size);
ret = posix_memalign((void **)&w_buf, BLK_SIZE, BUF_SIZE);
if (ret) {
perror("write posix_memalign failed");
exit(1);
}
for (int i = 0; i < BUF_SIZE / BLK_SIZE; i++) {
for (int j = 0; j < BLK_SIZE; j += 8) {
int pos = i * BLK_SIZE + j;
sprintf(w_buf + pos, "blk%05d", i);
}
}
for (int i = 0; i < BUF_SIZE; i++) {
w_sum += w_buf[i];
}
fd = open(argv[1], O_WRONLY | O_DIRECT | O_CREAT | O_SYNC, 0755);
if (fd < 0){
perror("write open failed");
exit(1);
}
// ssize_t w_bytes = write(fd, w_buf, BUF_SIZE);
ssize_t w_bytes = 0;
while (w_bytes < BUF_SIZE) {
w_bytes += write(fd, w_buf + w_bytes, io_size);
}
free(w_buf);
close(fd);
ret = posix_memalign((void **)&r_buf, BLK_SIZE, BUF_SIZE);
if (ret) {
perror("read posix_memalign failed");
exit(1);
}
fd = open(argv[1], O_RDONLY | O_DIRECT, 0755);
if (fd < 0){
perror("read open failed");
exit(1);
}
// ssize_t r_bytes = read(fd, r_buf, BUF_SIZE);
ssize_t r_bytes = 0;
while (r_bytes < BUF_SIZE) {
r_bytes += read(fd, r_buf + r_bytes, io_size);
}
for (int i = 0; i < BUF_SIZE; i++) {
r_sum += (unsigned int)r_buf[i];
}
int wrong = 0;
char blk_id[9];
for (int i = 0; i < BUF_SIZE / BLK_SIZE; i++) {
sprintf(blk_id, "blk%05d", i);
for (int j = 0; j < BLK_SIZE; j += 8) {
int pos = i * BLK_SIZE + j;
for (int k = 0; k < 8; k++) {
printf("%c", r_buf[pos + k]);
if (blk_id[k] != r_buf[pos + k]) {
wrong = 1;
}
}
}
if (wrong) {
printf("\twrong! should be %s\n", blk_id);
wrong = 0;
} else {
printf("\n");
}
}
free(r_buf);
close(fd);
if (w_bytes == r_bytes && w_sum == r_sum) {
printf("%s OK\n", argv[1]);
} else {
printf("%s Bad!\n", argv[1]);
printf("bytes written: %ld checksum: %u\n", w_bytes, w_sum);
printf("bytes read: %ld checksum: %u\n", r_bytes, r_sum);
}
}