diff --git a/tests/basic/ec/ec-sparsefile-heal.t b/tests/basic/ec/ec-sparsefile-heal.t new file mode 100644 index 00000000000..86a7c585af9 --- /dev/null +++ b/tests/basic/ec/ec-sparsefile-heal.t @@ -0,0 +1,147 @@ +#!/bin/bash + +. $(dirname $0)/../../include.rc +. $(dirname $0)/../../volume.rc +. $(dirname $0)/../../ec.rc + +function compare_brick_stats { + file_name=$1 + + SIZE=$(stat -c%s $B0/${V0}0/$file_name) + BLOCKS=$(stat -c%b $B0/${V0}0/$file_name) + + echo $SIZE + echo $BLOCKS + + for b in {0..5}; do + SIZE_IN_BRICK=$(stat -c%s $B0/${V0}${b}/$file_name) + BLOCKS_IN_BRICK=$(stat -c%b $B0/${V0}${b}/$file_name) + + if [[ "$SIZE" -ne "$SIZE_IN_BRICK" || "$BLOCKS" -ne "$BLOCKS_IN_BRICK" ]]; then + return 1 + fi + done + + return 0 +} + +function compare_md5sum { + + if [[ "$1" == "$2" ]]; then + return 0 + fi + + return 1 +} + + +cleanup + + +TEST_DIR="/tmp/glusterfs-sparse-test" +mkdir -p $TEST_DIR + + +TEST glusterd +TEST pidof glusterd + + +TEST $CLI volume create $V0 disperse 6 redundancy 2 $H0:$B0/${V0}{0..5} +TEST $CLI volume start $V0 + +TEST $GFS -s $H0 --volfile-id $V0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 + +TEST_FILE="sparse_test_file" + +TEST dd if=/dev/zero of=$M0/$TEST_FILE bs=1024 count=1024 seek=0 +TEST dd if=/dev/urandom of=$M0/$TEST_FILE bs=1024 count=1 seek=0 conv=notrunc +TEST dd if=/dev/urandom of=$M0/$TEST_FILE bs=1024 count=1 seek=512 conv=notrunc +TEST dd if=/dev/urandom of=$M0/$TEST_FILE bs=1024 count=1 seek=1023 conv=notrunc + +# Create another sparse file with different pattern +TEST_FILE2="sparse_test_file2" +TEST truncate -s 5M $M0/$TEST_FILE2 +TEST dd if=/dev/urandom of=$M0/$TEST_FILE2 bs=4096 count=1 seek=100 conv=notrunc +TEST dd if=/dev/urandom of=$M0/$TEST_FILE2 bs=4096 count=1 seek=500 conv=notrunc + +sleep 2 + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0 + +TEST compare_brick_stats $TEST_FILE +TEST compare_brick_stats $TEST_FILE2 + + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +$GFS --xlator-option="*.ec-read-mask=5:2:3:4" -s $H0 --volfile-id $V0 $M0 +TEST $GFS -s $H0 --volfile-id $V0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 +checksum1=$(md5sum $M0/$TEST_FILE | cut -d' ' -f1) + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +$GFS --xlator-option="*.ec-read-mask=0:2:3:4" -s $H0 --volfile-id $V0 $M0 +TEST $GFS -s $H0 --volfile-id $V0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 +checksum2=$(md5sum $M0/$TEST_FILE | cut -d' ' -f1) + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +$GFS --xlator-option="*.ec-read-mask=1:2:3:4" -s $H0 --volfile-id $V0 $M0 +TEST $GFS -s $H0 --volfile-id $V0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 +checksum3=$(md5sum $M0/$TEST_FILE | cut -d' ' -f1) + +TEST compare_md5sum $checksum1 $checksum2 +TEST compare_md5sum $checksum1 $checksum3 + + +#Test hole punching case +TEST_FILE3="to_be_sparsefile" +TEST dd if=/dev/urandom of=$M0/$TEST_FILE3 bs=1M count=100 + +TEST kill_brick $V0 $H0 $B0/${V0}0 +TEST kill_brick $V0 $H0 $B0/${V0}1 + +TEST fallocate --punch-hole --keep-size -o 6144 -l 15728640 $M0/$TEST_FILE3 + + +TEST $CLI volume start $V0 force +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 + +TEST $CLI volume heal $V0 +EXPECT_WITHIN $HEAL_TIMEOUT "0" get_pending_heal_count $V0 + +# TEST compare_brick_stats $TEST_FILE3 + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +$GFS --xlator-option="*.ec-read-mask=5:2:3:4" -s $H0 --volfile-id $V0 $M0 +TEST $GFS -s $H0 --volfile-id $V0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 +checksum1=$(md5sum $M0/$TEST_FILE3 | cut -d' ' -f1) + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +$GFS --xlator-option="*.ec-read-mask=0:2:3:4" -s $H0 --volfile-id $V0 $M0 +TEST $GFS -s $H0 --volfile-id $V0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 +checksum2=$(md5sum $M0/$TEST_FILE3 | cut -d' ' -f1) + +EXPECT_WITHIN $UMOUNT_TIMEOUT "Y" force_umount $M0 +$GFS --xlator-option="*.ec-read-mask=1:2:3:4" -s $H0 --volfile-id $V0 $M0 +TEST $GFS -s $H0 --volfile-id $V0 $M0 +EXPECT_WITHIN $CHILD_UP_TIMEOUT "6" ec_child_up_count $V0 0 +checksum3=$(md5sum $M0/$TEST_FILE3 | cut -d' ' -f1) + +TEST compare_md5sum $checksum1 $checksum2 +TEST compare_md5sum $checksum1 $checksum3 + + +# Cleanup +rm -rf $TEST_DIR +cleanup \ No newline at end of file diff --git a/xlators/cluster/ec/src/ec-heal.c b/xlators/cluster/ec/src/ec-heal.c index d0b66803f6c..04378203ad1 100644 --- a/xlators/cluster/ec/src/ec-heal.c +++ b/xlators/cluster/ec/src/ec-heal.c @@ -1839,8 +1839,7 @@ __ec_heal_data_prepare(call_frame_t *frame, ec_t *ec, fd_t *fd, for (i = 0; i < ec->nodes; i++) { if (healed_sinks[i]) { - if (replies[i].stat.ia_size) - trim[i] = 1; + trim[i] = 1; } } @@ -2045,9 +2044,95 @@ ec_sync_heal_block(call_frame_t *frame, xlator_t *this, ec_heal_t *heal) return 0; } +void +ec_heal_seek_hole_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, off_t offset, + dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_heal_t *heal = fop->data; + + if (op_ret < 0) { + heal->error = op_errno; + goto out; + } + heal->hole_offset = offset; + +out: + syncbarrier_wake(&heal->barrier); +} + +void +ec_heal_seek_data_cbk(call_frame_t *frame, void *cookie, xlator_t *this, + int32_t op_ret, int32_t op_errno, off_t offset, + dict_t *xdata) +{ + ec_fop_data_t *fop = cookie; + ec_heal_t *heal = fop->data; + + if (op_ret < 0) { + heal->error = op_errno; + goto out; + } + heal->offset = offset; + +out: + syncbarrier_wake(&heal->barrier); +} + +int32_t +ec_sync_heal_sparse_region(call_frame_t *frame, ec_t *ec, ec_heal_t *heal) +{ + int ret = 0; + ec_seek(frame, ec->xl, heal->good, EC_MINIMUM_ONE, ec_heal_seek_data_cbk, + heal, heal->fd, heal->offset, GF_SEEK_DATA, NULL); + syncbarrier_wait(&heal->barrier, 1); + + if (heal->error != 0) { + if (heal->error = ENXIO) { + heal->done = _gf_true; + goto out; + } + ret = -heal->error; + goto out; + } + + ec_seek(frame, ec->xl, heal->good, EC_MINIMUM_ONE, ec_heal_seek_hole_cbk, + heal, heal->fd, heal->offset, GF_SEEK_HOLE, NULL); + syncbarrier_wait(&heal->barrier, 1); + + if (heal->error != 0) { + ret = -heal->error; + goto out; + } + + for (; (heal->offset < heal->hole_offset) && (!heal->done); + heal->offset += heal->size) { + uint64_t data_block_size = heal->hole_offset - heal->offset; + data_block_size = (data_block_size > ec->stripe_size) ? data_block_size + : ec->stripe_size; + + uint64_t original_heal_size = heal->size; + + if (data_block_size < heal->size) + heal->size = data_block_size; + + ret = ec_sync_heal_block(frame, ec->xl, heal); + if (ret < 0) + goto out; + + heal->size = original_heal_size; + } + heal->offset = heal->hole_offset; + +out: + return ret; +} + int ec_rebuild_data(call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size, - unsigned char *sources, unsigned char *healed_sinks) + unsigned char *sources, unsigned char *healed_sinks, + int hole_exists) { ec_heal_t obj, *heal = &obj; int ret = 0; @@ -2072,29 +2157,46 @@ ec_rebuild_data(call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size, heal->ia_type = IA_IFREG; LOCK_INIT(&heal->lock); - for (heal->offset = 0; (heal->offset < size) && !heal->done; - heal->offset += heal->size) { - /* We immediately abort any heal if a shutdown request has been - * received to avoid delays. The healing of this file will be - * restarted by another SHD or other client that accesses the - * file. */ - if (ec->shutdown) { - gf_msg_debug(ec->xl->name, 0, - "Cancelling heal because " - "EC is stopping."); - ret = -ENOTCONN; - break; + if (!hole_exists) { + for (heal->offset = 0; (heal->offset < size) && !heal->done; + heal->offset += heal->size) { + /* We immediately abort any heal if a shutdown request has been + * received to avoid delays. The healing of this file will be + * restarted by another SHD or other client that accesses the + * file. */ + if (ec->shutdown) { + gf_msg_debug(ec->xl->name, 0, + "Cancelling heal because " + "EC is stopping."); + ret = -ENOTCONN; + break; + } + + gf_msg_debug( + ec->xl->name, 0, + "%s: sources: %d, sinks: " + "%d, offset: %" PRIu64 " bsize: %" PRIu64, + uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes), + EC_COUNT(healed_sinks, ec->nodes), heal->offset, heal->size); + ret = ec_sync_heal_block(frame, ec->xl, heal); + if (ret < 0) + break; } - gf_msg_debug(ec->xl->name, 0, - "%s: sources: %d, sinks: " - "%d, offset: %" PRIu64 " bsize: %" PRIu64, - uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes), - EC_COUNT(healed_sinks, ec->nodes), heal->offset, - heal->size); - ret = ec_sync_heal_block(frame, ec->xl, heal); - if (ret < 0) - break; + } else { + heal->offset = 0; + while (!heal->done) { + if (ec->shutdown) { + gf_msg_debug(ec->xl->name, 0, + "Cancelling heal because " + "EC is stopping."); + ret = -ENOTCONN; + break; + } + ret = ec_sync_heal_sparse_region(frame, ec, heal); + if (ret < 0) + break; + } } memset(healed_sinks, 0, ec->nodes); ec_mask_to_char_array(heal->bad, healed_sinks, ec->nodes); @@ -2103,14 +2205,14 @@ ec_rebuild_data(call_frame_t *frame, ec_t *ec, fd_t *fd, uint64_t size, syncbarrier_destroy(&heal->barrier); if (ret < 0) gf_msg_debug(ec->xl->name, -ret, "%s: heal failed", - uuid_utoa(fd->inode->gfid)); + uuid_utoa(fd->inode->gfid)); return ret; } int __ec_heal_trim_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *healed_sinks, unsigned char *trim, - uint64_t size) + uint64_t size, int file_has_holes) { default_args_cbk_t *replies = NULL; unsigned char *output = NULL; @@ -2127,6 +2229,14 @@ __ec_heal_trim_sinks(call_frame_t *frame, ec_t *ec, fd_t *fd, } trim_offset = size; ec_adjust_offset_up(ec, &trim_offset, _gf_true); + if (file_has_holes) { + ret = cluster_ftruncate(ec->xl_list, trim, ec->nodes, replies, output, + frame, ec->xl, fd, 0, NULL); + for (i = 0; i < ec->nodes; i++) { + if (!output[i] && trim[i]) + healed_sinks[i] = 0; + } + } ret = cluster_ftruncate(ec->xl_list, trim, ec->nodes, replies, output, frame, ec->xl, fd, trim_offset, NULL); for (i = 0; i < ec->nodes; i++) { @@ -2348,6 +2458,8 @@ __ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on, default_args_cbk_t *replies = NULL; int ret = 0; int source = 0; + int file_has_holes = 0; + struct iatt source_buf = {0}; locked_on = alloca0(ec->nodes); output = alloca0(ec->nodes); @@ -2371,10 +2483,16 @@ __ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on, } ret = __ec_heal_data_prepare(frame, ec, fd, locked_on, versions, dirty, - size, sources, healed_sinks, trim, NULL); + size, sources, healed_sinks, trim, + &source_buf); if (ret < 0) goto unlock; + if (source_buf.ia_blocks * source_buf.ia_blksize != + source_buf.ia_size) { + file_has_holes = 1; + } + if (EC_COUNT(healed_sinks, ec->nodes) == 0) { ret = __ec_fd_data_adjust_versions( frame, ec, fd, sources, healed_sinks, versions, dirty, size); @@ -2387,7 +2505,7 @@ __ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on, goto unlock; ret = __ec_heal_trim_sinks(frame, ec, fd, healed_sinks, trim, - size[source]); + size[source], file_has_holes); } unlock: cluster_uninodelk(ec->xl_list, locked_on, ec->nodes, replies, output, frame, @@ -2404,7 +2522,7 @@ __ec_heal_data(call_frame_t *frame, ec_t *ec, fd_t *fd, unsigned char *heal_on, uuid_utoa(fd->inode->gfid), EC_COUNT(sources, ec->nodes), EC_COUNT(healed_sinks, ec->nodes)); - ret = ec_rebuild_data(frame, ec, fd, size[source], sources, healed_sinks); + ret = ec_rebuild_data(frame, ec, fd, size[source], sources, healed_sinks, file_has_holes); if (ret < 0) goto out; diff --git a/xlators/cluster/ec/src/ec-types.h b/xlators/cluster/ec/src/ec-types.h index c89af3d3556..04b681f699c 100644 --- a/xlators/cluster/ec/src/ec-types.h +++ b/xlators/cluster/ec/src/ec-types.h @@ -577,6 +577,7 @@ struct _ec_heal { uint64_t offset; uint64_t size; uint64_t total_size; + uint64_t hole_offset; }; struct subvol_healer {