diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 1fe9a553c37b..831f19a32e08 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -101,6 +101,16 @@ Description: devices that support receiving integrity metadata. +What: /sys/block//partscan +Date: May 2024 +Contact: Christoph Hellwig +Description: + The /sys/block//partscan files reports if partition + scanning is enabled for the disk. It returns "1" if partition + scanning is enabled, or "0" if not. The value type is a 32-bit + unsigned integer, but only "0" and "1" are valid values. + + What: /sys/block///alignment_offset Date: April 2009 Contact: Martin K. Petersen @@ -584,18 +594,6 @@ Description: the data. If no such restriction exists, this file will contain '0'. This file is writable for testing purposes. - -What: /sys/block//queue/throttle_sample_time -Date: March 2017 -Contact: linux-block@vger.kernel.org -Description: - [RW] This is the time window that blk-throttle samples data, in - millisecond. blk-throttle makes decision based on the - samplings. Lower time means cgroups have more smooth throughput, - but higher CPU overhead. This exists only when - CONFIG_BLK_DEV_THROTTLING_LOW is enabled. - - What: /sys/block//queue/virt_boundary_mask Date: April 2021 Contact: linux-block@vger.kernel.org diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig index f18c2ba871ef..fc0d89d4c1c5 100644 --- a/arch/loongarch/configs/loongson3_defconfig +++ b/arch/loongarch/configs/loongson3_defconfig @@ -76,7 +76,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_MODVERSIONS=y CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y CONFIG_BLK_WBT=y CONFIG_BLK_CGROUP_IOLATENCY=y CONFIG_BLK_CGROUP_FC_APPID=y diff --git a/block/Kconfig b/block/Kconfig index 1de4682d48cc..dc12af58dbae 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED config BLK_DEV_ZONED bool "Zoned block device support" - select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables support for ZAC/ZBC/ZNS host-managed and host-aware zoned block @@ -120,17 +119,6 @@ config BLK_DEV_THROTTLING See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. -config BLK_DEV_THROTTLING_LOW - bool "Block throttling .low limit interface support (EXPERIMENTAL)" - depends on BLK_DEV_THROTTLING - help - Add .low limit interface for block throttling. The low limit is a best - effort limit to prioritize cgroups. Depending on the setting, the limit - can be used to protect cgroups in terms of bandwidth/iops and better - utilize disk resource. - - Note, this is an experimental interface and could be changed someday. - config BLK_WBT bool "Enable support for block device writeback throttling" help @@ -198,10 +186,6 @@ config BLK_DEBUG_FS Unless you are building a kernel for a tiny system, you should say Y here. -config BLK_DEBUG_FS_ZONED - bool - default BLK_DEBUG_FS && BLK_DEV_ZONED - config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" depends on KEYS diff --git a/block/Makefile b/block/Makefile index 46ada9dc8bbf..168150b9c510 100644 --- a/block/Makefile +++ b/block/Makefile @@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o -obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ diff --git a/block/bio.c b/block/bio.c index d24420ed1c4c..53f608028c78 100644 --- a/block/bio.c +++ b/block/bio.c @@ -345,18 +345,29 @@ void bio_chain(struct bio *bio, struct bio *parent) } EXPORT_SYMBOL(bio_chain); -struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, - unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +/** + * bio_chain_and_submit - submit a bio after chaining it to another one + * @prev: bio to chain and submit + * @new: bio to chain to + * + * If @prev is non-NULL, chain it to @new and submit it. + * + * Return: @new. + */ +struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new) { - struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); - - if (bio) { - bio_chain(bio, new); - submit_bio(bio); + if (prev) { + bio_chain(prev, new); + submit_bio(prev); } - return new; } + +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, + unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +{ + return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp)); +} EXPORT_SYMBOL_GPL(blk_next_bio); static void bio_alloc_rescue(struct work_struct *work) @@ -1384,6 +1395,26 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); +static void bio_wait_end_io(struct bio *bio) +{ + complete(bio->bi_private); + bio_put(bio); +} + +/* + * bio_await_chain - ends @bio and waits for every chained bio to complete + */ +void bio_await_chain(struct bio *bio) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = bio_wait_end_io; + bio_endio(bio); + blk_wait_io(&done); +} + void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) @@ -1576,6 +1607,8 @@ void bio_endio(struct bio *bio) if (!bio_integrity_endio(bio)) return; + blk_zone_bio_endio(bio); + rq_qos_done_bio(bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { @@ -1596,7 +1629,6 @@ void bio_endio(struct bio *bio) goto again; } - blk_throtl_bio_endio(bio); /* release cgroup info */ bio_uninit(bio); if (bio->bi_end_io) diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index 3304e841df7c..a55fb0c53558 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) { int i, ret; - for (i = 0; i < BLKG_RWSTAT_NR; i++) { - ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); - if (ret) { - while (--i >= 0) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); - return ret; - } + ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR); + if (ret) + return ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) atomic64_set(&rwstat->aux_cnt[i], 0); - } return 0; } EXPORT_SYMBOL_GPL(blkg_rwstat_init); void blkg_rwstat_exit(struct blkg_rwstat *rwstat) { - int i; - - for (i = 0; i < BLKG_RWSTAT_NR; i++) - percpu_counter_destroy(&rwstat->cpu_cnt[i]); + percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR); } EXPORT_SYMBOL_GPL(blkg_rwstat_exit); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 059467086b13..4b1a35ab0ea4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work) /* as long as there are pending bios, @blkg can't go away */ spin_lock(&blkg->async_bio_lock); - bio_list_merge(&bios, &blkg->async_bios); - bio_list_init(&blkg->async_bios); + bio_list_merge_init(&bios, &blkg->async_bios); spin_unlock(&blkg->async_bio_lock); /* start plug only when bio_list contains at least 2 bios */ @@ -1444,14 +1443,8 @@ int blkcg_init_disk(struct gendisk *disk) if (ret) goto err_destroy_all; - ret = blk_throtl_init(disk); - if (ret) - goto err_ioprio_exit; - return 0; -err_ioprio_exit: - blk_ioprio_exit(disk); err_destroy_all: blkg_destroy_all(disk); return ret; diff --git a/block/blk-core.c b/block/blk-core.c index b795ac177281..01186333c88e 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -591,8 +591,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || - !bio_zone_is_seq(bio)) + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector)) return BLK_STS_IOERR; /* @@ -604,7 +603,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ - if (nr_sectors > q->limits.max_zone_append_sectors) + if (nr_sectors > queue_max_zone_append_sectors(q)) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; @@ -649,11 +648,13 @@ static void __submit_bio(struct bio *bio) static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; + struct blk_plug plug; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; + blk_start_plug(&plug); do { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -687,19 +688,23 @@ static void __submit_bio_noacct(struct bio *bio) bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + blk_finish_plug(&plug); current->bio_list = NULL; } static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; + struct blk_plug plug; current->bio_list = bio_list; + blk_start_plug(&plug); do { __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); + blk_finish_plug(&plug); current->bio_list = NULL; } @@ -910,12 +915,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; - /* - * As the requests that require a zone lock are not plugged in the - * first place, directly accessing the plug instead of using - * blk_mq_plug() should not have any consequences during flushing for - * zoned devices. - */ blk_flush_plug(current->plug, false); /* @@ -987,10 +986,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end) unsigned long stamp; again: stamp = READ_ONCE(part->bd_stamp); - if (unlikely(time_after(now, stamp))) { - if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) - __part_stat_add(part, io_ticks, end ? now - stamp : 1); - } + if (unlikely(time_after(now, stamp)) && + likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) && + (end || part_in_flight(part))) + __part_stat_add(part, io_ticks, now - stamp); + if (part->bd_partno) { part = bdev_whole(part); goto again; diff --git a/block/blk-flush.c b/block/blk-flush.c index b0f314f4bc14..c17cf8ed8113 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -130,6 +130,8 @@ static void blk_flush_restore_request(struct request *rq) * original @rq->bio. Restore it. */ rq->bio = rq->biotail; + if (rq->bio) + rq->__sector = rq->bio->bi_iter.bi_sector; /* make @rq a normal request */ rq->rq_flags &= ~RQF_FLUSH_SEQ; diff --git a/block/blk-lib.c b/block/blk-lib.c index a6954eafb8c8..442da9dad042 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -35,51 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } -int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) +struct bio *blk_alloc_discard_bio(struct block_device *bdev, + sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask) { - struct bio *bio = *biop; - sector_t bs_mask; - - if (bdev_read_only(bdev)) - return -EPERM; - if (!bdev_max_discard_sectors(bdev)) - return -EOPNOTSUPP; - - /* In case the discard granularity isn't set by buggy device driver */ - if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { - pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n", - bdev); - return -EOPNOTSUPP; - } - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; + sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector)); + struct bio *bio; - if (!nr_sects) - return -EINVAL; + if (!bio_sects) + return NULL; - while (nr_sects) { - sector_t req_sects = - min(nr_sects, bio_discard_limit(bdev, sector)); + bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask); + if (!bio) + return NULL; + bio->bi_iter.bi_sector = *sector; + bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT; + *sector += bio_sects; + *nr_sects -= bio_sects; + /* + * We can loop for a long time in here if someone does full device + * discards (like mkfs). Be nice and allow us to schedule out to avoid + * softlocking if preempt is disabled. + */ + cond_resched(); + return bio; +} - bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); - bio->bi_iter.bi_sector = sector; - bio->bi_iter.bi_size = req_sects << 9; - sector += req_sects; - nr_sects -= req_sects; - - /* - * We can loop for a long time in here, if someone does - * full device discards (like mkfs). Be nice and allow - * us to schedule out to avoid softlocking if preempt - * is disabled. - */ - cond_resched(); - } +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) +{ + struct bio *bio; - *biop = bio; + while ((bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, + gfp_mask))) + *biop = bio_chain_and_submit(*biop, bio); return 0; } EXPORT_SYMBOL(__blkdev_issue_discard); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4e3483a16b75..8534c35e0497 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, blkcg_bio_issue_init(split); bio_chain(split, bio); trace_block_split(split, bio->bi_iter.bi_sector); + WARN_ON_ONCE(bio_zone_write_plugging(bio)); submit_bio_noacct(bio); return split; } @@ -779,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req) if (blk_do_io_stat(req)) { part_stat_lock(); part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } @@ -972,13 +975,7 @@ static void blk_account_io_merge_bio(struct request *req) part_stat_unlock(); } -enum bio_merge_status { - BIO_MERGE_OK, - BIO_MERGE_NONE, - BIO_MERGE_FAILED, -}; - -static enum bio_merge_status bio_attempt_back_merge(struct request *req, +enum bio_merge_status bio_attempt_back_merge(struct request *req, struct bio *bio, unsigned int nr_segs) { const blk_opf_t ff = bio_failfast(bio); @@ -994,6 +991,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, blk_update_mixed_merge(req, bio, false); + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) + blk_zone_write_plug_bio_merged(bio); + req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; @@ -1009,6 +1009,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, { const blk_opf_t ff = bio_failfast(bio); + /* + * A front merge for writes to sequential zones of a zoned block device + * can happen only if the user submitted writes out of order. Do not + * merge such write to let it fail. + */ + if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING) + return BIO_MERGE_FAILED; + if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; @@ -1107,10 +1115,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct blk_plug *plug; + struct blk_plug *plug = current->plug; struct request *rq; - plug = blk_mq_plug(bio); if (!plug || rq_list_empty(plug->mq_list)) return false; diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c deleted file mode 100644 index a77b099c34b7..000000000000 --- a/block/blk-mq-debugfs-zoned.c +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2017 Western Digital Corporation or its affiliates. - */ - -#include -#include "blk-mq-debugfs.h" - -int queue_zone_wlock_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - unsigned int i; - - if (!q->disk->seq_zones_wlock) - return 0; - - for (i = 0; i < q->disk->nr_zones; i++) - if (test_bit(i, q->disk->seq_zones_wlock)) - seq_printf(m, "%u\n", i); - - return 0; -} diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 94668e72ab09..770c0c2b72fa 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, - { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, + { "zone_wplugs", 0400, queue_zone_wplugs_show, NULL }, { }, }; @@ -256,7 +256,6 @@ static const char *const rqf_name[] = { RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), - RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(TIMED_OUT), RQF_NAME(RESV), }; diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index 9c7d4b6117d4..c80e453e3014 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) } #endif -#ifdef CONFIG_BLK_DEBUG_FS_ZONED -int queue_zone_wlock_show(void *data, struct seq_file *m); +#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS) +int queue_zone_wplugs_show(void *data, struct seq_file *m); #else -static inline int queue_zone_wlock_show(void *data, struct seq_file *m) +static inline int queue_zone_wplugs_show(void *data, struct seq_file *m) { return 0; } diff --git a/block/blk-mq.c b/block/blk-mq.c index 32afb87efbd0..8e01e4b32e10 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -690,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq) { struct request_queue *q = rq->q; + blk_zone_finish_request(rq); + if (rq->rq_flags & RQF_USE_SCHED) { q->elevator->type->ops.finish_request(rq); /* @@ -761,31 +764,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg) } EXPORT_SYMBOL(blk_dump_rq_flags); -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, blk_status_t error) -{ - if (unlikely(error)) { - bio->bi_status = error; - } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { - /* - * Partial zone append completions cannot be supported as the - * BIO fragments may end up not being written sequentially. - */ - if (bio->bi_iter.bi_size != nbytes) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_iter.bi_sector = rq->__sector; - } - - bio_advance(bio, nbytes); - - if (unlikely(rq->rq_flags & RQF_QUIET)) - bio_set_flag(bio, BIO_QUIET); - /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) - bio_endio(bio); -} - static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { @@ -845,8 +823,7 @@ static void blk_complete_request(struct request *req) /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - if (req_op(req) == REQ_OP_ZONE_APPEND) - bio->bi_iter.bi_sector = req->__sector; + blk_zone_update_request_bio(req, bio); if (!is_flush) bio_endio(bio); @@ -889,6 +866,8 @@ static void blk_complete_request(struct request *req) bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { + bool is_flush = req->rq_flags & RQF_FLUSH_SEQ; + bool quiet = req->rq_flags & RQF_QUIET; int total_bytes; trace_block_rq_complete(req, error, nr_bytes); @@ -909,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error, if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) __blk_crypto_rq_put_keyslot(req); - if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET)) && - !test_bit(GD_DEAD, &req->q->disk->state)) { + if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) && + !test_bit(GD_DEAD, &req->q->disk->state)) { blk_print_req_error(req, error); trace_block_rq_error(req, error, nr_bytes); } @@ -923,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error, struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - if (bio_bytes == bio->bi_iter.bi_size) + if (unlikely(error)) + bio->bi_status = error; + + if (bio_bytes == bio->bi_iter.bi_size) { req->bio = bio->bi_next; + } else if (bio_is_zone_append(bio) && error == BLK_STS_OK) { + /* + * Partial zone append completions cannot be supported + * as the BIO fragments may end up not being written + * sequentially. + */ + bio->bi_status = BLK_STS_IOERR; + } /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); - req_bio_endio(req, bio, bio_bytes, error); + if (unlikely(quiet)) + bio_set_flag(bio, BIO_QUIET); + + bio_advance(bio, bio_bytes); + + /* Don't actually finish bio if it's part of flush sequence */ + if (!bio->bi_iter.bi_size) { + blk_zone_update_request_bio(req, bio); + if (!is_flush) + bio_endio(bio); + } total_bytes += bio_bytes; nr_bytes -= bio_bytes; @@ -997,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now) update_io_ticks(req->part, jiffies, true); part_stat_inc(req->part, ios[sgrp]); part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_local_dec(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } @@ -1019,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req) part_stat_lock(); update_io_ticks(req->part, jiffies, false); + part_stat_local_inc(req->part, + in_flight[op_is_write(req_op(req))]); part_stat_unlock(); } } @@ -1330,11 +1333,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head) blk_account_io_start(rq); - /* - * As plugging can be enabled for passthrough requests on a zoned - * device, directly accessing the plug instead of using blk_mq_plug() - * should not have any consequences. - */ if (current->plug && !at_head) { blk_add_rq_to_plug(current->plug, rq); return; @@ -1921,19 +1919,6 @@ static void blk_mq_handle_dev_resource(struct request *rq, __blk_mq_requeue_request(rq); } -static void blk_mq_handle_zone_resource(struct request *rq, - struct list_head *zone_list) -{ - /* - * If we end up here it is because we cannot dispatch a request to a - * specific zone due to LLD level zone-write locking or other zone - * related resource not being available. In this case, set the request - * aside in zone_list for retrying it later. - */ - list_add(&rq->queuelist, zone_list); - __blk_mq_requeue_request(rq); -} - enum prep_dispatch { PREP_DISPATCH_OK, PREP_DISPATCH_NO_TAG, @@ -2019,7 +2004,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, struct request *rq; int queued; blk_status_t ret = BLK_STS_OK; - LIST_HEAD(zone_list); bool needs_resource = false; if (list_empty(list)) @@ -2061,23 +2045,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, case BLK_STS_DEV_RESOURCE: blk_mq_handle_dev_resource(rq, list); goto out; - case BLK_STS_ZONE_RESOURCE: - /* - * Move the request to zone_list and keep going through - * the dispatch list to find more requests the drive can - * accept. - */ - blk_mq_handle_zone_resource(rq, &zone_list); - needs_resource = true; - break; default: blk_mq_end_request(rq, ret); } } while (!list_empty(list)); out: - if (!list_empty(&zone_list)) - list_splice_tail_init(&zone_list, list); - /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ @@ -2163,6 +2135,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) return cpu; } +/* + * ->next_cpu is always calculated from hctx->cpumask, so simply use + * it for speeding up the check + */ +static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx) +{ + return hctx->next_cpu >= nr_cpu_ids; +} + /* * It'd be great if the workqueue API had a way to pass * in a mask and had some smarts for more clever placement. @@ -2174,7 +2155,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) bool tried = false; int next_cpu = hctx->next_cpu; - if (hctx->queue->nr_hw_queues == 1) + /* Switch to unbound if no allowable CPUs in this hctx */ + if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx)) return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { @@ -2948,22 +2930,37 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, void blk_mq_submit_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - struct blk_plug *plug = blk_mq_plug(bio); + struct blk_plug *plug = current->plug; const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; unsigned int nr_segs = 1; struct request *rq; blk_status_t ret; + /* + * If the plug has a cached request for this queue, try to use it. + */ + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); + + /* + * A BIO that was released from a zone write plug has already been + * through the preparation in this function, already holds a reference + * on the queue usage counter, and is the only write BIO in-flight for + * the target zone. Go straight to preparing a request for it. + */ + if (bio_zone_write_plugging(bio)) { + nr_segs = bio->__bi_nr_segments; + if (rq) + blk_queue_exit(q); + goto new_request; + } + bio = blk_queue_bounce(bio, q); /* - * If the plug has a cached request for this queue, try use it. - * * The cached request already holds a q_usage_counter reference and we * don't have to acquire a new one if we use it. */ - rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); if (!rq) { if (unlikely(bio_queue_enter(bio))) return; @@ -2980,6 +2977,10 @@ void blk_mq_submit_bio(struct bio *bio) if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) goto queue_exit; + if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs)) + goto queue_exit; + +new_request: if (!rq) { rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) @@ -3002,6 +3003,9 @@ void blk_mq_submit_bio(struct bio *bio) return; } + if (bio_zone_write_plugging(bio)) + blk_zone_write_plug_init_request(rq); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; @@ -3483,14 +3487,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) return data.has_rq; } -static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, - struct blk_mq_hw_ctx *hctx) +static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx, + unsigned int this_cpu) { - if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) - return false; - if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) - return false; - return true; + enum hctx_type type = hctx->type; + int cpu; + + /* + * hctx->cpumask has to rule out isolated CPUs, but userspace still + * might submit IOs on these isolated CPUs, so use the queue map to + * check if all CPUs mapped to this hctx are offline + */ + for_each_online_cpu(cpu) { + struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue, + type, cpu); + + if (h != hctx) + continue; + + /* this hctx has at least one online CPU */ + if (this_cpu != cpu) + return true; + } + + return false; } static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) @@ -3498,8 +3518,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_online); - if (!cpumask_test_cpu(cpu, hctx->cpumask) || - !blk_mq_last_cpu_in_hctx(cpu, hctx)) + if (blk_mq_hctx_has_online_cpu(hctx, cpu)) return 0; /* @@ -3907,6 +3926,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) } queue_for_each_hw_ctx(q, hctx, i) { + int cpu; + /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -3933,6 +3954,15 @@ static void blk_mq_map_swqueue(struct request_queue *q) */ sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); + /* + * Rule out isolated CPUs from hctx->cpumask to avoid + * running block kworker on isolated CPUs + */ + for_each_cpu(cpu, hctx->cpumask) { + if (cpu_is_isolated(cpu)) + cpumask_clear_cpu(cpu, hctx->cpumask); + } + /* * Initialize batch roundrobin counts */ diff --git a/block/blk-mq.h b/block/blk-mq.h index f75a9ecfebde..260beea8e332 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) qmap->mq_map[cpu] = 0; } -/* - * blk_mq_plug() - Get caller context plug - * @bio : the bio being submitted by the caller context - * - * Plugging, by design, may delay the insertion of BIOs into the elevator in - * order to increase BIO merging opportunities. This however can cause BIO - * insertion order to change from the order in which submit_bio() is being - * executed in the case of multiple contexts concurrently issuing BIOs to a - * device, even if these context are synchronized to tightly control BIO issuing - * order. While this is not a problem with regular block devices, this ordering - * change can cause write BIO failures with zoned block devices as these - * require sequential write patterns to zones. Prevent this from happening by - * ignoring the plug state of a BIO issuing context if it is for a zoned block - * device and the BIO to plug is a write operation. - * - * Return current->plug if the bio can be plugged and NULL otherwise - */ -static inline struct blk_plug *blk_mq_plug( struct bio *bio) -{ - /* Zoned block device write operation case: do not plug the BIO */ - if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && - bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) - return NULL; - - /* - * For regular block devices or read operations, use the context plug - * which may be NULL if blk_start_plug() was not executed. - */ - return current->plug; -} - /* Free all requests on the list */ static inline void blk_mq_free_requests(struct list_head *list) { diff --git a/block/blk-settings.c b/block/blk-settings.c index 9d6033e01f2e..ebba05a2bc7f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -411,24 +411,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); * blk_queue_max_zone_append_sectors - set max sectors for a single zone append * @q: the request queue for the device * @max_zone_append_sectors: maximum number of sectors to write per command + * + * Sets the maximum number of sectors allowed for zone append commands. If + * Specifying 0 for @max_zone_append_sectors indicates that the queue does + * not natively support zone append operations and that the block layer must + * emulate these operations using regular writes. **/ void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors) { - unsigned int max_sectors; + unsigned int max_sectors = 0; if (WARN_ON(!blk_queue_is_zoned(q))) return; - max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); - max_sectors = min(q->limits.chunk_sectors, max_sectors); + if (max_zone_append_sectors) { + max_sectors = min(q->limits.max_hw_sectors, + max_zone_append_sectors); + max_sectors = min(q->limits.chunk_sectors, max_sectors); - /* - * Signal eventual driver bugs resulting in the max_zone_append sectors limit - * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, - * or the max_hw_sectors limit not set. - */ - WARN_ON(!max_sectors); + /* + * Signal eventual driver bugs resulting in the max_zone_append + * sectors limit being 0 due to the chunk_sectors limit (zone + * size) not set or the max_hw_sectors limit not set. + */ + WARN_ON_ONCE(!max_sectors); + } q->limits.max_zone_append_sectors = max_sectors; } @@ -755,8 +763,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); - t->max_zone_append_sectors = min(t->max_zone_append_sectors, - b->max_zone_append_sectors); + t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t), + queue_limits_max_zone_append_sectors(b)); t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, @@ -1043,22 +1051,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) } EXPORT_SYMBOL_GPL(blk_queue_write_cache); -/** - * blk_queue_required_elevator_features - Set a queue required elevator features - * @q: the request queue for the target device - * @features: Required elevator features OR'ed together - * - * Tell the block layer that for the device controlled through @q, only the - * only elevators that can be used are those that implement at least the set of - * features specified by @features. - */ -void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features) -{ - q->required_elevator_features = features; -} -EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); - /** * blk_queue_can_use_dma_map_merging - configure queue for merging segments. * @q: the request queue for the device diff --git a/block/blk-stat.c b/block/blk-stat.c index e42c263e53fb..eaf60097bbe1 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now) value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; - if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE) - blk_throtl_stat_add(rq, value); - rcu_read_lock(); cpu = get_cpu(); list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8c8f69d8ba48..f0f9314ab65c 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q, static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { - unsigned long long max_sectors = q->limits.max_zone_append_sectors; + unsigned long long max_sectors = queue_max_zone_append_sectors(q); return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); } @@ -516,10 +516,6 @@ QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); -#endif - /* legacy alias for logical_block_size: */ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = 0444 }, @@ -640,9 +636,6 @@ static struct attribute *queue_attrs[] = { &queue_fua_entry.attr, &queue_dax_entry.attr, &queue_poll_delay_entry.attr, -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - &blk_throtl_sample_time_entry.attr, -#endif &queue_virt_boundary_mask_entry.attr, &queue_dma_alignment_entry.attr, NULL, @@ -814,7 +807,6 @@ int blk_register_queue(struct gendisk *disk) blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); wbt_enable_default(disk); - blk_throtl_register(disk); /* Now everything is ready and send out KOBJ_ADD uevent */ kobject_uevent(&disk->queue_kobj, KOBJ_ADD); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index f4850a6f860b..80aaca18bfb0 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -25,18 +25,6 @@ #define DFL_THROTL_SLICE_HD (HZ / 10) #define DFL_THROTL_SLICE_SSD (HZ / 50) #define MAX_THROTL_SLICE (HZ) -#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ -#define MIN_THROTL_BPS (320 * 1024) -#define MIN_THROTL_IOPS (10) -#define DFL_LATENCY_TARGET (-1L) -#define DFL_IDLE_THRESHOLD (0) -#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */ -#define LATENCY_FILTERED_SSD (0) -/* - * For HD, very small latency comes from sequential IO. Such IO is helpless to - * help determine if its IO is impacted by others, hence we ignore the IO - */ -#define LATENCY_FILTERED_HD (1000L) /* 1ms */ /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; @@ -70,19 +58,6 @@ struct throtl_data /* Work for dispatching throttled bios */ struct work_struct dispatch_work; - unsigned int limit_index; - bool limit_valid[LIMIT_CNT]; - - unsigned long low_upgrade_time; - unsigned long low_downgrade_time; - - unsigned int scale; - - struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; - struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; - struct latency_bucket __percpu *latency_buckets[2]; - unsigned long last_calculate_time; - unsigned long filtered_latency; bool track_bio_latency; }; @@ -126,84 +101,24 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) return container_of(sq, struct throtl_data, service_queue); } -/* - * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to - * make the IO dispatch more smooth. - * Scale up: linearly scale up according to elapsed time since upgrade. For - * every throtl_slice, the limit scales up 1/2 .low limit till the - * limit hits .max limit - * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit - */ -static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) -{ - /* arbitrary value to avoid too big scale */ - if (td->scale < 4096 && time_after_eq(jiffies, - td->low_upgrade_time + td->scale * td->throtl_slice)) - td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; - - return low + (low >> 1) * td->scale; -} - static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); - struct throtl_data *td; - uint64_t ret; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return U64_MAX; - td = tg->td; - ret = tg->bps[rw][td->limit_index]; - if (ret == 0 && td->limit_index == LIMIT_LOW) { - /* intermediate node or iops isn't 0 */ - if (!list_empty(&blkg->blkcg->css.children) || - tg->iops[rw][td->limit_index]) - return U64_MAX; - else - return MIN_THROTL_BPS; - } - - if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && - tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { - uint64_t adjusted; - - adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); - ret = min(tg->bps[rw][LIMIT_MAX], adjusted); - } - return ret; + return tg->bps[rw]; } static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) { struct blkcg_gq *blkg = tg_to_blkg(tg); - struct throtl_data *td; - unsigned int ret; if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) return UINT_MAX; - td = tg->td; - ret = tg->iops[rw][td->limit_index]; - if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { - /* intermediate node or bps isn't 0 */ - if (!list_empty(&blkg->blkcg->css.children) || - tg->bps[rw][td->limit_index]) - return UINT_MAX; - else - return MIN_THROTL_IOPS; - } - - if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && - tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { - uint64_t adjusted; - - adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); - if (adjusted > UINT_MAX) - adjusted = UINT_MAX; - ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); - } - return ret; + return tg->iops[rw]; } #define request_bucket_index(sectors) \ @@ -359,20 +274,10 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, } RB_CLEAR_NODE(&tg->rb_node); - tg->bps[READ][LIMIT_MAX] = U64_MAX; - tg->bps[WRITE][LIMIT_MAX] = U64_MAX; - tg->iops[READ][LIMIT_MAX] = UINT_MAX; - tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; - tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; - tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; - tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; - tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; - /* LIMIT_LOW will have default value 0 */ - - tg->latency_target = DFL_LATENCY_TARGET; - tg->latency_target_conf = DFL_LATENCY_TARGET; - tg->idletime_threshold = DFL_IDLE_THRESHOLD; - tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; + tg->bps[READ] = U64_MAX; + tg->bps[WRITE] = U64_MAX; + tg->iops[READ] = UINT_MAX; + tg->iops[WRITE] = UINT_MAX; return &tg->pd; @@ -418,18 +323,15 @@ static void throtl_pd_init(struct blkg_policy_data *pd) static void tg_update_has_rules(struct throtl_grp *tg) { struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); - struct throtl_data *td = tg->td; int rw; for (rw = READ; rw <= WRITE; rw++) { tg->has_rules_iops[rw] = (parent_tg && parent_tg->has_rules_iops[rw]) || - (td->limit_valid[td->limit_index] && - tg_iops_limit(tg, rw) != UINT_MAX); + tg_iops_limit(tg, rw) != UINT_MAX; tg->has_rules_bps[rw] = (parent_tg && parent_tg->has_rules_bps[rw]) || - (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX)); + tg_bps_limit(tg, rw) != U64_MAX; } } @@ -443,49 +345,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static void blk_throtl_update_limit_valid(struct throtl_data *td) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - bool low_valid = false; - - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || - tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { - low_valid = true; - break; - } - } - rcu_read_unlock(); - - td->limit_valid[LIMIT_LOW] = low_valid; -} -#else -static inline void blk_throtl_update_limit_valid(struct throtl_data *td) -{ -} -#endif - -static void throtl_upgrade_state(struct throtl_data *td); -static void throtl_pd_offline(struct blkg_policy_data *pd) -{ - struct throtl_grp *tg = pd_to_tg(pd); - - tg->bps[READ][LIMIT_LOW] = 0; - tg->bps[WRITE][LIMIT_LOW] = 0; - tg->iops[READ][LIMIT_LOW] = 0; - tg->iops[WRITE][LIMIT_LOW] = 0; - - blk_throtl_update_limit_valid(tg->td); - - if (!tg->td->limit_valid[tg->td->limit_index]) - throtl_upgrade_state(tg->td); -} - static void throtl_pd_free(struct blkg_policy_data *pd) { struct throtl_grp *tg = pd_to_tg(pd); @@ -1151,8 +1010,6 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) return nr_disp; } -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg); /** * throtl_pending_timer_fn - timer function for service_queue->pending_timer * @t: the pending_timer member of the throtl_service_queue being serviced @@ -1189,9 +1046,6 @@ static void throtl_pending_timer_fn(struct timer_list *t) if (!q->root_blkg) goto out_unlock; - if (throtl_can_upgrade(td, NULL)) - throtl_upgrade_state(td); - again: parent_sq = sq->parent_sq; dispatched = false; @@ -1331,22 +1185,12 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) blkg_for_each_descendant_pre(blkg, pos_css, global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { struct throtl_grp *this_tg = blkg_to_tg(blkg); - struct throtl_grp *parent_tg; tg_update_has_rules(this_tg); /* ignore root/second level */ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || !blkg->parent->parent) continue; - parent_tg = blkg_to_tg(blkg->parent); - /* - * make sure all children has lower idle time threshold and - * higher latency target - */ - this_tg->idletime_threshold = min(this_tg->idletime_threshold, - parent_tg->idletime_threshold); - this_tg->latency_target = max(this_tg->latency_target, - parent_tg->latency_target); } rcu_read_unlock(); @@ -1367,6 +1211,53 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) } } +static int blk_throtl_init(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct throtl_data *td; + int ret; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); + throtl_service_queue_init(&td->service_queue); + + /* + * Freeze queue before activating policy, to synchronize with IO path, + * which is protected by 'q_usage_counter'. + */ + blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); + + q->td = td; + td->queue = q; + + /* activate policy */ + ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); + if (ret) { + q->td = NULL; + kfree(td); + goto out; + } + + if (blk_queue_nonrot(q)) + td->throtl_slice = DFL_THROTL_SLICE_SSD; + else + td->throtl_slice = DFL_THROTL_SLICE_HD; + td->track_bio_latency = !queue_is_mq(q); + if (!td->track_bio_latency) + blk_stat_enable_accounting(q); + +out: + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + + return ret; +} + + static ssize_t tg_set_conf(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off, bool is_u64) { @@ -1378,6 +1269,16 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, blkg_conf_init(&ctx, buf); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out_finish; + + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { + ret = blk_throtl_init(ctx.bdev->bd_disk); + if (ret) + goto out_finish; + } + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; @@ -1444,25 +1345,25 @@ static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) static struct cftype throtl_legacy_files[] = { { .name = "throttle.read_bps_device", - .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, bps[READ]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.write_bps_device", - .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, bps[WRITE]), .seq_show = tg_print_conf_u64, .write = tg_set_conf_u64, }, { .name = "throttle.read_iops_device", - .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, iops[READ]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, { .name = "throttle.write_iops_device", - .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), + .private = offsetof(struct throtl_grp, iops[WRITE]), .seq_show = tg_print_conf_uint, .write = tg_set_conf_uint, }, @@ -1494,61 +1395,43 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, { struct throtl_grp *tg = pd_to_tg(pd); const char *dname = blkg_dev_name(pd->blkg); - char bufs[4][21] = { "max", "max", "max", "max" }; u64 bps_dft; unsigned int iops_dft; - char idle_time[26] = ""; - char latency_time[26] = ""; if (!dname) return 0; - if (off == LIMIT_LOW) { - bps_dft = 0; - iops_dft = 0; - } else { - bps_dft = U64_MAX; - iops_dft = UINT_MAX; - } + bps_dft = U64_MAX; + iops_dft = UINT_MAX; - if (tg->bps_conf[READ][off] == bps_dft && - tg->bps_conf[WRITE][off] == bps_dft && - tg->iops_conf[READ][off] == iops_dft && - tg->iops_conf[WRITE][off] == iops_dft && - (off != LIMIT_LOW || - (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && - tg->latency_target_conf == DFL_LATENCY_TARGET))) + if (tg->bps_conf[READ] == bps_dft && + tg->bps_conf[WRITE] == bps_dft && + tg->iops_conf[READ] == iops_dft && + tg->iops_conf[WRITE] == iops_dft) return 0; - if (tg->bps_conf[READ][off] != U64_MAX) - snprintf(bufs[0], sizeof(bufs[0]), "%llu", - tg->bps_conf[READ][off]); - if (tg->bps_conf[WRITE][off] != U64_MAX) - snprintf(bufs[1], sizeof(bufs[1]), "%llu", - tg->bps_conf[WRITE][off]); - if (tg->iops_conf[READ][off] != UINT_MAX) - snprintf(bufs[2], sizeof(bufs[2]), "%u", - tg->iops_conf[READ][off]); - if (tg->iops_conf[WRITE][off] != UINT_MAX) - snprintf(bufs[3], sizeof(bufs[3]), "%u", - tg->iops_conf[WRITE][off]); - if (off == LIMIT_LOW) { - if (tg->idletime_threshold_conf == ULONG_MAX) - strcpy(idle_time, " idle=max"); - else - snprintf(idle_time, sizeof(idle_time), " idle=%lu", - tg->idletime_threshold_conf); + seq_printf(sf, "%s", dname); + if (tg->bps_conf[READ] == U64_MAX) + seq_printf(sf, " rbps=max"); + else + seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]); - if (tg->latency_target_conf == ULONG_MAX) - strcpy(latency_time, " latency=max"); - else - snprintf(latency_time, sizeof(latency_time), - " latency=%lu", tg->latency_target_conf); - } + if (tg->bps_conf[WRITE] == U64_MAX) + seq_printf(sf, " wbps=max"); + else + seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]); + + if (tg->iops_conf[READ] == UINT_MAX) + seq_printf(sf, " riops=max"); + else + seq_printf(sf, " riops=%u", tg->iops_conf[READ]); - seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", - dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, - latency_time); + if (tg->iops_conf[WRITE] == UINT_MAX) + seq_printf(sf, " wiops=max"); + else + seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]); + + seq_printf(sf, "\n"); return 0; } @@ -1566,13 +1449,20 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, struct blkg_conf_ctx ctx; struct throtl_grp *tg; u64 v[4]; - unsigned long idle_time; - unsigned long latency_time; int ret; - int index = of_cft(of)->private; blkg_conf_init(&ctx, buf); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out_finish; + + if (!blk_throtl_activated(ctx.bdev->bd_queue)) { + ret = blk_throtl_init(ctx.bdev->bd_disk); + if (ret) + goto out_finish; + } + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) goto out_finish; @@ -1580,13 +1470,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg = blkg_to_tg(ctx.blkg); tg_update_carryover(tg); - v[0] = tg->bps_conf[READ][index]; - v[1] = tg->bps_conf[WRITE][index]; - v[2] = tg->iops_conf[READ][index]; - v[3] = tg->iops_conf[WRITE][index]; + v[0] = tg->bps[READ]; + v[1] = tg->bps[WRITE]; + v[2] = tg->iops[READ]; + v[3] = tg->iops[WRITE]; - idle_time = tg->idletime_threshold_conf; - latency_time = tg->latency_target_conf; while (true) { char tok[27]; /* wiops=18446744073709551616 */ char *p; @@ -1618,60 +1506,16 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, v[2] = min_t(u64, val, UINT_MAX); else if (!strcmp(tok, "wiops") && val > 1) v[3] = min_t(u64, val, UINT_MAX); - else if (off == LIMIT_LOW && !strcmp(tok, "idle")) - idle_time = val; - else if (off == LIMIT_LOW && !strcmp(tok, "latency")) - latency_time = val; else goto out_finish; } - tg->bps_conf[READ][index] = v[0]; - tg->bps_conf[WRITE][index] = v[1]; - tg->iops_conf[READ][index] = v[2]; - tg->iops_conf[WRITE][index] = v[3]; + tg->bps[READ] = v[0]; + tg->bps[WRITE] = v[1]; + tg->iops[READ] = v[2]; + tg->iops[WRITE] = v[3]; - if (index == LIMIT_MAX) { - tg->bps[READ][index] = v[0]; - tg->bps[WRITE][index] = v[1]; - tg->iops[READ][index] = v[2]; - tg->iops[WRITE][index] = v[3]; - } - tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], - tg->bps_conf[READ][LIMIT_MAX]); - tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], - tg->bps_conf[WRITE][LIMIT_MAX]); - tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], - tg->iops_conf[READ][LIMIT_MAX]); - tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], - tg->iops_conf[WRITE][LIMIT_MAX]); - tg->idletime_threshold_conf = idle_time; - tg->latency_target_conf = latency_time; - - /* force user to configure all settings for low limit */ - if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || - tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || - tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || - tg->latency_target_conf == DFL_LATENCY_TARGET) { - tg->bps[READ][LIMIT_LOW] = 0; - tg->bps[WRITE][LIMIT_LOW] = 0; - tg->iops[READ][LIMIT_LOW] = 0; - tg->iops[WRITE][LIMIT_LOW] = 0; - tg->idletime_threshold = DFL_IDLE_THRESHOLD; - tg->latency_target = DFL_LATENCY_TARGET; - } else if (index == LIMIT_LOW) { - tg->idletime_threshold = tg->idletime_threshold_conf; - tg->latency_target = tg->latency_target_conf; - } - - blk_throtl_update_limit_valid(tg->td); - if (tg->td->limit_valid[LIMIT_LOW]) { - if (index == LIMIT_LOW) - tg->td->limit_index = LIMIT_LOW; - } else - tg->td->limit_index = LIMIT_MAX; - tg_conf_updated(tg, index == LIMIT_LOW && - tg->td->limit_valid[LIMIT_LOW]); + tg_conf_updated(tg, false); ret = 0; out_finish: blkg_conf_exit(&ctx); @@ -1679,21 +1523,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, } static struct cftype throtl_files[] = { -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - { - .name = "low", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = tg_print_limit, - .write = tg_set_limit, - .private = LIMIT_LOW, - }, -#endif { .name = "max", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = tg_print_limit, .write = tg_set_limit, - .private = LIMIT_MAX, }, { } /* terminate */ }; @@ -1712,7 +1546,6 @@ struct blkcg_policy blkcg_policy_throtl = { .pd_alloc_fn = throtl_pd_alloc, .pd_init_fn = throtl_pd_init, .pd_online_fn = throtl_pd_online, - .pd_offline_fn = throtl_pd_offline, .pd_free_fn = throtl_pd_free, }; @@ -1722,6 +1555,9 @@ void blk_throtl_cancel_bios(struct gendisk *disk) struct cgroup_subsys_state *pos_css; struct blkcg_gq *blkg; + if (!blk_throtl_activated(q)) + return; + spin_lock_irq(&q->queue_lock); /* * queue_lock is held, rcu lock is not needed here technically. @@ -1761,418 +1597,6 @@ void blk_throtl_cancel_bios(struct gendisk *disk) spin_unlock_irq(&q->queue_lock); } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) -{ - unsigned long rtime = jiffies, wtime = jiffies; - - if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) - rtime = tg->last_low_overflow_time[READ]; - if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) - wtime = tg->last_low_overflow_time[WRITE]; - return min(rtime, wtime); -} - -static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) -{ - struct throtl_service_queue *parent_sq; - struct throtl_grp *parent = tg; - unsigned long ret = __tg_last_low_overflow_time(tg); - - while (true) { - parent_sq = parent->service_queue.parent_sq; - parent = sq_to_tg(parent_sq); - if (!parent) - break; - - /* - * The parent doesn't have low limit, it always reaches low - * limit. Its overflow time is useless for children - */ - if (!parent->bps[READ][LIMIT_LOW] && - !parent->iops[READ][LIMIT_LOW] && - !parent->bps[WRITE][LIMIT_LOW] && - !parent->iops[WRITE][LIMIT_LOW]) - continue; - if (time_after(__tg_last_low_overflow_time(parent), ret)) - ret = __tg_last_low_overflow_time(parent); - } - return ret; -} - -static bool throtl_tg_is_idle(struct throtl_grp *tg) -{ - /* - * cgroup is idle if: - * - single idle is too long, longer than a fixed value (in case user - * configure a too big threshold) or 4 times of idletime threshold - * - average think time is more than threshold - * - IO latency is largely below threshold - */ - unsigned long time; - bool ret; - - time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); - ret = tg->latency_target == DFL_LATENCY_TARGET || - tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (blk_time_get_ns() >> 10) - tg->last_finish_time > time || - tg->avg_idletime > tg->idletime_threshold || - (tg->latency_target && tg->bio_cnt && - tg->bad_bio_cnt * 5 < tg->bio_cnt); - throtl_log(&tg->service_queue, - "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", - tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, - tg->bio_cnt, ret, tg->td->scale); - return ret; -} - -static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) -{ - struct throtl_service_queue *sq = &tg->service_queue; - bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; - - /* - * if low limit is zero, low limit is always reached. - * if low limit is non-zero, we can check if there is any request - * is queued to determine if low limit is reached as we throttle - * request according to limit. - */ - return !limit || sq->nr_queued[rw]; -} - -static bool throtl_tg_can_upgrade(struct throtl_grp *tg) -{ - /* - * cgroup reaches low limit when low limit of READ and WRITE are - * both reached, it's ok to upgrade to next limit if cgroup reaches - * low limit - */ - if (throtl_low_limit_reached(tg, READ) && - throtl_low_limit_reached(tg, WRITE)) - return true; - - if (time_after_eq(jiffies, - tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && - throtl_tg_is_idle(tg)) - return true; - return false; -} - -static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) -{ - while (true) { - if (throtl_tg_can_upgrade(tg)) - return true; - tg = sq_to_tg(tg->service_queue.parent_sq); - if (!tg || !tg_to_blkg(tg)->parent) - return false; - } - return false; -} - -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - if (td->limit_index != LIMIT_LOW) - return false; - - if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) - return false; - - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - - if (tg == this_tg) - continue; - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) - continue; - if (!throtl_hierarchy_can_upgrade(tg)) { - rcu_read_unlock(); - return false; - } - } - rcu_read_unlock(); - return true; -} - -static void throtl_upgrade_check(struct throtl_grp *tg) -{ - unsigned long now = jiffies; - - if (tg->td->limit_index != LIMIT_LOW) - return; - - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) - return; - - tg->last_check_time = now; - - if (!time_after_eq(now, - __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) - return; - - if (throtl_can_upgrade(tg->td, NULL)) - throtl_upgrade_state(tg->td); -} - -static void throtl_upgrade_state(struct throtl_data *td) -{ - struct cgroup_subsys_state *pos_css; - struct blkcg_gq *blkg; - - throtl_log(&td->service_queue, "upgrade to max"); - td->limit_index = LIMIT_MAX; - td->low_upgrade_time = jiffies; - td->scale = 0; - rcu_read_lock(); - blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { - struct throtl_grp *tg = blkg_to_tg(blkg); - struct throtl_service_queue *sq = &tg->service_queue; - - tg->disptime = jiffies - 1; - throtl_select_dispatch(sq); - throtl_schedule_next_dispatch(sq, true); - } - rcu_read_unlock(); - throtl_select_dispatch(&td->service_queue); - throtl_schedule_next_dispatch(&td->service_queue, true); - queue_work(kthrotld_workqueue, &td->dispatch_work); -} - -static void throtl_downgrade_state(struct throtl_data *td) -{ - td->scale /= 2; - - throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); - if (td->scale) { - td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; - return; - } - - td->limit_index = LIMIT_LOW; - td->low_downgrade_time = jiffies; -} - -static bool throtl_tg_can_downgrade(struct throtl_grp *tg) -{ - struct throtl_data *td = tg->td; - unsigned long now = jiffies; - - /* - * If cgroup is below low limit, consider downgrade and throttle other - * cgroups - */ - if (time_after_eq(now, tg_last_low_overflow_time(tg) + - td->throtl_slice) && - (!throtl_tg_is_idle(tg) || - !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) - return true; - return false; -} - -static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) -{ - struct throtl_data *td = tg->td; - - if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) - return false; - - while (true) { - if (!throtl_tg_can_downgrade(tg)) - return false; - tg = sq_to_tg(tg->service_queue.parent_sq); - if (!tg || !tg_to_blkg(tg)->parent) - break; - } - return true; -} - -static void throtl_downgrade_check(struct throtl_grp *tg) -{ - uint64_t bps; - unsigned int iops; - unsigned long elapsed_time; - unsigned long now = jiffies; - - if (tg->td->limit_index != LIMIT_MAX || - !tg->td->limit_valid[LIMIT_LOW]) - return; - if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) - return; - if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) - return; - - elapsed_time = now - tg->last_check_time; - tg->last_check_time = now; - - if (time_before(now, tg_last_low_overflow_time(tg) + - tg->td->throtl_slice)) - return; - - if (tg->bps[READ][LIMIT_LOW]) { - bps = tg->last_bytes_disp[READ] * HZ; - do_div(bps, elapsed_time); - if (bps >= tg->bps[READ][LIMIT_LOW]) - tg->last_low_overflow_time[READ] = now; - } - - if (tg->bps[WRITE][LIMIT_LOW]) { - bps = tg->last_bytes_disp[WRITE] * HZ; - do_div(bps, elapsed_time); - if (bps >= tg->bps[WRITE][LIMIT_LOW]) - tg->last_low_overflow_time[WRITE] = now; - } - - if (tg->iops[READ][LIMIT_LOW]) { - iops = tg->last_io_disp[READ] * HZ / elapsed_time; - if (iops >= tg->iops[READ][LIMIT_LOW]) - tg->last_low_overflow_time[READ] = now; - } - - if (tg->iops[WRITE][LIMIT_LOW]) { - iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; - if (iops >= tg->iops[WRITE][LIMIT_LOW]) - tg->last_low_overflow_time[WRITE] = now; - } - - /* - * If cgroup is below low limit, consider downgrade and throttle other - * cgroups - */ - if (throtl_hierarchy_can_downgrade(tg)) - throtl_downgrade_state(tg->td); - - tg->last_bytes_disp[READ] = 0; - tg->last_bytes_disp[WRITE] = 0; - tg->last_io_disp[READ] = 0; - tg->last_io_disp[WRITE] = 0; -} - -static void blk_throtl_update_idletime(struct throtl_grp *tg) -{ - unsigned long now; - unsigned long last_finish_time = tg->last_finish_time; - - if (last_finish_time == 0) - return; - - now = blk_time_get_ns() >> 10; - if (now <= last_finish_time || - last_finish_time == tg->checked_last_finish_time) - return; - - tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; - tg->checked_last_finish_time = last_finish_time; -} - -static void throtl_update_latency_buckets(struct throtl_data *td) -{ - struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; - int i, cpu, rw; - unsigned long last_latency[2] = { 0 }; - unsigned long latency[2]; - - if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) - return; - if (time_before(jiffies, td->last_calculate_time + HZ)) - return; - td->last_calculate_time = jiffies; - - memset(avg_latency, 0, sizeof(avg_latency)); - for (rw = READ; rw <= WRITE; rw++) { - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; - - for_each_possible_cpu(cpu) { - struct latency_bucket *bucket; - - /* this isn't race free, but ok in practice */ - bucket = per_cpu_ptr(td->latency_buckets[rw], - cpu); - tmp->total_latency += bucket[i].total_latency; - tmp->samples += bucket[i].samples; - bucket[i].total_latency = 0; - bucket[i].samples = 0; - } - - if (tmp->samples >= 32) { - int samples = tmp->samples; - - latency[rw] = tmp->total_latency; - - tmp->total_latency = 0; - tmp->samples = 0; - latency[rw] /= samples; - if (latency[rw] == 0) - continue; - avg_latency[rw][i].latency = latency[rw]; - } - } - } - - for (rw = READ; rw <= WRITE; rw++) { - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - if (!avg_latency[rw][i].latency) { - if (td->avg_buckets[rw][i].latency < last_latency[rw]) - td->avg_buckets[rw][i].latency = - last_latency[rw]; - continue; - } - - if (!td->avg_buckets[rw][i].valid) - latency[rw] = avg_latency[rw][i].latency; - else - latency[rw] = (td->avg_buckets[rw][i].latency * 7 + - avg_latency[rw][i].latency) >> 3; - - td->avg_buckets[rw][i].latency = max(latency[rw], - last_latency[rw]); - td->avg_buckets[rw][i].valid = true; - last_latency[rw] = td->avg_buckets[rw][i].latency; - } - } - - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) - throtl_log(&td->service_queue, - "Latency bucket %d: read latency=%ld, read valid=%d, " - "write latency=%ld, write valid=%d", i, - td->avg_buckets[READ][i].latency, - td->avg_buckets[READ][i].valid, - td->avg_buckets[WRITE][i].latency, - td->avg_buckets[WRITE][i].valid); -} -#else -static inline void throtl_update_latency_buckets(struct throtl_data *td) -{ -} - -static void blk_throtl_update_idletime(struct throtl_grp *tg) -{ -} - -static void throtl_downgrade_check(struct throtl_grp *tg) -{ -} - -static void throtl_upgrade_check(struct throtl_grp *tg) -{ -} - -static bool throtl_can_upgrade(struct throtl_data *td, - struct throtl_grp *this_tg) -{ - return false; -} - -static void throtl_upgrade_state(struct throtl_data *td) -{ -} -#endif - bool __blk_throtl_bio(struct bio *bio) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); @@ -2185,21 +1609,12 @@ bool __blk_throtl_bio(struct bio *bio) struct throtl_data *td = tg->td; rcu_read_lock(); - spin_lock_irq(&q->queue_lock); - - throtl_update_latency_buckets(td); - - blk_throtl_update_idletime(tg); - sq = &tg->service_queue; -again: while (true) { if (tg->last_low_overflow_time[rw] == 0) tg->last_low_overflow_time[rw] = jiffies; - throtl_downgrade_check(tg); - throtl_upgrade_check(tg); /* throtl is FIFO - if bios are already queued, should queue */ if (sq->nr_queued[rw]) break; @@ -2207,10 +1622,6 @@ bool __blk_throtl_bio(struct bio *bio) /* if above limits, break to queue */ if (!tg_may_dispatch(tg, bio, NULL)) { tg->last_low_overflow_time[rw] = jiffies; - if (throtl_can_upgrade(td, tg)) { - throtl_upgrade_state(td); - goto again; - } break; } @@ -2270,215 +1681,25 @@ bool __blk_throtl_bio(struct bio *bio) } out_unlock: -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - if (throttled || !td->track_bio_latency) - bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; -#endif spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); return throttled; } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static void throtl_track_latency(struct throtl_data *td, sector_t size, - enum req_op op, unsigned long time) -{ - const bool rw = op_is_write(op); - struct latency_bucket *latency; - int index; - - if (!td || td->limit_index != LIMIT_LOW || - !(op == REQ_OP_READ || op == REQ_OP_WRITE) || - !blk_queue_nonrot(td->queue)) - return; - - index = request_bucket_index(size); - - latency = get_cpu_ptr(td->latency_buckets[rw]); - latency[index].total_latency += time; - latency[index].samples++; - put_cpu_ptr(td->latency_buckets[rw]); -} - -void blk_throtl_stat_add(struct request *rq, u64 time_ns) -{ - struct request_queue *q = rq->q; - struct throtl_data *td = q->td; - - throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), - time_ns >> 10); -} - -void blk_throtl_bio_endio(struct bio *bio) -{ - struct blkcg_gq *blkg; - struct throtl_grp *tg; - u64 finish_time_ns; - unsigned long finish_time; - unsigned long start_time; - unsigned long lat; - int rw = bio_data_dir(bio); - - blkg = bio->bi_blkg; - if (!blkg) - return; - tg = blkg_to_tg(blkg); - if (!tg->td->limit_valid[LIMIT_LOW]) - return; - - finish_time_ns = blk_time_get_ns(); - tg->last_finish_time = finish_time_ns >> 10; - - start_time = bio_issue_time(&bio->bi_issue) >> 10; - finish_time = __bio_issue_time(finish_time_ns) >> 10; - if (!start_time || finish_time <= start_time) - return; - - lat = finish_time - start_time; - /* this is only for bio based driver */ - if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY)) - throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue), - bio_op(bio), lat); - - if (tg->latency_target && lat >= tg->td->filtered_latency) { - int bucket; - unsigned int threshold; - - bucket = request_bucket_index(bio_issue_size(&bio->bi_issue)); - threshold = tg->td->avg_buckets[rw][bucket].latency + - tg->latency_target; - if (lat > threshold) - tg->bad_bio_cnt++; - /* - * Not race free, could get wrong count, which means cgroups - * will be throttled - */ - tg->bio_cnt++; - } - - if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { - tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; - tg->bio_cnt /= 2; - tg->bad_bio_cnt /= 2; - } -} -#endif - -int blk_throtl_init(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - struct throtl_data *td; - int ret; - - td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); - if (!td) - return -ENOMEM; - td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * - LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets[READ]) { - kfree(td); - return -ENOMEM; - } - td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * - LATENCY_BUCKET_SIZE, __alignof__(u64)); - if (!td->latency_buckets[WRITE]) { - free_percpu(td->latency_buckets[READ]); - kfree(td); - return -ENOMEM; - } - - INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); - throtl_service_queue_init(&td->service_queue); - - q->td = td; - td->queue = q; - - td->limit_valid[LIMIT_MAX] = true; - td->limit_index = LIMIT_MAX; - td->low_upgrade_time = jiffies; - td->low_downgrade_time = jiffies; - - /* activate policy */ - ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); - if (ret) { - free_percpu(td->latency_buckets[READ]); - free_percpu(td->latency_buckets[WRITE]); - kfree(td); - } - return ret; -} - void blk_throtl_exit(struct gendisk *disk) { struct request_queue *q = disk->queue; - BUG_ON(!q->td); + if (!blk_throtl_activated(q)) + return; + del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); blkcg_deactivate_policy(disk, &blkcg_policy_throtl); - free_percpu(q->td->latency_buckets[READ]); - free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } -void blk_throtl_register(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - struct throtl_data *td; - int i; - - td = q->td; - BUG_ON(!td); - - if (blk_queue_nonrot(q)) { - td->throtl_slice = DFL_THROTL_SLICE_SSD; - td->filtered_latency = LATENCY_FILTERED_SSD; - } else { - td->throtl_slice = DFL_THROTL_SLICE_HD; - td->filtered_latency = LATENCY_FILTERED_HD; - for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { - td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; - td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; - } - } -#ifndef CONFIG_BLK_DEV_THROTTLING_LOW - /* if no low limit, use previous default */ - td->throtl_slice = DFL_THROTL_SLICE_HD; - -#else - td->track_bio_latency = !queue_is_mq(q); - if (!td->track_bio_latency) - blk_stat_enable_accounting(q); -#endif -} - -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) -{ - if (!q->td) - return -EINVAL; - return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); -} - -ssize_t blk_throtl_sample_time_store(struct request_queue *q, - const char *page, size_t count) -{ - unsigned long v; - unsigned long t; - - if (!q->td) - return -EINVAL; - if (kstrtoul(page, 10, &v)) - return -EINVAL; - t = msecs_to_jiffies(v); - if (t == 0 || t > MAX_THROTL_SLICE) - return -EINVAL; - q->td->throtl_slice = t; - return count; -} -#endif - static int __init throtl_init(void) { kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index bffbc9cfc8ab..393c3d134b96 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -58,12 +58,6 @@ enum tg_state_flags { THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ }; -enum { - LIMIT_LOW, - LIMIT_MAX, - LIMIT_CNT, -}; - struct throtl_grp { /* must be the first member */ struct blkg_policy_data pd; @@ -102,14 +96,14 @@ struct throtl_grp { bool has_rules_iops[2]; /* internally used bytes per second rate limits */ - uint64_t bps[2][LIMIT_CNT]; + uint64_t bps[2]; /* user configured bps limits */ - uint64_t bps_conf[2][LIMIT_CNT]; + uint64_t bps_conf[2]; /* internally used IOPS limits */ - unsigned int iops[2][LIMIT_CNT]; + unsigned int iops[2]; /* user configured IOPS limits */ - unsigned int iops_conf[2][LIMIT_CNT]; + unsigned int iops_conf[2]; /* Number of bytes dispatched in current slice */ uint64_t bytes_disp[2]; @@ -132,22 +126,10 @@ struct throtl_grp { unsigned long last_check_time; - unsigned long latency_target; /* us */ - unsigned long latency_target_conf; /* us */ /* When did we start a new slice */ unsigned long slice_start[2]; unsigned long slice_end[2]; - unsigned long last_finish_time; /* ns / 1024 */ - unsigned long checked_last_finish_time; /* ns / 1024 */ - unsigned long avg_idletime; /* ns / 1024 */ - unsigned long idletime_threshold; /* us */ - unsigned long idletime_threshold_conf; /* us */ - - unsigned int bio_cnt; /* total bios */ - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ - unsigned long bio_cnt_reset_time; - struct blkg_rwstat stat_bytes; struct blkg_rwstat stat_ios; }; @@ -168,23 +150,33 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) * Internal throttling interface */ #ifndef CONFIG_BLK_DEV_THROTTLING -static inline int blk_throtl_init(struct gendisk *disk) { return 0; } static inline void blk_throtl_exit(struct gendisk *disk) { } -static inline void blk_throtl_register(struct gendisk *disk) { } static inline bool blk_throtl_bio(struct bio *bio) { return false; } static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } #else /* CONFIG_BLK_DEV_THROTTLING */ -int blk_throtl_init(struct gendisk *disk); void blk_throtl_exit(struct gendisk *disk); -void blk_throtl_register(struct gendisk *disk); bool __blk_throtl_bio(struct bio *bio); void blk_throtl_cancel_bios(struct gendisk *disk); +static inline bool blk_throtl_activated(struct request_queue *q) +{ + return q->td != NULL; +} + static inline bool blk_should_throtl(struct bio *bio) { - struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + struct throtl_grp *tg; int rw = bio_data_dir(bio); + /* + * This is called under bio_queue_enter(), and it's synchronized with + * the activation of blk-throtl, which is protected by + * blk_mq_freeze_queue(). + */ + if (!blk_throtl_activated(bio->bi_bdev->bd_queue)) + return false; + + tg = blkg_to_tg(bio->bi_blkg); if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { bio_set_flag(bio, BIO_CGROUP_ACCT); diff --git a/block/blk-zoned.c b/block/blk-zoned.c index da0f4b2a8fa0..48e5e3bbb89c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -7,6 +7,7 @@ * * Copyright (c) 2016, Damien Le Moal * Copyright (c) 2016, Western Digital + * Copyright (c) 2024, Western Digital Corporation or its affiliates. */ #include @@ -16,8 +17,13 @@ #include #include #include +#include +#include +#include #include "blk.h" +#include "blk-mq-sched.h" +#include "blk-mq-debugfs.h" #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name static const char *const zone_cond_name[] = { @@ -32,6 +38,64 @@ static const char *const zone_cond_name[] = { }; #undef ZONE_COND_NAME +/* + * Per-zone write plug. + * @node: hlist_node structure for managing the plug using a hash table. + * @link: To list the plug in the zone write plug error list of the disk. + * @ref: Zone write plug reference counter. A zone write plug reference is + * always at least 1 when the plug is hashed in the disk plug hash table. + * The reference is incremented whenever a new BIO needing plugging is + * submitted and when a function needs to manipulate a plug. The + * reference count is decremented whenever a plugged BIO completes and + * when a function that referenced the plug returns. The initial + * reference is dropped whenever the zone of the zone write plug is reset, + * finished and when the zone becomes full (last write BIO to the zone + * completes). + * @lock: Spinlock to atomically manipulate the plug. + * @flags: Flags indicating the plug state. + * @zone_no: The number of the zone the plug is managing. + * @wp_offset: The zone write pointer location relative to the start of the zone + * as a number of 512B sectors. + * @bio_list: The list of BIOs that are currently plugged. + * @bio_work: Work struct to handle issuing of plugged BIOs + * @rcu_head: RCU head to free zone write plugs with an RCU grace period. + * @disk: The gendisk the plug belongs to. + */ +struct blk_zone_wplug { + struct hlist_node node; + struct list_head link; + atomic_t ref; + spinlock_t lock; + unsigned int flags; + unsigned int zone_no; + unsigned int wp_offset; + struct bio_list bio_list; + struct work_struct bio_work; + struct rcu_head rcu_head; + struct gendisk *disk; +}; + +/* + * Zone write plug flags bits: + * - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged, + * that is, that write BIOs are being throttled due to a write BIO already + * being executed or the zone write plug bio list is not empty. + * - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be + * recovered with a report zone to update the zone write pointer offset. + * - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed + * from the disk hash table and that the initial reference to the zone + * write plug set when the plug was first added to the hash table has been + * dropped. This flag is set when a zone is reset, finished or become full, + * to prevent new references to the zone write plug to be taken for + * newly incoming BIOs. A zone write plug flagged with this flag will be + * freed once all remaining references from BIOs or functions are dropped. + */ +#define BLK_ZONE_WPLUG_PLUGGED (1U << 0) +#define BLK_ZONE_WPLUG_ERROR (1U << 1) +#define BLK_ZONE_WPLUG_UNHASHED (1U << 2) + +#define BLK_ZONE_WPLUG_BUSY (BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR) + /** * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. * @zone_cond: BLK_ZONE_COND_XXX. @@ -51,52 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -/* - * Return true if a request is a write requests that needs zone write locking. - */ -bool blk_req_needs_zone_write_lock(struct request *rq) -{ - if (!rq->q->disk->seq_zones_wlock) - return false; - - return blk_rq_is_seq_zoned_write(rq); -} -EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); - -bool blk_req_zone_write_trylock(struct request *rq) -{ - unsigned int zno = blk_rq_zone_no(rq); - - if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) - return false; - - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; - - return true; -} -EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); - -void __blk_req_zone_write_lock(struct request *rq) -{ - if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), - rq->q->disk->seq_zones_wlock))) - return; - - WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); - rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; -} -EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); - -void __blk_req_zone_write_unlock(struct request *rq) -{ - rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; - if (rq->q->disk->seq_zones_wlock) - WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), - rq->q->disk->seq_zones_wlock)); -} -EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); - /** * bdev_nr_zones - Get number of zones * @bdev: Target device @@ -425,189 +443,1483 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return ret; } -void disk_free_zone_bitmaps(struct gendisk *disk) +static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector) { - kfree(disk->conv_zones_bitmap); - disk->conv_zones_bitmap = NULL; - kfree(disk->seq_zones_wlock); - disk->seq_zones_wlock = NULL; + if (!disk->conv_zones_bitmap) + return false; + return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); } -struct blk_revalidate_zone_args { - struct gendisk *disk; - unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; - unsigned int nr_zones; - sector_t sector; -}; +static bool disk_insert_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + struct blk_zone_wplug *zwplg; + unsigned long flags; + unsigned int idx = + hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits); -/* - * Helper function to check the validity of zones of a zoned block device. - */ -static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) + /* + * Add the new zone write plug to the hash table, but carefully as we + * are racing with other submission context, so we may already have a + * zone write plug for the same zone. + */ + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) { + if (zwplg->zone_no == zwplug->zone_no) { + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + return false; + } + } + hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + return true; +} + +static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk, + sector_t sector) { - struct blk_revalidate_zone_args *args = data; - struct gendisk *disk = args->disk; - struct request_queue *q = disk->queue; - sector_t capacity = get_capacity(disk); - sector_t zone_sectors = q->limits.chunk_sectors; + unsigned int zno = disk_zone_no(disk, sector); + unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits); + struct blk_zone_wplug *zwplug; - /* Check for bad zones and holes in the zone report */ - if (zone->start != args->sector) { - pr_warn("%s: Zone gap at sectors %llu..%llu\n", - disk->disk_name, args->sector, zone->start); - return -ENODEV; + rcu_read_lock(); + + hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) { + if (zwplug->zone_no == zno && + atomic_inc_not_zero(&zwplug->ref)) { + rcu_read_unlock(); + return zwplug; + } } - if (zone->start >= capacity || !zone->len) { - pr_warn("%s: Invalid zone start %llu, length %llu\n", - disk->disk_name, zone->start, zone->len); - return -ENODEV; + rcu_read_unlock(); + + return NULL; +} + +static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head) +{ + struct blk_zone_wplug *zwplug = + container_of(rcu_head, struct blk_zone_wplug, rcu_head); + + mempool_free(zwplug, zwplug->disk->zone_wplugs_pool); +} + +static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug) +{ + if (atomic_dec_and_test(&zwplug->ref)) { + WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list)); + WARN_ON_ONCE(!list_empty(&zwplug->link)); + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)); + + call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu); } +} + +static inline bool disk_should_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + /* If the zone write plug was already removed, we are done. */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) + return false; + + /* If the zone write plug is still busy, it cannot be removed. */ + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + return false; /* - * All zones must have the same size, with the exception on an eventual - * smaller last zone. + * Completions of BIOs with blk_zone_write_plug_bio_endio() may + * happen after handling a request completion with + * blk_zone_write_plug_finish_request() (e.g. with split BIOs + * that are chained). In such case, disk_zone_wplug_unplug_bio() + * should not attempt to remove the zone write plug until all BIO + * completions are seen. Check by looking at the zone write plug + * reference count, which is 2 when the plug is unused (one reference + * taken when the plug was allocated and another reference taken by the + * caller context). */ - if (zone->start + zone->len < capacity) { - if (zone->len != zone_sectors) { - pr_warn("%s: Invalid zoned device with non constant zone size\n", - disk->disk_name); - return -ENODEV; + if (atomic_read(&zwplug->ref) > 2) + return false; + + /* We can remove zone write plugs for zones that are empty or full. */ + return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity; +} + +static void disk_remove_zone_wplug(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + /* If the zone write plug was already removed, we have nothing to do. */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) + return; + + /* + * Mark the zone write plug as unhashed and drop the extra reference we + * took when the plug was inserted in the hash table. + */ + zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED; + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + hlist_del_init_rcu(&zwplug->node); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + disk_put_zone_wplug(zwplug); +} + +static void blk_zone_wplug_bio_work(struct work_struct *work); + +/* + * Get a reference on the write plug for the zone containing @sector. + * If the plug does not exist, it is allocated and hashed. + * Return a pointer to the zone write plug with the plug spinlock held. + */ +static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk, + sector_t sector, gfp_t gfp_mask, + unsigned long *flags) +{ + unsigned int zno = disk_zone_no(disk, sector); + struct blk_zone_wplug *zwplug; + +again: + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + /* + * Check that a BIO completion or a zone reset or finish + * operation has not already removed the zone write plug from + * the hash table and dropped its reference count. In such case, + * we need to get a new plug so start over from the beginning. + */ + spin_lock_irqsave(&zwplug->lock, *flags); + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { + spin_unlock_irqrestore(&zwplug->lock, *flags); + disk_put_zone_wplug(zwplug); + goto again; } - } else if (zone->len > zone_sectors) { - pr_warn("%s: Invalid zoned device with larger last zone size\n", - disk->disk_name); - return -ENODEV; + return zwplug; } - /* Check zone type */ - switch (zone->type) { - case BLK_ZONE_TYPE_CONVENTIONAL: - if (!args->conv_zones_bitmap) { - args->conv_zones_bitmap = - blk_alloc_zone_bitmap(q->node, args->nr_zones); - if (!args->conv_zones_bitmap) - return -ENOMEM; - } - set_bit(idx, args->conv_zones_bitmap); - break; - case BLK_ZONE_TYPE_SEQWRITE_REQ: - if (!args->seq_zones_wlock) { - args->seq_zones_wlock = - blk_alloc_zone_bitmap(q->node, args->nr_zones); - if (!args->seq_zones_wlock) - return -ENOMEM; - } - break; - case BLK_ZONE_TYPE_SEQWRITE_PREF: - default: - pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", - disk->disk_name, (int)zone->type, zone->start); - return -ENODEV; + /* + * Allocate and initialize a zone write plug with an extra reference + * so that it is not freed when the zone write plug becomes idle without + * the zone being full. + */ + zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask); + if (!zwplug) + return NULL; + + INIT_HLIST_NODE(&zwplug->node); + INIT_LIST_HEAD(&zwplug->link); + atomic_set(&zwplug->ref, 2); + spin_lock_init(&zwplug->lock); + zwplug->flags = 0; + zwplug->zone_no = zno; + zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1); + bio_list_init(&zwplug->bio_list); + INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work); + zwplug->disk = disk; + + spin_lock_irqsave(&zwplug->lock, *flags); + + /* + * Insert the new zone write plug in the hash table. This can fail only + * if another context already inserted a plug. Retry from the beginning + * in such case. + */ + if (!disk_insert_zone_wplug(disk, zwplug)) { + spin_unlock_irqrestore(&zwplug->lock, *flags); + mempool_free(zwplug, disk->zone_wplugs_pool); + goto again; } - args->sector += zone->len; - return 0; + return zwplug; } -/** - * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps - * @disk: Target disk - * @update_driver_data: Callback to update driver data on the frozen disk - * - * Helper function for low-level device drivers to check and (re) allocate and - * initialize a disk request queue zone bitmaps. This functions should normally - * be called within the disk ->revalidate method for blk-mq based drivers. - * Before calling this function, the device driver must already have set the - * device zone size (chunk_sector limit) and the max zone append limit. - * For BIO based drivers, this function cannot be used. BIO based device drivers - * only need to set disk->nr_zones so that the sysfs exposed value is correct. - * If the @update_driver_data callback function is not NULL, the callback is - * executed with the device request queue frozen after all zones have been - * checked. +static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug, + struct bio *bio) +{ + struct request_queue *q = zwplug->disk->queue; + + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); + bio_io_error(bio); + disk_put_zone_wplug(zwplug); + blk_queue_exit(q); +} + +/* + * Abort (fail) all plugged BIOs of a zone write plug. */ -int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)) +static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug) { - struct request_queue *q = disk->queue; - sector_t zone_sectors = q->limits.chunk_sectors; - sector_t capacity = get_capacity(disk); - struct blk_revalidate_zone_args args = { }; - unsigned int noio_flag; - int ret; + struct bio *bio; - if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) - return -EIO; - if (WARN_ON_ONCE(!queue_is_mq(q))) - return -EIO; + while ((bio = bio_list_pop(&zwplug->bio_list))) + blk_zone_wplug_bio_io_error(zwplug, bio); +} - if (!capacity) - return -ENODEV; +/* + * Abort (fail) all plugged BIOs of a zone write plug that are not aligned + * with the assumed write pointer location of the zone when the BIO will + * be unplugged. + */ +static void disk_zone_wplug_abort_unaligned(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned int zone_capacity = disk->zone_capacity; + unsigned int wp_offset = zwplug->wp_offset; + struct bio_list bl = BIO_EMPTY_LIST; + struct bio *bio; + + while ((bio = bio_list_pop(&zwplug->bio_list))) { + if (wp_offset >= zone_capacity || + (bio_op(bio) != REQ_OP_ZONE_APPEND && + bio_offset_from_zone_start(bio) != wp_offset)) { + blk_zone_wplug_bio_io_error(zwplug, bio); + continue; + } + + wp_offset += bio_sectors(bio); + bio_list_add(&bl, bio); + } + + bio_list_merge(&zwplug->bio_list, &bl); +} + +static inline void disk_zone_wplug_set_error(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) + return; /* - * Checks that the device driver indicated a valid zone size and that - * the max zone append limit is set. + * At this point, we already have a reference on the zone write plug. + * However, since we are going to add the plug to the disk zone write + * plugs work list, increase its reference count. This reference will + * be dropped in disk_zone_wplugs_work() once the error state is + * handled, or in disk_zone_wplug_clear_error() if the zone is reset or + * finished. */ - if (!zone_sectors || !is_power_of_2(zone_sectors)) { - pr_warn("%s: Invalid non power of two zone size (%llu)\n", - disk->disk_name, zone_sectors); - return -ENODEV; + zwplug->flags |= BLK_ZONE_WPLUG_ERROR; + atomic_inc(&zwplug->ref); + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); +} + +static inline void disk_zone_wplug_clear_error(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) + return; + + /* + * We are racing with the error handling work which drops the reference + * on the zone write plug after handling the error state. So remove the + * plug from the error list and drop its reference count only if the + * error handling has not yet started, that is, if the zone write plug + * is still listed. + */ + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + if (!list_empty(&zwplug->link)) { + list_del_init(&zwplug->link); + zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; + disk_put_zone_wplug(zwplug); } + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); +} - if (!q->limits.max_zone_append_sectors) { - pr_warn("%s: Invalid 0 maximum zone append limit\n", - disk->disk_name); - return -ENODEV; +/* + * Set a zone write plug write pointer offset to either 0 (zone reset case) + * or to the zone size (zone finish case). This aborts all plugged BIOs, which + * is fine to do as doing a zone reset or zone finish while writes are in-flight + * is a mistake from the user which will most likely cause all plugged BIOs to + * fail anyway. + */ +static void disk_zone_wplug_set_wp_offset(struct gendisk *disk, + struct blk_zone_wplug *zwplug, + unsigned int wp_offset) +{ + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * Make sure that a BIO completion or another zone reset or finish + * operation has not already removed the plug from the hash table. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) { + spin_unlock_irqrestore(&zwplug->lock, flags); + return; } + /* Update the zone write pointer and abort all plugged BIOs. */ + zwplug->wp_offset = wp_offset; + disk_zone_wplug_abort(zwplug); + /* - * Ensure that all memory allocations in this context are done as if - * GFP_NOIO was specified. + * Updating the write pointer offset puts back the zone + * in a good state. So clear the error flag and decrement the + * error count if we were in error state. */ - args.disk = disk; - args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); - noio_flag = memalloc_noio_save(); - ret = disk->fops->report_zones(disk, 0, UINT_MAX, - blk_revalidate_zone_cb, &args); - if (!ret) { - pr_warn("%s: No zones reported\n", disk->disk_name); - ret = -ENODEV; + disk_zone_wplug_clear_error(disk, zwplug); + + /* + * The zone write plug now has no BIO plugged: remove it from the + * hash table so that it cannot be seen. The plug will be freed + * when the last reference is dropped. + */ + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); + + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio, + unsigned int wp_offset) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; + + /* Conventional zones cannot be reset nor finished. */ + if (disk_zone_is_conv(disk, sector)) { + bio_io_error(bio); + return true; } - memalloc_noio_restore(noio_flag); /* - * If zones where reported, make sure that the entire disk capacity - * has been checked. + * If we have a zone write plug, set its write pointer offset to 0 + * (reset case) or to the zone size (finish case). This will abort all + * BIOs plugged for the target zone. It is fine as resetting or + * finishing zones while writes are still in-flight will result in the + * writes failing anyway. */ - if (ret > 0 && args.sector != capacity) { - pr_warn("%s: Missing zones from sector %llu\n", - disk->disk_name, args.sector); - ret = -ENODEV; + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset); + disk_put_zone_wplug(zwplug); } + return false; +} + +static bool blk_zone_wplug_handle_reset_all(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug; + sector_t sector; + /* - * Install the new bitmaps and update nr_zones only once the queue is - * stopped and all I/Os are completed (i.e. a scheduler is not - * referencing the bitmaps). + * Set the write pointer offset of all zone write plugs to 0. This will + * abort all plugged BIOs. It is fine as resetting zones while writes + * are still in-flight will result in the writes failing anyway. */ - blk_mq_freeze_queue(q); - if (ret > 0) { - disk->nr_zones = args.nr_zones; - swap(disk->seq_zones_wlock, args.seq_zones_wlock); - swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); - if (update_driver_data) - update_driver_data(disk); - ret = 0; - } else { - pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - disk_free_zone_bitmaps(disk); + for (sector = 0; sector < get_capacity(disk); + sector += disk->queue->limits.chunk_sectors) { + zwplug = disk_get_zone_wplug(disk, sector); + if (zwplug) { + disk_zone_wplug_set_wp_offset(disk, zwplug, 0); + disk_put_zone_wplug(zwplug); + } } - blk_mq_unfreeze_queue(q); - kfree(args.seq_zones_wlock); - kfree(args.conv_zones_bitmap); - return ret; + return false; } -EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); + +static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug, + struct bio *bio, unsigned int nr_segs) +{ + /* + * Grab an extra reference on the BIO request queue usage counter. + * This reference will be reused to submit a request for the BIO for + * blk-mq devices and dropped when the BIO is failed and after + * it is issued in the case of BIO-based devices. + */ + percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter); + + /* + * The BIO is being plugged and thus will have to wait for the on-going + * write and for all other writes already plugged. So polling makes + * no sense. + */ + bio_clear_polled(bio); + + /* + * Reuse the poll cookie field to store the number of segments when + * split to the hardware limits. + */ + bio->__bi_nr_segments = nr_segs; + + /* + * We always receive BIOs after they are split and ready to be issued. + * The block layer passes the parts of a split BIO in order, and the + * user must also issue write sequentially. So simply add the new BIO + * at the tail of the list to preserve the sequential write order. + */ + bio_list_add(&zwplug->bio_list, bio); +} + +/* + * Called from bio_attempt_back_merge() when a BIO was merged with a request. + */ +void blk_zone_write_plug_bio_merged(struct bio *bio) +{ + struct blk_zone_wplug *zwplug; + unsigned long flags; + + /* + * If the BIO was already plugged, then we were called through + * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge(). + * For this case, we already hold a reference on the zone write plug for + * the BIO and blk_zone_write_plug_init_request() will handle the + * zone write pointer offset update. + */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return; + + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * Get a reference on the zone write plug of the target zone and advance + * the zone write pointer offset. Given that this is a merge, we already + * have at least one request and one BIO referencing the zone write + * plug. So this should not fail. + */ + zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk, + bio->bi_iter.bi_sector); + if (WARN_ON_ONCE(!zwplug)) + return; + + spin_lock_irqsave(&zwplug->lock, flags); + zwplug->wp_offset += bio_sectors(bio); + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +/* + * Attempt to merge plugged BIOs with a newly prepared request for a BIO that + * already went through zone write plugging (either a new BIO or one that was + * unplugged). + */ +void blk_zone_write_plug_init_request(struct request *req) +{ + sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req); + struct request_queue *q = req->q; + struct gendisk *disk = q->disk; + unsigned int zone_capacity = disk->zone_capacity; + struct blk_zone_wplug *zwplug = + disk_get_zone_wplug(disk, blk_rq_pos(req)); + unsigned long flags; + struct bio *bio; + + if (WARN_ON_ONCE(!zwplug)) + return; + + /* + * Indicate that completion of this request needs to be handled with + * blk_zone_write_plug_finish_request(), which will drop the reference + * on the zone write plug we took above on entry to this function. + */ + req->rq_flags |= RQF_ZONE_WRITE_PLUGGING; + + if (blk_queue_nomerges(q)) + return; + + /* + * Walk through the list of plugged BIOs to check if they can be merged + * into the back of the request. + */ + spin_lock_irqsave(&zwplug->lock, flags); + while (zwplug->wp_offset < zone_capacity) { + bio = bio_list_peek(&zwplug->bio_list); + if (!bio) + break; + + if (bio->bi_iter.bi_sector != req_back_sector || + !blk_rq_merge_ok(req, bio)) + break; + + WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES && + !bio->__bi_nr_segments); + + bio_list_pop(&zwplug->bio_list); + if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) != + BIO_MERGE_OK) { + bio_list_add_head(&zwplug->bio_list, bio); + break; + } + + /* + * Drop the extra reference on the queue usage we got when + * plugging the BIO and advance the write pointer offset. + */ + blk_queue_exit(q); + zwplug->wp_offset += bio_sectors(bio); + + req_back_sector += bio_sectors(bio); + } + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +/* + * Check and prepare a BIO for submission by incrementing the write pointer + * offset of its zone write plug and changing zone append operations into + * regular write when zone append emulation is needed. + */ +static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug, + struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + + /* + * Check that the user is not attempting to write to a full zone. + * We know such BIO will fail, and that would potentially overflow our + * write pointer offset beyond the end of the zone. + */ + if (zwplug->wp_offset >= disk->zone_capacity) + goto err; + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + /* + * Use a regular write starting at the current write pointer. + * Similarly to native zone append operations, do not allow + * merging. + */ + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE; + bio->bi_iter.bi_sector += zwplug->wp_offset; + + /* + * Remember that this BIO is in fact a zone append operation + * so that we can restore its operation code on completion. + */ + bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND); + } else { + /* + * Check for non-sequential writes early because we avoid a + * whole lot of error handling trouble if we don't send it off + * to the driver. + */ + if (bio_offset_from_zone_start(bio) != zwplug->wp_offset) + goto err; + } + + /* Advance the zone write pointer offset. */ + zwplug->wp_offset += bio_sectors(bio); + + return true; + +err: + /* We detected an invalid write BIO: schedule error recovery. */ + disk_zone_wplug_set_error(disk, zwplug); + kblockd_schedule_work(&disk->zone_wplugs_work); + return false; +} + +static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + sector_t sector = bio->bi_iter.bi_sector; + struct blk_zone_wplug *zwplug; + gfp_t gfp_mask = GFP_NOIO; + unsigned long flags; + + /* + * BIOs must be fully contained within a zone so that we use the correct + * zone write plug for the entire BIO. For blk-mq devices, the block + * layer should already have done any splitting required to ensure this + * and this BIO should thus not be straddling zone boundaries. For + * BIO-based devices, it is the responsibility of the driver to split + * the bio before submitting it. + */ + if (WARN_ON_ONCE(bio_straddles_zones(bio))) { + bio_io_error(bio); + return true; + } + + /* Conventional zones do not need write plugging. */ + if (disk_zone_is_conv(disk, sector)) { + /* Zone append to conventional zones is not allowed. */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + bio_io_error(bio); + return true; + } + return false; + } + + if (bio->bi_opf & REQ_NOWAIT) + gfp_mask = GFP_NOWAIT; + + zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags); + if (!zwplug) { + bio_io_error(bio); + return true; + } + + /* Indicate that this BIO is being handled using zone write plugging. */ + bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * If the zone is already plugged or has a pending error, add the BIO + * to the plug BIO list. Otherwise, plug and let the BIO execute. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_BUSY) + goto plug; + + /* + * If an error is detected when preparing the BIO, add it to the BIO + * list so that error recovery can deal with it. + */ + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) + goto plug; + + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + + spin_unlock_irqrestore(&zwplug->lock, flags); + + return false; + +plug: + zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED; + blk_zone_wplug_add_bio(zwplug, bio, nr_segs); + + spin_unlock_irqrestore(&zwplug->lock, flags); + + return true; +} + +/** + * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging + * @bio: The BIO being submitted + * @nr_segs: The number of physical segments of @bio + * + * Handle write, write zeroes and zone append operations requiring emulation + * using zone write plugging. + * + * Return true whenever @bio execution needs to be delayed through the zone + * write plug. Otherwise, return false to let the submission path process + * @bio normally. + */ +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + struct block_device *bdev = bio->bi_bdev; + + if (!bdev->bd_disk->zone_wplugs_hash) + return false; + + /* + * If the BIO already has the plugging flag set, then it was already + * handled through this path and this is a submission from the zone + * plug bio submit work. + */ + if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING)) + return false; + + /* + * We do not need to do anything special for empty flush BIOs, e.g + * BIOs such as issued by blkdev_issue_flush(). The is because it is + * the responsibility of the user to first wait for the completion of + * write operations for flush to have any effect on the persistence of + * the written data. + */ + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + + /* + * Regular writes and write zeroes need to be handled through the target + * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH + * which may need to go through the flush machinery depending on the + * target device capabilities. Plugging such writes is fine as the flush + * machinery operates at the request level, below the plug, and + * completion of the flush sequence will go through the regular BIO + * completion, which will handle zone write plugging. + * Zone append operations for devices that requested emulation must + * also be plugged so that these BIOs can be changed into regular + * write BIOs. + * Zone reset, reset all and finish commands need special treatment + * to correctly track the write pointer offset of zones. These commands + * are not plugged as we do not need serialization with write + * operations. It is the responsibility of the user to not issue reset + * and finish commands when write operations are in flight. + */ + switch (bio_op(bio)) { + case REQ_OP_ZONE_APPEND: + if (!bdev_emulates_zone_append(bdev)) + return false; + fallthrough; + case REQ_OP_WRITE: + case REQ_OP_WRITE_ZEROES: + return blk_zone_wplug_handle_write(bio, nr_segs); + case REQ_OP_ZONE_RESET: + return blk_zone_wplug_handle_reset_or_finish(bio, 0); + case REQ_OP_ZONE_FINISH: + return blk_zone_wplug_handle_reset_or_finish(bio, + bdev_zone_sectors(bdev)); + case REQ_OP_ZONE_RESET_ALL: + return blk_zone_wplug_handle_reset_all(bio); + default: + return false; + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_zone_plug_bio); + +static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + /* + * Take a reference on the zone write plug and schedule the submission + * of the next plugged BIO. blk_zone_wplug_bio_work() will release the + * reference we take here. + */ + WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED)); + atomic_inc(&zwplug->ref); + queue_work(disk->zone_wplugs_wq, &zwplug->bio_work); +} + +static void disk_zone_wplug_unplug_bio(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + unsigned long flags; + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * If we had an error, schedule error recovery. The recovery work + * will restart submission of plugged BIOs. + */ + if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) { + spin_unlock_irqrestore(&zwplug->lock, flags); + kblockd_schedule_work(&disk->zone_wplugs_work); + return; + } + + /* Schedule submission of the next plugged BIO if we have one. */ + if (!bio_list_empty(&zwplug->bio_list)) { + disk_zone_wplug_schedule_bio_work(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + return; + } + + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + + /* + * If the zone is full (it was fully written or finished, or empty + * (it was reset), remove its zone write plug from the hash table. + */ + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); + + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +void blk_zone_write_plug_bio_endio(struct bio *bio) +{ + struct gendisk *disk = bio->bi_bdev->bd_disk; + struct blk_zone_wplug *zwplug = + disk_get_zone_wplug(disk, bio->bi_iter.bi_sector); + unsigned long flags; + + if (WARN_ON_ONCE(!zwplug)) + return; + + /* Make sure we do not see this BIO again by clearing the plug flag. */ + bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING); + + /* + * If this is a regular write emulating a zone append operation, + * restore the original operation code. + */ + if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) { + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= REQ_OP_ZONE_APPEND; + } + + /* + * If the BIO failed, mark the plug as having an error to trigger + * recovery. + */ + if (bio->bi_status != BLK_STS_OK) { + spin_lock_irqsave(&zwplug->lock, flags); + disk_zone_wplug_set_error(disk, zwplug); + spin_unlock_irqrestore(&zwplug->lock, flags); + } + + /* Drop the reference we took when the BIO was issued. */ + disk_put_zone_wplug(zwplug); + + /* + * For BIO-based devices, blk_zone_write_plug_finish_request() + * is not called. So we need to schedule execution of the next + * plugged BIO here. + */ + if (bio->bi_bdev->bd_has_submit_bio) + disk_zone_wplug_unplug_bio(disk, zwplug); + + /* Drop the reference we took when entering this function. */ + disk_put_zone_wplug(zwplug); +} + +void blk_zone_write_plug_finish_request(struct request *req) +{ + struct gendisk *disk = req->q->disk; + struct blk_zone_wplug *zwplug; + + zwplug = disk_get_zone_wplug(disk, req->__sector); + if (WARN_ON_ONCE(!zwplug)) + return; + + req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING; + + /* + * Drop the reference we took when the request was initialized in + * blk_zone_write_plug_init_request(). + */ + disk_put_zone_wplug(zwplug); + + disk_zone_wplug_unplug_bio(disk, zwplug); + + /* Drop the reference we took when entering this function. */ + disk_put_zone_wplug(zwplug); +} + +static void blk_zone_wplug_bio_work(struct work_struct *work) +{ + struct blk_zone_wplug *zwplug = + container_of(work, struct blk_zone_wplug, bio_work); + struct block_device *bdev; + unsigned long flags; + struct bio *bio; + + /* + * Submit the next plugged BIO. If we do not have any, clear + * the plugged flag. + */ + spin_lock_irqsave(&zwplug->lock, flags); + + bio = bio_list_pop(&zwplug->bio_list); + if (!bio) { + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + spin_unlock_irqrestore(&zwplug->lock, flags); + goto put_zwplug; + } + + if (!blk_zone_wplug_prepare_bio(zwplug, bio)) { + /* Error recovery will decide what to do with the BIO. */ + bio_list_add_head(&zwplug->bio_list, bio); + spin_unlock_irqrestore(&zwplug->lock, flags); + goto put_zwplug; + } + + spin_unlock_irqrestore(&zwplug->lock, flags); + + bdev = bio->bi_bdev; + submit_bio_noacct_nocheck(bio); + + /* + * blk-mq devices will reuse the extra reference on the request queue + * usage counter we took when the BIO was plugged, but the submission + * path for BIO-based devices will not do that. So drop this extra + * reference here. + */ + if (bdev->bd_has_submit_bio) + blk_queue_exit(bdev->bd_disk->queue); + +put_zwplug: + /* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */ + disk_put_zone_wplug(zwplug); +} + +static unsigned int blk_zone_wp_offset(struct blk_zone *zone) +{ + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return zone->wp - zone->start; + case BLK_ZONE_COND_FULL: + return zone->len; + case BLK_ZONE_COND_EMPTY: + return 0; + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + default: + /* + * Conventional, offline and read-only zones do not have a valid + * write pointer. + */ + return UINT_MAX; + } +} + +static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone, + unsigned int idx, void *data) +{ + struct blk_zone *zonep = data; + + *zonep = *zone; + return 0; +} + +static void disk_zone_wplug_handle_error(struct gendisk *disk, + struct blk_zone_wplug *zwplug) +{ + sector_t zone_start_sector = + bdev_zone_sectors(disk->part0) * zwplug->zone_no; + unsigned int noio_flag; + struct blk_zone zone; + unsigned long flags; + int ret; + + /* Get the current zone information from the device. */ + noio_flag = memalloc_noio_save(); + ret = disk->fops->report_zones(disk, zone_start_sector, 1, + blk_zone_wplug_report_zone_cb, &zone); + memalloc_noio_restore(noio_flag); + + spin_lock_irqsave(&zwplug->lock, flags); + + /* + * A zone reset or finish may have cleared the error already. In such + * case, do nothing as the report zones may have seen the "old" write + * pointer value before the reset/finish operation completed. + */ + if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR)) + goto unlock; + + zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR; + + if (ret != 1) { + /* + * We failed to get the zone information, meaning that something + * is likely really wrong with the device. Abort all remaining + * plugged BIOs as otherwise we could endup waiting forever on + * plugged BIOs to complete if there is a queue freeze on-going. + */ + disk_zone_wplug_abort(zwplug); + goto unplug; + } + + /* Update the zone write pointer offset. */ + zwplug->wp_offset = blk_zone_wp_offset(&zone); + disk_zone_wplug_abort_unaligned(disk, zwplug); + + /* Restart BIO submission if we still have any BIO left. */ + if (!bio_list_empty(&zwplug->bio_list)) { + disk_zone_wplug_schedule_bio_work(disk, zwplug); + goto unlock; + } + +unplug: + zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED; + if (disk_should_remove_zone_wplug(disk, zwplug)) + disk_remove_zone_wplug(disk, zwplug); + +unlock: + spin_unlock_irqrestore(&zwplug->lock, flags); +} + +static void disk_zone_wplugs_work(struct work_struct *work) +{ + struct gendisk *disk = + container_of(work, struct gendisk, zone_wplugs_work); + struct blk_zone_wplug *zwplug; + unsigned long flags; + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + + while (!list_empty(&disk->zone_wplugs_err_list)) { + zwplug = list_first_entry(&disk->zone_wplugs_err_list, + struct blk_zone_wplug, link); + list_del_init(&zwplug->link); + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); + + disk_zone_wplug_handle_error(disk, zwplug); + disk_put_zone_wplug(zwplug); + + spin_lock_irqsave(&disk->zone_wplugs_lock, flags); + } + + spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags); +} + +static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk) +{ + return 1U << disk->zone_wplugs_hash_bits; +} + +void disk_init_zone_resources(struct gendisk *disk) +{ + spin_lock_init(&disk->zone_wplugs_lock); + INIT_LIST_HEAD(&disk->zone_wplugs_err_list); + INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work); +} + +/* + * For the size of a disk zone write plug hash table, use the size of the + * zone write plug mempool, which is the maximum of the disk open zones and + * active zones limits. But do not exceed 4KB (512 hlist head entries), that is, + * 9 bits. For a disk that has no limits, mempool size defaults to 128. + */ +#define BLK_ZONE_WPLUG_MAX_HASH_BITS 9 +#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE 128 + +static int disk_alloc_zone_resources(struct gendisk *disk, + unsigned int pool_size) +{ + unsigned int i; + + disk->zone_wplugs_hash_bits = + min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS); + + disk->zone_wplugs_hash = + kcalloc(disk_zone_wplugs_hash_size(disk), + sizeof(struct hlist_head), GFP_KERNEL); + if (!disk->zone_wplugs_hash) + return -ENOMEM; + + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) + INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]); + + disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size, + sizeof(struct blk_zone_wplug)); + if (!disk->zone_wplugs_pool) + goto free_hash; + + disk->zone_wplugs_wq = + alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI, + pool_size, disk->disk_name); + if (!disk->zone_wplugs_wq) + goto destroy_pool; + + return 0; + +destroy_pool: + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; +free_hash: + kfree(disk->zone_wplugs_hash); + disk->zone_wplugs_hash = NULL; + disk->zone_wplugs_hash_bits = 0; + return -ENOMEM; +} + +static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk) +{ + struct blk_zone_wplug *zwplug; + unsigned int i; + + if (!disk->zone_wplugs_hash) + return; + + /* Free all the zone write plugs we have. */ + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + while (!hlist_empty(&disk->zone_wplugs_hash[i])) { + zwplug = hlist_entry(disk->zone_wplugs_hash[i].first, + struct blk_zone_wplug, node); + atomic_inc(&zwplug->ref); + disk_remove_zone_wplug(disk, zwplug); + disk_put_zone_wplug(zwplug); + } + } + + kfree(disk->zone_wplugs_hash); + disk->zone_wplugs_hash = NULL; + disk->zone_wplugs_hash_bits = 0; +} + +void disk_free_zone_resources(struct gendisk *disk) +{ + cancel_work_sync(&disk->zone_wplugs_work); + + if (disk->zone_wplugs_wq) { + destroy_workqueue(disk->zone_wplugs_wq); + disk->zone_wplugs_wq = NULL; + } + + disk_destroy_zone_wplugs_hash_table(disk); + + /* + * Wait for the zone write plugs to be RCU-freed before + * destorying the mempool. + */ + rcu_barrier(); + + mempool_destroy(disk->zone_wplugs_pool); + disk->zone_wplugs_pool = NULL; + + kfree(disk->conv_zones_bitmap); + disk->conv_zones_bitmap = NULL; + disk->zone_capacity = 0; + disk->nr_zones = 0; +} + +static inline bool disk_need_zone_resources(struct gendisk *disk) +{ + /* + * All mq zoned devices need zone resources so that the block layer + * can automatically handle write BIO plugging. BIO-based device drivers + * (e.g. DM devices) are normally responsible for handling zone write + * ordering and do not need zone resources, unless the driver requires + * zone append emulation. + */ + return queue_is_mq(disk->queue) || + queue_emulates_zone_append(disk->queue); +} + +static int disk_revalidate_zone_resources(struct gendisk *disk, + unsigned int nr_zones) +{ + struct queue_limits *lim = &disk->queue->limits; + unsigned int pool_size; + + if (!disk_need_zone_resources(disk)) + return 0; + + /* + * If the device has no limit on the maximum number of open and active + * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE. + */ + pool_size = max(lim->max_open_zones, lim->max_active_zones); + if (!pool_size) + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones); + + if (!disk->zone_wplugs_hash) + return disk_alloc_zone_resources(disk, pool_size); + + return 0; +} + +struct blk_revalidate_zone_args { + struct gendisk *disk; + unsigned long *conv_zones_bitmap; + unsigned int nr_zones; + unsigned int zone_capacity; + sector_t sector; +}; + +/* + * Update the disk zone resources information and device queue limits. + * The disk queue is frozen when this is executed. + */ +static int disk_update_zone_resources(struct gendisk *disk, + struct blk_revalidate_zone_args *args) +{ + struct request_queue *q = disk->queue; + unsigned int nr_seq_zones, nr_conv_zones = 0; + unsigned int pool_size; + struct queue_limits lim; + + disk->nr_zones = args->nr_zones; + disk->zone_capacity = args->zone_capacity; + swap(disk->conv_zones_bitmap, args->conv_zones_bitmap); + if (disk->conv_zones_bitmap) + nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap, + disk->nr_zones); + if (nr_conv_zones >= disk->nr_zones) { + pr_warn("%s: Invalid number of conventional zones %u / %u\n", + disk->disk_name, nr_conv_zones, disk->nr_zones); + return -ENODEV; + } + + if (!disk->zone_wplugs_pool) + return 0; + + /* + * If the device has no limit on the maximum number of open and active + * zones, set its max open zone limit to the mempool size to indicate + * to the user that there is a potential performance impact due to + * dynamic zone write plug allocation when simultaneously writing to + * more zones than the size of the mempool. + */ + lim = queue_limits_start_update(q); + + nr_seq_zones = disk->nr_zones - nr_conv_zones; + pool_size = max(lim.max_open_zones, lim.max_active_zones); + if (!pool_size) + pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones); + + mempool_resize(disk->zone_wplugs_pool, pool_size); + + if (!lim.max_open_zones && !lim.max_active_zones) { + if (pool_size < nr_seq_zones) + lim.max_open_zones = pool_size; + else + lim.max_open_zones = 0; + } + + return queue_limits_commit_update(q, &lim); +} + +static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + struct gendisk *disk = args->disk; + struct request_queue *q = disk->queue; + + if (zone->capacity != zone->len) { + pr_warn("%s: Invalid conventional zone capacity\n", + disk->disk_name); + return -ENODEV; + } + + if (!disk_need_zone_resources(disk)) + return 0; + + if (!args->conv_zones_bitmap) { + args->conv_zones_bitmap = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->conv_zones_bitmap) + return -ENOMEM; + } + + set_bit(idx, args->conv_zones_bitmap); + + return 0; +} + +static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx, + struct blk_revalidate_zone_args *args) +{ + struct gendisk *disk = args->disk; + struct blk_zone_wplug *zwplug; + unsigned int wp_offset; + unsigned long flags; + + /* + * Remember the capacity of the first sequential zone and check + * if it is constant for all zones. + */ + if (!args->zone_capacity) + args->zone_capacity = zone->capacity; + if (zone->capacity != args->zone_capacity) { + pr_warn("%s: Invalid variable zone capacity\n", + disk->disk_name); + return -ENODEV; + } + + /* + * We need to track the write pointer of all zones that are not + * empty nor full. So make sure we have a zone write plug for + * such zone if the device has a zone write plug hash table. + */ + if (!disk->zone_wplugs_hash) + return 0; + + wp_offset = blk_zone_wp_offset(zone); + if (!wp_offset || wp_offset >= zone->capacity) + return 0; + + zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags); + if (!zwplug) + return -ENOMEM; + spin_unlock_irqrestore(&zwplug->lock, flags); + disk_put_zone_wplug(zwplug); + + return 0; +} + +/* + * Helper function to check the validity of zones of a zoned block device. + */ +static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct blk_revalidate_zone_args *args = data; + struct gendisk *disk = args->disk; + sector_t capacity = get_capacity(disk); + sector_t zone_sectors = disk->queue->limits.chunk_sectors; + int ret; + + /* Check for bad zones and holes in the zone report */ + if (zone->start != args->sector) { + pr_warn("%s: Zone gap at sectors %llu..%llu\n", + disk->disk_name, args->sector, zone->start); + return -ENODEV; + } + + if (zone->start >= capacity || !zone->len) { + pr_warn("%s: Invalid zone start %llu, length %llu\n", + disk->disk_name, zone->start, zone->len); + return -ENODEV; + } + + /* + * All zones must have the same size, with the exception on an eventual + * smaller last zone. + */ + if (zone->start + zone->len < capacity) { + if (zone->len != zone_sectors) { + pr_warn("%s: Invalid zoned device with non constant zone size\n", + disk->disk_name); + return -ENODEV; + } + } else if (zone->len > zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); + return -ENODEV; + } + + if (!zone->capacity || zone->capacity > zone->len) { + pr_warn("%s: Invalid zone capacity\n", + disk->disk_name); + return -ENODEV; + } + + /* Check zone type */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + ret = blk_revalidate_conv_zone(zone, idx, args); + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + ret = blk_revalidate_seq_zone(zone, idx, args); + break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: + default: + pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", + disk->disk_name, (int)zone->type, zone->start); + ret = -ENODEV; + } + + if (!ret) + args->sector += zone->len; + + return ret; +} + +/** + * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs + * @disk: Target disk + * + * Helper function for low-level device drivers to check, (re) allocate and + * initialize resources used for managing zoned disks. This function should + * normally be called by blk-mq based drivers when a zoned gendisk is probed + * and when the zone configuration of the gendisk changes (e.g. after a format). + * Before calling this function, the device driver must already have set the + * device zone size (chunk_sector limit) and the max zone append limit. + * BIO based drivers can also use this function as long as the device queue + * can be safely frozen. + */ +int blk_revalidate_disk_zones(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + sector_t zone_sectors = q->limits.chunk_sectors; + sector_t capacity = get_capacity(disk); + struct blk_revalidate_zone_args args = { }; + unsigned int noio_flag; + int ret = -ENOMEM; + + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return -EIO; + + if (!capacity) + return -ENODEV; + + /* + * Checks that the device driver indicated a valid zone size and that + * the max zone append limit is set. + */ + if (!zone_sectors || !is_power_of_2(zone_sectors)) { + pr_warn("%s: Invalid non power of two zone size (%llu)\n", + disk->disk_name, zone_sectors); + return -ENODEV; + } + + if (!queue_max_zone_append_sectors(q)) { + pr_warn("%s: Invalid 0 maximum zone append limit\n", + disk->disk_name); + return -ENODEV; + } + + /* + * Ensure that all memory allocations in this context are done as if + * GFP_NOIO was specified. + */ + args.disk = disk; + args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); + noio_flag = memalloc_noio_save(); + ret = disk_revalidate_zone_resources(disk, args.nr_zones); + if (ret) { + memalloc_noio_restore(noio_flag); + return ret; + } + ret = disk->fops->report_zones(disk, 0, UINT_MAX, + blk_revalidate_zone_cb, &args); + if (!ret) { + pr_warn("%s: No zones reported\n", disk->disk_name); + ret = -ENODEV; + } + memalloc_noio_restore(noio_flag); + + /* + * If zones where reported, make sure that the entire disk capacity + * has been checked. + */ + if (ret > 0 && args.sector != capacity) { + pr_warn("%s: Missing zones from sector %llu\n", + disk->disk_name, args.sector); + ret = -ENODEV; + } + + /* + * Set the new disk zone parameters only once the queue is frozen and + * all I/Os are completed. + */ + blk_mq_freeze_queue(q); + if (ret > 0) + ret = disk_update_zone_resources(disk, &args); + else + pr_warn("%s: failed to revalidate zones\n", disk->disk_name); + if (ret) + disk_free_zone_resources(disk); + blk_mq_unfreeze_queue(q); + + kfree(args.conv_zones_bitmap); + + return ret; +} +EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); + +#ifdef CONFIG_BLK_DEBUG_FS + +int queue_zone_wplugs_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct gendisk *disk = q->disk; + struct blk_zone_wplug *zwplug; + unsigned int zwp_wp_offset, zwp_flags; + unsigned int zwp_zone_no, zwp_ref; + unsigned int zwp_bio_list_size, i; + unsigned long flags; + + if (!disk->zone_wplugs_hash) + return 0; + + rcu_read_lock(); + for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) { + hlist_for_each_entry_rcu(zwplug, + &disk->zone_wplugs_hash[i], node) { + spin_lock_irqsave(&zwplug->lock, flags); + zwp_zone_no = zwplug->zone_no; + zwp_flags = zwplug->flags; + zwp_ref = atomic_read(&zwplug->ref); + zwp_wp_offset = zwplug->wp_offset; + zwp_bio_list_size = bio_list_size(&zwplug->bio_list); + spin_unlock_irqrestore(&zwplug->lock, flags); + + seq_printf(m, "%u 0x%x %u %u %u\n", + zwp_zone_no, zwp_flags, zwp_ref, + zwp_wp_offset, zwp_bio_list_size); + } + } + rcu_read_unlock(); + + return 0; +} + +#endif diff --git a/block/blk.h b/block/blk.h index d9f584984bc4..6e94c10af798 100644 --- a/block/blk.h +++ b/block/blk.h @@ -38,6 +38,7 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); void blk_queue_start_drain(struct request_queue *q); int __bio_queue_enter(struct request_queue *q, struct bio *bio); void submit_bio_noacct_nocheck(struct bio *bio); +void bio_await_chain(struct bio *bio); static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) { @@ -269,6 +270,14 @@ static inline void bio_integrity_free(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); +enum bio_merge_status { + BIO_MERGE_OK, + BIO_MERGE_NONE, + BIO_MERGE_FAILED, +}; + +enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, unsigned int nr_segs); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, @@ -357,6 +366,7 @@ static inline bool blk_do_io_stat(struct request *rq) } void update_io_ticks(struct block_device *part, unsigned long now, bool end); +unsigned int part_in_flight(struct block_device *part); static inline void req_set_nomerge(struct request_queue *q, struct request *req) { @@ -378,17 +388,6 @@ static inline void ioc_clear_queue(struct request_queue *q) } #endif /* CONFIG_BLK_ICQ */ -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); -extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, - const char *page, size_t count); -extern void blk_throtl_bio_endio(struct bio *bio); -extern void blk_throtl_stat_add(struct request *rq, u64 time); -#else -static inline void blk_throtl_bio_endio(struct bio *bio) { } -static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } -#endif - struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); static inline bool blk_queue_may_bounce(struct request_queue *q) @@ -407,13 +406,85 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, } #ifdef CONFIG_BLK_DEV_ZONED -void disk_free_zone_bitmaps(struct gendisk *disk); +void disk_init_zone_resources(struct gendisk *disk); +void disk_free_zone_resources(struct gendisk *disk); +static inline bool bio_zone_write_plugging(struct bio *bio) +{ + return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); +} +static inline bool bio_is_zone_append(struct bio *bio) +{ + return bio_op(bio) == REQ_OP_ZONE_APPEND || + bio_flagged(bio, BIO_EMULATES_ZONE_APPEND); +} +void blk_zone_write_plug_bio_merged(struct bio *bio); +void blk_zone_write_plug_init_request(struct request *rq); +static inline void blk_zone_update_request_bio(struct request *rq, + struct bio *bio) +{ + /* + * For zone append requests, the request sector indicates the location + * at which the BIO data was written. Return this value to the BIO + * issuer through the BIO iter sector. + * For plugged zone writes, which include emulated zone append, we need + * the original BIO sector so that blk_zone_write_plug_bio_endio() can + * lookup the zone write plug. + */ + if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio)) + bio->bi_iter.bi_sector = rq->__sector; +} +void blk_zone_write_plug_bio_endio(struct bio *bio); +static inline void blk_zone_bio_endio(struct bio *bio) +{ + /* + * For write BIOs to zoned devices, signal the completion of the BIO so + * that the next write BIO can be submitted by zone write plugging. + */ + if (bio_zone_write_plugging(bio)) + blk_zone_write_plug_bio_endio(bio); +} + +void blk_zone_write_plug_finish_request(struct request *rq); +static inline void blk_zone_finish_request(struct request *rq) +{ + if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING) + blk_zone_write_plug_finish_request(rq); +} int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ -static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} +static inline void disk_init_zone_resources(struct gendisk *disk) +{ +} +static inline void disk_free_zone_resources(struct gendisk *disk) +{ +} +static inline bool bio_zone_write_plugging(struct bio *bio) +{ + return false; +} +static inline bool bio_is_zone_append(struct bio *bio) +{ + return false; +} +static inline void blk_zone_write_plug_bio_merged(struct bio *bio) +{ +} +static inline void blk_zone_write_plug_init_request(struct request *rq) +{ +} +static inline void blk_zone_update_request_bio(struct request *rq, + struct bio *bio) +{ +} +static inline void blk_zone_bio_endio(struct bio *bio) +{ +} +static inline void blk_zone_finish_request(struct request *rq) +{ +} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { diff --git a/block/elevator.c b/block/elevator.c index 5ff093cb3cf8..f64ebd726e58 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static inline bool elv_support_features(struct request_queue *q, - const struct elevator_type *e) -{ - return (q->required_elevator_features & e->elevator_features) == - q->required_elevator_features; -} - /** * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test @@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q, spin_lock(&elv_list_lock); e = __elevator_find(name); - if (e && (!elv_support_features(q, e) || !elevator_tryget(e))) + if (e && (!elevator_tryget(e))) e = NULL; spin_unlock(&elv_list_lock); return e; @@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) } /* - * Get the first elevator providing the features required by the request queue. - * Default to "none" if no matching elevator is found. - */ -static struct elevator_type *elevator_get_by_features(struct request_queue *q) -{ - struct elevator_type *e, *found = NULL; - - spin_lock(&elv_list_lock); - - list_for_each_entry(e, &elv_list, list) { - if (elv_support_features(q, e)) { - found = e; - break; - } - } - - if (found && !elevator_tryget(found)) - found = NULL; - - spin_unlock(&elv_list_lock); - return found; -} - -/* - * For a device queue that has no required features, use the default elevator - * settings. Otherwise, use the first elevator available matching the required - * features. If no suitable elevator is find or if the chosen elevator - * initialization fails, fall back to the "none" elevator (no elevator). + * Use the default elevator settings. If the chosen elevator initialization + * fails, fall back to the "none" elevator (no elevator). */ void elevator_init_mq(struct request_queue *q) { @@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q) if (unlikely(q->elevator)) return; - if (!q->required_elevator_features) - e = elevator_get_default(q); - else - e = elevator_get_by_features(q); + e = elevator_get_default(q); if (!e) return; @@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) list_for_each_entry(e, &elv_list, list) { if (e == cur) len += sprintf(name+len, "[%s] ", e->elevator_name); - else if (elv_support_features(q, e)) + else len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); diff --git a/block/elevator.h b/block/elevator.h index 7ca3d7b6ed82..e9a050a96e53 100644 --- a/block/elevator.h +++ b/block/elevator.h @@ -74,7 +74,6 @@ struct elevator_type struct elv_fs_entry *elevator_attrs; const char *elevator_name; const char *elevator_alias; - const unsigned int elevator_features; struct module *elevator_owner; #ifdef CONFIG_BLK_DEBUG_FS const struct blk_mq_debugfs_attr *queue_debugfs_attrs; diff --git a/block/fops.c b/block/fops.c index af6c244314af..7a163f7fe2d8 100644 --- a/block/fops.c +++ b/block/fops.c @@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, #define DIO_INLINE_BIO_VECS 4 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, - struct iov_iter *iter, unsigned int nr_pages) + struct iov_iter *iter, struct block_device *bdev, + unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { @@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio) } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - unsigned int nr_pages) + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; @@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio) static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, + struct block_device *bdev, unsigned int nr_pages) { - struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; @@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, loff_t pos = iocb->ki_pos; int ret = 0; - if (blkdev_dio_unaligned(bdev, pos, iter)) - return -EINVAL; - if (iocb->ki_flags & IOCB_ALLOC_CACHE) opf |= REQ_ALLOC_CACHE; bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, @@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); unsigned int nr_pages; if (!iov_iter_count(iter)) return 0; + if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter)) + return -EINVAL; + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); if (likely(nr_pages <= BIO_MAX_VECS)) { if (is_sync_kiocb(iocb)) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - return __blkdev_direct_IO_async(iocb, iter, nr_pages); + return __blkdev_direct_IO_simple(iocb, iter, bdev, + nr_pages); + return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages); } - return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); + return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages)); } static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, @@ -390,7 +385,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, iomap->bdev = bdev; iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); - if (iomap->offset >= isize) + if (offset >= isize) return -EIO; iomap->type = IOMAP_MAPPED; iomap->addr = iomap->offset; diff --git a/block/genhd.c b/block/genhd.c index bb29a68e1d67..7f39fbe60753 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part, } } -static unsigned int part_in_flight(struct block_device *part) +unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; @@ -345,9 +345,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) struct file *file; int ret = 0; - if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) - return -EINVAL; - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return -EINVAL; if (disk->open_partitions) return -EBUSY; @@ -503,8 +501,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, goto out_unregister_bdi; /* Make sure the first partition scan will be proceed */ - if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) && - !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (get_capacity(disk) && disk_has_partscan(disk)) set_bit(GD_NEED_PART_SCAN, &disk->state); bdev_add(disk->part0, ddev->devt); @@ -954,15 +951,10 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); - struct request_queue *q = bdev_get_queue(bdev); struct disk_stats stat; unsigned int inflight; - if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, bdev); - else - inflight = part_in_flight(bdev); - + inflight = part_in_flight(bdev); if (inflight) { part_stat_lock(); update_io_ticks(bdev, jiffies, true); @@ -1047,6 +1039,12 @@ static ssize_t diskseq_show(struct device *dev, return sprintf(buf, "%llu\n", disk->diskseq); } +static ssize_t partscan_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev))); +} + static DEVICE_ATTR(range, 0444, disk_range_show, NULL); static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); @@ -1060,6 +1058,7 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); +static DEVICE_ATTR(partscan, 0444, partscan_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, @@ -1106,6 +1105,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_events_async.attr, &dev_attr_events_poll_msecs.attr, &dev_attr_diskseq.attr, + &dev_attr_partscan.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev) disk_release_events(disk); kfree(disk->random); - disk_free_zone_bitmaps(disk); + disk_free_zone_resources(disk); xa_destroy(&disk->part_tbl); disk->queue->disk = NULL; @@ -1251,11 +1251,8 @@ static int diskstats_show(struct seq_file *seqf, void *v) xa_for_each(&gp->part_tbl, idx, hd) { if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) continue; - if (queue_is_mq(gp->queue)) - inflight = blk_mq_in_flight(gp->queue, hd); - else - inflight = part_in_flight(hd); + inflight = part_in_flight(hd); if (inflight) { part_stat_lock(); update_io_ticks(hd, jiffies, true); @@ -1364,6 +1361,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (blkcg_init_disk(disk)) goto out_erase_part0; + disk_init_zone_resources(disk); rand_initialize_disk(disk); disk_to_dev(disk)->class = &block_class; disk_to_dev(disk)->type = &disk_type; diff --git a/block/ioctl.c b/block/ioctl.c index f505f9c341eb..c7db3bd2d653 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -33,7 +33,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); - if (p.start < 0 || p.length <= 0 || p.start + p.length < 0) + if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start) return -EINVAL; /* Check that the partition is aligned to the block size */ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) @@ -95,9 +95,12 @@ static int compat_blkpg_ioctl(struct block_device *bdev, static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { - uint64_t range[2]; - uint64_t start, len, end; + unsigned int bs_mask = bdev_logical_block_size(bdev) - 1; struct inode *inode = bdev->bd_inode; + uint64_t range[2], start, len, end; + struct bio *prev = NULL, *bio; + sector_t sector, nr_sects; + struct blk_plug plug; int err; if (!(mode & BLK_OPEN_WRITE)) @@ -105,6 +108,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; + if (bdev_read_only(bdev)) + return -EPERM; if (copy_from_user(range, (void __user *)arg, sizeof(range))) return -EFAULT; @@ -112,9 +117,9 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, start = range[0]; len = range[1]; - if (start & 511) + if (!len) return -EINVAL; - if (len & 511) + if ((start | len) & bs_mask) return -EINVAL; if (check_add_overflow(start, len, &end) || @@ -125,7 +130,32 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) goto fail; - err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + + sector = start >> SECTOR_SHIFT; + nr_sects = len >> SECTOR_SHIFT; + + blk_start_plug(&plug); + while (1) { + if (fatal_signal_pending(current)) { + if (prev) + bio_await_chain(prev); + err = -EINTR; + goto out_unplug; + } + bio = blk_alloc_discard_bio(bdev, §or, &nr_sects, + GFP_KERNEL); + if (!bio) + break; + prev = bio_chain_and_submit(prev, bio); + } + if (prev) { + err = submit_bio_wait(prev); + if (err == -EOPNOTSUPP) + err = 0; + bio_put(prev); + } +out_unplug: + blk_finish_plug(&plug); fail: filemap_invalidate_unlock(inode->i_mapping); return err; diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 02a916ba62ee..94eede4fb9eb 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -102,7 +102,6 @@ struct deadline_data { int prio_aging_expire; spinlock_t lock; - spinlock_t zone_lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ @@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq) } /* - * get the request before `rq' in sector-sorted order - */ -static inline struct request * -deadline_earlier_request(struct request *rq) -{ - struct rb_node *node = rb_prev(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -/* - * get the request after `rq' in sector-sorted order - */ -static inline struct request * -deadline_latter_request(struct request *rq) -{ - struct rb_node *node = rb_next(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -/* - * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, - * return the first request after the start of the zone containing @pos. + * Return the first request for which blk_rq_pos() >= @pos. */ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, enum dd_data_dir data_dir, sector_t pos) @@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, return NULL; rq = rb_entry_rq(node); - /* - * A zoned write may have been requeued with a starting position that - * is below that of the most recently dispatched request. Hence, for - * zoned writes, start searching from the start of a zone. - */ - if (blk_rq_is_seq_zoned_write(rq)) - pos = round_down(pos, rq->q->limits.chunk_sectors); - while (node) { rq = rb_entry_rq(node); if (blk_rq_pos(rq) >= pos) { @@ -308,36 +270,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } -/* - * Check if rq has a sequential request preceding it. - */ -static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) -{ - struct request *prev = deadline_earlier_request(rq); - - if (!prev) - return false; - - return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); -} - -/* - * Skip all write requests that are sequential from @rq, even if we cross - * a zone boundary. - */ -static struct request *deadline_skip_seq_writes(struct deadline_data *dd, - struct request *rq) -{ - sector_t pos = blk_rq_pos(rq); - - do { - pos += blk_rq_sectors(rq); - rq = deadline_latter_request(rq); - } while (rq && blk_rq_pos(rq) == pos); - - return rq; -} - /* * For the specified data direction, return the next request to * dispatch using arrival ordered lists. @@ -346,40 +278,10 @@ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq, *rb_rq, *next; - unsigned long flags; - if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; - rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. For some HDDs, breaking a sequential - * write stream can lead to lower throughput, so make sure to preserve - * sequential write streams, even if that stream crosses into the next - * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], - queuelist) { - /* Check whether a prior request exists for the same zone. */ - rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); - if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) - rq = rb_rq; - if (blk_req_can_dispatch_to_zone(rq) && - (blk_queue_nonrot(rq->q) || - !deadline_is_seq_write(dd, rq))) - goto out; - } - rq = NULL; -out: - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; + return rq_entry_fifo(per_prio->fifo_list[data_dir].next); } /* @@ -390,36 +292,8 @@ static struct request * deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq; - unsigned long flags; - - rq = deadline_from_pos(per_prio, data_dir, - per_prio->latest_pos[data_dir]); - if (!rq) - return NULL; - - if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. For some HDDs, breaking a sequential - * write stream can lead to lower throughput, so make sure to preserve - * sequential write streams, even if that stream crosses into the next - * zones and these zones are unlocked. - */ - spin_lock_irqsave(&dd->zone_lock, flags); - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - break; - if (blk_queue_nonrot(rq->q)) - rq = deadline_latter_request(rq); - else - rq = deadline_skip_seq_writes(dd, rq); - } - spin_unlock_irqrestore(&dd->zone_lock, flags); - - return rq; + return deadline_from_pos(per_prio, data_dir, + per_prio->latest_pos[data_dir]); } /* @@ -525,10 +399,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, rq = next_rq; } - /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. - */ if (!rq) return NULL; @@ -549,10 +419,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, prio = ioprio_class_to_prio[ioprio_class]; dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; - /* - * If the request needs its target zone locked, do it. - */ - blk_req_zone_write_lock(rq); rq->rq_flags |= RQF_STARTED; return rq; } @@ -722,7 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); - spin_lock_init(&dd->zone_lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); @@ -804,12 +669,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, lockdep_assert_held(&dd->lock); - /* - * This may be a requeue of a write request that has locked its - * target zone. If it is the case, this releases the zone lock. - */ - blk_req_zone_write_unlock(rq); - prio = ioprio_class_to_prio[ioprio_class]; per_prio = &dd->per_prio[prio]; if (!rq->elv.priv[0]) { @@ -841,18 +700,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; insert_before = &per_prio->fifo_list[data_dir]; -#ifdef CONFIG_BLK_DEV_ZONED - /* - * Insert zoned writes such that requests are sorted by - * position per zone. - */ - if (blk_rq_is_seq_zoned_write(rq)) { - struct request *rq2 = deadline_latter_request(rq); - - if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) - insert_before = &rq2->queuelist; - } -#endif list_add_tail(&rq->queuelist, insert_before); } } @@ -887,33 +734,8 @@ static void dd_prepare_request(struct request *rq) rq->elv.priv[0] = NULL; } -static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) -{ - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - enum dd_prio p; - - for (p = 0; p <= DD_PRIO_MAX; p++) - if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) - return true; - - return false; -} - /* * Callback from inside blk_mq_free_request(). - * - * For zoned block devices, write unlock the target zone of - * completed write requests. Do this while holding the zone lock - * spinlock so that the zone is never unlocked while deadline_fifo_request() - * or deadline_next_request() are executing. This function is called for - * all requests, whether or not these requests complete successfully. - * - * For a zoned block device, __dd_dispatch_request() may have stopped - * dispatching requests if all the queued requests are write requests directed - * at zones that are already locked due to on-going write requests. To ensure - * write request dispatch progress in this case, mark the queue as needing a - * restart to ensure that the queue is run again after completion of the - * request and zones being unlocked. */ static void dd_finish_request(struct request *rq) { @@ -928,21 +750,8 @@ static void dd_finish_request(struct request *rq) * called dd_insert_requests(). Skip requests that bypassed I/O * scheduling. See also blk_mq_request_bypass_insert(). */ - if (!rq->elv.priv[0]) - return; - - atomic_inc(&per_prio->stats.completed); - - if (blk_queue_is_zoned(q)) { - unsigned long flags; - - spin_lock_irqsave(&dd->zone_lock, flags); - blk_req_zone_write_unlock(rq); - spin_unlock_irqrestore(&dd->zone_lock, flags); - - if (dd_has_write_work(rq->mq_hctx)) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); - } + if (rq->elv.priv[0]) + atomic_inc(&per_prio->stats.completed); } static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) @@ -1266,7 +1075,6 @@ static struct elevator_type mq_deadline = { .elevator_attrs = deadline_attrs, .elevator_name = "mq-deadline", .elevator_alias = "deadline", - .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, .elevator_owner = THIS_MODULE, }; MODULE_ALIAS("mq-deadline-iosched"); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index c03bc105e575..152c85df92b2 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -70,8 +70,8 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) } if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); + partdef++; + char *next = strsep(&partdef, ")"); if (!next) { pr_warn("cmdline partition format is invalid."); @@ -79,11 +79,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) goto fail; } - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strscpy(new_subpart->name, partdef, length); - - partdef = ++next; + strscpy(new_subpart->name, next, sizeof(new_subpart->name)); } else new_subpart->name[0] = '\0'; @@ -117,14 +113,12 @@ static void free_subpart(struct cmdline_parts *parts) } } -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +static int parse_parts(struct cmdline_parts **parts, char *bdevdef) { int ret = -EINVAL; char *next; - int length; struct cmdline_subpart **next_subpart; struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; *parts = NULL; @@ -132,28 +126,19 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) if (!newparts) return -ENOMEM; - next = strchr(bdevdef, ':'); + next = strsep(&bdevdef, ":"); if (!next) { pr_warn("cmdline partition has no block device."); goto fail; } - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strscpy(newparts->name, bdevdef, length); + strscpy(newparts->name, next, sizeof(newparts->name)); newparts->nr_subparts = 0; next_subpart = &newparts->subpart; - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strscpy(buf, bdevdef, length); - - ret = parse_subpart(next_subpart, buf); + while ((next = strsep(&bdevdef, ","))) { + ret = parse_subpart(next_subpart, next); if (ret) goto fail; @@ -199,24 +184,17 @@ static int cmdline_parts_parse(struct cmdline_parts **parts, *parts = NULL; - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + pbuf = buf = kstrdup(cmdline, GFP_KERNEL); if (!buf) return -ENOMEM; next_parts = parts; - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); + while ((next = strsep(&pbuf, ";"))) { + ret = parse_parts(next_parts, next); if (ret) goto fail; - if (next) - pbuf = ++next; - next_parts = &(*next_parts)->next_parts; } @@ -250,7 +228,6 @@ static struct cmdline_parts *bdev_parts; static int add_part(int slot, struct cmdline_subpart *subpart, struct parsed_partitions *state) { - int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; @@ -262,9 +239,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, info = &state->parts[slot].info; - label_min = min_t(int, sizeof(info->volname) - 1, - sizeof(subpart->name)); - strscpy(info->volname, subpart->name, label_min); + strscpy(info->volname, subpart->name, sizeof(info->volname)); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); diff --git a/block/partitions/core.c b/block/partitions/core.c index b11e88c82c8c..37b5f92d07fe 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -573,10 +573,7 @@ static int blk_add_partitions(struct gendisk *disk) struct parsed_partitions *state; int ret = -EAGAIN, p; - if (disk->flags & GENHD_FL_NO_PART) - return 0; - - if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + if (!disk_has_partscan(disk)) return 0; state = check_partition(disk); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index e322cef6596b..b900fe9e0030 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -29,10 +29,7 @@ /* * Each block ramdisk device has a xarray brd_pages of pages that stores - * the pages containing the block device's contents. A brd page's ->index is - * its offset in PAGE_SIZE units. This is similar to, but in no way connected - * with, the kernel's pagecache or buffer cache (which sit above our block - * device). + * the pages containing the block device's contents. */ struct brd_device { int brd_number; @@ -51,15 +48,7 @@ struct brd_device { */ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) { - pgoff_t idx; - struct page *page; - - idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */ - page = xa_load(&brd->brd_pages, idx); - - BUG_ON(page && page->index != idx); - - return page; + return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT); } /* @@ -67,8 +56,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector) */ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) { - pgoff_t idx; - struct page *page, *cur; + pgoff_t idx = sector >> PAGE_SECTORS_SHIFT; + struct page *page; int ret = 0; page = brd_lookup_page(brd, sector); @@ -80,23 +69,16 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp) return -ENOMEM; xa_lock(&brd->brd_pages); - - idx = sector >> PAGE_SECTORS_SHIFT; - page->index = idx; - - cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp); - - if (unlikely(cur)) { - __free_page(page); - ret = xa_err(cur); - if (!ret && (cur->index != idx)) - ret = -EIO; - } else { + ret = __xa_insert(&brd->brd_pages, idx, page, gfp); + if (!ret) brd->brd_nr_pages++; - } - xa_unlock(&brd->brd_pages); + if (ret < 0) { + __free_page(page); + if (ret == -EBUSY) + ret = 0; + } return ret; } diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index ed33cf7192d2..4005a8b685e8 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -225,6 +225,10 @@ static unsigned long g_cache_size; module_param_named(cache_size, g_cache_size, ulong, 0444); MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)"); +static bool g_fua = true; +module_param_named(fua, g_fua, bool, 0444); +MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true"); + static unsigned int g_mbps; module_param_named(mbps, g_mbps, uint, 0444); MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)"); @@ -253,6 +257,11 @@ static unsigned int g_zone_max_active; module_param_named(zone_max_active, g_zone_max_active, uint, 0444); MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)"); +static int g_zone_append_max_sectors = INT_MAX; +module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444); +MODULE_PARM_DESC(zone_append_max_sectors, + "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -436,10 +445,12 @@ NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL); NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL); NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); +NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); +NULLB_DEVICE_ATTR(fua, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -580,12 +591,14 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_nr_conv, &nullb_device_attr_zone_max_open, &nullb_device_attr_zone_max_active, + &nullb_device_attr_zone_append_max_sectors, &nullb_device_attr_zone_readonly, &nullb_device_attr_zone_offline, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, &nullb_device_attr_shared_tags, &nullb_device_attr_shared_tag_bitmap, + &nullb_device_attr_fua, NULL, }; @@ -664,14 +677,14 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { return snprintf(page, PAGE_SIZE, - "badblocks,blocking,blocksize,cache_size," + "badblocks,blocking,blocksize,cache_size,fua," "completion_nsec,discard,home_node,hw_queue_depth," "irqmode,max_sectors,mbps,memory_backed,no_sched," "poll_queues,power,queue_mode,shared_tag_bitmap," "shared_tags,size,submit_queues,use_per_node_hctx," "virt_boundary,zoned,zone_capacity,zone_max_active," "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," - "zone_size\n"); + "zone_size,zone_append_max_sectors\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -751,10 +764,13 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_nr_conv = g_zone_nr_conv; dev->zone_max_open = g_zone_max_open; dev->zone_max_active = g_zone_max_active; + dev->zone_append_max_sectors = g_zone_append_max_sectors; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; + dev->fua = g_fua; + return dev; } @@ -1151,7 +1167,7 @@ blk_status_t null_handle_discard(struct nullb_device *dev, return BLK_STS_OK; } -static int null_handle_flush(struct nullb *nullb) +static blk_status_t null_handle_flush(struct nullb *nullb) { int err; @@ -1168,7 +1184,7 @@ static int null_handle_flush(struct nullb *nullb) WARN_ON(!radix_tree_empty(&nullb->dev->cache)); spin_unlock_irq(&nullb->lock); - return err; + return errno_to_blk_status(err); } static int null_transfer(struct nullb *nullb, struct page *page, @@ -1206,7 +1222,7 @@ static int null_handle_rq(struct nullb_cmd *cmd) { struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; - int err; + int err = 0; unsigned int len; sector_t sector = blk_rq_pos(rq); struct req_iterator iter; @@ -1218,15 +1234,13 @@ static int null_handle_rq(struct nullb_cmd *cmd) err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, op_is_write(req_op(rq)), sector, rq->cmd_flags & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } + if (err) + break; sector += len >> SECTOR_SHIFT; } spin_unlock_irq(&nullb->lock); - return 0; + return errno_to_blk_status(err); } static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) @@ -1273,8 +1287,8 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); - return errno_to_blk_status(null_handle_rq(cmd)); + return null_handle_rq(cmd); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) @@ -1343,7 +1357,7 @@ static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, blk_status_t sts; if (op == REQ_OP_FLUSH) { - cmd->error = errno_to_blk_status(null_handle_flush(nullb)); + cmd->error = null_handle_flush(nullb); goto out; } @@ -1912,7 +1926,7 @@ static int null_add_dev(struct nullb_device *dev) if (dev->cache_size > 0) { set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags); - blk_queue_write_cache(nullb->q, true, true); + blk_queue_write_cache(nullb->q, true, dev->fua); } nullb->q->queuedata = nullb; @@ -2113,10 +2127,13 @@ static void __exit null_exit(void) if (tag_set.ops) blk_mq_free_tag_set(&tag_set); + + mutex_destroy(&lock); } module_init(null_init); module_exit(null_exit); MODULE_AUTHOR("Jens Axboe "); +MODULE_DESCRIPTION("multi queue aware block test driver"); MODULE_LICENSE("GPL"); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 477b97746823..3234e6c85eed 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -82,6 +82,7 @@ struct nullb_device { unsigned int zone_nr_conv; /* number of conventional zones */ unsigned int zone_max_open; /* max number of open zones */ unsigned int zone_max_active; /* max number of active zones */ + unsigned int zone_append_max_sectors; /* Max sectors per zone append command */ unsigned int submit_queues; /* number of submission queues */ unsigned int prev_submit_queues; /* number of submission queues before change */ unsigned int poll_queues; /* number of IOPOLL submission queues */ @@ -104,6 +105,7 @@ struct nullb_device { bool no_sched; /* no IO scheduler for the device */ bool shared_tags; /* share tag set between devices for blk-mq */ bool shared_tag_bitmap; /* use hostwide shared tags */ + bool fua; /* Support FUA */ }; struct nullb { diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 1689e2584104..5b5a63adacc1 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -9,6 +9,8 @@ #undef pr_fmt #define pr_fmt(fmt) "null_blk: " fmt +#define NULL_ZONE_INVALID_WP ((sector_t)-1) + static inline sector_t mb_to_sects(unsigned long mb) { return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT; @@ -19,18 +21,6 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) return sect >> ilog2(dev->zone_size_sects); } -static inline void null_lock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_lock_irq(&dev->zone_res_lock); -} - -static inline void null_unlock_zone_res(struct nullb_device *dev) -{ - if (dev->need_zone_res_mgmt) - spin_unlock_irq(&dev->zone_res_lock); -} - static inline void null_init_zone_lock(struct nullb_device *dev, struct nullb_zone *zone) { @@ -103,6 +93,11 @@ int null_init_zoned_dev(struct nullb_device *dev, dev->zone_nr_conv); } + dev->zone_append_max_sectors = + min(ALIGN_DOWN(dev->zone_append_max_sectors, + dev->blocksize >> SECTOR_SHIFT), + zone_capacity_sects); + /* Max active zones has to be < nbr of seq zones in order to be enforceable */ if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) { dev->zone_max_active = 0; @@ -154,7 +149,7 @@ int null_init_zoned_dev(struct nullb_device *dev, lim->zoned = true; lim->chunk_sectors = dev->zone_size_sects; - lim->max_zone_append_sectors = dev->zone_size_sects; + lim->max_zone_append_sectors = dev->zone_append_max_sectors; lim->max_open_zones = dev->zone_max_open; lim->max_active_zones = dev->zone_max_active; return 0; @@ -163,11 +158,16 @@ int null_init_zoned_dev(struct nullb_device *dev, int null_register_zoned_dev(struct nullb *nullb) { struct request_queue *q = nullb->q; + struct gendisk *disk = nullb->disk; blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); - return blk_revalidate_disk_zones(nullb->disk, NULL); + disk->nr_zones = bdev_nr_zones(disk->part0); + + pr_info("%s: using %s zone append\n", + disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); + + return blk_revalidate_disk_zones(disk); } void null_free_zoned_dev(struct nullb_device *dev) @@ -241,35 +241,6 @@ size_t null_zone_valid_read_len(struct nullb *nullb, return (zone->wp - sector) << SECTOR_SHIFT; } -static blk_status_t __null_close_zone(struct nullb_device *dev, - struct nullb_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_CLOSED: - /* close operation on closed is not an error */ - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_FULL: - default: - return BLK_STS_IOERR; - } - - if (zone->wp == zone->start) { - zone->cond = BLK_ZONE_COND_EMPTY; - } else { - zone->cond = BLK_ZONE_COND_CLOSED; - dev->nr_zones_closed++; - } - - return BLK_STS_OK; -} - static void null_close_imp_open_zone(struct nullb_device *dev) { struct nullb_zone *zone; @@ -286,7 +257,13 @@ static void null_close_imp_open_zone(struct nullb_device *dev) zno = dev->zone_nr_conv; if (zone->cond == BLK_ZONE_COND_IMP_OPEN) { - __null_close_zone(dev, zone); + dev->nr_zones_imp_open--; + if (zone->wp == zone->start) { + zone->cond = BLK_ZONE_COND_EMPTY; + } else { + zone->cond = BLK_ZONE_COND_CLOSED; + dev->nr_zones_closed++; + } dev->imp_close_zone_no = zno; return; } @@ -374,73 +351,73 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, null_lock_zone(dev, zone); - if (zone->cond == BLK_ZONE_COND_FULL || - zone->cond == BLK_ZONE_COND_READONLY || - zone->cond == BLK_ZONE_COND_OFFLINE) { - /* Cannot write to the zone */ - ret = BLK_STS_IOERR; - goto unlock; - } - /* - * Regular writes must be at the write pointer position. - * Zone append writes are automatically issued at the write - * pointer and the position returned using the request or BIO - * sector. + * Regular writes must be at the write pointer position. Zone append + * writes are automatically issued at the write pointer and the position + * returned using the request sector. Note that we do not check the zone + * condition because for FULL, READONLY and OFFLINE zones, the sector + * check against the zone write pointer will always result in failing + * the command. */ if (append) { + if (WARN_ON_ONCE(!dev->zone_append_max_sectors) || + zone->wp == NULL_ZONE_INVALID_WP) { + ret = BLK_STS_IOERR; + goto unlock_zone; + } sector = zone->wp; blk_mq_rq_from_pdu(cmd)->__sector = sector; - } else if (sector != zone->wp) { - ret = BLK_STS_IOERR; - goto unlock; } - if (zone->wp + nr_sectors > zone->start + zone->capacity) { + if (sector != zone->wp || + zone->wp + nr_sectors > zone->start + zone->capacity) { ret = BLK_STS_IOERR; - goto unlock; + goto unlock_zone; } if (zone->cond == BLK_ZONE_COND_CLOSED || zone->cond == BLK_ZONE_COND_EMPTY) { - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) { - null_unlock_zone_res(dev); - goto unlock; - } - if (zone->cond == BLK_ZONE_COND_CLOSED) { - dev->nr_zones_closed--; - dev->nr_zones_imp_open++; - } else if (zone->cond == BLK_ZONE_COND_EMPTY) { - dev->nr_zones_imp_open++; - } + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + goto unlock_zone; + } + if (zone->cond == BLK_ZONE_COND_CLOSED) { + dev->nr_zones_closed--; + dev->nr_zones_imp_open++; + } else if (zone->cond == BLK_ZONE_COND_EMPTY) { + dev->nr_zones_imp_open++; + } - if (zone->cond != BLK_ZONE_COND_EXP_OPEN) - zone->cond = BLK_ZONE_COND_IMP_OPEN; + spin_unlock(&dev->zone_res_lock); + } - null_unlock_zone_res(dev); + zone->cond = BLK_ZONE_COND_IMP_OPEN; } ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors); if (ret != BLK_STS_OK) - goto unlock; + goto unlock_zone; zone->wp += nr_sectors; if (zone->wp == zone->start + zone->capacity) { - null_lock_zone_res(dev); - if (zone->cond == BLK_ZONE_COND_EXP_OPEN) - dev->nr_zones_exp_open--; - else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) - dev->nr_zones_imp_open--; + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); + if (zone->cond == BLK_ZONE_COND_EXP_OPEN) + dev->nr_zones_exp_open--; + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN) + dev->nr_zones_imp_open--; + spin_unlock(&dev->zone_res_lock); + } zone->cond = BLK_ZONE_COND_FULL; - null_unlock_zone_res(dev); } ret = BLK_STS_OK; -unlock: +unlock_zone: null_unlock_zone(dev, zone); return ret; @@ -454,54 +431,100 @@ static blk_status_t null_open_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); - switch (zone->cond) { case BLK_ZONE_COND_EXP_OPEN: - /* open operation on exp open is not an error */ - goto unlock; + /* Open operation on exp open is not an error */ + return BLK_STS_OK; case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; break; case BLK_ZONE_COND_FULL: default: - ret = BLK_STS_IOERR; - goto unlock; + return BLK_STS_IOERR; } - zone->cond = BLK_ZONE_COND_EXP_OPEN; - dev->nr_zones_exp_open++; + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); -unlock: - null_unlock_zone_res(dev); + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + dev->nr_zones_closed--; + break; + default: + break; + } - return ret; + dev->nr_zones_exp_open++; + + spin_unlock(&dev->zone_res_lock); + } + + zone->cond = BLK_ZONE_COND_EXP_OPEN; + + return BLK_STS_OK; } static blk_status_t null_close_zone(struct nullb_device *dev, struct nullb_zone *zone) { - blk_status_t ret; - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); - ret = __null_close_zone(dev, zone); - null_unlock_zone_res(dev); + switch (zone->cond) { + case BLK_ZONE_COND_CLOSED: + /* close operation on closed is not an error */ + return BLK_STS_OK; + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + default: + return BLK_STS_IOERR; + } + + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - return ret; + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + default: + break; + } + + if (zone->wp > zone->start) + dev->nr_zones_closed++; + + spin_unlock(&dev->zone_res_lock); + } + + if (zone->wp == zone->start) + zone->cond = BLK_ZONE_COND_EMPTY; + else + zone->cond = BLK_ZONE_COND_CLOSED; + + return BLK_STS_OK; } static blk_status_t null_finish_zone(struct nullb_device *dev, @@ -512,41 +535,47 @@ static blk_status_t null_finish_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - switch (zone->cond) { - case BLK_ZONE_COND_FULL: - /* finish operation on full is not an error */ - goto unlock; - case BLK_ZONE_COND_EMPTY: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - break; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - ret = null_check_zone_resources(dev, zone); - if (ret != BLK_STS_OK) - goto unlock; - dev->nr_zones_closed--; - break; - default: - ret = BLK_STS_IOERR; - goto unlock; + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* Finish operation on full is not an error */ + spin_unlock(&dev->zone_res_lock); + return BLK_STS_OK; + case BLK_ZONE_COND_EMPTY: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + break; + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + ret = null_check_zone_resources(dev, zone); + if (ret != BLK_STS_OK) { + spin_unlock(&dev->zone_res_lock); + return ret; + } + dev->nr_zones_closed--; + break; + default: + spin_unlock(&dev->zone_res_lock); + return BLK_STS_IOERR; + } + + spin_unlock(&dev->zone_res_lock); } zone->cond = BLK_ZONE_COND_FULL; zone->wp = zone->start + zone->len; -unlock: - null_unlock_zone_res(dev); - - return ret; + return BLK_STS_OK; } static blk_status_t null_reset_zone(struct nullb_device *dev, @@ -555,34 +584,33 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) return BLK_STS_IOERR; - null_lock_zone_res(dev); + if (dev->need_zone_res_mgmt) { + spin_lock(&dev->zone_res_lock); - switch (zone->cond) { - case BLK_ZONE_COND_EMPTY: - /* reset operation on empty is not an error */ - null_unlock_zone_res(dev); - return BLK_STS_OK; - case BLK_ZONE_COND_IMP_OPEN: - dev->nr_zones_imp_open--; - break; - case BLK_ZONE_COND_EXP_OPEN: - dev->nr_zones_exp_open--; - break; - case BLK_ZONE_COND_CLOSED: - dev->nr_zones_closed--; - break; - case BLK_ZONE_COND_FULL: - break; - default: - null_unlock_zone_res(dev); - return BLK_STS_IOERR; + switch (zone->cond) { + case BLK_ZONE_COND_IMP_OPEN: + dev->nr_zones_imp_open--; + break; + case BLK_ZONE_COND_EXP_OPEN: + dev->nr_zones_exp_open--; + break; + case BLK_ZONE_COND_CLOSED: + dev->nr_zones_closed--; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_FULL: + break; + default: + spin_unlock(&dev->zone_res_lock); + return BLK_STS_IOERR; + } + + spin_unlock(&dev->zone_res_lock); } zone->cond = BLK_ZONE_COND_EMPTY; zone->wp = zone->start; - null_unlock_zone_res(dev); - if (dev->memory_backed) return null_handle_discard(dev, zone->start, zone->len); @@ -711,7 +739,7 @@ static void null_set_zone_cond(struct nullb_device *dev, zone->cond != BLK_ZONE_COND_OFFLINE) null_finish_zone(dev, zone); zone->cond = cond; - zone->wp = (sector_t)-1; + zone->wp = NULL_ZONE_INVALID_WP; } null_unlock_zone(dev, zone); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 374e4efa8759..176657dce3e3 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub) static int ublk_revalidate_disk_zones(struct ublk_device *ub) { - return blk_revalidate_disk_zones(ub->ub_disk, NULL); + return blk_revalidate_disk_zones(ub->ub_disk); } static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) @@ -249,8 +249,7 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); - blk_queue_required_elevator_features(ub->ub_disk->queue, - ELEVATOR_F_ZBD_SEQ_WRITE); + ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); } diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 42dea7601d87..c1af0a7d56c8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev) */ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); - err = blk_revalidate_disk_zones(vblk->disk, NULL); + err = blk_revalidate_disk_zones(vblk->disk); if (err) goto out_cleanup_disk; } diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 2bba4d6aaaa2..463eb13bd0b2 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -54,7 +54,7 @@ void bch_dump_bucket(struct btree_keys *b) int __bch_count_data(struct btree_keys *b) { unsigned int ret = 0; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k; if (b->ops->is_extents) @@ -67,7 +67,7 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...) { va_list args; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; const char *err; for_each_key(b, k, &iter) { @@ -879,7 +879,7 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, unsigned int status = BTREE_INSERT_STATUS_NO_INSERT; struct bset *i = bset_tree_last(b)->data; struct bkey *m, *prev = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey preceding_key_on_stack = ZERO_KEY; struct bkey *preceding_key_p = &preceding_key_on_stack; @@ -895,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k, else preceding_key(k, &preceding_key_p); - m = bch_btree_iter_init(b, &iter, preceding_key_p); + m = bch_btree_iter_stack_init(b, &iter, preceding_key_p); - if (b->ops->insert_fixup(b, k, &iter, replace_key)) + if (b->ops->insert_fixup(b, k, &iter.iter, replace_key)) return status; status = BTREE_INSERT_STATUS_INSERT; @@ -1100,33 +1100,33 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, btree_iter_cmp)); } -static struct bkey *__bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search, - struct bset_tree *start) +static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search, + struct bset_tree *start) { struct bkey *ret = NULL; - iter->size = ARRAY_SIZE(iter->data); - iter->used = 0; + iter->iter.size = ARRAY_SIZE(iter->stack_data); + iter->iter.used = 0; #ifdef CONFIG_BCACHE_DEBUG - iter->b = b; + iter->iter.b = b; #endif for (; start <= bset_tree_last(b); start++) { ret = bch_bset_search(b, start, search); - bch_btree_iter_push(iter, ret, bset_bkey_last(start->data)); + bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data)); } return ret; } -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, struct bkey *search) { - return __bch_btree_iter_init(b, iter, search, b->set); + return __bch_btree_iter_stack_init(b, iter, search, b->set); } static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter, @@ -1293,10 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, struct bset_sort_state *state) { size_t order = b->page_order, keys = 0; - struct btree_iter iter; + struct btree_iter_stack iter; int oldsize = bch_count_data(b); - __bch_btree_iter_init(b, &iter, NULL, &b->set[start]); + __bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]); if (start) { unsigned int i; @@ -1307,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start, order = get_order(__set_bytes(b->set->data, keys)); } - __btree_sort(b, &iter, start, order, false, state); + __btree_sort(b, &iter.iter, start, order, false, state); EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize); } @@ -1323,11 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new, struct bset_sort_state *state) { uint64_t start_time = local_clock(); - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(b, &iter, NULL); + bch_btree_iter_stack_init(b, &iter, NULL); - btree_mergesort(b, new->set->data, &iter, false, true); + btree_mergesort(b, new->set->data, &iter.iter, false, true); bch_time_stats_update(&state->time, start_time); diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h index d795c84246b0..011f6062c4c0 100644 --- a/drivers/md/bcache/bset.h +++ b/drivers/md/bcache/bset.h @@ -321,7 +321,14 @@ struct btree_iter { #endif struct btree_iter_set { struct bkey *k, *end; - } data[MAX_BSETS]; + } data[]; +}; + +/* Fixed-size btree_iter that can be allocated on the stack */ + +struct btree_iter_stack { + struct btree_iter iter; + struct btree_iter_set stack_data[MAX_BSETS]; }; typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k); @@ -333,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter, void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k, struct bkey *end); -struct bkey *bch_btree_iter_init(struct btree_keys *b, - struct btree_iter *iter, - struct bkey *search); +struct bkey *bch_btree_iter_stack_init(struct btree_keys *b, + struct btree_iter_stack *iter, + struct bkey *search); struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t, const struct bkey *search); @@ -350,13 +357,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b, return search ? __bch_bset_search(b, t, search) : t->data->start; } -#define for_each_key_filter(b, k, iter, filter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next_filter((iter), (b), filter));) +#define for_each_key_filter(b, k, stack_iter, filter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \ + filter));) -#define for_each_key(b, k, iter) \ - for (bch_btree_iter_init((b), (iter), NULL); \ - ((k) = bch_btree_iter_next(iter));) +#define for_each_key(b, k, stack_iter) \ + for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \ + ((k) = bch_btree_iter_next(&((stack_iter)->iter)));) /* Sorting */ diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 196cdacce38f..d011a7154d33 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1309,7 +1309,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct bset_tree *t; gc->nodes++; @@ -1570,7 +1570,7 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; unsigned int ret = 0; for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) @@ -1611,17 +1611,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op, int ret = 0; bool should_rewrite; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; - bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); + bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { - k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); @@ -1911,7 +1912,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); @@ -1919,10 +1920,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { - bch_btree_iter_init(&b->keys, &iter, NULL); + bch_btree_iter_stack_init(&b->keys, &iter, NULL); do { - k = bch_btree_iter_next_filter(&iter, &b->keys, + k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); @@ -1950,7 +1951,7 @@ static int bch_btree_check_thread(void *arg) struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; @@ -1959,8 +1960,8 @@ static int bch_btree_check_thread(void *arg) ret = 0; /* root node keys are checked before thread created */ - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -1978,7 +1979,7 @@ static int bch_btree_check_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -2051,7 +2052,7 @@ int bch_btree_check(struct cache_set *c) int ret = 0; int i; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct btree_check_state check_state; /* check and mark root node keys */ @@ -2547,11 +2548,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, if (b->level) { struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); @@ -2580,11 +2581,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, { int ret = MAP_CONTINUE; struct bkey *k; - struct btree_iter iter; + struct btree_iter_stack iter; - bch_btree_iter_init(&b->keys, &iter, from); + bch_btree_iter_stack_init(&b->keys, &iter, from); - while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { + while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys, + bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 330bcd9ea4a9..cba09660148a 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -881,8 +881,8 @@ static void bcache_device_free(struct bcache_device *d) bcache_device_detach(d); if (disk) { - ida_simple_remove(&bcache_device_idx, - first_minor_to_idx(disk->first_minor)); + ida_free(&bcache_device_idx, + first_minor_to_idx(disk->first_minor)); put_disk(disk); } @@ -940,8 +940,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, if (!d->full_dirty_stripes) goto out_free_stripe_sectors_dirty; - idx = ida_simple_get(&bcache_device_idx, 0, - BCACHE_DEVICE_IDX_MAX, GFP_KERNEL); + idx = ida_alloc_max(&bcache_device_idx, BCACHE_DEVICE_IDX_MAX - 1, + GFP_KERNEL); if (idx < 0) goto out_free_full_dirty_stripes; @@ -986,7 +986,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, out_bioset_exit: bioset_exit(&d->bio_split); out_ida_remove: - ida_simple_remove(&bcache_device_idx, idx); + ida_free(&bcache_device_idx, idx); out_free_full_dirty_stripes: kvfree(d->full_dirty_stripes); out_free_stripe_sectors_dirty: @@ -1914,8 +1914,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) INIT_LIST_HEAD(&c->btree_cache_freed); INIT_LIST_HEAD(&c->data_buckets); - iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) * - sizeof(struct btree_iter_set); + iter_size = sizeof(struct btree_iter) + + ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) * + sizeof(struct btree_iter_set); c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL); if (!c->devices) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index 6956beb55326..826b14cae4e5 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -660,7 +660,7 @@ static unsigned int bch_root_usage(struct cache_set *c) unsigned int bytes = 0; struct bkey *k; struct btree *b; - struct btree_iter iter; + struct btree_iter_stack iter; goto lock_root; diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8827a6f130ad..792e070ccf38 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -908,15 +908,15 @@ static int bch_dirty_init_thread(void *arg) struct dirty_init_thrd_info *info = arg; struct bch_dirty_init_state *state = info->state; struct cache_set *c = state->c; - struct btree_iter iter; + struct btree_iter_stack iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; prev_idx = 0; - bch_btree_iter_init(&c->root->keys, &iter, NULL); - k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); + bch_btree_iter_stack_init(&c->root->keys, &iter, NULL); + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; @@ -930,7 +930,7 @@ static int bch_dirty_init_thread(void *arg) skip_nr = cur_idx - prev_idx; while (skip_nr) { - k = bch_btree_iter_next_filter(&iter, + k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad); if (k) @@ -979,7 +979,7 @@ void bch_sectors_dirty_init(struct bcache_device *d) int i; struct btree *b = NULL; struct bkey *k = NULL; - struct btree_iter iter; + struct btree_iter_stack iter; struct sectors_dirty_init op; struct cache_set *c = d->c; struct bch_dirty_init_state state; diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c index fd852981ef9c..cf433b0cf742 100644 --- a/drivers/md/dm-bio-prison-v2.c +++ b/drivers/md/dm-bio-prison-v2.c @@ -321,8 +321,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison, { BUG_ON(!cell->exclusive_lock); - bio_list_merge(bios, &cell->bios); - bio_list_init(&cell->bios); + bio_list_merge_init(bios, &cell->bios); if (cell->shared_count) { cell->exclusive_lock = false; diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 911f73f7ebba..0fcbf8603846 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -115,8 +115,7 @@ static void __commit(struct work_struct *_ws) */ spin_lock_irq(&b->lock); list_splice_init(&b->work_items, &work_items); - bio_list_merge(&bios, &b->bios); - bio_list_init(&b->bios); + bio_list_merge_init(&bios, &b->bios); b->commit_scheduled = false; spin_unlock_irq(&b->lock); @@ -565,8 +564,7 @@ static void defer_bio(struct cache *cache, struct bio *bio) static void defer_bios(struct cache *cache, struct bio_list *bios) { spin_lock_irq(&cache->lock); - bio_list_merge(&cache->deferred_bios, bios); - bio_list_init(bios); + bio_list_merge_init(&cache->deferred_bios, bios); spin_unlock_irq(&cache->lock); wake_deferred_bio_worker(cache); @@ -1816,8 +1814,7 @@ static void process_deferred_bios(struct work_struct *ws) bio_list_init(&bios); spin_lock_irq(&cache->lock); - bio_list_merge(&bios, &cache->deferred_bios); - bio_list_init(&cache->deferred_bios); + bio_list_merge_init(&bios, &cache->deferred_bios); spin_unlock_irq(&cache->lock); while ((bio = bio_list_pop(&bios))) { @@ -1847,8 +1844,7 @@ static void requeue_deferred_bios(struct cache *cache) struct bio_list bios; bio_list_init(&bios); - bio_list_merge(&bios, &cache->deferred_bios); - bio_list_init(&cache->deferred_bios); + bio_list_merge_init(&bios, &cache->deferred_bios); while ((bio = bio_list_pop(&bios))) { bio->bi_status = BLK_STS_DM_REQUEUE; diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c index 94b2fc33f64b..3f68672ab7c9 100644 --- a/drivers/md/dm-clone-target.c +++ b/drivers/md/dm-clone-target.c @@ -1181,8 +1181,7 @@ static void process_deferred_discards(struct clone *clone) struct bio_list discards = BIO_EMPTY_LIST; spin_lock_irq(&clone->lock); - bio_list_merge(&discards, &clone->deferred_discard_bios); - bio_list_init(&clone->deferred_discard_bios); + bio_list_merge_init(&discards, &clone->deferred_discard_bios); spin_unlock_irq(&clone->lock); if (bio_list_empty(&discards)) @@ -1215,8 +1214,7 @@ static void process_deferred_bios(struct clone *clone) struct bio_list bios = BIO_EMPTY_LIST; spin_lock_irq(&clone->lock); - bio_list_merge(&bios, &clone->deferred_bios); - bio_list_init(&clone->deferred_bios); + bio_list_merge_init(&bios, &clone->deferred_bios); spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios)) @@ -1237,11 +1235,9 @@ static void process_deferred_flush_bios(struct clone *clone) * before issuing them or signaling their completion. */ spin_lock_irq(&clone->lock); - bio_list_merge(&bios, &clone->deferred_flush_bios); - bio_list_init(&clone->deferred_flush_bios); - - bio_list_merge(&bio_completions, &clone->deferred_flush_completions); - bio_list_init(&clone->deferred_flush_completions); + bio_list_merge_init(&bios, &clone->deferred_flush_bios); + bio_list_merge_init(&bio_completions, + &clone->deferred_flush_completions); spin_unlock_irq(&clone->lock); if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) && diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h index e6757a30dcca..08700bfc3e23 100644 --- a/drivers/md/dm-core.h +++ b/drivers/md/dm-core.h @@ -140,7 +140,7 @@ struct mapped_device { #ifdef CONFIG_BLK_DEV_ZONED unsigned int nr_zones; - unsigned int *zwp_offset; + void *zone_revalidate_map; #endif #ifdef CONFIG_IMA diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 6acfa5bf97a4..8f81e597858d 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1272,8 +1272,7 @@ static void process_deferred_bios(struct era *era) bio_list_init(&marked_bios); spin_lock(&era->deferred_lock); - bio_list_merge(&deferred_bios, &era->deferred_bios); - bio_list_init(&era->deferred_bios); + bio_list_merge_init(&deferred_bios, &era->deferred_bios); spin_unlock(&era->deferred_lock); if (bio_list_empty(&deferred_bios)) diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 05d1328d1811..15b681b90153 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -704,8 +704,7 @@ static void process_queued_bios(struct work_struct *work) return; } - bio_list_merge(&bios, &m->queued_bios); - bio_list_init(&m->queued_bios); + bio_list_merge_init(&bios, &m->queued_bios); spin_unlock_irqrestore(&m->lock, flags); diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 41f1d731ae5a..2c6fbd87363f 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -2042,7 +2042,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, r = dm_set_zones_restrictions(t, q); if (r) return r; - if (!static_key_enabled(&zoned_enabled.key)) + if (blk_queue_is_zoned(q) && + !static_key_enabled(&zoned_enabled.key)) static_branch_enable(&zoned_enabled); } diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 4793ad2aa1f7..f359984c8ef2 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -592,12 +592,6 @@ struct dm_thin_endio_hook { struct dm_bio_prison_cell *cell; }; -static void __merge_bio_list(struct bio_list *bios, struct bio_list *master) -{ - bio_list_merge(bios, master); - bio_list_init(master); -} - static void error_bio_list(struct bio_list *bios, blk_status_t error) { struct bio *bio; @@ -616,7 +610,7 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, bio_list_init(&bios); spin_lock_irq(&tc->lock); - __merge_bio_list(&bios, master); + bio_list_merge_init(&bios, master); spin_unlock_irq(&tc->lock); error_bio_list(&bios, error); @@ -645,8 +639,8 @@ static void requeue_io(struct thin_c *tc) bio_list_init(&bios); spin_lock_irq(&tc->lock); - __merge_bio_list(&bios, &tc->deferred_bio_list); - __merge_bio_list(&bios, &tc->retry_on_resume_list); + bio_list_merge_init(&bios, &tc->deferred_bio_list); + bio_list_merge_init(&bios, &tc->retry_on_resume_list); spin_unlock_irq(&tc->lock); error_bio_list(&bios, BLK_STS_DM_REQUEUE); diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c index 94f6f1ccfb7d..ab3ea8337809 100644 --- a/drivers/md/dm-vdo/data-vio.c +++ b/drivers/md/dm-vdo/data-vio.c @@ -604,8 +604,7 @@ static void assign_discard_permit(struct limiter *limiter) static void get_waiters(struct limiter *limiter) { - bio_list_merge(&limiter->waiters, &limiter->new_waiters); - bio_list_init(&limiter->new_waiters); + bio_list_merge_init(&limiter->waiters, &limiter->new_waiters); } static inline struct data_vio *get_available_data_vio(struct data_vio_pool *pool) diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c index 57e87f0d7069..dd4fdee2ca0c 100644 --- a/drivers/md/dm-vdo/flush.c +++ b/drivers/md/dm-vdo/flush.c @@ -369,8 +369,7 @@ void vdo_dump_flusher(const struct flusher *flusher) static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo) { bio_list_init(&flush->bios); - bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios); - bio_list_init(&vdo->flusher->waiting_flush_bios); + bio_list_merge_init(&flush->bios, &vdo->flusher->waiting_flush_bios); } static void launch_flush(struct vdo_flush *flush) diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c index eb9832b22b14..8e6bcb0d786a 100644 --- a/drivers/md/dm-zone.c +++ b/drivers/md/dm-zone.c @@ -60,16 +60,23 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector, struct dm_table *map; int srcu_idx, ret; - if (dm_suspended_md(md)) - return -EAGAIN; + if (!md->zone_revalidate_map) { + /* Regular user context */ + if (dm_suspended_md(md)) + return -EAGAIN; - map = dm_get_live_table(md, &srcu_idx); - if (!map) - return -EIO; + map = dm_get_live_table(md, &srcu_idx); + if (!map) + return -EIO; + } else { + /* Zone revalidation during __bind() */ + map = md->zone_revalidate_map; + } ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data); - dm_put_live_table(md, srcu_idx); + if (!md->zone_revalidate_map) + dm_put_live_table(md, srcu_idx); return ret; } @@ -138,80 +145,47 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) } } -void dm_cleanup_zoned_dev(struct mapped_device *md) +/* + * Count conventional zones of a mapped zoned device. If the device + * only has conventional zones, do not expose it as zoned. + */ +static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx, + void *data) { - if (md->disk) { - bitmap_free(md->disk->conv_zones_bitmap); - md->disk->conv_zones_bitmap = NULL; - bitmap_free(md->disk->seq_zones_wlock); - md->disk->seq_zones_wlock = NULL; - } + unsigned int *nr_conv_zones = data; - kvfree(md->zwp_offset); - md->zwp_offset = NULL; - md->nr_zones = 0; -} + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) + (*nr_conv_zones)++; -static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone) -{ - switch (zone->cond) { - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - case BLK_ZONE_COND_CLOSED: - return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_NOT_WP: - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - default: - /* - * Conventional, offline and read-only zones do not have a valid - * write pointer. Use 0 as for an empty zone. - */ - return 0; - } + return 0; } -static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, - void *data) +static int dm_check_zoned(struct mapped_device *md, struct dm_table *t) { - struct mapped_device *md = data; struct gendisk *disk = md->disk; + unsigned int nr_conv_zones = 0; + int ret; - switch (zone->type) { - case BLK_ZONE_TYPE_CONVENTIONAL: - if (!disk->conv_zones_bitmap) { - disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones, - GFP_NOIO); - if (!disk->conv_zones_bitmap) - return -ENOMEM; - } - set_bit(idx, disk->conv_zones_bitmap); - break; - case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: - if (!disk->seq_zones_wlock) { - disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones, - GFP_NOIO); - if (!disk->seq_zones_wlock) - return -ENOMEM; - } - if (!md->zwp_offset) { - md->zwp_offset = - kvcalloc(disk->nr_zones, sizeof(unsigned int), - GFP_KERNEL); - if (!md->zwp_offset) - return -ENOMEM; - } - md->zwp_offset[idx] = dm_get_zone_wp_offset(zone); - - break; - default: - DMERR("Invalid zone type 0x%x at sectors %llu", - (int)zone->type, zone->start); - return -ENODEV; + /* Count conventional zones */ + md->zone_revalidate_map = t; + ret = dm_blk_report_zones(disk, 0, UINT_MAX, + dm_check_zoned_cb, &nr_conv_zones); + md->zone_revalidate_map = NULL; + if (ret < 0) { + DMERR("Check zoned failed %d", ret); + return ret; + } + + /* + * If we only have conventional zones, expose the mapped device as + * a regular device. + */ + if (nr_conv_zones >= ret) { + disk->queue->limits.max_open_zones = 0; + disk->queue->limits.max_active_zones = 0; + disk->queue->limits.zoned = false; + clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + disk->nr_zones = 0; } return 0; @@ -226,41 +200,32 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx, static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t) { struct gendisk *disk = md->disk; - unsigned int noio_flag; int ret; - /* - * Check if something changed. If yes, cleanup the current resources - * and reallocate everything. - */ + /* Revalidate only if something changed. */ if (!disk->nr_zones || disk->nr_zones != md->nr_zones) - dm_cleanup_zoned_dev(md); + md->nr_zones = 0; + if (md->nr_zones) return 0; /* - * Scan all zones to initialize everything. Ensure that all vmalloc - * operations in this context are done as if GFP_NOIO was specified. + * Our table is not live yet. So the call to dm_get_live_table() + * in dm_blk_report_zones() will fail. Set a temporary pointer to + * our table for dm_blk_report_zones() to use directly. */ - noio_flag = memalloc_noio_save(); - ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones, - dm_zone_revalidate_cb, md); - memalloc_noio_restore(noio_flag); - if (ret < 0) - goto err; - if (ret != disk->nr_zones) { - ret = -EIO; - goto err; + md->zone_revalidate_map = t; + ret = blk_revalidate_disk_zones(disk); + md->zone_revalidate_map = NULL; + + if (ret) { + DMERR("Revalidate zones failed %d", ret); + return ret; } md->nr_zones = disk->nr_zones; return 0; - -err: - DMERR("Revalidate zones failed %d", ret); - dm_cleanup_zoned_dev(md); - return ret; } static int device_not_zone_append_capable(struct dm_target *ti, @@ -289,294 +254,40 @@ static bool dm_table_supports_zone_append(struct dm_table *t) int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q) { struct mapped_device *md = t->md; + int ret; /* - * For a zoned target, the number of zones should be updated for the - * correct value to be exposed in sysfs queue/nr_zones. + * Check if zone append is natively supported, and if not, set the + * mapped device queue as needing zone append emulation. */ WARN_ON_ONCE(queue_is_mq(q)); - md->disk->nr_zones = bdev_nr_zones(md->disk->part0); - - /* Check if zone append is natively supported */ if (dm_table_supports_zone_append(t)) { clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); - dm_cleanup_zoned_dev(md); - return 0; + } else { + set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); + blk_queue_max_zone_append_sectors(q, 0); } - /* - * Mark the mapped device as needing zone append emulation and - * initialize the emulation resources once the capacity is set. - */ - set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags); if (!get_capacity(md->disk)) return 0; - return dm_revalidate_zones(md, t); -} - -static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - unsigned int *wp_offset = data; - - *wp_offset = dm_get_zone_wp_offset(zone); - - return 0; -} - -static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno, - unsigned int *wp_ofst) -{ - sector_t sector = zno * bdev_zone_sectors(md->disk->part0); - unsigned int noio_flag; - struct dm_table *t; - int srcu_idx, ret; - - t = dm_get_live_table(md, &srcu_idx); - if (!t) - return -EIO; - - /* - * Ensure that all memory allocations in this context are done as if - * GFP_NOIO was specified. - */ - noio_flag = memalloc_noio_save(); - ret = dm_blk_do_report_zones(md, t, sector, 1, - dm_update_zone_wp_offset_cb, wp_ofst); - memalloc_noio_restore(noio_flag); - - dm_put_live_table(md, srcu_idx); - - if (ret != 1) - return -EIO; - - return 0; -} - -struct orig_bio_details { - enum req_op op; - unsigned int nr_sectors; -}; - -/* - * First phase of BIO mapping for targets with zone append emulation: - * check all BIO that change a zone writer pointer and change zone - * append operations into regular write operations. - */ -static bool dm_zone_map_bio_begin(struct mapped_device *md, - unsigned int zno, struct bio *clone) -{ - sector_t zsectors = bdev_zone_sectors(md->disk->part0); - unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); - - /* - * If the target zone is in an error state, recover by inspecting the - * zone to get its current write pointer position. Note that since the - * target zone is already locked, a BIO issuing context should never - * see the zone write in the DM_ZONE_UPDATING_WP_OFST state. - */ - if (zwp_offset == DM_ZONE_INVALID_WP_OFST) { - if (dm_update_zone_wp_offset(md, zno, &zwp_offset)) - return false; - WRITE_ONCE(md->zwp_offset[zno], zwp_offset); - } - - switch (bio_op(clone)) { - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_FINISH: - return true; - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - /* Writes must be aligned to the zone write pointer */ - if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset) - return false; - break; - case REQ_OP_ZONE_APPEND: - /* - * Change zone append operations into a non-mergeable regular - * writes directed at the current write pointer position of the - * target zone. - */ - clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE | - (clone->bi_opf & (~REQ_OP_MASK)); - clone->bi_iter.bi_sector += zwp_offset; - break; - default: - DMWARN_LIMIT("Invalid BIO operation"); - return false; - } - - /* Cannot write to a full zone */ - if (zwp_offset >= zsectors) - return false; - - return true; -} - -/* - * Second phase of BIO mapping for targets with zone append emulation: - * update the zone write pointer offset array to account for the additional - * data written to a zone. Note that at this point, the remapped clone BIO - * may already have completed, so we do not touch it. - */ -static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno, - struct orig_bio_details *orig_bio_details, - unsigned int nr_sectors) -{ - unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]); - - /* The clone BIO may already have been completed and failed */ - if (zwp_offset == DM_ZONE_INVALID_WP_OFST) - return BLK_STS_IOERR; - - /* Update the zone wp offset */ - switch (orig_bio_details->op) { - case REQ_OP_ZONE_RESET: - WRITE_ONCE(md->zwp_offset[zno], 0); - return BLK_STS_OK; - case REQ_OP_ZONE_FINISH: - WRITE_ONCE(md->zwp_offset[zno], - bdev_zone_sectors(md->disk->part0)); - return BLK_STS_OK; - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); - return BLK_STS_OK; - case REQ_OP_ZONE_APPEND: - /* - * Check that the target did not truncate the write operation - * emulating a zone append. - */ - if (nr_sectors != orig_bio_details->nr_sectors) { - DMWARN_LIMIT("Truncated write for zone append"); - return BLK_STS_IOERR; - } - WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors); - return BLK_STS_OK; - default: - DMWARN_LIMIT("Invalid BIO operation"); - return BLK_STS_IOERR; - } -} - -static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno, - struct bio *clone) -{ - if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))) - return; - - wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE); - bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED); -} - -static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno, - struct bio *clone) -{ - if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) - return; - - WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock)); - clear_bit_unlock(zno, disk->seq_zones_wlock); - smp_mb__after_atomic(); - wake_up_bit(disk->seq_zones_wlock, zno); - - bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED); -} - -static bool dm_need_zone_wp_tracking(struct bio *bio) -{ /* - * Special processing is not needed for operations that do not need the - * zone write lock, that is, all operations that target conventional - * zones and all operations that do not modify directly a sequential - * zone write pointer. + * Check that the mapped device will indeed be zoned, that is, that it + * has sequential write required zones. */ - if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) - return false; - switch (bio_op(bio)) { - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_FINISH: - case REQ_OP_ZONE_APPEND: - return bio_zone_is_seq(bio); - default: - return false; - } -} - -/* - * Special IO mapping for targets needing zone append emulation. - */ -int dm_zone_map_bio(struct dm_target_io *tio) -{ - struct dm_io *io = tio->io; - struct dm_target *ti = tio->ti; - struct mapped_device *md = io->md; - struct bio *clone = &tio->clone; - struct orig_bio_details orig_bio_details; - unsigned int zno; - blk_status_t sts; - int r; - - /* - * IOs that do not change a zone write pointer do not need - * any additional special processing. - */ - if (!dm_need_zone_wp_tracking(clone)) - return ti->type->map(ti, clone); - - /* Lock the target zone */ - zno = bio_zone_no(clone); - dm_zone_lock(md->disk, zno, clone); - - orig_bio_details.nr_sectors = bio_sectors(clone); - orig_bio_details.op = bio_op(clone); + ret = dm_check_zoned(md, t); + if (ret) + return ret; + if (!blk_queue_is_zoned(q)) + return 0; - /* - * Check that the bio and the target zone write pointer offset are - * both valid, and if the bio is a zone append, remap it to a write. - */ - if (!dm_zone_map_bio_begin(md, zno, clone)) { - dm_zone_unlock(md->disk, zno, clone); - return DM_MAPIO_KILL; + if (!md->disk->nr_zones) { + DMINFO("%s using %s zone append", + md->disk->disk_name, + queue_emulates_zone_append(q) ? "emulated" : "native"); } - /* Let the target do its work */ - r = ti->type->map(ti, clone); - switch (r) { - case DM_MAPIO_SUBMITTED: - /* - * The target submitted the clone BIO. The target zone will - * be unlocked on completion of the clone. - */ - sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, - *tio->len_ptr); - break; - case DM_MAPIO_REMAPPED: - /* - * The target only remapped the clone BIO. In case of error, - * unlock the target zone here as the clone will not be - * submitted. - */ - sts = dm_zone_map_bio_end(md, zno, &orig_bio_details, - *tio->len_ptr); - if (sts != BLK_STS_OK) - dm_zone_unlock(md->disk, zno, clone); - break; - case DM_MAPIO_REQUEUE: - case DM_MAPIO_KILL: - default: - dm_zone_unlock(md->disk, zno, clone); - sts = BLK_STS_IOERR; - break; - } - - if (sts != BLK_STS_OK) - return DM_MAPIO_KILL; - - return r; + return dm_revalidate_zones(md, t); } /* @@ -587,61 +298,17 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone) struct mapped_device *md = io->md; struct gendisk *disk = md->disk; struct bio *orig_bio = io->orig_bio; - unsigned int zwp_offset; - unsigned int zno; /* - * For targets that do not emulate zone append, we only need to - * handle native zone-append bios. + * Get the offset within the zone of the written sector + * and add that to the original bio sector position. */ - if (!dm_emulate_zone_append(md)) { - /* - * Get the offset within the zone of the written sector - * and add that to the original bio sector position. - */ - if (clone->bi_status == BLK_STS_OK && - bio_op(clone) == REQ_OP_ZONE_APPEND) { - sector_t mask = - (sector_t)bdev_zone_sectors(disk->part0) - 1; - - orig_bio->bi_iter.bi_sector += - clone->bi_iter.bi_sector & mask; - } - - return; - } + if (clone->bi_status == BLK_STS_OK && + bio_op(clone) == REQ_OP_ZONE_APPEND) { + sector_t mask = bdev_zone_sectors(disk->part0) - 1; - /* - * For targets that do emulate zone append, if the clone BIO does not - * own the target zone write lock, we have nothing to do. - */ - if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)) - return; - - zno = bio_zone_no(orig_bio); - - if (clone->bi_status != BLK_STS_OK) { - /* - * BIOs that modify a zone write pointer may leave the zone - * in an unknown state in case of failure (e.g. the write - * pointer was only partially advanced). In this case, set - * the target zone write pointer as invalid unless it is - * already being updated. - */ - WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST); - } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { - /* - * Get the written sector for zone append operation that were - * emulated using regular write operations. - */ - zwp_offset = READ_ONCE(md->zwp_offset[zno]); - if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio))) - WRITE_ONCE(md->zwp_offset[zno], - DM_ZONE_INVALID_WP_OFST); - else - orig_bio->bi_iter.bi_sector += - zwp_offset - bio_sectors(orig_bio); + orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask; } - dm_zone_unlock(disk, zno, clone); + return; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7d0746b37c8e..597dd7a25823 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1428,25 +1428,12 @@ static void __map_bio(struct bio *clone) down(&md->swap_bios_semaphore); } - if (static_branch_unlikely(&zoned_enabled)) { - /* - * Check if the IO needs a special mapping due to zone append - * emulation on zoned target. In this case, dm_zone_map_bio() - * calls the target map operation. - */ - if (unlikely(dm_emulate_zone_append(md))) - r = dm_zone_map_bio(tio); - else - goto do_map; - } else { -do_map: - if (likely(ti->type->map == linear_map)) - r = linear_map(ti, clone); - else if (ti->type->map == stripe_map) - r = stripe_map(ti, clone); - else - r = ti->type->map(ti, clone); - } + if (likely(ti->type->map == linear_map)) + r = linear_map(ti, clone); + else if (ti->type->map == stripe_map) + r = stripe_map(ti, clone); + else + r = ti->type->map(ti, clone); switch (r) { case DM_MAPIO_SUBMITTED: @@ -1774,6 +1761,33 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io, ci->sector_count = 0; } +#ifdef CONFIG_BLK_DEV_ZONED +static inline bool dm_zone_bio_needs_split(struct mapped_device *md, + struct bio *bio) +{ + /* + * For mapped device that need zone append emulation, we must + * split any large BIO that straddles zone boundaries. + */ + return dm_emulate_zone_append(md) && bio_straddles_zones(bio) && + !bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING); +} +static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) +{ + return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0); +} +#else +static inline bool dm_zone_bio_needs_split(struct mapped_device *md, + struct bio *bio) +{ + return false; +} +static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio) +{ + return false; +} +#endif + /* * Entry point to split a bio into clones and submit them to the targets. */ @@ -1783,19 +1797,32 @@ static void dm_split_and_process_bio(struct mapped_device *md, struct clone_info ci; struct dm_io *io; blk_status_t error = BLK_STS_OK; - bool is_abnormal; + bool is_abnormal, need_split; + + need_split = is_abnormal = is_abnormal_io(bio); + if (static_branch_unlikely(&zoned_enabled)) + need_split = is_abnormal || dm_zone_bio_needs_split(md, bio); - is_abnormal = is_abnormal_io(bio); - if (unlikely(is_abnormal)) { + if (unlikely(need_split)) { /* * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc) * otherwise associated queue_limits won't be imposed. + * Also split the BIO for mapped devices needing zone append + * emulation to ensure that the BIO does not cross zone + * boundaries. */ bio = bio_split_to_limits(bio); if (!bio) return; } + /* + * Use the block layer zone write plugging for mapped devices that + * need zone append emulation (e.g. dm-crypt). + */ + if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio)) + return; + /* Only support nowait for normal IO */ if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) { io = alloc_io(md, bio, GFP_NOWAIT); @@ -2016,7 +2043,6 @@ static void cleanup_mapped_device(struct mapped_device *md) md->dax_dev = NULL; } - dm_cleanup_zoned_dev(md); if (md->disk) { spin_lock(&_minor_lock); md->disk->private_data = NULL; diff --git a/drivers/md/dm.h b/drivers/md/dm.h index 7f1acbf6bd9e..e0c57f19839b 100644 --- a/drivers/md/dm.h +++ b/drivers/md/dm.h @@ -104,13 +104,11 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t); int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q); void dm_zone_endio(struct dm_io *io, struct bio *clone); #ifdef CONFIG_BLK_DEV_ZONED -void dm_cleanup_zoned_dev(struct mapped_device *md); int dm_blk_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); bool dm_is_zone_write(struct mapped_device *md, struct bio *bio); int dm_zone_map_bio(struct dm_target_io *io); #else -static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {} #define dm_blk_report_zones NULL static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio) { diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 059afc24c08b..0a2d37eb38ef 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1424,7 +1424,7 @@ __acquires(bitmap->lock) sector_t chunk = offset >> bitmap->chunkshift; unsigned long page = chunk >> PAGE_COUNTER_SHIFT; unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; - sector_t csize; + sector_t csize = ((sector_t)1) << bitmap->chunkshift; int err; if (page >= bitmap->pages) { @@ -1433,6 +1433,7 @@ __acquires(bitmap->lock) * End-of-device while looking for a whole page or * user set a huge number to sysfs bitmap_set_bits. */ + *blocks = csize - (offset & (csize - 1)); return NULL; } err = md_bitmap_checkpage(bitmap, page, create, 0); @@ -1441,8 +1442,7 @@ __acquires(bitmap->lock) bitmap->bp[page].map == NULL) csize = ((sector_t)1) << (bitmap->chunkshift + PAGE_COUNTER_SHIFT); - else - csize = ((sector_t)1) << bitmap->chunkshift; + *blocks = csize - (offset & (csize - 1)); if (err < 0) diff --git a/drivers/md/md.c b/drivers/md/md.c index e575e74aabf5..aff9118ff697 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8087,7 +8087,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread) if (t) { pr_debug("md: waking up MD thread %s.\n", t->tsk->comm); set_bit(THREAD_WAKEUP, &t->flags); - wake_up(&t->wqueue); + if (wq_has_sleeper(&t->wqueue)) + wake_up(&t->wqueue); } rcu_read_unlock(); } @@ -8582,6 +8583,10 @@ static int is_mddev_idle(struct mddev *mddev, int init) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_disk; + + if (!init && !blk_queue_io_stat(disk->queue)) + continue; + curr_events = (int)part_stat_read_accum(disk->part0, sectors) - atomic_read(&disk->sync_io); /* sync IO will cause sync_io to increase before the disk_stats diff --git a/drivers/md/md.h b/drivers/md/md.h index 097d9dbd69b8..ca085ecad504 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -621,7 +621,8 @@ extern void mddev_unlock(struct mddev *mddev); static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) { - atomic_add(nr_sectors, &bdev->bd_disk->sync_io); + if (blk_queue_io_stat(bdev->bd_disk->queue)) + atomic_add(nr_sectors, &bdev->bd_disk->sync_io); } static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index d874abfc1836..2bd1ce9b3922 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,7 +36,6 @@ */ #include -#include #include #include #include @@ -6734,6 +6733,9 @@ static void raid5d(struct md_thread *thread) int batch_size, released; unsigned int offset; + if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) + break; + released = release_stripe_list(conf, conf->temp_inactive_list); if (released) clear_bit(R5_DID_ALLOC, &conf->cache_state); @@ -6770,18 +6772,7 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); - - /* - * Waiting on MD_SB_CHANGE_PENDING below may deadlock - * seeing md_check_recovery() is needed to clear - * the flag when using mdmon. - */ - continue; } - - wait_event_lock_irq(mddev->sb_wait, - !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), - conf->device_lock); } pr_debug("%d stripes handled\n", handled); diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 095f59e7aa93..bf7615cb36ee 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2132,7 +2132,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { - ret = blk_revalidate_disk_zones(ns->disk, NULL); + ret = blk_revalidate_disk_zones(ns->disk); if (ret && !nvme_first_scan(ns->disk)) goto out; } diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 3148d9f1bde6..0021d06041c1 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -52,14 +52,10 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns) if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1)) return false; /* - * ZNS does not define a conventional zone type. If the underlying - * device has a bitmap set indicating the existence of conventional - * zones, reject the device. Otherwise, use report zones to detect if - * the device has conventional zones. + * ZNS does not define a conventional zone type. Use report zones + * to detect if the device has conventional zones and reject it if + * it does. */ - if (ns->bdev->bd_disk->conv_zones_bitmap) - return false; - ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev), validate_conv_zones_cb, NULL); if (ret < 0) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 5b3230ef51fe..967b6d62bb37 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1869,7 +1869,6 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, case BLK_STS_OK: break; case BLK_STS_RESOURCE: - case BLK_STS_ZONE_RESOURCE: if (scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 65cdc8b77e35..64c5129044b3 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1260,12 +1260,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd) } } - if (req_op(rq) == REQ_OP_ZONE_APPEND) { - ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks); - if (ret) - goto fail; - } - fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0; dix = scsi_prot_sg_count(cmd); dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type); @@ -1348,7 +1342,6 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd) return sd_setup_flush_cmnd(cmd); case REQ_OP_READ: case REQ_OP_WRITE: - case REQ_OP_ZONE_APPEND: return sd_setup_read_write_cmnd(cmd); case REQ_OP_ZONE_RESET: return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER, @@ -3981,7 +3974,6 @@ static void scsi_disk_release(struct device *dev) struct scsi_disk *sdkp = to_scsi_disk(dev); ida_free(&sd_index_ida, sdkp->index); - sd_zbc_free_zone_info(sdkp); put_device(&sdkp->device->sdev_gendev); free_opal_dev(sdkp->opal_dev); diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h index 5c4285a582b2..49dd600bfa48 100644 --- a/drivers/scsi/sd.h +++ b/drivers/scsi/sd.h @@ -104,12 +104,6 @@ struct scsi_disk { * between zone starting LBAs is constant. */ u32 zone_starting_lba_gran; - u32 *zones_wp_offset; - spinlock_t zones_wp_offset_lock; - u32 *rev_wp_offset; - struct mutex rev_mutex; - struct work_struct zone_wp_offset_work; - char *zone_wp_update_buf; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ @@ -245,7 +239,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp) #ifdef CONFIG_BLK_DEV_ZONED -void sd_zbc_free_zone_info(struct scsi_disk *sdkp); int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, @@ -255,13 +248,8 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, - unsigned int nr_blocks); - #else /* CONFIG_BLK_DEV_ZONED */ -static inline void sd_zbc_free_zone_info(struct scsi_disk *sdkp) {} - static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) { return 0; @@ -285,13 +273,6 @@ static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, return good_bytes; } -static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, - sector_t *lba, - unsigned int nr_blocks) -{ - return BLK_STS_TARGET; -} - #define sd_zbc_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 26af5ab7d7c1..806036e48abe 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -23,36 +23,6 @@ #define CREATE_TRACE_POINTS #include "sd_trace.h" -/** - * sd_zbc_get_zone_wp_offset - Get zone write pointer offset. - * @zone: Zone for which to return the write pointer offset. - * - * Return: offset of the write pointer from the start of the zone. - */ -static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone) -{ - if (zone->type == ZBC_ZONE_TYPE_CONV) - return 0; - - switch (zone->cond) { - case BLK_ZONE_COND_IMP_OPEN: - case BLK_ZONE_COND_EXP_OPEN: - case BLK_ZONE_COND_CLOSED: - return zone->wp - zone->start; - case BLK_ZONE_COND_FULL: - return zone->len; - case BLK_ZONE_COND_EMPTY: - case BLK_ZONE_COND_OFFLINE: - case BLK_ZONE_COND_READONLY: - default: - /* - * Offline and read-only zones do not have a valid - * write pointer. Use 0 as for an empty zone. - */ - return 0; - } -} - /* Whether or not a SCSI zone descriptor describes a gap zone. */ static bool sd_zbc_is_gap_zone(const u8 buf[64]) { @@ -121,9 +91,6 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64], if (ret) return ret; - if (sdkp->rev_wp_offset) - sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone); - return 0; } @@ -347,123 +314,6 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd) return BLK_STS_OK; } -#define SD_ZBC_INVALID_WP_OFST (~0u) -#define SD_ZBC_UPDATING_WP_OFST (SD_ZBC_INVALID_WP_OFST - 1) - -static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx, - void *data) -{ - struct scsi_disk *sdkp = data; - - lockdep_assert_held(&sdkp->zones_wp_offset_lock); - - sdkp->zones_wp_offset[idx] = sd_zbc_get_zone_wp_offset(zone); - - return 0; -} - -/* - * An attempt to append a zone triggered an invalid write pointer error. - * Reread the write pointer of the zone(s) in which the append failed. - */ -static void sd_zbc_update_wp_offset_workfn(struct work_struct *work) -{ - struct scsi_disk *sdkp; - unsigned long flags; - sector_t zno; - int ret; - - sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work); - - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) { - if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) - continue; - - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf, - SD_BUF_SIZE, - zno * sdkp->zone_info.zone_blocks, true); - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - if (!ret) - sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64, - zno, sd_zbc_update_wp_offset_cb, - sdkp); - } - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - - scsi_device_put(sdkp->device); -} - -/** - * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command. - * @cmd: the command to setup - * @lba: the LBA to patch - * @nr_blocks: the number of LBAs to be written - * - * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND. - * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and - * patching of the lba for an emulated ZONE_APPEND command. - * - * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will - * schedule a REPORT ZONES command and return BLK_STS_IOERR. - */ -blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, - unsigned int nr_blocks) -{ - struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->q->disk); - unsigned int wp_offset, zno = blk_rq_zone_no(rq); - unsigned long flags; - blk_status_t ret; - - ret = sd_zbc_cmnd_checks(cmd); - if (ret != BLK_STS_OK) - return ret; - - if (!blk_rq_zone_is_seq(rq)) - return BLK_STS_IOERR; - - /* Unlock of the write lock will happen in sd_zbc_complete() */ - if (!blk_req_zone_write_trylock(rq)) - return BLK_STS_ZONE_RESOURCE; - - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - wp_offset = sdkp->zones_wp_offset[zno]; - switch (wp_offset) { - case SD_ZBC_INVALID_WP_OFST: - /* - * We are about to schedule work to update a zone write pointer - * offset, which will cause the zone append command to be - * requeued. So make sure that the scsi device does not go away - * while the work is being processed. - */ - if (scsi_device_get(sdkp->device)) { - ret = BLK_STS_IOERR; - break; - } - sdkp->zones_wp_offset[zno] = SD_ZBC_UPDATING_WP_OFST; - schedule_work(&sdkp->zone_wp_offset_work); - fallthrough; - case SD_ZBC_UPDATING_WP_OFST: - ret = BLK_STS_DEV_RESOURCE; - break; - default: - wp_offset = sectors_to_logical(sdkp->device, wp_offset); - if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) { - ret = BLK_STS_IOERR; - break; - } - - trace_scsi_prepare_zone_append(cmd, *lba, wp_offset); - *lba += wp_offset; - } - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - if (ret) - blk_req_zone_write_unlock(rq); - return ret; -} - /** * sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations * can be RESET WRITE POINTER, OPEN, CLOSE or FINISH. @@ -504,96 +354,6 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, return BLK_STS_OK; } -static bool sd_zbc_need_zone_wp_update(struct request *rq) -{ - switch (req_op(rq)) { - case REQ_OP_ZONE_APPEND: - case REQ_OP_ZONE_FINISH: - case REQ_OP_ZONE_RESET: - case REQ_OP_ZONE_RESET_ALL: - return true; - case REQ_OP_WRITE: - case REQ_OP_WRITE_ZEROES: - return blk_rq_zone_is_seq(rq); - default: - return false; - } -} - -/** - * sd_zbc_zone_wp_update - Update cached zone write pointer upon cmd completion - * @cmd: Completed command - * @good_bytes: Command reply bytes - * - * Called from sd_zbc_complete() to handle the update of the cached zone write - * pointer value in case an update is needed. - */ -static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd, - unsigned int good_bytes) -{ - int result = cmd->result; - struct request *rq = scsi_cmd_to_rq(cmd); - struct scsi_disk *sdkp = scsi_disk(rq->q->disk); - unsigned int zno = blk_rq_zone_no(rq); - enum req_op op = req_op(rq); - unsigned long flags; - - /* - * If we got an error for a command that needs updating the write - * pointer offset cache, we must mark the zone wp offset entry as - * invalid to force an update from disk the next time a zone append - * command is issued. - */ - spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags); - - if (result && op != REQ_OP_ZONE_RESET_ALL) { - if (op == REQ_OP_ZONE_APPEND) { - /* Force complete completion (no retry) */ - good_bytes = 0; - scsi_set_resid(cmd, blk_rq_bytes(rq)); - } - - /* - * Force an update of the zone write pointer offset on - * the next zone append access. - */ - if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST) - sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST; - goto unlock_wp_offset; - } - - switch (op) { - case REQ_OP_ZONE_APPEND: - trace_scsi_zone_wp_update(cmd, rq->__sector, - sdkp->zones_wp_offset[zno], good_bytes); - rq->__sector += sdkp->zones_wp_offset[zno]; - fallthrough; - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE: - if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp)) - sdkp->zones_wp_offset[zno] += - good_bytes >> SECTOR_SHIFT; - break; - case REQ_OP_ZONE_RESET: - sdkp->zones_wp_offset[zno] = 0; - break; - case REQ_OP_ZONE_FINISH: - sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp); - break; - case REQ_OP_ZONE_RESET_ALL: - memset(sdkp->zones_wp_offset, 0, - sdkp->zone_info.nr_zones * sizeof(unsigned int)); - break; - default: - break; - } - -unlock_wp_offset: - spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags); - - return good_bytes; -} - /** * sd_zbc_complete - ZBC command post processing. * @cmd: Completed command @@ -619,11 +379,7 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, * so be quiet about the error. */ rq->rq_flags |= RQF_QUIET; - } else if (sd_zbc_need_zone_wp_update(rq)) - good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes); - - if (req_op(rq) == REQ_OP_ZONE_APPEND) - blk_req_zone_write_unlock(rq); + } return good_bytes; } @@ -780,46 +536,6 @@ static void sd_zbc_print_zones(struct scsi_disk *sdkp) sdkp->zone_info.zone_blocks); } -static int sd_zbc_init_disk(struct scsi_disk *sdkp) -{ - sdkp->zones_wp_offset = NULL; - spin_lock_init(&sdkp->zones_wp_offset_lock); - sdkp->rev_wp_offset = NULL; - mutex_init(&sdkp->rev_mutex); - INIT_WORK(&sdkp->zone_wp_offset_work, sd_zbc_update_wp_offset_workfn); - sdkp->zone_wp_update_buf = kzalloc(SD_BUF_SIZE, GFP_KERNEL); - if (!sdkp->zone_wp_update_buf) - return -ENOMEM; - - return 0; -} - -void sd_zbc_free_zone_info(struct scsi_disk *sdkp) -{ - if (!sdkp->zone_wp_update_buf) - return; - - /* Serialize against revalidate zones */ - mutex_lock(&sdkp->rev_mutex); - - kvfree(sdkp->zones_wp_offset); - sdkp->zones_wp_offset = NULL; - kfree(sdkp->zone_wp_update_buf); - sdkp->zone_wp_update_buf = NULL; - - sdkp->early_zone_info = (struct zoned_disk_info){ }; - sdkp->zone_info = (struct zoned_disk_info){ }; - - mutex_unlock(&sdkp->rev_mutex); -} - -static void sd_zbc_revalidate_zones_cb(struct gendisk *disk) -{ - struct scsi_disk *sdkp = scsi_disk(disk); - - swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset); -} - /* * Call blk_revalidate_disk_zones() if any of the zoned disk properties have * changed that make it necessary to call that function. Called by @@ -831,18 +547,8 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) struct request_queue *q = disk->queue; u32 zone_blocks = sdkp->early_zone_info.zone_blocks; unsigned int nr_zones = sdkp->early_zone_info.nr_zones; - int ret = 0; unsigned int flags; - - /* - * For all zoned disks, initialize zone append emulation data if not - * already done. - */ - if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) { - ret = sd_zbc_init_disk(sdkp); - if (ret) - return ret; - } + int ret; /* * There is nothing to do for regular disks, including host-aware disks @@ -851,50 +557,32 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) if (!blk_queue_is_zoned(q)) return 0; - /* - * Make sure revalidate zones are serialized to ensure exclusive - * updates of the scsi disk data. - */ - mutex_lock(&sdkp->rev_mutex); - if (sdkp->zone_info.zone_blocks == zone_blocks && sdkp->zone_info.nr_zones == nr_zones && disk->nr_zones == nr_zones) - goto unlock; + return 0; - flags = memalloc_noio_save(); sdkp->zone_info.zone_blocks = zone_blocks; sdkp->zone_info.nr_zones = nr_zones; - sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL); - if (!sdkp->rev_wp_offset) { - ret = -ENOMEM; - memalloc_noio_restore(flags); - goto unlock; - } blk_queue_chunk_sectors(q, logical_to_sectors(sdkp->device, zone_blocks)); - blk_queue_max_zone_append_sectors(q, - q->limits.max_segments << PAGE_SECTORS_SHIFT); - ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb); + /* Enable block layer zone append emulation */ + blk_queue_max_zone_append_sectors(q, 0); + flags = memalloc_noio_save(); + ret = blk_revalidate_disk_zones(disk); memalloc_noio_restore(flags); - kvfree(sdkp->rev_wp_offset); - sdkp->rev_wp_offset = NULL; - if (ret) { sdkp->zone_info = (struct zoned_disk_info){ }; sdkp->capacity = 0; - goto unlock; + return ret; } sd_zbc_print_zones(sdkp); -unlock: - mutex_unlock(&sdkp->rev_mutex); - - return ret; + return 0; } /** @@ -917,10 +605,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) if (!sd_is_zoned(sdkp)) { /* * Device managed or normal SCSI disk, no special handling - * required. Nevertheless, free the disk zone information in - * case the device type changed. + * required. */ - sd_zbc_free_zone_info(sdkp); return 0; } @@ -941,7 +627,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]) /* The drive satisfies the kernel restrictions: set it up */ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); if (sdkp->zones_max_open == U32_MAX) disk_set_max_open_zones(disk, 0); else diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6f4a9cfeea44..831fac45e70f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -331,12 +331,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) static void merge_rbio(struct btrfs_raid_bio *dest, struct btrfs_raid_bio *victim) { - bio_list_merge(&dest->bio_list, &victim->bio_list); + bio_list_merge_init(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; /* Also inherit the bitmaps from @victim. */ bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, dest->stripe_nsectors); - bio_list_init(&victim->bio_list); } /* diff --git a/include/linux/bio.h b/include/linux/bio.h index 875d792bffff..d5379548d684 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -615,6 +615,13 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2) bl->tail = bl2->tail; } +static inline void bio_list_merge_init(struct bio_list *bl, + struct bio_list *bl2) +{ + bio_list_merge(bl, bl2); + bio_list_init(bl2); +} + static inline void bio_list_merge_head(struct bio_list *bl, struct bio_list *bl2) { @@ -824,5 +831,9 @@ static inline void bio_clear_polled(struct bio *bio) struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, unsigned int nr_pages, blk_opf_t opf, gfp_t gfp); +struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new); + +struct bio *blk_alloc_discard_bio(struct block_device *bdev, + sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask); #endif /* __LINUX_BIO_H */ diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d3d8fd8e229b..89ba6b16fe8b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -54,8 +54,8 @@ typedef __u32 __bitwise req_flags_t; /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) -/* The per-zone write lock is held for this request */ -#define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) +/* The request completion needs to be signaled to zone write pluging. */ +#define RQF_ZONE_WRITE_PLUGGING ((__force req_flags_t)(1 << 20)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) #define RQF_RESV ((__force req_flags_t)(1 << 23)) @@ -1150,85 +1150,4 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, } void blk_dump_rq_flags(struct request *, char *); -#ifdef CONFIG_BLK_DEV_ZONED -static inline unsigned int blk_rq_zone_no(struct request *rq) -{ - return disk_zone_no(rq->q->disk, blk_rq_pos(rq)); -} - -static inline unsigned int blk_rq_zone_is_seq(struct request *rq) -{ - return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq)); -} - -/** - * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization. - * @rq: Request to examine. - * - * Note: REQ_OP_ZONE_APPEND requests do not require serialization. - */ -static inline bool blk_rq_is_seq_zoned_write(struct request *rq) -{ - return op_needs_zoned_write_locking(req_op(rq)) && - blk_rq_zone_is_seq(rq); -} - -bool blk_req_needs_zone_write_lock(struct request *rq); -bool blk_req_zone_write_trylock(struct request *rq); -void __blk_req_zone_write_lock(struct request *rq); -void __blk_req_zone_write_unlock(struct request *rq); - -static inline void blk_req_zone_write_lock(struct request *rq) -{ - if (blk_req_needs_zone_write_lock(rq)) - __blk_req_zone_write_lock(rq); -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ - if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) - __blk_req_zone_write_unlock(rq); -} - -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return rq->q->disk->seq_zones_wlock && - test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock); -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - if (!blk_req_needs_zone_write_lock(rq)) - return true; - return !blk_req_zone_is_write_locked(rq); -} -#else /* CONFIG_BLK_DEV_ZONED */ -static inline bool blk_rq_is_seq_zoned_write(struct request *rq) -{ - return false; -} - -static inline bool blk_req_needs_zone_write_lock(struct request *rq) -{ - return false; -} - -static inline void blk_req_zone_write_lock(struct request *rq) -{ -} - -static inline void blk_req_zone_write_unlock(struct request *rq) -{ -} -static inline bool blk_req_zone_is_write_locked(struct request *rq) -{ - return false; -} - -static inline bool blk_req_can_dispatch_to_zone(struct request *rq) -{ - return true; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - #endif /* BLK_MQ_H */ diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index c3e098b21c16..25dbf1097085 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -130,18 +130,6 @@ typedef u16 blk_short_t; */ #define BLK_STS_DEV_RESOURCE ((__force blk_status_t)13) -/* - * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone - * related resources are unavailable, but the driver can guarantee the queue - * will be rerun in the future once the resources become available again. - * - * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references - * a zone specific resource and IO to a different zone on the same device could - * still be served. Examples of that are zones that are write-locked, but a read - * to the same zone could be served. - */ -#define BLK_STS_ZONE_RESOURCE ((__force blk_status_t)14) - /* * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion * path if the device returns a status indicating that too many zone resources @@ -149,7 +137,7 @@ typedef u16 blk_short_t; * after the number of open zones decreases below the device's limits, which is * reported in the request_queue's max_open_zones. */ -#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)15) +#define BLK_STS_ZONE_OPEN_RESOURCE ((__force blk_status_t)14) /* * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion @@ -158,20 +146,20 @@ typedef u16 blk_short_t; * after the number of active zones decreases below the device's limits, which * is reported in the request_queue's max_active_zones. */ -#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)16) +#define BLK_STS_ZONE_ACTIVE_RESOURCE ((__force blk_status_t)15) /* * BLK_STS_OFFLINE is returned from the driver when the target device is offline * or is being taken offline. This could help differentiate the case where a * device is intentionally being shut down from a real I/O error. */ -#define BLK_STS_OFFLINE ((__force blk_status_t)17) +#define BLK_STS_OFFLINE ((__force blk_status_t)16) /* * BLK_STS_DURATION_LIMIT is returned from the driver when the target device * aborted the command because it exceeded one of its Command Duration Limits. */ -#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)18) +#define BLK_STS_DURATION_LIMIT ((__force blk_status_t)17) /** * blk_path_error - returns true if error may be path related @@ -228,7 +216,12 @@ struct bio { struct bvec_iter bi_iter; - blk_qc_t bi_cookie; + union { + /* for polled bios: */ + blk_qc_t bi_cookie; + /* for plugged zoned writes only: */ + unsigned int __bi_nr_segments; + }; bio_end_io_t *bi_end_io; void *bi_private; #ifdef CONFIG_BLK_CGROUP @@ -298,7 +291,8 @@ enum { BIO_QOS_THROTTLED, /* bio went through rq_qos throttle path */ BIO_QOS_MERGED, /* but went through rq_qos merge path */ BIO_REMAPPED, - BIO_ZONE_WRITE_LOCKED, /* Owns a zoned device zone write lock */ + BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */ + BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */ BIO_FLAG_LAST }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 69e7da33ca49..69c4f113db42 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -179,22 +179,21 @@ struct gendisk { #ifdef CONFIG_BLK_DEV_ZONED /* - * Zoned block device information for request dispatch control. - * nr_zones is the total number of zones of the device. This is always - * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones - * bits which indicates if a zone is conventional (bit set) or - * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones - * bits which indicates if a zone is write locked, that is, if a write - * request targeting the zone was dispatched. - * - * Reads of this information must be protected with blk_queue_enter() / - * blk_queue_exit(). Modifying this information is only allowed while - * no requests are being processed. See also blk_mq_freeze_queue() and - * blk_mq_unfreeze_queue(). + * Zoned block device information. Reads of this information must be + * protected with blk_queue_enter() / blk_queue_exit(). Modifying this + * information is only allowed while no requests are being processed. + * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue(). */ unsigned int nr_zones; + unsigned int zone_capacity; unsigned long *conv_zones_bitmap; - unsigned long *seq_zones_wlock; + unsigned int zone_wplugs_hash_bits; + spinlock_t zone_wplugs_lock; + struct mempool_s *zone_wplugs_pool; + struct hlist_head *zone_wplugs_hash; + struct list_head zone_wplugs_err_list; + struct work_struct zone_wplugs_work; + struct workqueue_struct *zone_wplugs_wq; #endif /* CONFIG_BLK_DEV_ZONED */ #if IS_ENABLED(CONFIG_CDROM) @@ -233,6 +232,19 @@ static inline unsigned int disk_openers(struct gendisk *disk) return atomic_read(&disk->part0->bd_openers); } +/** + * disk_has_partscan - return %true if partition scanning is enabled on a disk + * @disk: disk to check + * + * Returns %true if partitions scanning is enabled for @disk, or %false if + * partition scanning is disabled either permanently or temporarily. + */ +static inline bool disk_has_partscan(struct gendisk *disk) +{ + return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) && + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state); +} + /* * The gendisk is refcounted by the part0 block_device, and the bd_device * therein is also used for device model presentation in sysfs. @@ -331,8 +343,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, sector_t sectors, sector_t nr_sectors); -int blk_revalidate_disk_zones(struct gendisk *disk, - void (*update_driver_data)(struct gendisk *disk)); +int blk_revalidate_disk_zones(struct gendisk *disk); /* * Independent access ranges: struct blk_independent_access_range describes @@ -449,8 +460,6 @@ struct request_queue { atomic_t nr_active_requests_shared_tags; - unsigned int required_elevator_features; - struct blk_mq_tags *sched_shared_tags; struct list_head icq_list; @@ -633,15 +642,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - if (!blk_queue_is_zoned(disk->queue)) - return false; - if (!disk->conv_zones_bitmap) - return true; - return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap); -} - static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { @@ -664,6 +664,7 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) return bdev->bd_disk->queue->limits.max_active_zones; } +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int bdev_nr_zones(struct block_device *bdev) { @@ -674,10 +675,6 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk) { return 0; } -static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) -{ - return false; -} static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { return 0; @@ -691,6 +688,10 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { return 0; } +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; +} #endif /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_depth(struct request_queue *q) @@ -855,9 +856,11 @@ static inline unsigned int bio_zone_no(struct bio *bio) return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); } -static inline unsigned int bio_zone_is_seq(struct bio *bio) +static inline bool bio_straddles_zones(struct bio *bio) { - return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector); + return bio_sectors(bio) && + bio_zone_no(bio) != + disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1); } /* @@ -942,14 +945,6 @@ disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges); void disk_set_independent_access_ranges(struct gendisk *disk, struct blk_independent_access_ranges *iars); -/* - * Elevator features for blk_queue_required_elevator_features: - */ -/* Supports zoned block devices sequential write constraint */ -#define ELEVATOR_F_ZBD_SEQ_WRITE (1U << 0) - -extern void blk_queue_required_elevator_features(struct request_queue *q, - unsigned int features); extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev); @@ -1156,12 +1151,29 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q) return q->limits.max_segment_size; } -static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) +static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l) { + unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors); - const struct queue_limits *l = &q->limits; + return min_not_zero(l->max_zone_append_sectors, max_sectors); +} + +static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q) +{ + if (!blk_queue_is_zoned(q)) + return 0; + + return queue_limits_max_zone_append_sectors(&q->limits); +} + +static inline bool queue_emulates_zone_append(struct request_queue *q) +{ + return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors; +} - return min(l->max_zone_append_sectors, l->max_sectors); +static inline bool bdev_emulates_zone_append(struct block_device *bdev) +{ + return queue_emulates_zone_append(bdev_get_queue(bdev)); } static inline unsigned int @@ -1303,18 +1315,6 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec) return disk_zone_no(bdev->bd_disk, sec); } -/* Whether write serialization is required for @op on zoned devices. */ -static inline bool op_needs_zoned_write_locking(enum req_op op) -{ - return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES; -} - -static inline bool bdev_op_is_zoned_write(struct block_device *bdev, - enum req_op op) -{ - return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op); -} - static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@ -1330,6 +1330,12 @@ static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev, return sector & (bdev_zone_sectors(bdev) - 1); } +static inline sector_t bio_offset_from_zone_start(struct bio *bio) +{ + return bdev_offset_from_zone_start(bio->bi_bdev, + bio->bi_iter.bi_sector); +} + static inline bool bdev_is_zone_start(struct block_device *bdev, sector_t sector) { diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 92c6b1fd8989..1e453f825c05 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -494,18 +494,18 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags, struct sbitmap_word *map = &sb->map[index]; unsigned long get_mask; unsigned int map_depth = __map_depth(sb, index); + unsigned long val; sbitmap_deferred_clear(map); - if (map->word == (1UL << (map_depth - 1)) - 1) + val = READ_ONCE(map->word); + if (val == (1UL << (map_depth - 1)) - 1) goto next; - nr = find_first_zero_bit(&map->word, map_depth); + nr = find_first_zero_bit(&val, map_depth); if (nr + nr_tags <= map_depth) { atomic_long_t *ptr = (atomic_long_t *) &map->word; - unsigned long val; get_mask = ((1UL << nr_tags) - 1) << nr; - val = READ_ONCE(map->word); while (!atomic_long_try_cmpxchg(ptr, &val, get_mask | val)) ;