diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block
index 1fe9a553c37b..831f19a32e08 100644
--- a/Documentation/ABI/stable/sysfs-block
+++ b/Documentation/ABI/stable/sysfs-block
@@ -101,6 +101,16 @@ Description:
 		devices that support receiving integrity metadata.
 
 
+What:		/sys/block/<disk>/partscan
+Date:		May 2024
+Contact:	Christoph Hellwig <hch@lst.de>
+Description:
+		The /sys/block/<disk>/partscan files reports if partition
+		scanning is enabled for the disk.  It returns "1" if partition
+		scanning is enabled, or "0" if not.  The value type is a 32-bit
+		unsigned integer, but only "0" and "1" are valid values.
+
+
 What:		/sys/block/<disk>/<partition>/alignment_offset
 Date:		April 2009
 Contact:	Martin K. Petersen <martin.petersen@oracle.com>
@@ -584,18 +594,6 @@ Description:
 		the data.  If no such restriction exists, this file will contain
 		'0'.  This file is writable for testing purposes.
 
-
-What:		/sys/block/<disk>/queue/throttle_sample_time
-Date:		March 2017
-Contact:	linux-block@vger.kernel.org
-Description:
-		[RW] This is the time window that blk-throttle samples data, in
-		millisecond.  blk-throttle makes decision based on the
-		samplings. Lower time means cgroups have more smooth throughput,
-		but higher CPU overhead. This exists only when
-		CONFIG_BLK_DEV_THROTTLING_LOW is enabled.
-
-
 What:		/sys/block/<disk>/queue/virt_boundary_mask
 Date:		April 2021
 Contact:	linux-block@vger.kernel.org
diff --git a/arch/loongarch/configs/loongson3_defconfig b/arch/loongarch/configs/loongson3_defconfig
index f18c2ba871ef..fc0d89d4c1c5 100644
--- a/arch/loongarch/configs/loongson3_defconfig
+++ b/arch/loongarch/configs/loongson3_defconfig
@@ -76,7 +76,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_MODVERSIONS=y
 CONFIG_BLK_DEV_ZONED=y
 CONFIG_BLK_DEV_THROTTLING=y
-CONFIG_BLK_DEV_THROTTLING_LOW=y
 CONFIG_BLK_WBT=y
 CONFIG_BLK_CGROUP_IOLATENCY=y
 CONFIG_BLK_CGROUP_FC_APPID=y
diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..dc12af58dbae 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -100,7 +100,6 @@ config BLK_DEV_WRITE_MOUNTED
 
 config BLK_DEV_ZONED
 	bool "Zoned block device support"
-	select MQ_IOSCHED_DEADLINE
 	help
 	Block layer zoned block device support. This option enables
 	support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
@@ -120,17 +119,6 @@ config BLK_DEV_THROTTLING
 
 	See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
 
-config BLK_DEV_THROTTLING_LOW
-	bool "Block throttling .low limit interface support (EXPERIMENTAL)"
-	depends on BLK_DEV_THROTTLING
-	help
-	Add .low limit interface for block throttling. The low limit is a best
-	effort limit to prioritize cgroups. Depending on the setting, the limit
-	can be used to protect cgroups in terms of bandwidth/iops and better
-	utilize disk resource.
-
-	Note, this is an experimental interface and could be changed someday.
-
 config BLK_WBT
 	bool "Enable support for block device writeback throttling"
 	help
@@ -198,10 +186,6 @@ config BLK_DEBUG_FS
 	Unless you are building a kernel for a tiny system, you should
 	say Y here.
 
-config BLK_DEBUG_FS_ZONED
-       bool
-       default BLK_DEBUG_FS && BLK_DEV_ZONED
-
 config BLK_SED_OPAL
 	bool "Logic for interfacing with Opal enabled SEDs"
 	depends on KEYS
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..168150b9c510 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -33,7 +33,6 @@ obj-$(CONFIG_BLK_MQ_VIRTIO)	+= blk-mq-virtio.o
 obj-$(CONFIG_BLK_DEV_ZONED)	+= blk-zoned.o
 obj-$(CONFIG_BLK_WBT)		+= blk-wbt.o
 obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
-obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
 obj-$(CONFIG_BLK_SED_OPAL)	+= sed-opal.o
 obj-$(CONFIG_BLK_PM)		+= blk-pm.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o \
diff --git a/block/bio.c b/block/bio.c
index d24420ed1c4c..53f608028c78 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -345,18 +345,29 @@ void bio_chain(struct bio *bio, struct bio *parent)
 }
 EXPORT_SYMBOL(bio_chain);
 
-struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
-		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+/**
+ * bio_chain_and_submit - submit a bio after chaining it to another one
+ * @prev: bio to chain and submit
+ * @new: bio to chain to
+ *
+ * If @prev is non-NULL, chain it to @new and submit it.
+ *
+ * Return: @new.
+ */
+struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new)
 {
-	struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
-
-	if (bio) {
-		bio_chain(bio, new);
-		submit_bio(bio);
+	if (prev) {
+		bio_chain(prev, new);
+		submit_bio(prev);
 	}
-
 	return new;
 }
+
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+{
+	return bio_chain_and_submit(bio, bio_alloc(bdev, nr_pages, opf, gfp));
+}
 EXPORT_SYMBOL_GPL(blk_next_bio);
 
 static void bio_alloc_rescue(struct work_struct *work)
@@ -1384,6 +1395,26 @@ int submit_bio_wait(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio_wait);
 
+static void bio_wait_end_io(struct bio *bio)
+{
+	complete(bio->bi_private);
+	bio_put(bio);
+}
+
+/*
+ * bio_await_chain - ends @bio and waits for every chained bio to complete
+ */
+void bio_await_chain(struct bio *bio)
+{
+	DECLARE_COMPLETION_ONSTACK_MAP(done,
+			bio->bi_bdev->bd_disk->lockdep_map);
+
+	bio->bi_private = &done;
+	bio->bi_end_io = bio_wait_end_io;
+	bio_endio(bio);
+	blk_wait_io(&done);
+}
+
 void __bio_advance(struct bio *bio, unsigned bytes)
 {
 	if (bio_integrity(bio))
@@ -1576,6 +1607,8 @@ void bio_endio(struct bio *bio)
 	if (!bio_integrity_endio(bio))
 		return;
 
+	blk_zone_bio_endio(bio);
+
 	rq_qos_done_bio(bio);
 
 	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
@@ -1596,7 +1629,6 @@ void bio_endio(struct bio *bio)
 		goto again;
 	}
 
-	blk_throtl_bio_endio(bio);
 	/* release cgroup info */
 	bio_uninit(bio);
 	if (bio->bi_end_io)
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index 3304e841df7c..a55fb0c53558 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -9,25 +9,19 @@ int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
 	int i, ret;
 
-	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
-		ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
-		if (ret) {
-			while (--i >= 0)
-				percpu_counter_destroy(&rwstat->cpu_cnt[i]);
-			return ret;
-		}
+	ret = percpu_counter_init_many(rwstat->cpu_cnt, 0, gfp, BLKG_RWSTAT_NR);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
 		atomic64_set(&rwstat->aux_cnt[i], 0);
-	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_init);
 
 void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
 {
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+	percpu_counter_destroy_many(rwstat->cpu_cnt, BLKG_RWSTAT_NR);
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
 
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 059467086b13..4b1a35ab0ea4 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -218,8 +218,7 @@ static void blkg_async_bio_workfn(struct work_struct *work)
 
 	/* as long as there are pending bios, @blkg can't go away */
 	spin_lock(&blkg->async_bio_lock);
-	bio_list_merge(&bios, &blkg->async_bios);
-	bio_list_init(&blkg->async_bios);
+	bio_list_merge_init(&bios, &blkg->async_bios);
 	spin_unlock(&blkg->async_bio_lock);
 
 	/* start plug only when bio_list contains at least 2 bios */
@@ -1444,14 +1443,8 @@ int blkcg_init_disk(struct gendisk *disk)
 	if (ret)
 		goto err_destroy_all;
 
-	ret = blk_throtl_init(disk);
-	if (ret)
-		goto err_ioprio_exit;
-
 	return 0;
 
-err_ioprio_exit:
-	blk_ioprio_exit(disk);
 err_destroy_all:
 	blkg_destroy_all(disk);
 	return ret;
diff --git a/block/blk-core.c b/block/blk-core.c
index b795ac177281..01186333c88e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -591,8 +591,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 		return BLK_STS_NOTSUPP;
 
 	/* The bio sector must point to the start of a sequential zone */
-	if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
-	    !bio_zone_is_seq(bio))
+	if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector))
 		return BLK_STS_IOERR;
 
 	/*
@@ -604,7 +603,7 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
 		return BLK_STS_IOERR;
 
 	/* Make sure the BIO is small enough and will not get split */
-	if (nr_sectors > q->limits.max_zone_append_sectors)
+	if (nr_sectors > queue_max_zone_append_sectors(q))
 		return BLK_STS_IOERR;
 
 	bio->bi_opf |= REQ_NOMERGE;
@@ -649,11 +648,13 @@ static void __submit_bio(struct bio *bio)
 static void __submit_bio_noacct(struct bio *bio)
 {
 	struct bio_list bio_list_on_stack[2];
+	struct blk_plug plug;
 
 	BUG_ON(bio->bi_next);
 
 	bio_list_init(&bio_list_on_stack[0]);
 	current->bio_list = bio_list_on_stack;
+	blk_start_plug(&plug);
 
 	do {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -687,19 +688,23 @@ static void __submit_bio_noacct(struct bio *bio)
 		bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
 	} while ((bio = bio_list_pop(&bio_list_on_stack[0])));
 
+	blk_finish_plug(&plug);
 	current->bio_list = NULL;
 }
 
 static void __submit_bio_noacct_mq(struct bio *bio)
 {
 	struct bio_list bio_list[2] = { };
+	struct blk_plug plug;
 
 	current->bio_list = bio_list;
+	blk_start_plug(&plug);
 
 	do {
 		__submit_bio(bio);
 	} while ((bio = bio_list_pop(&bio_list[0])));
 
+	blk_finish_plug(&plug);
 	current->bio_list = NULL;
 }
 
@@ -910,12 +915,6 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
 	    !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 		return 0;
 
-	/*
-	 * As the requests that require a zone lock are not plugged in the
-	 * first place, directly accessing the plug instead of using
-	 * blk_mq_plug() should not have any consequences during flushing for
-	 * zoned devices.
-	 */
 	blk_flush_plug(current->plug, false);
 
 	/*
@@ -987,10 +986,11 @@ void update_io_ticks(struct block_device *part, unsigned long now, bool end)
 	unsigned long stamp;
 again:
 	stamp = READ_ONCE(part->bd_stamp);
-	if (unlikely(time_after(now, stamp))) {
-		if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
-			__part_stat_add(part, io_ticks, end ? now - stamp : 1);
-	}
+	if (unlikely(time_after(now, stamp)) &&
+	    likely(try_cmpxchg(&part->bd_stamp, &stamp, now)) &&
+	    (end || part_in_flight(part)))
+		__part_stat_add(part, io_ticks, now - stamp);
+
 	if (part->bd_partno) {
 		part = bdev_whole(part);
 		goto again;
diff --git a/block/blk-flush.c b/block/blk-flush.c
index b0f314f4bc14..c17cf8ed8113 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -130,6 +130,8 @@ static void blk_flush_restore_request(struct request *rq)
 	 * original @rq->bio.  Restore it.
 	 */
 	rq->bio = rq->biotail;
+	if (rq->bio)
+		rq->__sector = rq->bio->bi_iter.bi_sector;
 
 	/* make @rq a normal request */
 	rq->rq_flags &= ~RQF_FLUSH_SEQ;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index a6954eafb8c8..442da9dad042 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -35,51 +35,39 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
 	return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
 }
 
-int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+struct bio *blk_alloc_discard_bio(struct block_device *bdev,
+		sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask)
 {
-	struct bio *bio = *biop;
-	sector_t bs_mask;
-
-	if (bdev_read_only(bdev))
-		return -EPERM;
-	if (!bdev_max_discard_sectors(bdev))
-		return -EOPNOTSUPP;
-
-	/* In case the discard granularity isn't set by buggy device driver */
-	if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
-		pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
-				   bdev);
-		return -EOPNOTSUPP;
-	}
-
-	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
-	if ((sector | nr_sects) & bs_mask)
-		return -EINVAL;
+	sector_t bio_sects = min(*nr_sects, bio_discard_limit(bdev, *sector));
+	struct bio *bio;
 
-	if (!nr_sects)
-		return -EINVAL;
+	if (!bio_sects)
+		return NULL;
 
-	while (nr_sects) {
-		sector_t req_sects =
-			min(nr_sects, bio_discard_limit(bdev, sector));
+	bio = bio_alloc(bdev, 0, REQ_OP_DISCARD, gfp_mask);
+	if (!bio)
+		return NULL;
+	bio->bi_iter.bi_sector = *sector;
+	bio->bi_iter.bi_size = bio_sects << SECTOR_SHIFT;
+	*sector += bio_sects;
+	*nr_sects -= bio_sects;
+	/*
+	 * We can loop for a long time in here if someone does full device
+	 * discards (like mkfs).  Be nice and allow us to schedule out to avoid
+	 * softlocking if preempt is disabled.
+	 */
+	cond_resched();
+	return bio;
+}
 
-		bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask);
-		bio->bi_iter.bi_sector = sector;
-		bio->bi_iter.bi_size = req_sects << 9;
-		sector += req_sects;
-		nr_sects -= req_sects;
-
-		/*
-		 * We can loop for a long time in here, if someone does
-		 * full device discards (like mkfs). Be nice and allow
-		 * us to schedule out to avoid softlocking if preempt
-		 * is disabled.
-		 */
-		cond_resched();
-	}
+int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
+{
+	struct bio *bio;
 
-	*biop = bio;
+	while ((bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+			gfp_mask)))
+		*biop = bio_chain_and_submit(*biop, bio);
 	return 0;
 }
 EXPORT_SYMBOL(__blkdev_issue_discard);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e3483a16b75..8534c35e0497 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -377,6 +377,7 @@ struct bio *__bio_split_to_limits(struct bio *bio,
 		blkcg_bio_issue_init(split);
 		bio_chain(split, bio);
 		trace_block_split(split, bio->bi_iter.bi_sector);
+		WARN_ON_ONCE(bio_zone_write_plugging(bio));
 		submit_bio_noacct(bio);
 		return split;
 	}
@@ -779,6 +780,8 @@ static void blk_account_io_merge_request(struct request *req)
 	if (blk_do_io_stat(req)) {
 		part_stat_lock();
 		part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+		part_stat_local_dec(req->part,
+				    in_flight[op_is_write(req_op(req))]);
 		part_stat_unlock();
 	}
 }
@@ -972,13 +975,7 @@ static void blk_account_io_merge_bio(struct request *req)
 	part_stat_unlock();
 }
 
-enum bio_merge_status {
-	BIO_MERGE_OK,
-	BIO_MERGE_NONE,
-	BIO_MERGE_FAILED,
-};
-
-static enum bio_merge_status bio_attempt_back_merge(struct request *req,
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
 		struct bio *bio, unsigned int nr_segs)
 {
 	const blk_opf_t ff = bio_failfast(bio);
@@ -994,6 +991,9 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req,
 
 	blk_update_mixed_merge(req, bio, false);
 
+	if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+		blk_zone_write_plug_bio_merged(bio);
+
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
@@ -1009,6 +1009,14 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 {
 	const blk_opf_t ff = bio_failfast(bio);
 
+	/*
+	 * A front merge for writes to sequential zones of a zoned block device
+	 * can happen only if the user submitted writes out of order. Do not
+	 * merge such write to let it fail.
+	 */
+	if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+		return BIO_MERGE_FAILED;
+
 	if (!ll_front_merge_fn(req, bio, nr_segs))
 		return BIO_MERGE_FAILED;
 
@@ -1107,10 +1115,9 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
-	struct blk_plug *plug;
+	struct blk_plug *plug = current->plug;
 	struct request *rq;
 
-	plug = blk_mq_plug(bio);
 	if (!plug || rq_list_empty(plug->mq_list))
 		return false;
 
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
deleted file mode 100644
index a77b099c34b7..000000000000
--- a/block/blk-mq-debugfs-zoned.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2017 Western Digital Corporation or its affiliates.
- */
-
-#include <linux/blkdev.h>
-#include "blk-mq-debugfs.h"
-
-int queue_zone_wlock_show(void *data, struct seq_file *m)
-{
-	struct request_queue *q = data;
-	unsigned int i;
-
-	if (!q->disk->seq_zones_wlock)
-		return 0;
-
-	for (i = 0; i < q->disk->nr_zones; i++)
-		if (test_bit(i, q->disk->seq_zones_wlock))
-			seq_printf(m, "%u\n", i);
-
-	return 0;
-}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..770c0c2b72fa 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -160,7 +160,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
 	{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
 	{ "pm_only", 0600, queue_pm_only_show, NULL },
 	{ "state", 0600, queue_state_show, queue_state_write },
-	{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
+	{ "zone_wplugs", 0400, queue_zone_wplugs_show, NULL },
 	{ },
 };
 
@@ -256,7 +256,6 @@ static const char *const rqf_name[] = {
 	RQF_NAME(HASHED),
 	RQF_NAME(STATS),
 	RQF_NAME(SPECIAL_PAYLOAD),
-	RQF_NAME(ZONE_WRITE_LOCKED),
 	RQF_NAME(TIMED_OUT),
 	RQF_NAME(RESV),
 };
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 9c7d4b6117d4..c80e453e3014 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -83,10 +83,10 @@ static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
 }
 #endif
 
-#ifdef CONFIG_BLK_DEBUG_FS_ZONED
-int queue_zone_wlock_show(void *data, struct seq_file *m);
+#if defined(CONFIG_BLK_DEV_ZONED) && defined(CONFIG_BLK_DEBUG_FS)
+int queue_zone_wplugs_show(void *data, struct seq_file *m);
 #else
-static inline int queue_zone_wlock_show(void *data, struct seq_file *m)
+static inline int queue_zone_wplugs_show(void *data, struct seq_file *m)
 {
 	return 0;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 32afb87efbd0..8e01e4b32e10 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -28,6 +28,7 @@
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
 #include <linux/part_stat.h>
+#include <linux/sched/isolation.h>
 
 #include <trace/events/block.h>
 
@@ -690,6 +691,8 @@ static void blk_mq_finish_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_zone_finish_request(rq);
+
 	if (rq->rq_flags & RQF_USE_SCHED) {
 		q->elevator->type->ops.finish_request(rq);
 		/*
@@ -761,31 +764,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 
-static void req_bio_endio(struct request *rq, struct bio *bio,
-			  unsigned int nbytes, blk_status_t error)
-{
-	if (unlikely(error)) {
-		bio->bi_status = error;
-	} else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
-		/*
-		 * Partial zone append completions cannot be supported as the
-		 * BIO fragments may end up not being written sequentially.
-		 */
-		if (bio->bi_iter.bi_size != nbytes)
-			bio->bi_status = BLK_STS_IOERR;
-		else
-			bio->bi_iter.bi_sector = rq->__sector;
-	}
-
-	bio_advance(bio, nbytes);
-
-	if (unlikely(rq->rq_flags & RQF_QUIET))
-		bio_set_flag(bio, BIO_QUIET);
-	/* don't actually finish bio if it's part of flush sequence */
-	if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
-		bio_endio(bio);
-}
-
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (req->part && blk_do_io_stat(req)) {
@@ -845,8 +823,7 @@ static void blk_complete_request(struct request *req)
 		/* Completion has already been traced */
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 
-		if (req_op(req) == REQ_OP_ZONE_APPEND)
-			bio->bi_iter.bi_sector = req->__sector;
+		blk_zone_update_request_bio(req, bio);
 
 		if (!is_flush)
 			bio_endio(bio);
@@ -889,6 +866,8 @@ static void blk_complete_request(struct request *req)
 bool blk_update_request(struct request *req, blk_status_t error,
 		unsigned int nr_bytes)
 {
+	bool is_flush = req->rq_flags & RQF_FLUSH_SEQ;
+	bool quiet = req->rq_flags & RQF_QUIET;
 	int total_bytes;
 
 	trace_block_rq_complete(req, error, nr_bytes);
@@ -909,9 +888,8 @@ bool blk_update_request(struct request *req, blk_status_t error,
 	if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
 		__blk_crypto_rq_put_keyslot(req);
 
-	if (unlikely(error && !blk_rq_is_passthrough(req) &&
-		     !(req->rq_flags & RQF_QUIET)) &&
-		     !test_bit(GD_DEAD, &req->q->disk->state)) {
+	if (unlikely(error && !blk_rq_is_passthrough(req) && !quiet) &&
+	    !test_bit(GD_DEAD, &req->q->disk->state)) {
 		blk_print_req_error(req, error);
 		trace_block_rq_error(req, error, nr_bytes);
 	}
@@ -923,12 +901,33 @@ bool blk_update_request(struct request *req, blk_status_t error,
 		struct bio *bio = req->bio;
 		unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
 
-		if (bio_bytes == bio->bi_iter.bi_size)
+		if (unlikely(error))
+			bio->bi_status = error;
+
+		if (bio_bytes == bio->bi_iter.bi_size) {
 			req->bio = bio->bi_next;
+		} else if (bio_is_zone_append(bio) && error == BLK_STS_OK) {
+			/*
+			 * Partial zone append completions cannot be supported
+			 * as the BIO fragments may end up not being written
+			 * sequentially.
+			 */
+			bio->bi_status = BLK_STS_IOERR;
+		}
 
 		/* Completion has already been traced */
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
-		req_bio_endio(req, bio, bio_bytes, error);
+		if (unlikely(quiet))
+			bio_set_flag(bio, BIO_QUIET);
+
+		bio_advance(bio, bio_bytes);
+
+		/* Don't actually finish bio if it's part of flush sequence */
+		if (!bio->bi_iter.bi_size) {
+			blk_zone_update_request_bio(req, bio);
+			if (!is_flush)
+				bio_endio(bio);
+		}
 
 		total_bytes += bio_bytes;
 		nr_bytes -= bio_bytes;
@@ -997,6 +996,8 @@ static inline void blk_account_io_done(struct request *req, u64 now)
 		update_io_ticks(req->part, jiffies, true);
 		part_stat_inc(req->part, ios[sgrp]);
 		part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+		part_stat_local_dec(req->part,
+				    in_flight[op_is_write(req_op(req))]);
 		part_stat_unlock();
 	}
 }
@@ -1019,6 +1020,8 @@ static inline void blk_account_io_start(struct request *req)
 
 		part_stat_lock();
 		update_io_ticks(req->part, jiffies, false);
+		part_stat_local_inc(req->part,
+				    in_flight[op_is_write(req_op(req))]);
 		part_stat_unlock();
 	}
 }
@@ -1330,11 +1333,6 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
 
 	blk_account_io_start(rq);
 
-	/*
-	 * As plugging can be enabled for passthrough requests on a zoned
-	 * device, directly accessing the plug instead of using blk_mq_plug()
-	 * should not have any consequences.
-	 */
 	if (current->plug && !at_head) {
 		blk_add_rq_to_plug(current->plug, rq);
 		return;
@@ -1921,19 +1919,6 @@ static void blk_mq_handle_dev_resource(struct request *rq,
 	__blk_mq_requeue_request(rq);
 }
 
-static void blk_mq_handle_zone_resource(struct request *rq,
-					struct list_head *zone_list)
-{
-	/*
-	 * If we end up here it is because we cannot dispatch a request to a
-	 * specific zone due to LLD level zone-write locking or other zone
-	 * related resource not being available. In this case, set the request
-	 * aside in zone_list for retrying it later.
-	 */
-	list_add(&rq->queuelist, zone_list);
-	__blk_mq_requeue_request(rq);
-}
-
 enum prep_dispatch {
 	PREP_DISPATCH_OK,
 	PREP_DISPATCH_NO_TAG,
@@ -2019,7 +2004,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 	struct request *rq;
 	int queued;
 	blk_status_t ret = BLK_STS_OK;
-	LIST_HEAD(zone_list);
 	bool needs_resource = false;
 
 	if (list_empty(list))
@@ -2061,23 +2045,11 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 		case BLK_STS_DEV_RESOURCE:
 			blk_mq_handle_dev_resource(rq, list);
 			goto out;
-		case BLK_STS_ZONE_RESOURCE:
-			/*
-			 * Move the request to zone_list and keep going through
-			 * the dispatch list to find more requests the drive can
-			 * accept.
-			 */
-			blk_mq_handle_zone_resource(rq, &zone_list);
-			needs_resource = true;
-			break;
 		default:
 			blk_mq_end_request(rq, ret);
 		}
 	} while (!list_empty(list));
 out:
-	if (!list_empty(&zone_list))
-		list_splice_tail_init(&zone_list, list);
-
 	/* If we didn't flush the entire list, we could have told the driver
 	 * there was more coming, but that turned out to be a lie.
 	 */
@@ -2163,6 +2135,15 @@ static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
 	return cpu;
 }
 
+/*
+ * ->next_cpu is always calculated from hctx->cpumask, so simply use
+ * it for speeding up the check
+ */
+static bool blk_mq_hctx_empty_cpumask(struct blk_mq_hw_ctx *hctx)
+{
+        return hctx->next_cpu >= nr_cpu_ids;
+}
+
 /*
  * It'd be great if the workqueue API had a way to pass
  * in a mask and had some smarts for more clever placement.
@@ -2174,7 +2155,8 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 	bool tried = false;
 	int next_cpu = hctx->next_cpu;
 
-	if (hctx->queue->nr_hw_queues == 1)
+	/* Switch to unbound if no allowable CPUs in this hctx */
+	if (hctx->queue->nr_hw_queues == 1 || blk_mq_hctx_empty_cpumask(hctx))
 		return WORK_CPU_UNBOUND;
 
 	if (--hctx->next_cpu_batch <= 0) {
@@ -2948,22 +2930,37 @@ static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
 void blk_mq_submit_bio(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	struct blk_plug *plug = blk_mq_plug(bio);
+	struct blk_plug *plug = current->plug;
 	const int is_sync = op_is_sync(bio->bi_opf);
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int nr_segs = 1;
 	struct request *rq;
 	blk_status_t ret;
 
+	/*
+	 * If the plug has a cached request for this queue, try to use it.
+	 */
+	rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+
+	/*
+	 * A BIO that was released from a zone write plug has already been
+	 * through the preparation in this function, already holds a reference
+	 * on the queue usage counter, and is the only write BIO in-flight for
+	 * the target zone. Go straight to preparing a request for it.
+	 */
+	if (bio_zone_write_plugging(bio)) {
+		nr_segs = bio->__bi_nr_segments;
+		if (rq)
+			blk_queue_exit(q);
+		goto new_request;
+	}
+
 	bio = blk_queue_bounce(bio, q);
 
 	/*
-	 * If the plug has a cached request for this queue, try use it.
-	 *
 	 * The cached request already holds a q_usage_counter reference and we
 	 * don't have to acquire a new one if we use it.
 	 */
-	rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
 	if (!rq) {
 		if (unlikely(bio_queue_enter(bio)))
 			return;
@@ -2980,6 +2977,10 @@ void blk_mq_submit_bio(struct bio *bio)
 	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
 		goto queue_exit;
 
+	if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
+		goto queue_exit;
+
+new_request:
 	if (!rq) {
 		rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
 		if (unlikely(!rq))
@@ -3002,6 +3003,9 @@ void blk_mq_submit_bio(struct bio *bio)
 		return;
 	}
 
+	if (bio_zone_write_plugging(bio))
+		blk_zone_write_plug_init_request(rq);
+
 	if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
 		return;
 
@@ -3483,14 +3487,30 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
 	return data.has_rq;
 }
 
-static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
-		struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_hctx_has_online_cpu(struct blk_mq_hw_ctx *hctx,
+		unsigned int this_cpu)
 {
-	if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
-		return false;
-	if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
-		return false;
-	return true;
+	enum hctx_type type = hctx->type;
+	int cpu;
+
+	/*
+	 * hctx->cpumask has to rule out isolated CPUs, but userspace still
+	 * might submit IOs on these isolated CPUs, so use the queue map to
+	 * check if all CPUs mapped to this hctx are offline
+	 */
+	for_each_online_cpu(cpu) {
+		struct blk_mq_hw_ctx *h = blk_mq_map_queue_type(hctx->queue,
+				type, cpu);
+
+		if (h != hctx)
+			continue;
+
+		/* this hctx has at least one online CPU */
+		if (this_cpu != cpu)
+			return true;
+	}
+
+	return false;
 }
 
 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
@@ -3498,8 +3518,7 @@ static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
 	struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
 			struct blk_mq_hw_ctx, cpuhp_online);
 
-	if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
-	    !blk_mq_last_cpu_in_hctx(cpu, hctx))
+	if (blk_mq_hctx_has_online_cpu(hctx, cpu))
 		return 0;
 
 	/*
@@ -3907,6 +3926,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	}
 
 	queue_for_each_hw_ctx(q, hctx, i) {
+		int cpu;
+
 		/*
 		 * If no software queues are mapped to this hardware queue,
 		 * disable it and free the request entries.
@@ -3933,6 +3954,15 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 		 */
 		sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx);
 
+		/*
+		 * Rule out isolated CPUs from hctx->cpumask to avoid
+		 * running block kworker on isolated CPUs
+		 */
+		for_each_cpu(cpu, hctx->cpumask) {
+			if (cpu_is_isolated(cpu))
+				cpumask_clear_cpu(cpu, hctx->cpumask);
+		}
+
 		/*
 		 * Initialize batch roundrobin counts
 		 */
diff --git a/block/blk-mq.h b/block/blk-mq.h
index f75a9ecfebde..260beea8e332 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -365,37 +365,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
 		qmap->mq_map[cpu] = 0;
 }
 
-/*
- * blk_mq_plug() - Get caller context plug
- * @bio : the bio being submitted by the caller context
- *
- * Plugging, by design, may delay the insertion of BIOs into the elevator in
- * order to increase BIO merging opportunities. This however can cause BIO
- * insertion order to change from the order in which submit_bio() is being
- * executed in the case of multiple contexts concurrently issuing BIOs to a
- * device, even if these context are synchronized to tightly control BIO issuing
- * order. While this is not a problem with regular block devices, this ordering
- * change can cause write BIO failures with zoned block devices as these
- * require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if it is for a zoned block
- * device and the BIO to plug is a write operation.
- *
- * Return current->plug if the bio can be plugged and NULL otherwise
- */
-static inline struct blk_plug *blk_mq_plug( struct bio *bio)
-{
-	/* Zoned block device write operation case: do not plug the BIO */
-	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
-	    bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
-		return NULL;
-
-	/*
-	 * For regular block devices or read operations, use the context plug
-	 * which may be NULL if blk_start_plug() was not executed.
-	 */
-	return current->plug;
-}
-
 /* Free all requests on the list */
 static inline void blk_mq_free_requests(struct list_head *list)
 {
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9d6033e01f2e..ebba05a2bc7f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -411,24 +411,32 @@ EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
  * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
  * @q:  the request queue for the device
  * @max_zone_append_sectors: maximum number of sectors to write per command
+ *
+ * Sets the maximum number of sectors allowed for zone append commands. If
+ * Specifying 0 for @max_zone_append_sectors indicates that the queue does
+ * not natively support zone append operations and that the block layer must
+ * emulate these operations using regular writes.
  **/
 void blk_queue_max_zone_append_sectors(struct request_queue *q,
 		unsigned int max_zone_append_sectors)
 {
-	unsigned int max_sectors;
+	unsigned int max_sectors = 0;
 
 	if (WARN_ON(!blk_queue_is_zoned(q)))
 		return;
 
-	max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
-	max_sectors = min(q->limits.chunk_sectors, max_sectors);
+	if (max_zone_append_sectors) {
+		max_sectors = min(q->limits.max_hw_sectors,
+				  max_zone_append_sectors);
+		max_sectors = min(q->limits.chunk_sectors, max_sectors);
 
-	/*
-	 * Signal eventual driver bugs resulting in the max_zone_append sectors limit
-	 * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
-	 * or the max_hw_sectors limit not set.
-	 */
-	WARN_ON(!max_sectors);
+		/*
+		 * Signal eventual driver bugs resulting in the max_zone_append
+		 * sectors limit being 0 due to the chunk_sectors limit (zone
+		 * size) not set or the max_hw_sectors limit not set.
+		 */
+		WARN_ON_ONCE(!max_sectors);
+	}
 
 	q->limits.max_zone_append_sectors = max_sectors;
 }
@@ -755,8 +763,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 	t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
 	t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
 					b->max_write_zeroes_sectors);
-	t->max_zone_append_sectors = min(t->max_zone_append_sectors,
-					b->max_zone_append_sectors);
+	t->max_zone_append_sectors = min(queue_limits_max_zone_append_sectors(t),
+					 queue_limits_max_zone_append_sectors(b));
 	t->bounce = max(t->bounce, b->bounce);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -1043,22 +1051,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 }
 EXPORT_SYMBOL_GPL(blk_queue_write_cache);
 
-/**
- * blk_queue_required_elevator_features - Set a queue required elevator features
- * @q:		the request queue for the target device
- * @features:	Required elevator features OR'ed together
- *
- * Tell the block layer that for the device controlled through @q, only the
- * only elevators that can be used are those that implement at least the set of
- * features specified by @features.
- */
-void blk_queue_required_elevator_features(struct request_queue *q,
-					  unsigned int features)
-{
-	q->required_elevator_features = features;
-}
-EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
-
 /**
  * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
  * @q:		the request queue for the device
diff --git a/block/blk-stat.c b/block/blk-stat.c
index e42c263e53fb..eaf60097bbe1 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -57,9 +57,6 @@ void blk_stat_add(struct request *rq, u64 now)
 
 	value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
 
-	if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE)
-		blk_throtl_stat_add(rq, value);
-
 	rcu_read_lock();
 	cpu = get_cpu();
 	list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 8c8f69d8ba48..f0f9314ab65c 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -224,7 +224,7 @@ static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
 
 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
 {
-	unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+	unsigned long long max_sectors = queue_max_zone_append_sectors(q);
 
 	return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
 }
@@ -516,10 +516,6 @@ QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
 QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
 QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
 
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
-#endif
-
 /* legacy alias for logical_block_size: */
 static struct queue_sysfs_entry queue_hw_sector_size_entry = {
 	.attr = {.name = "hw_sector_size", .mode = 0444 },
@@ -640,9 +636,6 @@ static struct attribute *queue_attrs[] = {
 	&queue_fua_entry.attr,
 	&queue_dax_entry.attr,
 	&queue_poll_delay_entry.attr,
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	&blk_throtl_sample_time_entry.attr,
-#endif
 	&queue_virt_boundary_mask_entry.attr,
 	&queue_dma_alignment_entry.attr,
 	NULL,
@@ -814,7 +807,6 @@ int blk_register_queue(struct gendisk *disk)
 
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
 	wbt_enable_default(disk);
-	blk_throtl_register(disk);
 
 	/* Now everything is ready and send out KOBJ_ADD uevent */
 	kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index f4850a6f860b..80aaca18bfb0 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -25,18 +25,6 @@
 #define DFL_THROTL_SLICE_HD (HZ / 10)
 #define DFL_THROTL_SLICE_SSD (HZ / 50)
 #define MAX_THROTL_SLICE (HZ)
-#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
-#define MIN_THROTL_BPS (320 * 1024)
-#define MIN_THROTL_IOPS (10)
-#define DFL_LATENCY_TARGET (-1L)
-#define DFL_IDLE_THRESHOLD (0)
-#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
-#define LATENCY_FILTERED_SSD (0)
-/*
- * For HD, very small latency comes from sequential IO. Such IO is helpless to
- * help determine if its IO is impacted by others, hence we ignore the IO
- */
-#define LATENCY_FILTERED_HD (1000L) /* 1ms */
 
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
@@ -70,19 +58,6 @@ struct throtl_data
 
 	/* Work for dispatching throttled bios */
 	struct work_struct dispatch_work;
-	unsigned int limit_index;
-	bool limit_valid[LIMIT_CNT];
-
-	unsigned long low_upgrade_time;
-	unsigned long low_downgrade_time;
-
-	unsigned int scale;
-
-	struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
-	struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
-	struct latency_bucket __percpu *latency_buckets[2];
-	unsigned long last_calculate_time;
-	unsigned long filtered_latency;
 
 	bool track_bio_latency;
 };
@@ -126,84 +101,24 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
 		return container_of(sq, struct throtl_data, service_queue);
 }
 
-/*
- * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
- * make the IO dispatch more smooth.
- * Scale up: linearly scale up according to elapsed time since upgrade. For
- *           every throtl_slice, the limit scales up 1/2 .low limit till the
- *           limit hits .max limit
- * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
- */
-static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
-{
-	/* arbitrary value to avoid too big scale */
-	if (td->scale < 4096 && time_after_eq(jiffies,
-	    td->low_upgrade_time + td->scale * td->throtl_slice))
-		td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
-
-	return low + (low >> 1) * td->scale;
-}
-
 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
 {
 	struct blkcg_gq *blkg = tg_to_blkg(tg);
-	struct throtl_data *td;
-	uint64_t ret;
 
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
 		return U64_MAX;
 
-	td = tg->td;
-	ret = tg->bps[rw][td->limit_index];
-	if (ret == 0 && td->limit_index == LIMIT_LOW) {
-		/* intermediate node or iops isn't 0 */
-		if (!list_empty(&blkg->blkcg->css.children) ||
-		    tg->iops[rw][td->limit_index])
-			return U64_MAX;
-		else
-			return MIN_THROTL_BPS;
-	}
-
-	if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
-	    tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
-		uint64_t adjusted;
-
-		adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
-		ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
-	}
-	return ret;
+	return tg->bps[rw];
 }
 
 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
 {
 	struct blkcg_gq *blkg = tg_to_blkg(tg);
-	struct throtl_data *td;
-	unsigned int ret;
 
 	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
 		return UINT_MAX;
 
-	td = tg->td;
-	ret = tg->iops[rw][td->limit_index];
-	if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
-		/* intermediate node or bps isn't 0 */
-		if (!list_empty(&blkg->blkcg->css.children) ||
-		    tg->bps[rw][td->limit_index])
-			return UINT_MAX;
-		else
-			return MIN_THROTL_IOPS;
-	}
-
-	if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
-	    tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
-		uint64_t adjusted;
-
-		adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
-		if (adjusted > UINT_MAX)
-			adjusted = UINT_MAX;
-		ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
-	}
-	return ret;
+	return tg->iops[rw];
 }
 
 #define request_bucket_index(sectors) \
@@ -359,20 +274,10 @@ static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk,
 	}
 
 	RB_CLEAR_NODE(&tg->rb_node);
-	tg->bps[READ][LIMIT_MAX] = U64_MAX;
-	tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
-	tg->iops[READ][LIMIT_MAX] = UINT_MAX;
-	tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
-	tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
-	tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
-	tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
-	tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
-	/* LIMIT_LOW will have default value 0 */
-
-	tg->latency_target = DFL_LATENCY_TARGET;
-	tg->latency_target_conf = DFL_LATENCY_TARGET;
-	tg->idletime_threshold = DFL_IDLE_THRESHOLD;
-	tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
+	tg->bps[READ] = U64_MAX;
+	tg->bps[WRITE] = U64_MAX;
+	tg->iops[READ] = UINT_MAX;
+	tg->iops[WRITE] = UINT_MAX;
 
 	return &tg->pd;
 
@@ -418,18 +323,15 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 static void tg_update_has_rules(struct throtl_grp *tg)
 {
 	struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
-	struct throtl_data *td = tg->td;
 	int rw;
 
 	for (rw = READ; rw <= WRITE; rw++) {
 		tg->has_rules_iops[rw] =
 			(parent_tg && parent_tg->has_rules_iops[rw]) ||
-			(td->limit_valid[td->limit_index] &&
-			  tg_iops_limit(tg, rw) != UINT_MAX);
+			tg_iops_limit(tg, rw) != UINT_MAX;
 		tg->has_rules_bps[rw] =
 			(parent_tg && parent_tg->has_rules_bps[rw]) ||
-			(td->limit_valid[td->limit_index] &&
-			 (tg_bps_limit(tg, rw) != U64_MAX));
+			tg_bps_limit(tg, rw) != U64_MAX;
 	}
 }
 
@@ -443,49 +345,6 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
 	tg_update_has_rules(tg);
 }
 
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
-	struct cgroup_subsys_state *pos_css;
-	struct blkcg_gq *blkg;
-	bool low_valid = false;
-
-	rcu_read_lock();
-	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
-		struct throtl_grp *tg = blkg_to_tg(blkg);
-
-		if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
-		    tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
-			low_valid = true;
-			break;
-		}
-	}
-	rcu_read_unlock();
-
-	td->limit_valid[LIMIT_LOW] = low_valid;
-}
-#else
-static inline void blk_throtl_update_limit_valid(struct throtl_data *td)
-{
-}
-#endif
-
-static void throtl_upgrade_state(struct throtl_data *td);
-static void throtl_pd_offline(struct blkg_policy_data *pd)
-{
-	struct throtl_grp *tg = pd_to_tg(pd);
-
-	tg->bps[READ][LIMIT_LOW] = 0;
-	tg->bps[WRITE][LIMIT_LOW] = 0;
-	tg->iops[READ][LIMIT_LOW] = 0;
-	tg->iops[WRITE][LIMIT_LOW] = 0;
-
-	blk_throtl_update_limit_valid(tg->td);
-
-	if (!tg->td->limit_valid[tg->td->limit_index])
-		throtl_upgrade_state(tg->td);
-}
-
 static void throtl_pd_free(struct blkg_policy_data *pd)
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
@@ -1151,8 +1010,6 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
 	return nr_disp;
 }
 
-static bool throtl_can_upgrade(struct throtl_data *td,
-	struct throtl_grp *this_tg);
 /**
  * throtl_pending_timer_fn - timer function for service_queue->pending_timer
  * @t: the pending_timer member of the throtl_service_queue being serviced
@@ -1189,9 +1046,6 @@ static void throtl_pending_timer_fn(struct timer_list *t)
 	if (!q->root_blkg)
 		goto out_unlock;
 
-	if (throtl_can_upgrade(td, NULL))
-		throtl_upgrade_state(td);
-
 again:
 	parent_sq = sq->parent_sq;
 	dispatched = false;
@@ -1331,22 +1185,12 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
 	blkg_for_each_descendant_pre(blkg, pos_css,
 			global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
 		struct throtl_grp *this_tg = blkg_to_tg(blkg);
-		struct throtl_grp *parent_tg;
 
 		tg_update_has_rules(this_tg);
 		/* ignore root/second level */
 		if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
 		    !blkg->parent->parent)
 			continue;
-		parent_tg = blkg_to_tg(blkg->parent);
-		/*
-		 * make sure all children has lower idle time threshold and
-		 * higher latency target
-		 */
-		this_tg->idletime_threshold = min(this_tg->idletime_threshold,
-				parent_tg->idletime_threshold);
-		this_tg->latency_target = max(this_tg->latency_target,
-				parent_tg->latency_target);
 	}
 	rcu_read_unlock();
 
@@ -1367,6 +1211,53 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
 	}
 }
 
+static int blk_throtl_init(struct gendisk *disk)
+{
+	struct request_queue *q = disk->queue;
+	struct throtl_data *td;
+	int ret;
+
+	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
+	if (!td)
+		return -ENOMEM;
+
+	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
+	throtl_service_queue_init(&td->service_queue);
+
+	/*
+	 * Freeze queue before activating policy, to synchronize with IO path,
+	 * which is protected by 'q_usage_counter'.
+	 */
+	blk_mq_freeze_queue(disk->queue);
+	blk_mq_quiesce_queue(disk->queue);
+
+	q->td = td;
+	td->queue = q;
+
+	/* activate policy */
+	ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
+	if (ret) {
+		q->td = NULL;
+		kfree(td);
+		goto out;
+	}
+
+	if (blk_queue_nonrot(q))
+		td->throtl_slice = DFL_THROTL_SLICE_SSD;
+	else
+		td->throtl_slice = DFL_THROTL_SLICE_HD;
+	td->track_bio_latency = !queue_is_mq(q);
+	if (!td->track_bio_latency)
+		blk_stat_enable_accounting(q);
+
+out:
+	blk_mq_unquiesce_queue(disk->queue);
+	blk_mq_unfreeze_queue(disk->queue);
+
+	return ret;
+}
+
+
 static ssize_t tg_set_conf(struct kernfs_open_file *of,
 			   char *buf, size_t nbytes, loff_t off, bool is_u64)
 {
@@ -1378,6 +1269,16 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
 
 	blkg_conf_init(&ctx, buf);
 
+	ret = blkg_conf_open_bdev(&ctx);
+	if (ret)
+		goto out_finish;
+
+	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
+		ret = blk_throtl_init(ctx.bdev->bd_disk);
+		if (ret)
+			goto out_finish;
+	}
+
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
 	if (ret)
 		goto out_finish;
@@ -1444,25 +1345,25 @@ static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
 static struct cftype throtl_legacy_files[] = {
 	{
 		.name = "throttle.read_bps_device",
-		.private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
+		.private = offsetof(struct throtl_grp, bps[READ]),
 		.seq_show = tg_print_conf_u64,
 		.write = tg_set_conf_u64,
 	},
 	{
 		.name = "throttle.write_bps_device",
-		.private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
+		.private = offsetof(struct throtl_grp, bps[WRITE]),
 		.seq_show = tg_print_conf_u64,
 		.write = tg_set_conf_u64,
 	},
 	{
 		.name = "throttle.read_iops_device",
-		.private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
+		.private = offsetof(struct throtl_grp, iops[READ]),
 		.seq_show = tg_print_conf_uint,
 		.write = tg_set_conf_uint,
 	},
 	{
 		.name = "throttle.write_iops_device",
-		.private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
+		.private = offsetof(struct throtl_grp, iops[WRITE]),
 		.seq_show = tg_print_conf_uint,
 		.write = tg_set_conf_uint,
 	},
@@ -1494,61 +1395,43 @@ static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
 {
 	struct throtl_grp *tg = pd_to_tg(pd);
 	const char *dname = blkg_dev_name(pd->blkg);
-	char bufs[4][21] = { "max", "max", "max", "max" };
 	u64 bps_dft;
 	unsigned int iops_dft;
-	char idle_time[26] = "";
-	char latency_time[26] = "";
 
 	if (!dname)
 		return 0;
 
-	if (off == LIMIT_LOW) {
-		bps_dft = 0;
-		iops_dft = 0;
-	} else {
-		bps_dft = U64_MAX;
-		iops_dft = UINT_MAX;
-	}
+	bps_dft = U64_MAX;
+	iops_dft = UINT_MAX;
 
-	if (tg->bps_conf[READ][off] == bps_dft &&
-	    tg->bps_conf[WRITE][off] == bps_dft &&
-	    tg->iops_conf[READ][off] == iops_dft &&
-	    tg->iops_conf[WRITE][off] == iops_dft &&
-	    (off != LIMIT_LOW ||
-	     (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
-	      tg->latency_target_conf == DFL_LATENCY_TARGET)))
+	if (tg->bps_conf[READ] == bps_dft &&
+	    tg->bps_conf[WRITE] == bps_dft &&
+	    tg->iops_conf[READ] == iops_dft &&
+	    tg->iops_conf[WRITE] == iops_dft)
 		return 0;
 
-	if (tg->bps_conf[READ][off] != U64_MAX)
-		snprintf(bufs[0], sizeof(bufs[0]), "%llu",
-			tg->bps_conf[READ][off]);
-	if (tg->bps_conf[WRITE][off] != U64_MAX)
-		snprintf(bufs[1], sizeof(bufs[1]), "%llu",
-			tg->bps_conf[WRITE][off]);
-	if (tg->iops_conf[READ][off] != UINT_MAX)
-		snprintf(bufs[2], sizeof(bufs[2]), "%u",
-			tg->iops_conf[READ][off]);
-	if (tg->iops_conf[WRITE][off] != UINT_MAX)
-		snprintf(bufs[3], sizeof(bufs[3]), "%u",
-			tg->iops_conf[WRITE][off]);
-	if (off == LIMIT_LOW) {
-		if (tg->idletime_threshold_conf == ULONG_MAX)
-			strcpy(idle_time, " idle=max");
-		else
-			snprintf(idle_time, sizeof(idle_time), " idle=%lu",
-				tg->idletime_threshold_conf);
+	seq_printf(sf, "%s", dname);
+	if (tg->bps_conf[READ] == U64_MAX)
+		seq_printf(sf, " rbps=max");
+	else
+		seq_printf(sf, " rbps=%llu", tg->bps_conf[READ]);
 
-		if (tg->latency_target_conf == ULONG_MAX)
-			strcpy(latency_time, " latency=max");
-		else
-			snprintf(latency_time, sizeof(latency_time),
-				" latency=%lu", tg->latency_target_conf);
-	}
+	if (tg->bps_conf[WRITE] == U64_MAX)
+		seq_printf(sf, " wbps=max");
+	else
+		seq_printf(sf, " wbps=%llu", tg->bps_conf[WRITE]);
+
+	if (tg->iops_conf[READ] == UINT_MAX)
+		seq_printf(sf, " riops=max");
+	else
+		seq_printf(sf, " riops=%u", tg->iops_conf[READ]);
 
-	seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
-		   dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
-		   latency_time);
+	if (tg->iops_conf[WRITE] == UINT_MAX)
+		seq_printf(sf, " wiops=max");
+	else
+		seq_printf(sf, " wiops=%u", tg->iops_conf[WRITE]);
+
+	seq_printf(sf, "\n");
 	return 0;
 }
 
@@ -1566,13 +1449,20 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
 	u64 v[4];
-	unsigned long idle_time;
-	unsigned long latency_time;
 	int ret;
-	int index = of_cft(of)->private;
 
 	blkg_conf_init(&ctx, buf);
 
+	ret = blkg_conf_open_bdev(&ctx);
+	if (ret)
+		goto out_finish;
+
+	if (!blk_throtl_activated(ctx.bdev->bd_queue)) {
+		ret = blk_throtl_init(ctx.bdev->bd_disk);
+		if (ret)
+			goto out_finish;
+	}
+
 	ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
 	if (ret)
 		goto out_finish;
@@ -1580,13 +1470,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 	tg = blkg_to_tg(ctx.blkg);
 	tg_update_carryover(tg);
 
-	v[0] = tg->bps_conf[READ][index];
-	v[1] = tg->bps_conf[WRITE][index];
-	v[2] = tg->iops_conf[READ][index];
-	v[3] = tg->iops_conf[WRITE][index];
+	v[0] = tg->bps[READ];
+	v[1] = tg->bps[WRITE];
+	v[2] = tg->iops[READ];
+	v[3] = tg->iops[WRITE];
 
-	idle_time = tg->idletime_threshold_conf;
-	latency_time = tg->latency_target_conf;
 	while (true) {
 		char tok[27];	/* wiops=18446744073709551616 */
 		char *p;
@@ -1618,60 +1506,16 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 			v[2] = min_t(u64, val, UINT_MAX);
 		else if (!strcmp(tok, "wiops") && val > 1)
 			v[3] = min_t(u64, val, UINT_MAX);
-		else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
-			idle_time = val;
-		else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
-			latency_time = val;
 		else
 			goto out_finish;
 	}
 
-	tg->bps_conf[READ][index] = v[0];
-	tg->bps_conf[WRITE][index] = v[1];
-	tg->iops_conf[READ][index] = v[2];
-	tg->iops_conf[WRITE][index] = v[3];
+	tg->bps[READ] = v[0];
+	tg->bps[WRITE] = v[1];
+	tg->iops[READ] = v[2];
+	tg->iops[WRITE] = v[3];
 
-	if (index == LIMIT_MAX) {
-		tg->bps[READ][index] = v[0];
-		tg->bps[WRITE][index] = v[1];
-		tg->iops[READ][index] = v[2];
-		tg->iops[WRITE][index] = v[3];
-	}
-	tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
-		tg->bps_conf[READ][LIMIT_MAX]);
-	tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
-		tg->bps_conf[WRITE][LIMIT_MAX]);
-	tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
-		tg->iops_conf[READ][LIMIT_MAX]);
-	tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
-		tg->iops_conf[WRITE][LIMIT_MAX]);
-	tg->idletime_threshold_conf = idle_time;
-	tg->latency_target_conf = latency_time;
-
-	/* force user to configure all settings for low limit  */
-	if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
-	      tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
-	    tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
-	    tg->latency_target_conf == DFL_LATENCY_TARGET) {
-		tg->bps[READ][LIMIT_LOW] = 0;
-		tg->bps[WRITE][LIMIT_LOW] = 0;
-		tg->iops[READ][LIMIT_LOW] = 0;
-		tg->iops[WRITE][LIMIT_LOW] = 0;
-		tg->idletime_threshold = DFL_IDLE_THRESHOLD;
-		tg->latency_target = DFL_LATENCY_TARGET;
-	} else if (index == LIMIT_LOW) {
-		tg->idletime_threshold = tg->idletime_threshold_conf;
-		tg->latency_target = tg->latency_target_conf;
-	}
-
-	blk_throtl_update_limit_valid(tg->td);
-	if (tg->td->limit_valid[LIMIT_LOW]) {
-		if (index == LIMIT_LOW)
-			tg->td->limit_index = LIMIT_LOW;
-	} else
-		tg->td->limit_index = LIMIT_MAX;
-	tg_conf_updated(tg, index == LIMIT_LOW &&
-		tg->td->limit_valid[LIMIT_LOW]);
+	tg_conf_updated(tg, false);
 	ret = 0;
 out_finish:
 	blkg_conf_exit(&ctx);
@@ -1679,21 +1523,11 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
 }
 
 static struct cftype throtl_files[] = {
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	{
-		.name = "low",
-		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = tg_print_limit,
-		.write = tg_set_limit,
-		.private = LIMIT_LOW,
-	},
-#endif
 	{
 		.name = "max",
 		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = tg_print_limit,
 		.write = tg_set_limit,
-		.private = LIMIT_MAX,
 	},
 	{ }	/* terminate */
 };
@@ -1712,7 +1546,6 @@ struct blkcg_policy blkcg_policy_throtl = {
 	.pd_alloc_fn		= throtl_pd_alloc,
 	.pd_init_fn		= throtl_pd_init,
 	.pd_online_fn		= throtl_pd_online,
-	.pd_offline_fn		= throtl_pd_offline,
 	.pd_free_fn		= throtl_pd_free,
 };
 
@@ -1722,6 +1555,9 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 	struct cgroup_subsys_state *pos_css;
 	struct blkcg_gq *blkg;
 
+	if (!blk_throtl_activated(q))
+		return;
+
 	spin_lock_irq(&q->queue_lock);
 	/*
 	 * queue_lock is held, rcu lock is not needed here technically.
@@ -1761,418 +1597,6 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
 	spin_unlock_irq(&q->queue_lock);
 }
 
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
-{
-	unsigned long rtime = jiffies, wtime = jiffies;
-
-	if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
-		rtime = tg->last_low_overflow_time[READ];
-	if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
-		wtime = tg->last_low_overflow_time[WRITE];
-	return min(rtime, wtime);
-}
-
-static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
-{
-	struct throtl_service_queue *parent_sq;
-	struct throtl_grp *parent = tg;
-	unsigned long ret = __tg_last_low_overflow_time(tg);
-
-	while (true) {
-		parent_sq = parent->service_queue.parent_sq;
-		parent = sq_to_tg(parent_sq);
-		if (!parent)
-			break;
-
-		/*
-		 * The parent doesn't have low limit, it always reaches low
-		 * limit. Its overflow time is useless for children
-		 */
-		if (!parent->bps[READ][LIMIT_LOW] &&
-		    !parent->iops[READ][LIMIT_LOW] &&
-		    !parent->bps[WRITE][LIMIT_LOW] &&
-		    !parent->iops[WRITE][LIMIT_LOW])
-			continue;
-		if (time_after(__tg_last_low_overflow_time(parent), ret))
-			ret = __tg_last_low_overflow_time(parent);
-	}
-	return ret;
-}
-
-static bool throtl_tg_is_idle(struct throtl_grp *tg)
-{
-	/*
-	 * cgroup is idle if:
-	 * - single idle is too long, longer than a fixed value (in case user
-	 *   configure a too big threshold) or 4 times of idletime threshold
-	 * - average think time is more than threshold
-	 * - IO latency is largely below threshold
-	 */
-	unsigned long time;
-	bool ret;
-
-	time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
-	ret = tg->latency_target == DFL_LATENCY_TARGET ||
-	      tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
-	      (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
-	      tg->avg_idletime > tg->idletime_threshold ||
-	      (tg->latency_target && tg->bio_cnt &&
-		tg->bad_bio_cnt * 5 < tg->bio_cnt);
-	throtl_log(&tg->service_queue,
-		"avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
-		tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
-		tg->bio_cnt, ret, tg->td->scale);
-	return ret;
-}
-
-static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
-{
-	struct throtl_service_queue *sq = &tg->service_queue;
-	bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
-
-	/*
-	 * if low limit is zero, low limit is always reached.
-	 * if low limit is non-zero, we can check if there is any request
-	 * is queued to determine if low limit is reached as we throttle
-	 * request according to limit.
-	 */
-	return !limit || sq->nr_queued[rw];
-}
-
-static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
-{
-	/*
-	 * cgroup reaches low limit when low limit of READ and WRITE are
-	 * both reached, it's ok to upgrade to next limit if cgroup reaches
-	 * low limit
-	 */
-	if (throtl_low_limit_reached(tg, READ) &&
-	    throtl_low_limit_reached(tg, WRITE))
-		return true;
-
-	if (time_after_eq(jiffies,
-		tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
-	    throtl_tg_is_idle(tg))
-		return true;
-	return false;
-}
-
-static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
-{
-	while (true) {
-		if (throtl_tg_can_upgrade(tg))
-			return true;
-		tg = sq_to_tg(tg->service_queue.parent_sq);
-		if (!tg || !tg_to_blkg(tg)->parent)
-			return false;
-	}
-	return false;
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
-	struct throtl_grp *this_tg)
-{
-	struct cgroup_subsys_state *pos_css;
-	struct blkcg_gq *blkg;
-
-	if (td->limit_index != LIMIT_LOW)
-		return false;
-
-	if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
-		return false;
-
-	rcu_read_lock();
-	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
-		struct throtl_grp *tg = blkg_to_tg(blkg);
-
-		if (tg == this_tg)
-			continue;
-		if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
-			continue;
-		if (!throtl_hierarchy_can_upgrade(tg)) {
-			rcu_read_unlock();
-			return false;
-		}
-	}
-	rcu_read_unlock();
-	return true;
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
-	unsigned long now = jiffies;
-
-	if (tg->td->limit_index != LIMIT_LOW)
-		return;
-
-	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
-		return;
-
-	tg->last_check_time = now;
-
-	if (!time_after_eq(now,
-	     __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
-		return;
-
-	if (throtl_can_upgrade(tg->td, NULL))
-		throtl_upgrade_state(tg->td);
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
-	struct cgroup_subsys_state *pos_css;
-	struct blkcg_gq *blkg;
-
-	throtl_log(&td->service_queue, "upgrade to max");
-	td->limit_index = LIMIT_MAX;
-	td->low_upgrade_time = jiffies;
-	td->scale = 0;
-	rcu_read_lock();
-	blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
-		struct throtl_grp *tg = blkg_to_tg(blkg);
-		struct throtl_service_queue *sq = &tg->service_queue;
-
-		tg->disptime = jiffies - 1;
-		throtl_select_dispatch(sq);
-		throtl_schedule_next_dispatch(sq, true);
-	}
-	rcu_read_unlock();
-	throtl_select_dispatch(&td->service_queue);
-	throtl_schedule_next_dispatch(&td->service_queue, true);
-	queue_work(kthrotld_workqueue, &td->dispatch_work);
-}
-
-static void throtl_downgrade_state(struct throtl_data *td)
-{
-	td->scale /= 2;
-
-	throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
-	if (td->scale) {
-		td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
-		return;
-	}
-
-	td->limit_index = LIMIT_LOW;
-	td->low_downgrade_time = jiffies;
-}
-
-static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
-{
-	struct throtl_data *td = tg->td;
-	unsigned long now = jiffies;
-
-	/*
-	 * If cgroup is below low limit, consider downgrade and throttle other
-	 * cgroups
-	 */
-	if (time_after_eq(now, tg_last_low_overflow_time(tg) +
-					td->throtl_slice) &&
-	    (!throtl_tg_is_idle(tg) ||
-	     !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
-		return true;
-	return false;
-}
-
-static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
-{
-	struct throtl_data *td = tg->td;
-
-	if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
-		return false;
-
-	while (true) {
-		if (!throtl_tg_can_downgrade(tg))
-			return false;
-		tg = sq_to_tg(tg->service_queue.parent_sq);
-		if (!tg || !tg_to_blkg(tg)->parent)
-			break;
-	}
-	return true;
-}
-
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
-	uint64_t bps;
-	unsigned int iops;
-	unsigned long elapsed_time;
-	unsigned long now = jiffies;
-
-	if (tg->td->limit_index != LIMIT_MAX ||
-	    !tg->td->limit_valid[LIMIT_LOW])
-		return;
-	if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
-		return;
-	if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
-		return;
-
-	elapsed_time = now - tg->last_check_time;
-	tg->last_check_time = now;
-
-	if (time_before(now, tg_last_low_overflow_time(tg) +
-			tg->td->throtl_slice))
-		return;
-
-	if (tg->bps[READ][LIMIT_LOW]) {
-		bps = tg->last_bytes_disp[READ] * HZ;
-		do_div(bps, elapsed_time);
-		if (bps >= tg->bps[READ][LIMIT_LOW])
-			tg->last_low_overflow_time[READ] = now;
-	}
-
-	if (tg->bps[WRITE][LIMIT_LOW]) {
-		bps = tg->last_bytes_disp[WRITE] * HZ;
-		do_div(bps, elapsed_time);
-		if (bps >= tg->bps[WRITE][LIMIT_LOW])
-			tg->last_low_overflow_time[WRITE] = now;
-	}
-
-	if (tg->iops[READ][LIMIT_LOW]) {
-		iops = tg->last_io_disp[READ] * HZ / elapsed_time;
-		if (iops >= tg->iops[READ][LIMIT_LOW])
-			tg->last_low_overflow_time[READ] = now;
-	}
-
-	if (tg->iops[WRITE][LIMIT_LOW]) {
-		iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
-		if (iops >= tg->iops[WRITE][LIMIT_LOW])
-			tg->last_low_overflow_time[WRITE] = now;
-	}
-
-	/*
-	 * If cgroup is below low limit, consider downgrade and throttle other
-	 * cgroups
-	 */
-	if (throtl_hierarchy_can_downgrade(tg))
-		throtl_downgrade_state(tg->td);
-
-	tg->last_bytes_disp[READ] = 0;
-	tg->last_bytes_disp[WRITE] = 0;
-	tg->last_io_disp[READ] = 0;
-	tg->last_io_disp[WRITE] = 0;
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
-	unsigned long now;
-	unsigned long last_finish_time = tg->last_finish_time;
-
-	if (last_finish_time == 0)
-		return;
-
-	now = blk_time_get_ns() >> 10;
-	if (now <= last_finish_time ||
-	    last_finish_time == tg->checked_last_finish_time)
-		return;
-
-	tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
-	tg->checked_last_finish_time = last_finish_time;
-}
-
-static void throtl_update_latency_buckets(struct throtl_data *td)
-{
-	struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
-	int i, cpu, rw;
-	unsigned long last_latency[2] = { 0 };
-	unsigned long latency[2];
-
-	if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
-		return;
-	if (time_before(jiffies, td->last_calculate_time + HZ))
-		return;
-	td->last_calculate_time = jiffies;
-
-	memset(avg_latency, 0, sizeof(avg_latency));
-	for (rw = READ; rw <= WRITE; rw++) {
-		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-			struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
-
-			for_each_possible_cpu(cpu) {
-				struct latency_bucket *bucket;
-
-				/* this isn't race free, but ok in practice */
-				bucket = per_cpu_ptr(td->latency_buckets[rw],
-					cpu);
-				tmp->total_latency += bucket[i].total_latency;
-				tmp->samples += bucket[i].samples;
-				bucket[i].total_latency = 0;
-				bucket[i].samples = 0;
-			}
-
-			if (tmp->samples >= 32) {
-				int samples = tmp->samples;
-
-				latency[rw] = tmp->total_latency;
-
-				tmp->total_latency = 0;
-				tmp->samples = 0;
-				latency[rw] /= samples;
-				if (latency[rw] == 0)
-					continue;
-				avg_latency[rw][i].latency = latency[rw];
-			}
-		}
-	}
-
-	for (rw = READ; rw <= WRITE; rw++) {
-		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-			if (!avg_latency[rw][i].latency) {
-				if (td->avg_buckets[rw][i].latency < last_latency[rw])
-					td->avg_buckets[rw][i].latency =
-						last_latency[rw];
-				continue;
-			}
-
-			if (!td->avg_buckets[rw][i].valid)
-				latency[rw] = avg_latency[rw][i].latency;
-			else
-				latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
-					avg_latency[rw][i].latency) >> 3;
-
-			td->avg_buckets[rw][i].latency = max(latency[rw],
-				last_latency[rw]);
-			td->avg_buckets[rw][i].valid = true;
-			last_latency[rw] = td->avg_buckets[rw][i].latency;
-		}
-	}
-
-	for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
-		throtl_log(&td->service_queue,
-			"Latency bucket %d: read latency=%ld, read valid=%d, "
-			"write latency=%ld, write valid=%d", i,
-			td->avg_buckets[READ][i].latency,
-			td->avg_buckets[READ][i].valid,
-			td->avg_buckets[WRITE][i].latency,
-			td->avg_buckets[WRITE][i].valid);
-}
-#else
-static inline void throtl_update_latency_buckets(struct throtl_data *td)
-{
-}
-
-static void blk_throtl_update_idletime(struct throtl_grp *tg)
-{
-}
-
-static void throtl_downgrade_check(struct throtl_grp *tg)
-{
-}
-
-static void throtl_upgrade_check(struct throtl_grp *tg)
-{
-}
-
-static bool throtl_can_upgrade(struct throtl_data *td,
-	struct throtl_grp *this_tg)
-{
-	return false;
-}
-
-static void throtl_upgrade_state(struct throtl_data *td)
-{
-}
-#endif
-
 bool __blk_throtl_bio(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
@@ -2185,21 +1609,12 @@ bool __blk_throtl_bio(struct bio *bio)
 	struct throtl_data *td = tg->td;
 
 	rcu_read_lock();
-
 	spin_lock_irq(&q->queue_lock);
-
-	throtl_update_latency_buckets(td);
-
-	blk_throtl_update_idletime(tg);
-
 	sq = &tg->service_queue;
 
-again:
 	while (true) {
 		if (tg->last_low_overflow_time[rw] == 0)
 			tg->last_low_overflow_time[rw] = jiffies;
-		throtl_downgrade_check(tg);
-		throtl_upgrade_check(tg);
 		/* throtl is FIFO - if bios are already queued, should queue */
 		if (sq->nr_queued[rw])
 			break;
@@ -2207,10 +1622,6 @@ bool __blk_throtl_bio(struct bio *bio)
 		/* if above limits, break to queue */
 		if (!tg_may_dispatch(tg, bio, NULL)) {
 			tg->last_low_overflow_time[rw] = jiffies;
-			if (throtl_can_upgrade(td, tg)) {
-				throtl_upgrade_state(td);
-				goto again;
-			}
 			break;
 		}
 
@@ -2270,215 +1681,25 @@ bool __blk_throtl_bio(struct bio *bio)
 	}
 
 out_unlock:
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-	if (throttled || !td->track_bio_latency)
-		bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
-#endif
 	spin_unlock_irq(&q->queue_lock);
 
 	rcu_read_unlock();
 	return throttled;
 }
 
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static void throtl_track_latency(struct throtl_data *td, sector_t size,
-				 enum req_op op, unsigned long time)
-{
-	const bool rw = op_is_write(op);
-	struct latency_bucket *latency;
-	int index;
-
-	if (!td || td->limit_index != LIMIT_LOW ||
-	    !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
-	    !blk_queue_nonrot(td->queue))
-		return;
-
-	index = request_bucket_index(size);
-
-	latency = get_cpu_ptr(td->latency_buckets[rw]);
-	latency[index].total_latency += time;
-	latency[index].samples++;
-	put_cpu_ptr(td->latency_buckets[rw]);
-}
-
-void blk_throtl_stat_add(struct request *rq, u64 time_ns)
-{
-	struct request_queue *q = rq->q;
-	struct throtl_data *td = q->td;
-
-	throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
-			     time_ns >> 10);
-}
-
-void blk_throtl_bio_endio(struct bio *bio)
-{
-	struct blkcg_gq *blkg;
-	struct throtl_grp *tg;
-	u64 finish_time_ns;
-	unsigned long finish_time;
-	unsigned long start_time;
-	unsigned long lat;
-	int rw = bio_data_dir(bio);
-
-	blkg = bio->bi_blkg;
-	if (!blkg)
-		return;
-	tg = blkg_to_tg(blkg);
-	if (!tg->td->limit_valid[LIMIT_LOW])
-		return;
-
-	finish_time_ns = blk_time_get_ns();
-	tg->last_finish_time = finish_time_ns >> 10;
-
-	start_time = bio_issue_time(&bio->bi_issue) >> 10;
-	finish_time = __bio_issue_time(finish_time_ns) >> 10;
-	if (!start_time || finish_time <= start_time)
-		return;
-
-	lat = finish_time - start_time;
-	/* this is only for bio based driver */
-	if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
-		throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
-				     bio_op(bio), lat);
-
-	if (tg->latency_target && lat >= tg->td->filtered_latency) {
-		int bucket;
-		unsigned int threshold;
-
-		bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
-		threshold = tg->td->avg_buckets[rw][bucket].latency +
-			tg->latency_target;
-		if (lat > threshold)
-			tg->bad_bio_cnt++;
-		/*
-		 * Not race free, could get wrong count, which means cgroups
-		 * will be throttled
-		 */
-		tg->bio_cnt++;
-	}
-
-	if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
-		tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
-		tg->bio_cnt /= 2;
-		tg->bad_bio_cnt /= 2;
-	}
-}
-#endif
-
-int blk_throtl_init(struct gendisk *disk)
-{
-	struct request_queue *q = disk->queue;
-	struct throtl_data *td;
-	int ret;
-
-	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
-	if (!td)
-		return -ENOMEM;
-	td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
-		LATENCY_BUCKET_SIZE, __alignof__(u64));
-	if (!td->latency_buckets[READ]) {
-		kfree(td);
-		return -ENOMEM;
-	}
-	td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
-		LATENCY_BUCKET_SIZE, __alignof__(u64));
-	if (!td->latency_buckets[WRITE]) {
-		free_percpu(td->latency_buckets[READ]);
-		kfree(td);
-		return -ENOMEM;
-	}
-
-	INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
-	throtl_service_queue_init(&td->service_queue);
-
-	q->td = td;
-	td->queue = q;
-
-	td->limit_valid[LIMIT_MAX] = true;
-	td->limit_index = LIMIT_MAX;
-	td->low_upgrade_time = jiffies;
-	td->low_downgrade_time = jiffies;
-
-	/* activate policy */
-	ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
-	if (ret) {
-		free_percpu(td->latency_buckets[READ]);
-		free_percpu(td->latency_buckets[WRITE]);
-		kfree(td);
-	}
-	return ret;
-}
-
 void blk_throtl_exit(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 
-	BUG_ON(!q->td);
+	if (!blk_throtl_activated(q))
+		return;
+
 	del_timer_sync(&q->td->service_queue.pending_timer);
 	throtl_shutdown_wq(q);
 	blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
-	free_percpu(q->td->latency_buckets[READ]);
-	free_percpu(q->td->latency_buckets[WRITE]);
 	kfree(q->td);
 }
 
-void blk_throtl_register(struct gendisk *disk)
-{
-	struct request_queue *q = disk->queue;
-	struct throtl_data *td;
-	int i;
-
-	td = q->td;
-	BUG_ON(!td);
-
-	if (blk_queue_nonrot(q)) {
-		td->throtl_slice = DFL_THROTL_SLICE_SSD;
-		td->filtered_latency = LATENCY_FILTERED_SSD;
-	} else {
-		td->throtl_slice = DFL_THROTL_SLICE_HD;
-		td->filtered_latency = LATENCY_FILTERED_HD;
-		for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
-			td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
-			td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
-		}
-	}
-#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
-	/* if no low limit, use previous default */
-	td->throtl_slice = DFL_THROTL_SLICE_HD;
-
-#else
-	td->track_bio_latency = !queue_is_mq(q);
-	if (!td->track_bio_latency)
-		blk_stat_enable_accounting(q);
-#endif
-}
-
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
-{
-	if (!q->td)
-		return -EINVAL;
-	return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
-}
-
-ssize_t blk_throtl_sample_time_store(struct request_queue *q,
-	const char *page, size_t count)
-{
-	unsigned long v;
-	unsigned long t;
-
-	if (!q->td)
-		return -EINVAL;
-	if (kstrtoul(page, 10, &v))
-		return -EINVAL;
-	t = msecs_to_jiffies(v);
-	if (t == 0 || t > MAX_THROTL_SLICE)
-		return -EINVAL;
-	q->td->throtl_slice = t;
-	return count;
-}
-#endif
-
 static int __init throtl_init(void)
 {
 	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
index bffbc9cfc8ab..393c3d134b96 100644
--- a/block/blk-throttle.h
+++ b/block/blk-throttle.h
@@ -58,12 +58,6 @@ enum tg_state_flags {
 	THROTL_TG_CANCELING	= 1 << 2,	/* starts to cancel bio */
 };
 
-enum {
-	LIMIT_LOW,
-	LIMIT_MAX,
-	LIMIT_CNT,
-};
-
 struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
@@ -102,14 +96,14 @@ struct throtl_grp {
 	bool has_rules_iops[2];
 
 	/* internally used bytes per second rate limits */
-	uint64_t bps[2][LIMIT_CNT];
+	uint64_t bps[2];
 	/* user configured bps limits */
-	uint64_t bps_conf[2][LIMIT_CNT];
+	uint64_t bps_conf[2];
 
 	/* internally used IOPS limits */
-	unsigned int iops[2][LIMIT_CNT];
+	unsigned int iops[2];
 	/* user configured IOPS limits */
-	unsigned int iops_conf[2][LIMIT_CNT];
+	unsigned int iops_conf[2];
 
 	/* Number of bytes dispatched in current slice */
 	uint64_t bytes_disp[2];
@@ -132,22 +126,10 @@ struct throtl_grp {
 
 	unsigned long last_check_time;
 
-	unsigned long latency_target; /* us */
-	unsigned long latency_target_conf; /* us */
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 
-	unsigned long last_finish_time; /* ns / 1024 */
-	unsigned long checked_last_finish_time; /* ns / 1024 */
-	unsigned long avg_idletime; /* ns / 1024 */
-	unsigned long idletime_threshold; /* us */
-	unsigned long idletime_threshold_conf; /* us */
-
-	unsigned int bio_cnt; /* total bios */
-	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
-	unsigned long bio_cnt_reset_time;
-
 	struct blkg_rwstat stat_bytes;
 	struct blkg_rwstat stat_ios;
 };
@@ -168,23 +150,33 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
  * Internal throttling interface
  */
 #ifndef CONFIG_BLK_DEV_THROTTLING
-static inline int blk_throtl_init(struct gendisk *disk) { return 0; }
 static inline void blk_throtl_exit(struct gendisk *disk) { }
-static inline void blk_throtl_register(struct gendisk *disk) { }
 static inline bool blk_throtl_bio(struct bio *bio) { return false; }
 static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
 #else /* CONFIG_BLK_DEV_THROTTLING */
-int blk_throtl_init(struct gendisk *disk);
 void blk_throtl_exit(struct gendisk *disk);
-void blk_throtl_register(struct gendisk *disk);
 bool __blk_throtl_bio(struct bio *bio);
 void blk_throtl_cancel_bios(struct gendisk *disk);
 
+static inline bool blk_throtl_activated(struct request_queue *q)
+{
+	return q->td != NULL;
+}
+
 static inline bool blk_should_throtl(struct bio *bio)
 {
-	struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+	struct throtl_grp *tg;
 	int rw = bio_data_dir(bio);
 
+	/*
+	 * This is called under bio_queue_enter(), and it's synchronized with
+	 * the activation of blk-throtl, which is protected by
+	 * blk_mq_freeze_queue().
+	 */
+	if (!blk_throtl_activated(bio->bi_bdev->bd_queue))
+		return false;
+
+	tg = blkg_to_tg(bio->bi_blkg);
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
 		if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
 			bio_set_flag(bio, BIO_CGROUP_ACCT);
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index da0f4b2a8fa0..48e5e3bbb89c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -7,6 +7,7 @@
  *
  * Copyright (c) 2016, Damien Le Moal
  * Copyright (c) 2016, Western Digital
+ * Copyright (c) 2024, Western Digital Corporation or its affiliates.
  */
 
 #include <linux/kernel.h>
@@ -16,8 +17,13 @@
 #include <linux/mm.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/mm.h>
+#include <linux/spinlock.h>
+#include <linux/atomic.h>
+#include <linux/mempool.h>
 
 #include "blk.h"
+#include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
 
 #define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
 static const char *const zone_cond_name[] = {
@@ -32,6 +38,64 @@ static const char *const zone_cond_name[] = {
 };
 #undef ZONE_COND_NAME
 
+/*
+ * Per-zone write plug.
+ * @node: hlist_node structure for managing the plug using a hash table.
+ * @link: To list the plug in the zone write plug error list of the disk.
+ * @ref: Zone write plug reference counter. A zone write plug reference is
+ *       always at least 1 when the plug is hashed in the disk plug hash table.
+ *       The reference is incremented whenever a new BIO needing plugging is
+ *       submitted and when a function needs to manipulate a plug. The
+ *       reference count is decremented whenever a plugged BIO completes and
+ *       when a function that referenced the plug returns. The initial
+ *       reference is dropped whenever the zone of the zone write plug is reset,
+ *       finished and when the zone becomes full (last write BIO to the zone
+ *       completes).
+ * @lock: Spinlock to atomically manipulate the plug.
+ * @flags: Flags indicating the plug state.
+ * @zone_no: The number of the zone the plug is managing.
+ * @wp_offset: The zone write pointer location relative to the start of the zone
+ *             as a number of 512B sectors.
+ * @bio_list: The list of BIOs that are currently plugged.
+ * @bio_work: Work struct to handle issuing of plugged BIOs
+ * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
+ */
+struct blk_zone_wplug {
+	struct hlist_node	node;
+	struct list_head	link;
+	atomic_t		ref;
+	spinlock_t		lock;
+	unsigned int		flags;
+	unsigned int		zone_no;
+	unsigned int		wp_offset;
+	struct bio_list		bio_list;
+	struct work_struct	bio_work;
+	struct rcu_head		rcu_head;
+	struct gendisk		*disk;
+};
+
+/*
+ * Zone write plug flags bits:
+ *  - BLK_ZONE_WPLUG_PLUGGED: Indicates that the zone write plug is plugged,
+ *    that is, that write BIOs are being throttled due to a write BIO already
+ *    being executed or the zone write plug bio list is not empty.
+ *  - BLK_ZONE_WPLUG_ERROR: Indicates that a write error happened which will be
+ *    recovered with a report zone to update the zone write pointer offset.
+ *  - BLK_ZONE_WPLUG_UNHASHED: Indicates that the zone write plug was removed
+ *    from the disk hash table and that the initial reference to the zone
+ *    write plug set when the plug was first added to the hash table has been
+ *    dropped. This flag is set when a zone is reset, finished or become full,
+ *    to prevent new references to the zone write plug to be taken for
+ *    newly incoming BIOs. A zone write plug flagged with this flag will be
+ *    freed once all remaining references from BIOs or functions are dropped.
+ */
+#define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
+#define BLK_ZONE_WPLUG_ERROR		(1U << 1)
+#define BLK_ZONE_WPLUG_UNHASHED		(1U << 2)
+
+#define BLK_ZONE_WPLUG_BUSY	(BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
+
 /**
  * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
  * @zone_cond: BLK_ZONE_COND_XXX.
@@ -51,52 +115,6 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 }
 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 
-/*
- * Return true if a request is a write requests that needs zone write locking.
- */
-bool blk_req_needs_zone_write_lock(struct request *rq)
-{
-	if (!rq->q->disk->seq_zones_wlock)
-		return false;
-
-	return blk_rq_is_seq_zoned_write(rq);
-}
-EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
-
-bool blk_req_zone_write_trylock(struct request *rq)
-{
-	unsigned int zno = blk_rq_zone_no(rq);
-
-	if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
-		return false;
-
-	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
-	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-
-	return true;
-}
-EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
-
-void __blk_req_zone_write_lock(struct request *rq)
-{
-	if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
-					  rq->q->disk->seq_zones_wlock)))
-		return;
-
-	WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
-	rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
-
-void __blk_req_zone_write_unlock(struct request *rq)
-{
-	rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
-	if (rq->q->disk->seq_zones_wlock)
-		WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
-						 rq->q->disk->seq_zones_wlock));
-}
-EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-
 /**
  * bdev_nr_zones - Get number of zones
  * @bdev:	Target device
@@ -425,189 +443,1483 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 	return ret;
 }
 
-void disk_free_zone_bitmaps(struct gendisk *disk)
+static inline bool disk_zone_is_conv(struct gendisk *disk, sector_t sector)
 {
-	kfree(disk->conv_zones_bitmap);
-	disk->conv_zones_bitmap = NULL;
-	kfree(disk->seq_zones_wlock);
-	disk->seq_zones_wlock = NULL;
+	if (!disk->conv_zones_bitmap)
+		return false;
+	return test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
 }
 
-struct blk_revalidate_zone_args {
-	struct gendisk	*disk;
-	unsigned long	*conv_zones_bitmap;
-	unsigned long	*seq_zones_wlock;
-	unsigned int	nr_zones;
-	sector_t	sector;
-};
+static bool disk_insert_zone_wplug(struct gendisk *disk,
+				   struct blk_zone_wplug *zwplug)
+{
+	struct blk_zone_wplug *zwplg;
+	unsigned long flags;
+	unsigned int idx =
+		hash_32(zwplug->zone_no, disk->zone_wplugs_hash_bits);
 
-/*
- * Helper function to check the validity of zones of a zoned block device.
- */
-static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
-				  void *data)
+	/*
+	 * Add the new zone write plug to the hash table, but carefully as we
+	 * are racing with other submission context, so we may already have a
+	 * zone write plug for the same zone.
+	 */
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	hlist_for_each_entry_rcu(zwplg, &disk->zone_wplugs_hash[idx], node) {
+		if (zwplg->zone_no == zwplug->zone_no) {
+			spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+			return false;
+		}
+	}
+	hlist_add_head_rcu(&zwplug->node, &disk->zone_wplugs_hash[idx]);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+	return true;
+}
+
+static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
+						  sector_t sector)
 {
-	struct blk_revalidate_zone_args *args = data;
-	struct gendisk *disk = args->disk;
-	struct request_queue *q = disk->queue;
-	sector_t capacity = get_capacity(disk);
-	sector_t zone_sectors = q->limits.chunk_sectors;
+	unsigned int zno = disk_zone_no(disk, sector);
+	unsigned int idx = hash_32(zno, disk->zone_wplugs_hash_bits);
+	struct blk_zone_wplug *zwplug;
 
-	/* Check for bad zones and holes in the zone report */
-	if (zone->start != args->sector) {
-		pr_warn("%s: Zone gap at sectors %llu..%llu\n",
-			disk->disk_name, args->sector, zone->start);
-		return -ENODEV;
+	rcu_read_lock();
+
+	hlist_for_each_entry_rcu(zwplug, &disk->zone_wplugs_hash[idx], node) {
+		if (zwplug->zone_no == zno &&
+		    atomic_inc_not_zero(&zwplug->ref)) {
+			rcu_read_unlock();
+			return zwplug;
+		}
 	}
 
-	if (zone->start >= capacity || !zone->len) {
-		pr_warn("%s: Invalid zone start %llu, length %llu\n",
-			disk->disk_name, zone->start, zone->len);
-		return -ENODEV;
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
+{
+	struct blk_zone_wplug *zwplug =
+		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+	mempool_free(zwplug, zwplug->disk->zone_wplugs_pool);
+}
+
+static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+	if (atomic_dec_and_test(&zwplug->ref)) {
+		WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
+		WARN_ON_ONCE(!list_empty(&zwplug->link));
+		WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_UNHASHED));
+
+		call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
 	}
+}
+
+static inline bool disk_should_remove_zone_wplug(struct gendisk *disk,
+						 struct blk_zone_wplug *zwplug)
+{
+	/* If the zone write plug was already removed, we are done. */
+	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+		return false;
+
+	/* If the zone write plug is still busy, it cannot be removed. */
+	if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+		return false;
 
 	/*
-	 * All zones must have the same size, with the exception on an eventual
-	 * smaller last zone.
+	 * Completions of BIOs with blk_zone_write_plug_bio_endio() may
+	 * happen after handling a request completion with
+	 * blk_zone_write_plug_finish_request() (e.g. with split BIOs
+	 * that are chained). In such case, disk_zone_wplug_unplug_bio()
+	 * should not attempt to remove the zone write plug until all BIO
+	 * completions are seen. Check by looking at the zone write plug
+	 * reference count, which is 2 when the plug is unused (one reference
+	 * taken when the plug was allocated and another reference taken by the
+	 * caller context).
 	 */
-	if (zone->start + zone->len < capacity) {
-		if (zone->len != zone_sectors) {
-			pr_warn("%s: Invalid zoned device with non constant zone size\n",
-				disk->disk_name);
-			return -ENODEV;
+	if (atomic_read(&zwplug->ref) > 2)
+		return false;
+
+	/* We can remove zone write plugs for zones that are empty or full. */
+	return !zwplug->wp_offset || zwplug->wp_offset >= disk->zone_capacity;
+}
+
+static void disk_remove_zone_wplug(struct gendisk *disk,
+				   struct blk_zone_wplug *zwplug)
+{
+	unsigned long flags;
+
+	/* If the zone write plug was already removed, we have nothing to do. */
+	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED)
+		return;
+
+	/*
+	 * Mark the zone write plug as unhashed and drop the extra reference we
+	 * took when the plug was inserted in the hash table.
+	 */
+	zwplug->flags |= BLK_ZONE_WPLUG_UNHASHED;
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	hlist_del_init_rcu(&zwplug->node);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+	disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work);
+
+/*
+ * Get a reference on the write plug for the zone containing @sector.
+ * If the plug does not exist, it is allocated and hashed.
+ * Return a pointer to the zone write plug with the plug spinlock held.
+ */
+static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
+					sector_t sector, gfp_t gfp_mask,
+					unsigned long *flags)
+{
+	unsigned int zno = disk_zone_no(disk, sector);
+	struct blk_zone_wplug *zwplug;
+
+again:
+	zwplug = disk_get_zone_wplug(disk, sector);
+	if (zwplug) {
+		/*
+		 * Check that a BIO completion or a zone reset or finish
+		 * operation has not already removed the zone write plug from
+		 * the hash table and dropped its reference count. In such case,
+		 * we need to get a new plug so start over from the beginning.
+		 */
+		spin_lock_irqsave(&zwplug->lock, *flags);
+		if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+			spin_unlock_irqrestore(&zwplug->lock, *flags);
+			disk_put_zone_wplug(zwplug);
+			goto again;
 		}
-	} else if (zone->len > zone_sectors) {
-		pr_warn("%s: Invalid zoned device with larger last zone size\n",
-			disk->disk_name);
-		return -ENODEV;
+		return zwplug;
 	}
 
-	/* Check zone type */
-	switch (zone->type) {
-	case BLK_ZONE_TYPE_CONVENTIONAL:
-		if (!args->conv_zones_bitmap) {
-			args->conv_zones_bitmap =
-				blk_alloc_zone_bitmap(q->node, args->nr_zones);
-			if (!args->conv_zones_bitmap)
-				return -ENOMEM;
-		}
-		set_bit(idx, args->conv_zones_bitmap);
-		break;
-	case BLK_ZONE_TYPE_SEQWRITE_REQ:
-		if (!args->seq_zones_wlock) {
-			args->seq_zones_wlock =
-				blk_alloc_zone_bitmap(q->node, args->nr_zones);
-			if (!args->seq_zones_wlock)
-				return -ENOMEM;
-		}
-		break;
-	case BLK_ZONE_TYPE_SEQWRITE_PREF:
-	default:
-		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
-			disk->disk_name, (int)zone->type, zone->start);
-		return -ENODEV;
+	/*
+	 * Allocate and initialize a zone write plug with an extra reference
+	 * so that it is not freed when the zone write plug becomes idle without
+	 * the zone being full.
+	 */
+	zwplug = mempool_alloc(disk->zone_wplugs_pool, gfp_mask);
+	if (!zwplug)
+		return NULL;
+
+	INIT_HLIST_NODE(&zwplug->node);
+	INIT_LIST_HEAD(&zwplug->link);
+	atomic_set(&zwplug->ref, 2);
+	spin_lock_init(&zwplug->lock);
+	zwplug->flags = 0;
+	zwplug->zone_no = zno;
+	zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
+	bio_list_init(&zwplug->bio_list);
+	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+	zwplug->disk = disk;
+
+	spin_lock_irqsave(&zwplug->lock, *flags);
+
+	/*
+	 * Insert the new zone write plug in the hash table. This can fail only
+	 * if another context already inserted a plug. Retry from the beginning
+	 * in such case.
+	 */
+	if (!disk_insert_zone_wplug(disk, zwplug)) {
+		spin_unlock_irqrestore(&zwplug->lock, *flags);
+		mempool_free(zwplug, disk->zone_wplugs_pool);
+		goto again;
 	}
 
-	args->sector += zone->len;
-	return 0;
+	return zwplug;
 }
 
-/**
- * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
- * @disk:	Target disk
- * @update_driver_data:	Callback to update driver data on the frozen disk
- *
- * Helper function for low-level device drivers to check and (re) allocate and
- * initialize a disk request queue zone bitmaps. This functions should normally
- * be called within the disk ->revalidate method for blk-mq based drivers.
- * Before calling this function, the device driver must already have set the
- * device zone size (chunk_sector limit) and the max zone append limit.
- * For BIO based drivers, this function cannot be used. BIO based device drivers
- * only need to set disk->nr_zones so that the sysfs exposed value is correct.
- * If the @update_driver_data callback function is not NULL, the callback is
- * executed with the device request queue frozen after all zones have been
- * checked.
+static inline void blk_zone_wplug_bio_io_error(struct blk_zone_wplug *zwplug,
+					       struct bio *bio)
+{
+	struct request_queue *q = zwplug->disk->queue;
+
+	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+	bio_io_error(bio);
+	disk_put_zone_wplug(zwplug);
+	blk_queue_exit(q);
+}
+
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug.
  */
-int blk_revalidate_disk_zones(struct gendisk *disk,
-			      void (*update_driver_data)(struct gendisk *disk))
+static void disk_zone_wplug_abort(struct blk_zone_wplug *zwplug)
 {
-	struct request_queue *q = disk->queue;
-	sector_t zone_sectors = q->limits.chunk_sectors;
-	sector_t capacity = get_capacity(disk);
-	struct blk_revalidate_zone_args args = { };
-	unsigned int noio_flag;
-	int ret;
+	struct bio *bio;
 
-	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
-		return -EIO;
-	if (WARN_ON_ONCE(!queue_is_mq(q)))
-		return -EIO;
+	while ((bio = bio_list_pop(&zwplug->bio_list)))
+		blk_zone_wplug_bio_io_error(zwplug, bio);
+}
 
-	if (!capacity)
-		return -ENODEV;
+/*
+ * Abort (fail) all plugged BIOs of a zone write plug that are not aligned
+ * with the assumed write pointer location of the zone when the BIO will
+ * be unplugged.
+ */
+static void disk_zone_wplug_abort_unaligned(struct gendisk *disk,
+					    struct blk_zone_wplug *zwplug)
+{
+	unsigned int zone_capacity = disk->zone_capacity;
+	unsigned int wp_offset = zwplug->wp_offset;
+	struct bio_list bl = BIO_EMPTY_LIST;
+	struct bio *bio;
+
+	while ((bio = bio_list_pop(&zwplug->bio_list))) {
+		if (wp_offset >= zone_capacity ||
+		    (bio_op(bio) != REQ_OP_ZONE_APPEND &&
+		     bio_offset_from_zone_start(bio) != wp_offset)) {
+			blk_zone_wplug_bio_io_error(zwplug, bio);
+			continue;
+		}
+
+		wp_offset += bio_sectors(bio);
+		bio_list_add(&bl, bio);
+	}
+
+	bio_list_merge(&zwplug->bio_list, &bl);
+}
+
+static inline void disk_zone_wplug_set_error(struct gendisk *disk,
+					     struct blk_zone_wplug *zwplug)
+{
+	unsigned long flags;
+
+	if (zwplug->flags & BLK_ZONE_WPLUG_ERROR)
+		return;
 
 	/*
-	 * Checks that the device driver indicated a valid zone size and that
-	 * the max zone append limit is set.
+	 * At this point, we already have a reference on the zone write plug.
+	 * However, since we are going to add the plug to the disk zone write
+	 * plugs work list, increase its reference count. This reference will
+	 * be dropped in disk_zone_wplugs_work() once the error state is
+	 * handled, or in disk_zone_wplug_clear_error() if the zone is reset or
+	 * finished.
 	 */
-	if (!zone_sectors || !is_power_of_2(zone_sectors)) {
-		pr_warn("%s: Invalid non power of two zone size (%llu)\n",
-			disk->disk_name, zone_sectors);
-		return -ENODEV;
+	zwplug->flags |= BLK_ZONE_WPLUG_ERROR;
+	atomic_inc(&zwplug->ref);
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	list_add_tail(&zwplug->link, &disk->zone_wplugs_err_list);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline void disk_zone_wplug_clear_error(struct gendisk *disk,
+					       struct blk_zone_wplug *zwplug)
+{
+	unsigned long flags;
+
+	if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+		return;
+
+	/*
+	 * We are racing with the error handling work which drops the reference
+	 * on the zone write plug after handling the error state. So remove the
+	 * plug from the error list and drop its reference count only if the
+	 * error handling has not yet started, that is, if the zone write plug
+	 * is still listed.
+	 */
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	if (!list_empty(&zwplug->link)) {
+		list_del_init(&zwplug->link);
+		zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+		disk_put_zone_wplug(zwplug);
 	}
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
 
-	if (!q->limits.max_zone_append_sectors) {
-		pr_warn("%s: Invalid 0 maximum zone append limit\n",
-			disk->disk_name);
-		return -ENODEV;
+/*
+ * Set a zone write plug write pointer offset to either 0 (zone reset case)
+ * or to the zone size (zone finish case). This aborts all plugged BIOs, which
+ * is fine to do as doing a zone reset or zone finish while writes are in-flight
+ * is a mistake from the user which will most likely cause all plugged BIOs to
+ * fail anyway.
+ */
+static void disk_zone_wplug_set_wp_offset(struct gendisk *disk,
+					  struct blk_zone_wplug *zwplug,
+					  unsigned int wp_offset)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	/*
+	 * Make sure that a BIO completion or another zone reset or finish
+	 * operation has not already removed the plug from the hash table.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_UNHASHED) {
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		return;
 	}
 
+	/* Update the zone write pointer and abort all plugged BIOs. */
+	zwplug->wp_offset = wp_offset;
+	disk_zone_wplug_abort(zwplug);
+
 	/*
-	 * Ensure that all memory allocations in this context are done as if
-	 * GFP_NOIO was specified.
+	 * Updating the write pointer offset puts back the zone
+	 * in a good state. So clear the error flag and decrement the
+	 * error count if we were in error state.
 	 */
-	args.disk = disk;
-	args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
-	noio_flag = memalloc_noio_save();
-	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
-				       blk_revalidate_zone_cb, &args);
-	if (!ret) {
-		pr_warn("%s: No zones reported\n", disk->disk_name);
-		ret = -ENODEV;
+	disk_zone_wplug_clear_error(disk, zwplug);
+
+	/*
+	 * The zone write plug now has no BIO plugged: remove it from the
+	 * hash table so that it cannot be seen. The plug will be freed
+	 * when the last reference is dropped.
+	 */
+	if (disk_should_remove_zone_wplug(disk, zwplug))
+		disk_remove_zone_wplug(disk, zwplug);
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static bool blk_zone_wplug_handle_reset_or_finish(struct bio *bio,
+						  unsigned int wp_offset)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct blk_zone_wplug *zwplug;
+
+	/* Conventional zones cannot be reset nor finished. */
+	if (disk_zone_is_conv(disk, sector)) {
+		bio_io_error(bio);
+		return true;
 	}
-	memalloc_noio_restore(noio_flag);
 
 	/*
-	 * If zones where reported, make sure that the entire disk capacity
-	 * has been checked.
+	 * If we have a zone write plug, set its write pointer offset to 0
+	 * (reset case) or to the zone size (finish case). This will abort all
+	 * BIOs plugged for the target zone. It is fine as resetting or
+	 * finishing zones while writes are still in-flight will result in the
+	 * writes failing anyway.
 	 */
-	if (ret > 0 && args.sector != capacity) {
-		pr_warn("%s: Missing zones from sector %llu\n",
-			disk->disk_name, args.sector);
-		ret = -ENODEV;
+	zwplug = disk_get_zone_wplug(disk, sector);
+	if (zwplug) {
+		disk_zone_wplug_set_wp_offset(disk, zwplug, wp_offset);
+		disk_put_zone_wplug(zwplug);
 	}
 
+	return false;
+}
+
+static bool blk_zone_wplug_handle_reset_all(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct blk_zone_wplug *zwplug;
+	sector_t sector;
+
 	/*
-	 * Install the new bitmaps and update nr_zones only once the queue is
-	 * stopped and all I/Os are completed (i.e. a scheduler is not
-	 * referencing the bitmaps).
+	 * Set the write pointer offset of all zone write plugs to 0. This will
+	 * abort all plugged BIOs. It is fine as resetting zones while writes
+	 * are still in-flight will result in the writes failing anyway.
 	 */
-	blk_mq_freeze_queue(q);
-	if (ret > 0) {
-		disk->nr_zones = args.nr_zones;
-		swap(disk->seq_zones_wlock, args.seq_zones_wlock);
-		swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
-		if (update_driver_data)
-			update_driver_data(disk);
-		ret = 0;
-	} else {
-		pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
-		disk_free_zone_bitmaps(disk);
+	for (sector = 0; sector < get_capacity(disk);
+	     sector += disk->queue->limits.chunk_sectors) {
+		zwplug = disk_get_zone_wplug(disk, sector);
+		if (zwplug) {
+			disk_zone_wplug_set_wp_offset(disk, zwplug, 0);
+			disk_put_zone_wplug(zwplug);
+		}
 	}
-	blk_mq_unfreeze_queue(q);
 
-	kfree(args.seq_zones_wlock);
-	kfree(args.conv_zones_bitmap);
-	return ret;
+	return false;
 }
-EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+
+static inline void blk_zone_wplug_add_bio(struct blk_zone_wplug *zwplug,
+					  struct bio *bio, unsigned int nr_segs)
+{
+	/*
+	 * Grab an extra reference on the BIO request queue usage counter.
+	 * This reference will be reused to submit a request for the BIO for
+	 * blk-mq devices and dropped when the BIO is failed and after
+	 * it is issued in the case of BIO-based devices.
+	 */
+	percpu_ref_get(&bio->bi_bdev->bd_disk->queue->q_usage_counter);
+
+	/*
+	 * The BIO is being plugged and thus will have to wait for the on-going
+	 * write and for all other writes already plugged. So polling makes
+	 * no sense.
+	 */
+	bio_clear_polled(bio);
+
+	/*
+	 * Reuse the poll cookie field to store the number of segments when
+	 * split to the hardware limits.
+	 */
+	bio->__bi_nr_segments = nr_segs;
+
+	/*
+	 * We always receive BIOs after they are split and ready to be issued.
+	 * The block layer passes the parts of a split BIO in order, and the
+	 * user must also issue write sequentially. So simply add the new BIO
+	 * at the tail of the list to preserve the sequential write order.
+	 */
+	bio_list_add(&zwplug->bio_list, bio);
+}
+
+/*
+ * Called from bio_attempt_back_merge() when a BIO was merged with a request.
+ */
+void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+	struct blk_zone_wplug *zwplug;
+	unsigned long flags;
+
+	/*
+	 * If the BIO was already plugged, then we were called through
+	 * blk_zone_write_plug_init_request() -> blk_attempt_bio_merge().
+	 * For this case, we already hold a reference on the zone write plug for
+	 * the BIO and blk_zone_write_plug_init_request() will handle the
+	 * zone write pointer offset update.
+	 */
+	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+		return;
+
+	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+	/*
+	 * Get a reference on the zone write plug of the target zone and advance
+	 * the zone write pointer offset. Given that this is a merge, we already
+	 * have at least one request and one BIO referencing the zone write
+	 * plug. So this should not fail.
+	 */
+	zwplug = disk_get_zone_wplug(bio->bi_bdev->bd_disk,
+				     bio->bi_iter.bi_sector);
+	if (WARN_ON_ONCE(!zwplug))
+		return;
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+	zwplug->wp_offset += bio_sectors(bio);
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Attempt to merge plugged BIOs with a newly prepared request for a BIO that
+ * already went through zone write plugging (either a new BIO or one that was
+ * unplugged).
+ */
+void blk_zone_write_plug_init_request(struct request *req)
+{
+	sector_t req_back_sector = blk_rq_pos(req) + blk_rq_sectors(req);
+	struct request_queue *q = req->q;
+	struct gendisk *disk = q->disk;
+	unsigned int zone_capacity = disk->zone_capacity;
+	struct blk_zone_wplug *zwplug =
+		disk_get_zone_wplug(disk, blk_rq_pos(req));
+	unsigned long flags;
+	struct bio *bio;
+
+	if (WARN_ON_ONCE(!zwplug))
+		return;
+
+	/*
+	 * Indicate that completion of this request needs to be handled with
+	 * blk_zone_write_plug_finish_request(), which will drop the reference
+	 * on the zone write plug we took above on entry to this function.
+	 */
+	req->rq_flags |= RQF_ZONE_WRITE_PLUGGING;
+
+	if (blk_queue_nomerges(q))
+		return;
+
+	/*
+	 * Walk through the list of plugged BIOs to check if they can be merged
+	 * into the back of the request.
+	 */
+	spin_lock_irqsave(&zwplug->lock, flags);
+	while (zwplug->wp_offset < zone_capacity) {
+		bio = bio_list_peek(&zwplug->bio_list);
+		if (!bio)
+			break;
+
+		if (bio->bi_iter.bi_sector != req_back_sector ||
+		    !blk_rq_merge_ok(req, bio))
+			break;
+
+		WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE_ZEROES &&
+			     !bio->__bi_nr_segments);
+
+		bio_list_pop(&zwplug->bio_list);
+		if (bio_attempt_back_merge(req, bio, bio->__bi_nr_segments) !=
+		    BIO_MERGE_OK) {
+			bio_list_add_head(&zwplug->bio_list, bio);
+			break;
+		}
+
+		/*
+		 * Drop the extra reference on the queue usage we got when
+		 * plugging the BIO and advance the write pointer offset.
+		 */
+		blk_queue_exit(q);
+		zwplug->wp_offset += bio_sectors(bio);
+
+		req_back_sector += bio_sectors(bio);
+	}
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+/*
+ * Check and prepare a BIO for submission by incrementing the write pointer
+ * offset of its zone write plug and changing zone append operations into
+ * regular write when zone append emulation is needed.
+ */
+static bool blk_zone_wplug_prepare_bio(struct blk_zone_wplug *zwplug,
+				       struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+
+	/*
+	 * Check that the user is not attempting to write to a full zone.
+	 * We know such BIO will fail, and that would potentially overflow our
+	 * write pointer offset beyond the end of the zone.
+	 */
+	if (zwplug->wp_offset >= disk->zone_capacity)
+		goto err;
+
+	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+		/*
+		 * Use a regular write starting at the current write pointer.
+		 * Similarly to native zone append operations, do not allow
+		 * merging.
+		 */
+		bio->bi_opf &= ~REQ_OP_MASK;
+		bio->bi_opf |= REQ_OP_WRITE | REQ_NOMERGE;
+		bio->bi_iter.bi_sector += zwplug->wp_offset;
+
+		/*
+		 * Remember that this BIO is in fact a zone append operation
+		 * so that we can restore its operation code on completion.
+		 */
+		bio_set_flag(bio, BIO_EMULATES_ZONE_APPEND);
+	} else {
+		/*
+		 * Check for non-sequential writes early because we avoid a
+		 * whole lot of error handling trouble if we don't send it off
+		 * to the driver.
+		 */
+		if (bio_offset_from_zone_start(bio) != zwplug->wp_offset)
+			goto err;
+	}
+
+	/* Advance the zone write pointer offset. */
+	zwplug->wp_offset += bio_sectors(bio);
+
+	return true;
+
+err:
+	/* We detected an invalid write BIO: schedule error recovery. */
+	disk_zone_wplug_set_error(disk, zwplug);
+	kblockd_schedule_work(&disk->zone_wplugs_work);
+	return false;
+}
+
+static bool blk_zone_wplug_handle_write(struct bio *bio, unsigned int nr_segs)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct blk_zone_wplug *zwplug;
+	gfp_t gfp_mask = GFP_NOIO;
+	unsigned long flags;
+
+	/*
+	 * BIOs must be fully contained within a zone so that we use the correct
+	 * zone write plug for the entire BIO. For blk-mq devices, the block
+	 * layer should already have done any splitting required to ensure this
+	 * and this BIO should thus not be straddling zone boundaries. For
+	 * BIO-based devices, it is the responsibility of the driver to split
+	 * the bio before submitting it.
+	 */
+	if (WARN_ON_ONCE(bio_straddles_zones(bio))) {
+		bio_io_error(bio);
+		return true;
+	}
+
+	/* Conventional zones do not need write plugging. */
+	if (disk_zone_is_conv(disk, sector)) {
+		/* Zone append to conventional zones is not allowed. */
+		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+			bio_io_error(bio);
+			return true;
+		}
+		return false;
+	}
+
+	if (bio->bi_opf & REQ_NOWAIT)
+		gfp_mask = GFP_NOWAIT;
+
+	zwplug = disk_get_and_lock_zone_wplug(disk, sector, gfp_mask, &flags);
+	if (!zwplug) {
+		bio_io_error(bio);
+		return true;
+	}
+
+	/* Indicate that this BIO is being handled using zone write plugging. */
+	bio_set_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+	/*
+	 * If the zone is already plugged or has a pending error, add the BIO
+	 * to the plug BIO list. Otherwise, plug and let the BIO execute.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_BUSY)
+		goto plug;
+
+	/*
+	 * If an error is detected when preparing the BIO, add it to the BIO
+	 * list so that error recovery can deal with it.
+	 */
+	if (!blk_zone_wplug_prepare_bio(zwplug, bio))
+		goto plug;
+
+	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	return false;
+
+plug:
+	zwplug->flags |= BLK_ZONE_WPLUG_PLUGGED;
+	blk_zone_wplug_add_bio(zwplug, bio, nr_segs);
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	return true;
+}
+
+/**
+ * blk_zone_plug_bio - Handle a zone write BIO with zone write plugging
+ * @bio: The BIO being submitted
+ * @nr_segs: The number of physical segments of @bio
+ *
+ * Handle write, write zeroes and zone append operations requiring emulation
+ * using zone write plugging.
+ *
+ * Return true whenever @bio execution needs to be delayed through the zone
+ * write plug. Otherwise, return false to let the submission path process
+ * @bio normally.
+ */
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+	struct block_device *bdev = bio->bi_bdev;
+
+	if (!bdev->bd_disk->zone_wplugs_hash)
+		return false;
+
+	/*
+	 * If the BIO already has the plugging flag set, then it was already
+	 * handled through this path and this is a submission from the zone
+	 * plug bio submit work.
+	 */
+	if (bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING))
+		return false;
+
+	/*
+	 * We do not need to do anything special for empty flush BIOs, e.g
+	 * BIOs such as issued by blkdev_issue_flush(). The is because it is
+	 * the responsibility of the user to first wait for the completion of
+	 * write operations for flush to have any effect on the persistence of
+	 * the written data.
+	 */
+	if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+		return false;
+
+	/*
+	 * Regular writes and write zeroes need to be handled through the target
+	 * zone write plug. This includes writes with REQ_FUA | REQ_PREFLUSH
+	 * which may need to go through the flush machinery depending on the
+	 * target device capabilities. Plugging such writes is fine as the flush
+	 * machinery operates at the request level, below the plug, and
+	 * completion of the flush sequence will go through the regular BIO
+	 * completion, which will handle zone write plugging.
+	 * Zone append operations for devices that requested emulation must
+	 * also be plugged so that these BIOs can be changed into regular
+	 * write BIOs.
+	 * Zone reset, reset all and finish commands need special treatment
+	 * to correctly track the write pointer offset of zones. These commands
+	 * are not plugged as we do not need serialization with write
+	 * operations. It is the responsibility of the user to not issue reset
+	 * and finish commands when write operations are in flight.
+	 */
+	switch (bio_op(bio)) {
+	case REQ_OP_ZONE_APPEND:
+		if (!bdev_emulates_zone_append(bdev))
+			return false;
+		fallthrough;
+	case REQ_OP_WRITE:
+	case REQ_OP_WRITE_ZEROES:
+		return blk_zone_wplug_handle_write(bio, nr_segs);
+	case REQ_OP_ZONE_RESET:
+		return blk_zone_wplug_handle_reset_or_finish(bio, 0);
+	case REQ_OP_ZONE_FINISH:
+		return blk_zone_wplug_handle_reset_or_finish(bio,
+						bdev_zone_sectors(bdev));
+	case REQ_OP_ZONE_RESET_ALL:
+		return blk_zone_wplug_handle_reset_all(bio);
+	default:
+		return false;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(blk_zone_plug_bio);
+
+static void disk_zone_wplug_schedule_bio_work(struct gendisk *disk,
+					      struct blk_zone_wplug *zwplug)
+{
+	/*
+	 * Take a reference on the zone write plug and schedule the submission
+	 * of the next plugged BIO. blk_zone_wplug_bio_work() will release the
+	 * reference we take here.
+	 */
+	WARN_ON_ONCE(!(zwplug->flags & BLK_ZONE_WPLUG_PLUGGED));
+	atomic_inc(&zwplug->ref);
+	queue_work(disk->zone_wplugs_wq, &zwplug->bio_work);
+}
+
+static void disk_zone_wplug_unplug_bio(struct gendisk *disk,
+				       struct blk_zone_wplug *zwplug)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	/*
+	 * If we had an error, schedule error recovery. The recovery work
+	 * will restart submission of plugged BIOs.
+	 */
+	if (zwplug->flags & BLK_ZONE_WPLUG_ERROR) {
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		kblockd_schedule_work(&disk->zone_wplugs_work);
+		return;
+	}
+
+	/* Schedule submission of the next plugged BIO if we have one. */
+	if (!bio_list_empty(&zwplug->bio_list)) {
+		disk_zone_wplug_schedule_bio_work(disk, zwplug);
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		return;
+	}
+
+	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+
+	/*
+	 * If the zone is full (it was fully written or finished, or empty
+	 * (it was reset), remove its zone write plug from the hash table.
+	 */
+	if (disk_should_remove_zone_wplug(disk, zwplug))
+		disk_remove_zone_wplug(disk, zwplug);
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+void blk_zone_write_plug_bio_endio(struct bio *bio)
+{
+	struct gendisk *disk = bio->bi_bdev->bd_disk;
+	struct blk_zone_wplug *zwplug =
+		disk_get_zone_wplug(disk, bio->bi_iter.bi_sector);
+	unsigned long flags;
+
+	if (WARN_ON_ONCE(!zwplug))
+		return;
+
+	/* Make sure we do not see this BIO again by clearing the plug flag. */
+	bio_clear_flag(bio, BIO_ZONE_WRITE_PLUGGING);
+
+	/*
+	 * If this is a regular write emulating a zone append operation,
+	 * restore the original operation code.
+	 */
+	if (bio_flagged(bio, BIO_EMULATES_ZONE_APPEND)) {
+		bio->bi_opf &= ~REQ_OP_MASK;
+		bio->bi_opf |= REQ_OP_ZONE_APPEND;
+	}
+
+	/*
+	 * If the BIO failed, mark the plug as having an error to trigger
+	 * recovery.
+	 */
+	if (bio->bi_status != BLK_STS_OK) {
+		spin_lock_irqsave(&zwplug->lock, flags);
+		disk_zone_wplug_set_error(disk, zwplug);
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+	}
+
+	/* Drop the reference we took when the BIO was issued. */
+	disk_put_zone_wplug(zwplug);
+
+	/*
+	 * For BIO-based devices, blk_zone_write_plug_finish_request()
+	 * is not called. So we need to schedule execution of the next
+	 * plugged BIO here.
+	 */
+	if (bio->bi_bdev->bd_has_submit_bio)
+		disk_zone_wplug_unplug_bio(disk, zwplug);
+
+	/* Drop the reference we took when entering this function. */
+	disk_put_zone_wplug(zwplug);
+}
+
+void blk_zone_write_plug_finish_request(struct request *req)
+{
+	struct gendisk *disk = req->q->disk;
+	struct blk_zone_wplug *zwplug;
+
+	zwplug = disk_get_zone_wplug(disk, req->__sector);
+	if (WARN_ON_ONCE(!zwplug))
+		return;
+
+	req->rq_flags &= ~RQF_ZONE_WRITE_PLUGGING;
+
+	/*
+	 * Drop the reference we took when the request was initialized in
+	 * blk_zone_write_plug_init_request().
+	 */
+	disk_put_zone_wplug(zwplug);
+
+	disk_zone_wplug_unplug_bio(disk, zwplug);
+
+	/* Drop the reference we took when entering this function. */
+	disk_put_zone_wplug(zwplug);
+}
+
+static void blk_zone_wplug_bio_work(struct work_struct *work)
+{
+	struct blk_zone_wplug *zwplug =
+		container_of(work, struct blk_zone_wplug, bio_work);
+	struct block_device *bdev;
+	unsigned long flags;
+	struct bio *bio;
+
+	/*
+	 * Submit the next plugged BIO. If we do not have any, clear
+	 * the plugged flag.
+	 */
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	bio = bio_list_pop(&zwplug->bio_list);
+	if (!bio) {
+		zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		goto put_zwplug;
+	}
+
+	if (!blk_zone_wplug_prepare_bio(zwplug, bio)) {
+		/* Error recovery will decide what to do with the BIO. */
+		bio_list_add_head(&zwplug->bio_list, bio);
+		spin_unlock_irqrestore(&zwplug->lock, flags);
+		goto put_zwplug;
+	}
+
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+
+	bdev = bio->bi_bdev;
+	submit_bio_noacct_nocheck(bio);
+
+	/*
+	 * blk-mq devices will reuse the extra reference on the request queue
+	 * usage counter we took when the BIO was plugged, but the submission
+	 * path for BIO-based devices will not do that. So drop this extra
+	 * reference here.
+	 */
+	if (bdev->bd_has_submit_bio)
+		blk_queue_exit(bdev->bd_disk->queue);
+
+put_zwplug:
+	/* Drop the reference we took in disk_zone_wplug_schedule_bio_work(). */
+	disk_put_zone_wplug(zwplug);
+}
+
+static unsigned int blk_zone_wp_offset(struct blk_zone *zone)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+		return zone->wp - zone->start;
+	case BLK_ZONE_COND_FULL:
+		return zone->len;
+	case BLK_ZONE_COND_EMPTY:
+		return 0;
+	case BLK_ZONE_COND_NOT_WP:
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+	default:
+		/*
+		 * Conventional, offline and read-only zones do not have a valid
+		 * write pointer.
+		 */
+		return UINT_MAX;
+	}
+}
+
+static int blk_zone_wplug_report_zone_cb(struct blk_zone *zone,
+					 unsigned int idx, void *data)
+{
+	struct blk_zone *zonep = data;
+
+	*zonep = *zone;
+	return 0;
+}
+
+static void disk_zone_wplug_handle_error(struct gendisk *disk,
+					 struct blk_zone_wplug *zwplug)
+{
+	sector_t zone_start_sector =
+		bdev_zone_sectors(disk->part0) * zwplug->zone_no;
+	unsigned int noio_flag;
+	struct blk_zone zone;
+	unsigned long flags;
+	int ret;
+
+	/* Get the current zone information from the device. */
+	noio_flag = memalloc_noio_save();
+	ret = disk->fops->report_zones(disk, zone_start_sector, 1,
+				       blk_zone_wplug_report_zone_cb, &zone);
+	memalloc_noio_restore(noio_flag);
+
+	spin_lock_irqsave(&zwplug->lock, flags);
+
+	/*
+	 * A zone reset or finish may have cleared the error already. In such
+	 * case, do nothing as the report zones may have seen the "old" write
+	 * pointer value before the reset/finish operation completed.
+	 */
+	if (!(zwplug->flags & BLK_ZONE_WPLUG_ERROR))
+		goto unlock;
+
+	zwplug->flags &= ~BLK_ZONE_WPLUG_ERROR;
+
+	if (ret != 1) {
+		/*
+		 * We failed to get the zone information, meaning that something
+		 * is likely really wrong with the device. Abort all remaining
+		 * plugged BIOs as otherwise we could endup waiting forever on
+		 * plugged BIOs to complete if there is a queue freeze on-going.
+		 */
+		disk_zone_wplug_abort(zwplug);
+		goto unplug;
+	}
+
+	/* Update the zone write pointer offset. */
+	zwplug->wp_offset = blk_zone_wp_offset(&zone);
+	disk_zone_wplug_abort_unaligned(disk, zwplug);
+
+	/* Restart BIO submission if we still have any BIO left. */
+	if (!bio_list_empty(&zwplug->bio_list)) {
+		disk_zone_wplug_schedule_bio_work(disk, zwplug);
+		goto unlock;
+	}
+
+unplug:
+	zwplug->flags &= ~BLK_ZONE_WPLUG_PLUGGED;
+	if (disk_should_remove_zone_wplug(disk, zwplug))
+		disk_remove_zone_wplug(disk, zwplug);
+
+unlock:
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+}
+
+static void disk_zone_wplugs_work(struct work_struct *work)
+{
+	struct gendisk *disk =
+		container_of(work, struct gendisk, zone_wplugs_work);
+	struct blk_zone_wplug *zwplug;
+	unsigned long flags;
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+
+	while (!list_empty(&disk->zone_wplugs_err_list)) {
+		zwplug = list_first_entry(&disk->zone_wplugs_err_list,
+					  struct blk_zone_wplug, link);
+		list_del_init(&zwplug->link);
+		spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+
+		disk_zone_wplug_handle_error(disk, zwplug);
+		disk_put_zone_wplug(zwplug);
+
+		spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
+static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
+{
+	return 1U << disk->zone_wplugs_hash_bits;
+}
+
+void disk_init_zone_resources(struct gendisk *disk)
+{
+	spin_lock_init(&disk->zone_wplugs_lock);
+	INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
+	INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
+}
+
+/*
+ * For the size of a disk zone write plug hash table, use the size of the
+ * zone write plug mempool, which is the maximum of the disk open zones and
+ * active zones limits. But do not exceed 4KB (512 hlist head entries), that is,
+ * 9 bits. For a disk that has no limits, mempool size defaults to 128.
+ */
+#define BLK_ZONE_WPLUG_MAX_HASH_BITS		9
+#define BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE	128
+
+static int disk_alloc_zone_resources(struct gendisk *disk,
+				     unsigned int pool_size)
+{
+	unsigned int i;
+
+	disk->zone_wplugs_hash_bits =
+		min(ilog2(pool_size) + 1, BLK_ZONE_WPLUG_MAX_HASH_BITS);
+
+	disk->zone_wplugs_hash =
+		kcalloc(disk_zone_wplugs_hash_size(disk),
+			sizeof(struct hlist_head), GFP_KERNEL);
+	if (!disk->zone_wplugs_hash)
+		return -ENOMEM;
+
+	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
+		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
+
+	disk->zone_wplugs_pool = mempool_create_kmalloc_pool(pool_size,
+						sizeof(struct blk_zone_wplug));
+	if (!disk->zone_wplugs_pool)
+		goto free_hash;
+
+	disk->zone_wplugs_wq =
+		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI,
+				pool_size, disk->disk_name);
+	if (!disk->zone_wplugs_wq)
+		goto destroy_pool;
+
+	return 0;
+
+destroy_pool:
+	mempool_destroy(disk->zone_wplugs_pool);
+	disk->zone_wplugs_pool = NULL;
+free_hash:
+	kfree(disk->zone_wplugs_hash);
+	disk->zone_wplugs_hash = NULL;
+	disk->zone_wplugs_hash_bits = 0;
+	return -ENOMEM;
+}
+
+static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
+{
+	struct blk_zone_wplug *zwplug;
+	unsigned int i;
+
+	if (!disk->zone_wplugs_hash)
+		return;
+
+	/* Free all the zone write plugs we have. */
+	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+		while (!hlist_empty(&disk->zone_wplugs_hash[i])) {
+			zwplug = hlist_entry(disk->zone_wplugs_hash[i].first,
+					     struct blk_zone_wplug, node);
+			atomic_inc(&zwplug->ref);
+			disk_remove_zone_wplug(disk, zwplug);
+			disk_put_zone_wplug(zwplug);
+		}
+	}
+
+	kfree(disk->zone_wplugs_hash);
+	disk->zone_wplugs_hash = NULL;
+	disk->zone_wplugs_hash_bits = 0;
+}
+
+void disk_free_zone_resources(struct gendisk *disk)
+{
+	cancel_work_sync(&disk->zone_wplugs_work);
+
+	if (disk->zone_wplugs_wq) {
+		destroy_workqueue(disk->zone_wplugs_wq);
+		disk->zone_wplugs_wq = NULL;
+	}
+
+	disk_destroy_zone_wplugs_hash_table(disk);
+
+	/*
+	 * Wait for the zone write plugs to be RCU-freed before
+	 * destorying the mempool.
+	 */
+	rcu_barrier();
+
+	mempool_destroy(disk->zone_wplugs_pool);
+	disk->zone_wplugs_pool = NULL;
+
+	kfree(disk->conv_zones_bitmap);
+	disk->conv_zones_bitmap = NULL;
+	disk->zone_capacity = 0;
+	disk->nr_zones = 0;
+}
+
+static inline bool disk_need_zone_resources(struct gendisk *disk)
+{
+	/*
+	 * All mq zoned devices need zone resources so that the block layer
+	 * can automatically handle write BIO plugging. BIO-based device drivers
+	 * (e.g. DM devices) are normally responsible for handling zone write
+	 * ordering and do not need zone resources, unless the driver requires
+	 * zone append emulation.
+	 */
+	return queue_is_mq(disk->queue) ||
+		queue_emulates_zone_append(disk->queue);
+}
+
+static int disk_revalidate_zone_resources(struct gendisk *disk,
+					  unsigned int nr_zones)
+{
+	struct queue_limits *lim = &disk->queue->limits;
+	unsigned int pool_size;
+
+	if (!disk_need_zone_resources(disk))
+		return 0;
+
+	/*
+	 * If the device has no limit on the maximum number of open and active
+	 * zones, use BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE.
+	 */
+	pool_size = max(lim->max_open_zones, lim->max_active_zones);
+	if (!pool_size)
+		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+
+	if (!disk->zone_wplugs_hash)
+		return disk_alloc_zone_resources(disk, pool_size);
+
+	return 0;
+}
+
+struct blk_revalidate_zone_args {
+	struct gendisk	*disk;
+	unsigned long	*conv_zones_bitmap;
+	unsigned int	nr_zones;
+	unsigned int	zone_capacity;
+	sector_t	sector;
+};
+
+/*
+ * Update the disk zone resources information and device queue limits.
+ * The disk queue is frozen when this is executed.
+ */
+static int disk_update_zone_resources(struct gendisk *disk,
+				      struct blk_revalidate_zone_args *args)
+{
+	struct request_queue *q = disk->queue;
+	unsigned int nr_seq_zones, nr_conv_zones = 0;
+	unsigned int pool_size;
+	struct queue_limits lim;
+
+	disk->nr_zones = args->nr_zones;
+	disk->zone_capacity = args->zone_capacity;
+	swap(disk->conv_zones_bitmap, args->conv_zones_bitmap);
+	if (disk->conv_zones_bitmap)
+		nr_conv_zones = bitmap_weight(disk->conv_zones_bitmap,
+					      disk->nr_zones);
+	if (nr_conv_zones >= disk->nr_zones) {
+		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
+			disk->disk_name, nr_conv_zones, disk->nr_zones);
+		return -ENODEV;
+	}
+
+	if (!disk->zone_wplugs_pool)
+		return 0;
+
+	/*
+	 * If the device has no limit on the maximum number of open and active
+	 * zones, set its max open zone limit to the mempool size to indicate
+	 * to the user that there is a potential performance impact due to
+	 * dynamic zone write plug allocation when simultaneously writing to
+	 * more zones than the size of the mempool.
+	 */
+	lim = queue_limits_start_update(q);
+
+	nr_seq_zones = disk->nr_zones - nr_conv_zones;
+	pool_size = max(lim.max_open_zones, lim.max_active_zones);
+	if (!pool_size)
+		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_seq_zones);
+
+	mempool_resize(disk->zone_wplugs_pool, pool_size);
+
+	if (!lim.max_open_zones && !lim.max_active_zones) {
+		if (pool_size < nr_seq_zones)
+			lim.max_open_zones = pool_size;
+		else
+			lim.max_open_zones = 0;
+	}
+
+	return queue_limits_commit_update(q, &lim);
+}
+
+static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
+				    struct blk_revalidate_zone_args *args)
+{
+	struct gendisk *disk = args->disk;
+	struct request_queue *q = disk->queue;
+
+	if (zone->capacity != zone->len) {
+		pr_warn("%s: Invalid conventional zone capacity\n",
+			disk->disk_name);
+		return -ENODEV;
+	}
+
+	if (!disk_need_zone_resources(disk))
+		return 0;
+
+	if (!args->conv_zones_bitmap) {
+		args->conv_zones_bitmap =
+			blk_alloc_zone_bitmap(q->node, args->nr_zones);
+		if (!args->conv_zones_bitmap)
+			return -ENOMEM;
+	}
+
+	set_bit(idx, args->conv_zones_bitmap);
+
+	return 0;
+}
+
+static int blk_revalidate_seq_zone(struct blk_zone *zone, unsigned int idx,
+				   struct blk_revalidate_zone_args *args)
+{
+	struct gendisk *disk = args->disk;
+	struct blk_zone_wplug *zwplug;
+	unsigned int wp_offset;
+	unsigned long flags;
+
+	/*
+	 * Remember the capacity of the first sequential zone and check
+	 * if it is constant for all zones.
+	 */
+	if (!args->zone_capacity)
+		args->zone_capacity = zone->capacity;
+	if (zone->capacity != args->zone_capacity) {
+		pr_warn("%s: Invalid variable zone capacity\n",
+			disk->disk_name);
+		return -ENODEV;
+	}
+
+	/*
+	 * We need to track the write pointer of all zones that are not
+	 * empty nor full. So make sure we have a zone write plug for
+	 * such zone if the device has a zone write plug hash table.
+	 */
+	if (!disk->zone_wplugs_hash)
+		return 0;
+
+	wp_offset = blk_zone_wp_offset(zone);
+	if (!wp_offset || wp_offset >= zone->capacity)
+		return 0;
+
+	zwplug = disk_get_and_lock_zone_wplug(disk, zone->wp, GFP_NOIO, &flags);
+	if (!zwplug)
+		return -ENOMEM;
+	spin_unlock_irqrestore(&zwplug->lock, flags);
+	disk_put_zone_wplug(zwplug);
+
+	return 0;
+}
+
+/*
+ * Helper function to check the validity of zones of a zoned block device.
+ */
+static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
+				  void *data)
+{
+	struct blk_revalidate_zone_args *args = data;
+	struct gendisk *disk = args->disk;
+	sector_t capacity = get_capacity(disk);
+	sector_t zone_sectors = disk->queue->limits.chunk_sectors;
+	int ret;
+
+	/* Check for bad zones and holes in the zone report */
+	if (zone->start != args->sector) {
+		pr_warn("%s: Zone gap at sectors %llu..%llu\n",
+			disk->disk_name, args->sector, zone->start);
+		return -ENODEV;
+	}
+
+	if (zone->start >= capacity || !zone->len) {
+		pr_warn("%s: Invalid zone start %llu, length %llu\n",
+			disk->disk_name, zone->start, zone->len);
+		return -ENODEV;
+	}
+
+	/*
+	 * All zones must have the same size, with the exception on an eventual
+	 * smaller last zone.
+	 */
+	if (zone->start + zone->len < capacity) {
+		if (zone->len != zone_sectors) {
+			pr_warn("%s: Invalid zoned device with non constant zone size\n",
+				disk->disk_name);
+			return -ENODEV;
+		}
+	} else if (zone->len > zone_sectors) {
+		pr_warn("%s: Invalid zoned device with larger last zone size\n",
+			disk->disk_name);
+		return -ENODEV;
+	}
+
+	if (!zone->capacity || zone->capacity > zone->len) {
+		pr_warn("%s: Invalid zone capacity\n",
+			disk->disk_name);
+		return -ENODEV;
+	}
+
+	/* Check zone type */
+	switch (zone->type) {
+	case BLK_ZONE_TYPE_CONVENTIONAL:
+		ret = blk_revalidate_conv_zone(zone, idx, args);
+		break;
+	case BLK_ZONE_TYPE_SEQWRITE_REQ:
+		ret = blk_revalidate_seq_zone(zone, idx, args);
+		break;
+	case BLK_ZONE_TYPE_SEQWRITE_PREF:
+	default:
+		pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
+			disk->disk_name, (int)zone->type, zone->start);
+		ret = -ENODEV;
+	}
+
+	if (!ret)
+		args->sector += zone->len;
+
+	return ret;
+}
+
+/**
+ * blk_revalidate_disk_zones - (re)allocate and initialize zone write plugs
+ * @disk:	Target disk
+ *
+ * Helper function for low-level device drivers to check, (re) allocate and
+ * initialize resources used for managing zoned disks. This function should
+ * normally be called by blk-mq based drivers when a zoned gendisk is probed
+ * and when the zone configuration of the gendisk changes (e.g. after a format).
+ * Before calling this function, the device driver must already have set the
+ * device zone size (chunk_sector limit) and the max zone append limit.
+ * BIO based drivers can also use this function as long as the device queue
+ * can be safely frozen.
+ */
+int blk_revalidate_disk_zones(struct gendisk *disk)
+{
+	struct request_queue *q = disk->queue;
+	sector_t zone_sectors = q->limits.chunk_sectors;
+	sector_t capacity = get_capacity(disk);
+	struct blk_revalidate_zone_args args = { };
+	unsigned int noio_flag;
+	int ret = -ENOMEM;
+
+	if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+		return -EIO;
+
+	if (!capacity)
+		return -ENODEV;
+
+	/*
+	 * Checks that the device driver indicated a valid zone size and that
+	 * the max zone append limit is set.
+	 */
+	if (!zone_sectors || !is_power_of_2(zone_sectors)) {
+		pr_warn("%s: Invalid non power of two zone size (%llu)\n",
+			disk->disk_name, zone_sectors);
+		return -ENODEV;
+	}
+
+	if (!queue_max_zone_append_sectors(q)) {
+		pr_warn("%s: Invalid 0 maximum zone append limit\n",
+			disk->disk_name);
+		return -ENODEV;
+	}
+
+	/*
+	 * Ensure that all memory allocations in this context are done as if
+	 * GFP_NOIO was specified.
+	 */
+	args.disk = disk;
+	args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
+	noio_flag = memalloc_noio_save();
+	ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+	if (ret) {
+		memalloc_noio_restore(noio_flag);
+		return ret;
+	}
+	ret = disk->fops->report_zones(disk, 0, UINT_MAX,
+				       blk_revalidate_zone_cb, &args);
+	if (!ret) {
+		pr_warn("%s: No zones reported\n", disk->disk_name);
+		ret = -ENODEV;
+	}
+	memalloc_noio_restore(noio_flag);
+
+	/*
+	 * If zones where reported, make sure that the entire disk capacity
+	 * has been checked.
+	 */
+	if (ret > 0 && args.sector != capacity) {
+		pr_warn("%s: Missing zones from sector %llu\n",
+			disk->disk_name, args.sector);
+		ret = -ENODEV;
+	}
+
+	/*
+	 * Set the new disk zone parameters only once the queue is frozen and
+	 * all I/Os are completed.
+	 */
+	blk_mq_freeze_queue(q);
+	if (ret > 0)
+		ret = disk_update_zone_resources(disk, &args);
+	else
+		pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
+	if (ret)
+		disk_free_zone_resources(disk);
+	blk_mq_unfreeze_queue(q);
+
+	kfree(args.conv_zones_bitmap);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
+
+#ifdef CONFIG_BLK_DEBUG_FS
+
+int queue_zone_wplugs_show(void *data, struct seq_file *m)
+{
+	struct request_queue *q = data;
+	struct gendisk *disk = q->disk;
+	struct blk_zone_wplug *zwplug;
+	unsigned int zwp_wp_offset, zwp_flags;
+	unsigned int zwp_zone_no, zwp_ref;
+	unsigned int zwp_bio_list_size, i;
+	unsigned long flags;
+
+	if (!disk->zone_wplugs_hash)
+		return 0;
+
+	rcu_read_lock();
+	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++) {
+		hlist_for_each_entry_rcu(zwplug,
+					 &disk->zone_wplugs_hash[i], node) {
+			spin_lock_irqsave(&zwplug->lock, flags);
+			zwp_zone_no = zwplug->zone_no;
+			zwp_flags = zwplug->flags;
+			zwp_ref = atomic_read(&zwplug->ref);
+			zwp_wp_offset = zwplug->wp_offset;
+			zwp_bio_list_size = bio_list_size(&zwplug->bio_list);
+			spin_unlock_irqrestore(&zwplug->lock, flags);
+
+			seq_printf(m, "%u 0x%x %u %u %u\n",
+				   zwp_zone_no, zwp_flags, zwp_ref,
+				   zwp_wp_offset, zwp_bio_list_size);
+		}
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+#endif
diff --git a/block/blk.h b/block/blk.h
index d9f584984bc4..6e94c10af798 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -38,6 +38,7 @@ void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
 void submit_bio_noacct_nocheck(struct bio *bio);
+void bio_await_chain(struct bio *bio);
 
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
@@ -269,6 +270,14 @@ static inline void bio_integrity_free(struct bio *bio)
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 
+enum bio_merge_status {
+	BIO_MERGE_OK,
+	BIO_MERGE_NONE,
+	BIO_MERGE_FAILED,
+};
+
+enum bio_merge_status bio_attempt_back_merge(struct request *req,
+		struct bio *bio, unsigned int nr_segs);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs);
 bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
@@ -357,6 +366,7 @@ static inline bool blk_do_io_stat(struct request *rq)
 }
 
 void update_io_ticks(struct block_device *part, unsigned long now, bool end);
+unsigned int part_in_flight(struct block_device *part);
 
 static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 {
@@ -378,17 +388,6 @@ static inline void ioc_clear_queue(struct request_queue *q)
 }
 #endif /* CONFIG_BLK_ICQ */
 
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
-extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
-	const char *page, size_t count);
-extern void blk_throtl_bio_endio(struct bio *bio);
-extern void blk_throtl_stat_add(struct request *rq, u64 time);
-#else
-static inline void blk_throtl_bio_endio(struct bio *bio) { }
-static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
-#endif
-
 struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
 
 static inline bool blk_queue_may_bounce(struct request_queue *q)
@@ -407,13 +406,85 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
 }
 
 #ifdef CONFIG_BLK_DEV_ZONED
-void disk_free_zone_bitmaps(struct gendisk *disk);
+void disk_init_zone_resources(struct gendisk *disk);
+void disk_free_zone_resources(struct gendisk *disk);
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+	return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+	return bio_op(bio) == REQ_OP_ZONE_APPEND ||
+		bio_flagged(bio, BIO_EMULATES_ZONE_APPEND);
+}
+void blk_zone_write_plug_bio_merged(struct bio *bio);
+void blk_zone_write_plug_init_request(struct request *rq);
+static inline void blk_zone_update_request_bio(struct request *rq,
+					       struct bio *bio)
+{
+	/*
+	 * For zone append requests, the request sector indicates the location
+	 * at which the BIO data was written. Return this value to the BIO
+	 * issuer through the BIO iter sector.
+	 * For plugged zone writes, which include emulated zone append, we need
+	 * the original BIO sector so that blk_zone_write_plug_bio_endio() can
+	 * lookup the zone write plug.
+	 */
+	if (req_op(rq) == REQ_OP_ZONE_APPEND || bio_zone_write_plugging(bio))
+		bio->bi_iter.bi_sector = rq->__sector;
+}
+void blk_zone_write_plug_bio_endio(struct bio *bio);
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+	/*
+	 * For write BIOs to zoned devices, signal the completion of the BIO so
+	 * that the next write BIO can be submitted by zone write plugging.
+	 */
+	if (bio_zone_write_plugging(bio))
+		blk_zone_write_plug_bio_endio(bio);
+}
+
+void blk_zone_write_plug_finish_request(struct request *rq);
+static inline void blk_zone_finish_request(struct request *rq)
+{
+	if (rq->rq_flags & RQF_ZONE_WRITE_PLUGGING)
+		blk_zone_write_plug_finish_request(rq);
+}
 int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
 		unsigned long arg);
 int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 		unsigned int cmd, unsigned long arg);
 #else /* CONFIG_BLK_DEV_ZONED */
-static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline void disk_init_zone_resources(struct gendisk *disk)
+{
+}
+static inline void disk_free_zone_resources(struct gendisk *disk)
+{
+}
+static inline bool bio_zone_write_plugging(struct bio *bio)
+{
+	return false;
+}
+static inline bool bio_is_zone_append(struct bio *bio)
+{
+	return false;
+}
+static inline void blk_zone_write_plug_bio_merged(struct bio *bio)
+{
+}
+static inline void blk_zone_write_plug_init_request(struct request *rq)
+{
+}
+static inline void blk_zone_update_request_bio(struct request *rq,
+					       struct bio *bio)
+{
+}
+static inline void blk_zone_bio_endio(struct bio *bio)
+{
+}
+static inline void blk_zone_finish_request(struct request *rq)
+{
+}
 static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
 		unsigned int cmd, unsigned long arg)
 {
diff --git a/block/elevator.c b/block/elevator.c
index 5ff093cb3cf8..f64ebd726e58 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,13 +83,6 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
 }
 EXPORT_SYMBOL(elv_bio_merge_ok);
 
-static inline bool elv_support_features(struct request_queue *q,
-		const struct elevator_type *e)
-{
-	return (q->required_elevator_features & e->elevator_features) ==
-		q->required_elevator_features;
-}
-
 /**
  * elevator_match - Check whether @e's name or alias matches @name
  * @e: Scheduler to test
@@ -120,7 +113,7 @@ static struct elevator_type *elevator_find_get(struct request_queue *q,
 
 	spin_lock(&elv_list_lock);
 	e = __elevator_find(name);
-	if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
+	if (e && (!elevator_tryget(e)))
 		e = NULL;
 	spin_unlock(&elv_list_lock);
 	return e;
@@ -580,34 +573,8 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
 }
 
 /*
- * Get the first elevator providing the features required by the request queue.
- * Default to "none" if no matching elevator is found.
- */
-static struct elevator_type *elevator_get_by_features(struct request_queue *q)
-{
-	struct elevator_type *e, *found = NULL;
-
-	spin_lock(&elv_list_lock);
-
-	list_for_each_entry(e, &elv_list, list) {
-		if (elv_support_features(q, e)) {
-			found = e;
-			break;
-		}
-	}
-
-	if (found && !elevator_tryget(found))
-		found = NULL;
-
-	spin_unlock(&elv_list_lock);
-	return found;
-}
-
-/*
- * For a device queue that has no required features, use the default elevator
- * settings. Otherwise, use the first elevator available matching the required
- * features. If no suitable elevator is find or if the chosen elevator
- * initialization fails, fall back to the "none" elevator (no elevator).
+ * Use the default elevator settings. If the chosen elevator initialization
+ * fails, fall back to the "none" elevator (no elevator).
  */
 void elevator_init_mq(struct request_queue *q)
 {
@@ -622,10 +589,7 @@ void elevator_init_mq(struct request_queue *q)
 	if (unlikely(q->elevator))
 		return;
 
-	if (!q->required_elevator_features)
-		e = elevator_get_default(q);
-	else
-		e = elevator_get_by_features(q);
+	e = elevator_get_default(q);
 	if (!e)
 		return;
 
@@ -781,7 +745,7 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
 	list_for_each_entry(e, &elv_list, list) {
 		if (e == cur)
 			len += sprintf(name+len, "[%s] ", e->elevator_name);
-		else if (elv_support_features(q, e))
+		else
 			len += sprintf(name+len, "%s ", e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
diff --git a/block/elevator.h b/block/elevator.h
index 7ca3d7b6ed82..e9a050a96e53 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -74,7 +74,6 @@ struct elevator_type
 	struct elv_fs_entry *elevator_attrs;
 	const char *elevator_name;
 	const char *elevator_alias;
-	const unsigned int elevator_features;
 	struct module *elevator_owner;
 #ifdef CONFIG_BLK_DEBUG_FS
 	const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
diff --git a/block/fops.c b/block/fops.c
index af6c244314af..7a163f7fe2d8 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -44,18 +44,15 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
 #define DIO_INLINE_BIO_VECS 4
 
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
-		struct iov_iter *iter, unsigned int nr_pages)
+		struct iov_iter *iter, struct block_device *bdev,
+		unsigned int nr_pages)
 {
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
 	struct bio bio;
 	ssize_t ret;
 
-	if (blkdev_dio_unaligned(bdev, pos, iter))
-		return -EINVAL;
-
 	if (nr_pages <= DIO_INLINE_BIO_VECS)
 		vecs = inline_vecs;
 	else {
@@ -161,9 +158,8 @@ static void blkdev_bio_end_io(struct bio *bio)
 }
 
 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-		unsigned int nr_pages)
+		struct block_device *bdev, unsigned int nr_pages)
 {
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	struct blk_plug plug;
 	struct blkdev_dio *dio;
 	struct bio *bio;
@@ -172,9 +168,6 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
-	if (blkdev_dio_unaligned(bdev, pos, iter))
-		return -EINVAL;
-
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
 		opf |= REQ_ALLOC_CACHE;
 	bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -302,9 +295,9 @@ static void blkdev_bio_end_io_async(struct bio *bio)
 
 static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 					struct iov_iter *iter,
+					struct block_device *bdev,
 					unsigned int nr_pages)
 {
-	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	bool is_read = iov_iter_rw(iter) == READ;
 	blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	struct blkdev_dio *dio;
@@ -312,9 +305,6 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 
-	if (blkdev_dio_unaligned(bdev, pos, iter))
-		return -EINVAL;
-
 	if (iocb->ki_flags & IOCB_ALLOC_CACHE)
 		opf |= REQ_ALLOC_CACHE;
 	bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
@@ -368,18 +358,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
+	struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
 	unsigned int nr_pages;
 
 	if (!iov_iter_count(iter))
 		return 0;
 
+	if (blkdev_dio_unaligned(bdev, iocb->ki_pos, iter))
+		return -EINVAL;
+
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
 	if (likely(nr_pages <= BIO_MAX_VECS)) {
 		if (is_sync_kiocb(iocb))
-			return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
-		return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+			return __blkdev_direct_IO_simple(iocb, iter, bdev,
+							nr_pages);
+		return __blkdev_direct_IO_async(iocb, iter, bdev, nr_pages);
 	}
-	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+	return __blkdev_direct_IO(iocb, iter, bdev, bio_max_segs(nr_pages));
 }
 
 static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
@@ -390,7 +385,7 @@ static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
 
 	iomap->bdev = bdev;
 	iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
-	if (iomap->offset >= isize)
+	if (offset >= isize)
 		return -EIO;
 	iomap->type = IOMAP_MAPPED;
 	iomap->addr = iomap->offset;
diff --git a/block/genhd.c b/block/genhd.c
index bb29a68e1d67..7f39fbe60753 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -118,7 +118,7 @@ static void part_stat_read_all(struct block_device *part,
 	}
 }
 
-static unsigned int part_in_flight(struct block_device *part)
+unsigned int part_in_flight(struct block_device *part)
 {
 	unsigned int inflight = 0;
 	int cpu;
@@ -345,9 +345,7 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
 	struct file *file;
 	int ret = 0;
 
-	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
-		return -EINVAL;
-	if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+	if (!disk_has_partscan(disk))
 		return -EINVAL;
 	if (disk->open_partitions)
 		return -EBUSY;
@@ -503,8 +501,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
 			goto out_unregister_bdi;
 
 		/* Make sure the first partition scan will be proceed */
-		if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) &&
-		    !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+		if (get_capacity(disk) && disk_has_partscan(disk))
 			set_bit(GD_NEED_PART_SCAN, &disk->state);
 
 		bdev_add(disk->part0, ddev->devt);
@@ -954,15 +951,10 @@ ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
 	struct block_device *bdev = dev_to_bdev(dev);
-	struct request_queue *q = bdev_get_queue(bdev);
 	struct disk_stats stat;
 	unsigned int inflight;
 
-	if (queue_is_mq(q))
-		inflight = blk_mq_in_flight(q, bdev);
-	else
-		inflight = part_in_flight(bdev);
-
+	inflight = part_in_flight(bdev);
 	if (inflight) {
 		part_stat_lock();
 		update_io_ticks(bdev, jiffies, true);
@@ -1047,6 +1039,12 @@ static ssize_t diskseq_show(struct device *dev,
 	return sprintf(buf, "%llu\n", disk->diskseq);
 }
 
+static ssize_t partscan_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
+}
+
 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1060,6 +1058,7 @@ static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
+static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
@@ -1106,6 +1105,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_events_async.attr,
 	&dev_attr_events_poll_msecs.attr,
 	&dev_attr_diskseq.attr,
+	&dev_attr_partscan.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
@@ -1182,7 +1182,7 @@ static void disk_release(struct device *dev)
 
 	disk_release_events(disk);
 	kfree(disk->random);
-	disk_free_zone_bitmaps(disk);
+	disk_free_zone_resources(disk);
 	xa_destroy(&disk->part_tbl);
 
 	disk->queue->disk = NULL;
@@ -1251,11 +1251,8 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	xa_for_each(&gp->part_tbl, idx, hd) {
 		if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
 			continue;
-		if (queue_is_mq(gp->queue))
-			inflight = blk_mq_in_flight(gp->queue, hd);
-		else
-			inflight = part_in_flight(hd);
 
+		inflight = part_in_flight(hd);
 		if (inflight) {
 			part_stat_lock();
 			update_io_ticks(hd, jiffies, true);
@@ -1364,6 +1361,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (blkcg_init_disk(disk))
 		goto out_erase_part0;
 
+	disk_init_zone_resources(disk);
 	rand_initialize_disk(disk);
 	disk_to_dev(disk)->class = &block_class;
 	disk_to_dev(disk)->type = &disk_type;
diff --git a/block/ioctl.c b/block/ioctl.c
index f505f9c341eb..c7db3bd2d653 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -33,7 +33,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
 	if (op == BLKPG_DEL_PARTITION)
 		return bdev_del_partition(disk, p.pno);
 
-	if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+	if (p.start < 0 || p.length <= 0 || LLONG_MAX - p.length < p.start)
 		return -EINVAL;
 	/* Check that the partition is aligned to the block size */
 	if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
@@ -95,9 +95,12 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
 static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 		unsigned long arg)
 {
-	uint64_t range[2];
-	uint64_t start, len, end;
+	unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
 	struct inode *inode = bdev->bd_inode;
+	uint64_t range[2], start, len, end;
+	struct bio *prev = NULL, *bio;
+	sector_t sector, nr_sects;
+	struct blk_plug plug;
 	int err;
 
 	if (!(mode & BLK_OPEN_WRITE))
@@ -105,6 +108,8 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 
 	if (!bdev_max_discard_sectors(bdev))
 		return -EOPNOTSUPP;
+	if (bdev_read_only(bdev))
+		return -EPERM;
 
 	if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 		return -EFAULT;
@@ -112,9 +117,9 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 	start = range[0];
 	len = range[1];
 
-	if (start & 511)
+	if (!len)
 		return -EINVAL;
-	if (len & 511)
+	if ((start | len) & bs_mask)
 		return -EINVAL;
 
 	if (check_add_overflow(start, len, &end) ||
@@ -125,7 +130,32 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
 	err = truncate_bdev_range(bdev, mode, start, start + len - 1);
 	if (err)
 		goto fail;
-	err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+
+	sector = start >> SECTOR_SHIFT;
+	nr_sects = len >> SECTOR_SHIFT;
+
+	blk_start_plug(&plug);
+	while (1) {
+		if (fatal_signal_pending(current)) {
+			if (prev)
+				bio_await_chain(prev);
+			err = -EINTR;
+			goto out_unplug;
+		}
+		bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects,
+				GFP_KERNEL);
+		if (!bio)
+			break;
+		prev = bio_chain_and_submit(prev, bio);
+	}
+	if (prev) {
+		err = submit_bio_wait(prev);
+		if (err == -EOPNOTSUPP)
+			err = 0;
+		bio_put(prev);
+	}
+out_unplug:
+	blk_finish_plug(&plug);
 fail:
 	filemap_invalidate_unlock(inode->i_mapping);
 	return err;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 02a916ba62ee..94eede4fb9eb 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -102,7 +102,6 @@ struct deadline_data {
 	int prio_aging_expire;
 
 	spinlock_t lock;
-	spinlock_t zone_lock;
 };
 
 /* Maps an I/O priority class to a deadline scheduler priority. */
@@ -129,36 +128,7 @@ static u8 dd_rq_ioclass(struct request *rq)
 }
 
 /*
- * get the request before `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_earlier_request(struct request *rq)
-{
-	struct rb_node *node = rb_prev(&rq->rb_node);
-
-	if (node)
-		return rb_entry_rq(node);
-
-	return NULL;
-}
-
-/*
- * get the request after `rq' in sector-sorted order
- */
-static inline struct request *
-deadline_latter_request(struct request *rq)
-{
-	struct rb_node *node = rb_next(&rq->rb_node);
-
-	if (node)
-		return rb_entry_rq(node);
-
-	return NULL;
-}
-
-/*
- * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
- * return the first request after the start of the zone containing @pos.
+ * Return the first request for which blk_rq_pos() >= @pos.
  */
 static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
 				enum dd_data_dir data_dir, sector_t pos)
@@ -170,14 +140,6 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
 		return NULL;
 
 	rq = rb_entry_rq(node);
-	/*
-	 * A zoned write may have been requeued with a starting position that
-	 * is below that of the most recently dispatched request. Hence, for
-	 * zoned writes, start searching from the start of a zone.
-	 */
-	if (blk_rq_is_seq_zoned_write(rq))
-		pos = round_down(pos, rq->q->limits.chunk_sectors);
-
 	while (node) {
 		rq = rb_entry_rq(node);
 		if (blk_rq_pos(rq) >= pos) {
@@ -308,36 +270,6 @@ static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
 	return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
 }
 
-/*
- * Check if rq has a sequential request preceding it.
- */
-static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
-{
-	struct request *prev = deadline_earlier_request(rq);
-
-	if (!prev)
-		return false;
-
-	return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
-}
-
-/*
- * Skip all write requests that are sequential from @rq, even if we cross
- * a zone boundary.
- */
-static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
-						struct request *rq)
-{
-	sector_t pos = blk_rq_pos(rq);
-
-	do {
-		pos += blk_rq_sectors(rq);
-		rq = deadline_latter_request(rq);
-	} while (rq && blk_rq_pos(rq) == pos);
-
-	return rq;
-}
-
 /*
  * For the specified data direction, return the next request to
  * dispatch using arrival ordered lists.
@@ -346,40 +278,10 @@ static struct request *
 deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 		      enum dd_data_dir data_dir)
 {
-	struct request *rq, *rb_rq, *next;
-	unsigned long flags;
-
 	if (list_empty(&per_prio->fifo_list[data_dir]))
 		return NULL;
 
-	rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
-	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
-		return rq;
-
-	/*
-	 * Look for a write request that can be dispatched, that is one with
-	 * an unlocked target zone. For some HDDs, breaking a sequential
-	 * write stream can lead to lower throughput, so make sure to preserve
-	 * sequential write streams, even if that stream crosses into the next
-	 * zones and these zones are unlocked.
-	 */
-	spin_lock_irqsave(&dd->zone_lock, flags);
-	list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
-				 queuelist) {
-		/* Check whether a prior request exists for the same zone. */
-		rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
-		if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
-			rq = rb_rq;
-		if (blk_req_can_dispatch_to_zone(rq) &&
-		    (blk_queue_nonrot(rq->q) ||
-		     !deadline_is_seq_write(dd, rq)))
-			goto out;
-	}
-	rq = NULL;
-out:
-	spin_unlock_irqrestore(&dd->zone_lock, flags);
-
-	return rq;
+	return rq_entry_fifo(per_prio->fifo_list[data_dir].next);
 }
 
 /*
@@ -390,36 +292,8 @@ static struct request *
 deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 		      enum dd_data_dir data_dir)
 {
-	struct request *rq;
-	unsigned long flags;
-
-	rq = deadline_from_pos(per_prio, data_dir,
-			       per_prio->latest_pos[data_dir]);
-	if (!rq)
-		return NULL;
-
-	if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
-		return rq;
-
-	/*
-	 * Look for a write request that can be dispatched, that is one with
-	 * an unlocked target zone. For some HDDs, breaking a sequential
-	 * write stream can lead to lower throughput, so make sure to preserve
-	 * sequential write streams, even if that stream crosses into the next
-	 * zones and these zones are unlocked.
-	 */
-	spin_lock_irqsave(&dd->zone_lock, flags);
-	while (rq) {
-		if (blk_req_can_dispatch_to_zone(rq))
-			break;
-		if (blk_queue_nonrot(rq->q))
-			rq = deadline_latter_request(rq);
-		else
-			rq = deadline_skip_seq_writes(dd, rq);
-	}
-	spin_unlock_irqrestore(&dd->zone_lock, flags);
-
-	return rq;
+	return deadline_from_pos(per_prio, data_dir,
+				 per_prio->latest_pos[data_dir]);
 }
 
 /*
@@ -525,10 +399,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 		rq = next_rq;
 	}
 
-	/*
-	 * For a zoned block device, if we only have writes queued and none of
-	 * them can be dispatched, rq will be NULL.
-	 */
 	if (!rq)
 		return NULL;
 
@@ -549,10 +419,6 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	prio = ioprio_class_to_prio[ioprio_class];
 	dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
 	dd->per_prio[prio].stats.dispatched++;
-	/*
-	 * If the request needs its target zone locked, do it.
-	 */
-	blk_req_zone_write_lock(rq);
 	rq->rq_flags |= RQF_STARTED;
 	return rq;
 }
@@ -722,7 +588,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->fifo_batch = fifo_batch;
 	dd->prio_aging_expire = prio_aging_expire;
 	spin_lock_init(&dd->lock);
-	spin_lock_init(&dd->zone_lock);
 
 	/* We dispatch from request queue wide instead of hw queue */
 	blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
@@ -804,12 +669,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 
 	lockdep_assert_held(&dd->lock);
 
-	/*
-	 * This may be a requeue of a write request that has locked its
-	 * target zone. If it is the case, this releases the zone lock.
-	 */
-	blk_req_zone_write_unlock(rq);
-
 	prio = ioprio_class_to_prio[ioprio_class];
 	per_prio = &dd->per_prio[prio];
 	if (!rq->elv.priv[0]) {
@@ -841,18 +700,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 		 */
 		rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
 		insert_before = &per_prio->fifo_list[data_dir];
-#ifdef CONFIG_BLK_DEV_ZONED
-		/*
-		 * Insert zoned writes such that requests are sorted by
-		 * position per zone.
-		 */
-		if (blk_rq_is_seq_zoned_write(rq)) {
-			struct request *rq2 = deadline_latter_request(rq);
-
-			if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
-				insert_before = &rq2->queuelist;
-		}
-#endif
 		list_add_tail(&rq->queuelist, insert_before);
 	}
 }
@@ -887,33 +734,8 @@ static void dd_prepare_request(struct request *rq)
 	rq->elv.priv[0] = NULL;
 }
 
-static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
-{
-	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-	enum dd_prio p;
-
-	for (p = 0; p <= DD_PRIO_MAX; p++)
-		if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
-			return true;
-
-	return false;
-}
-
 /*
  * Callback from inside blk_mq_free_request().
- *
- * For zoned block devices, write unlock the target zone of
- * completed write requests. Do this while holding the zone lock
- * spinlock so that the zone is never unlocked while deadline_fifo_request()
- * or deadline_next_request() are executing. This function is called for
- * all requests, whether or not these requests complete successfully.
- *
- * For a zoned block device, __dd_dispatch_request() may have stopped
- * dispatching requests if all the queued requests are write requests directed
- * at zones that are already locked due to on-going write requests. To ensure
- * write request dispatch progress in this case, mark the queue as needing a
- * restart to ensure that the queue is run again after completion of the
- * request and zones being unlocked.
  */
 static void dd_finish_request(struct request *rq)
 {
@@ -928,21 +750,8 @@ static void dd_finish_request(struct request *rq)
 	 * called dd_insert_requests(). Skip requests that bypassed I/O
 	 * scheduling. See also blk_mq_request_bypass_insert().
 	 */
-	if (!rq->elv.priv[0])
-		return;
-
-	atomic_inc(&per_prio->stats.completed);
-
-	if (blk_queue_is_zoned(q)) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&dd->zone_lock, flags);
-		blk_req_zone_write_unlock(rq);
-		spin_unlock_irqrestore(&dd->zone_lock, flags);
-
-		if (dd_has_write_work(rq->mq_hctx))
-			blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
-	}
+	if (rq->elv.priv[0])
+		atomic_inc(&per_prio->stats.completed);
 }
 
 static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
@@ -1266,7 +1075,6 @@ static struct elevator_type mq_deadline = {
 	.elevator_attrs = deadline_attrs,
 	.elevator_name = "mq-deadline",
 	.elevator_alias = "deadline",
-	.elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
 	.elevator_owner = THIS_MODULE,
 };
 MODULE_ALIAS("mq-deadline-iosched");
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index c03bc105e575..152c85df92b2 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -70,8 +70,8 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
 	}
 
 	if (*partdef == '(') {
-		int length;
-		char *next = strchr(++partdef, ')');
+		partdef++;
+		char *next = strsep(&partdef, ")");
 
 		if (!next) {
 			pr_warn("cmdline partition format is invalid.");
@@ -79,11 +79,7 @@ static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
 			goto fail;
 		}
 
-		length = min_t(int, next - partdef,
-			       sizeof(new_subpart->name) - 1);
-		strscpy(new_subpart->name, partdef, length);
-
-		partdef = ++next;
+		strscpy(new_subpart->name, next, sizeof(new_subpart->name));
 	} else
 		new_subpart->name[0] = '\0';
 
@@ -117,14 +113,12 @@ static void free_subpart(struct cmdline_parts *parts)
 	}
 }
 
-static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+static int parse_parts(struct cmdline_parts **parts, char *bdevdef)
 {
 	int ret = -EINVAL;
 	char *next;
-	int length;
 	struct cmdline_subpart **next_subpart;
 	struct cmdline_parts *newparts;
-	char buf[BDEVNAME_SIZE + 32 + 4];
 
 	*parts = NULL;
 
@@ -132,28 +126,19 @@ static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
 	if (!newparts)
 		return -ENOMEM;
 
-	next = strchr(bdevdef, ':');
+	next = strsep(&bdevdef, ":");
 	if (!next) {
 		pr_warn("cmdline partition has no block device.");
 		goto fail;
 	}
 
-	length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
-	strscpy(newparts->name, bdevdef, length);
+	strscpy(newparts->name, next, sizeof(newparts->name));
 	newparts->nr_subparts = 0;
 
 	next_subpart = &newparts->subpart;
 
-	while (next && *(++next)) {
-		bdevdef = next;
-		next = strchr(bdevdef, ',');
-
-		length = (!next) ? (sizeof(buf) - 1) :
-			min_t(int, next - bdevdef, sizeof(buf) - 1);
-
-		strscpy(buf, bdevdef, length);
-
-		ret = parse_subpart(next_subpart, buf);
+	while ((next = strsep(&bdevdef, ","))) {
+		ret = parse_subpart(next_subpart, next);
 		if (ret)
 			goto fail;
 
@@ -199,24 +184,17 @@ static int cmdline_parts_parse(struct cmdline_parts **parts,
 
 	*parts = NULL;
 
-	next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+	pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
 	if (!buf)
 		return -ENOMEM;
 
 	next_parts = parts;
 
-	while (next && *pbuf) {
-		next = strchr(pbuf, ';');
-		if (next)
-			*next = '\0';
-
-		ret = parse_parts(next_parts, pbuf);
+	while ((next = strsep(&pbuf, ";"))) {
+		ret = parse_parts(next_parts, next);
 		if (ret)
 			goto fail;
 
-		if (next)
-			pbuf = ++next;
-
 		next_parts = &(*next_parts)->next_parts;
 	}
 
@@ -250,7 +228,6 @@ static struct cmdline_parts *bdev_parts;
 static int add_part(int slot, struct cmdline_subpart *subpart,
 		struct parsed_partitions *state)
 {
-	int label_min;
 	struct partition_meta_info *info;
 	char tmp[sizeof(info->volname) + 4];
 
@@ -262,9 +239,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart,
 
 	info = &state->parts[slot].info;
 
-	label_min = min_t(int, sizeof(info->volname) - 1,
-			  sizeof(subpart->name));
-	strscpy(info->volname, subpart->name, label_min);
+	strscpy(info->volname, subpart->name, sizeof(info->volname));
 
 	snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
 	strlcat(state->pp_buf, tmp, PAGE_SIZE);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index b11e88c82c8c..37b5f92d07fe 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -573,10 +573,7 @@ static int blk_add_partitions(struct gendisk *disk)
 	struct parsed_partitions *state;
 	int ret = -EAGAIN, p;
 
-	if (disk->flags & GENHD_FL_NO_PART)
-		return 0;
-
-	if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+	if (!disk_has_partscan(disk))
 		return 0;
 
 	state = check_partition(disk);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index e322cef6596b..b900fe9e0030 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -29,10 +29,7 @@
 
 /*
  * Each block ramdisk device has a xarray brd_pages of pages that stores
- * the pages containing the block device's contents. A brd page's ->index is
- * its offset in PAGE_SIZE units. This is similar to, but in no way connected
- * with, the kernel's pagecache or buffer cache (which sit above our block
- * device).
+ * the pages containing the block device's contents.
  */
 struct brd_device {
 	int			brd_number;
@@ -51,15 +48,7 @@ struct brd_device {
  */
 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 {
-	pgoff_t idx;
-	struct page *page;
-
-	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
-	page = xa_load(&brd->brd_pages, idx);
-
-	BUG_ON(page && page->index != idx);
-
-	return page;
+	return xa_load(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT);
 }
 
 /*
@@ -67,8 +56,8 @@ static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
  */
 static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
 {
-	pgoff_t idx;
-	struct page *page, *cur;
+	pgoff_t idx = sector >> PAGE_SECTORS_SHIFT;
+	struct page *page;
 	int ret = 0;
 
 	page = brd_lookup_page(brd, sector);
@@ -80,23 +69,16 @@ static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
 		return -ENOMEM;
 
 	xa_lock(&brd->brd_pages);
-
-	idx = sector >> PAGE_SECTORS_SHIFT;
-	page->index = idx;
-
-	cur = __xa_cmpxchg(&brd->brd_pages, idx, NULL, page, gfp);
-
-	if (unlikely(cur)) {
-		__free_page(page);
-		ret = xa_err(cur);
-		if (!ret && (cur->index != idx))
-			ret = -EIO;
-	} else {
+	ret = __xa_insert(&brd->brd_pages, idx, page, gfp);
+	if (!ret)
 		brd->brd_nr_pages++;
-	}
-
 	xa_unlock(&brd->brd_pages);
 
+	if (ret < 0) {
+		__free_page(page);
+		if (ret == -EBUSY)
+			ret = 0;
+	}
 	return ret;
 }
 
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index ed33cf7192d2..4005a8b685e8 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -225,6 +225,10 @@ static unsigned long g_cache_size;
 module_param_named(cache_size, g_cache_size, ulong, 0444);
 MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
 
+static bool g_fua = true;
+module_param_named(fua, g_fua, bool, 0444);
+MODULE_PARM_DESC(zoned, "Enable/disable FUA support when cache_size is used. Default: true");
+
 static unsigned int g_mbps;
 module_param_named(mbps, g_mbps, uint, 0444);
 MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
@@ -253,6 +257,11 @@ static unsigned int g_zone_max_active;
 module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
 MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
 
+static int g_zone_append_max_sectors = INT_MAX;
+module_param_named(zone_append_max_sectors, g_zone_append_max_sectors, int, 0444);
+MODULE_PARM_DESC(zone_append_max_sectors,
+		 "Maximum size of a zone append command (in 512B sectors). Specify 0 for zone append emulation");
+
 static struct nullb_device *null_alloc_dev(void);
 static void null_free_dev(struct nullb_device *dev);
 static void null_del_dev(struct nullb *nullb);
@@ -436,10 +445,12 @@ NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
 NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
 NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
 NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
+NULLB_DEVICE_ATTR(zone_append_max_sectors, uint, NULL);
 NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
 NULLB_DEVICE_ATTR(no_sched, bool, NULL);
 NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
 NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
+NULLB_DEVICE_ATTR(fua, bool, NULL);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -580,12 +591,14 @@ static struct configfs_attribute *nullb_device_attrs[] = {
 	&nullb_device_attr_zone_nr_conv,
 	&nullb_device_attr_zone_max_open,
 	&nullb_device_attr_zone_max_active,
+	&nullb_device_attr_zone_append_max_sectors,
 	&nullb_device_attr_zone_readonly,
 	&nullb_device_attr_zone_offline,
 	&nullb_device_attr_virt_boundary,
 	&nullb_device_attr_no_sched,
 	&nullb_device_attr_shared_tags,
 	&nullb_device_attr_shared_tag_bitmap,
+	&nullb_device_attr_fua,
 	NULL,
 };
 
@@ -664,14 +677,14 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
 static ssize_t memb_group_features_show(struct config_item *item, char *page)
 {
 	return snprintf(page, PAGE_SIZE,
-			"badblocks,blocking,blocksize,cache_size,"
+			"badblocks,blocking,blocksize,cache_size,fua,"
 			"completion_nsec,discard,home_node,hw_queue_depth,"
 			"irqmode,max_sectors,mbps,memory_backed,no_sched,"
 			"poll_queues,power,queue_mode,shared_tag_bitmap,"
 			"shared_tags,size,submit_queues,use_per_node_hctx,"
 			"virt_boundary,zoned,zone_capacity,zone_max_active,"
 			"zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
-			"zone_size\n");
+			"zone_size,zone_append_max_sectors\n");
 }
 
 CONFIGFS_ATTR_RO(memb_group_, features);
@@ -751,10 +764,13 @@ static struct nullb_device *null_alloc_dev(void)
 	dev->zone_nr_conv = g_zone_nr_conv;
 	dev->zone_max_open = g_zone_max_open;
 	dev->zone_max_active = g_zone_max_active;
+	dev->zone_append_max_sectors = g_zone_append_max_sectors;
 	dev->virt_boundary = g_virt_boundary;
 	dev->no_sched = g_no_sched;
 	dev->shared_tags = g_shared_tags;
 	dev->shared_tag_bitmap = g_shared_tag_bitmap;
+	dev->fua = g_fua;
+
 	return dev;
 }
 
@@ -1151,7 +1167,7 @@ blk_status_t null_handle_discard(struct nullb_device *dev,
 	return BLK_STS_OK;
 }
 
-static int null_handle_flush(struct nullb *nullb)
+static blk_status_t null_handle_flush(struct nullb *nullb)
 {
 	int err;
 
@@ -1168,7 +1184,7 @@ static int null_handle_flush(struct nullb *nullb)
 
 	WARN_ON(!radix_tree_empty(&nullb->dev->cache));
 	spin_unlock_irq(&nullb->lock);
-	return err;
+	return errno_to_blk_status(err);
 }
 
 static int null_transfer(struct nullb *nullb, struct page *page,
@@ -1206,7 +1222,7 @@ static int null_handle_rq(struct nullb_cmd *cmd)
 {
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
 	struct nullb *nullb = cmd->nq->dev->nullb;
-	int err;
+	int err = 0;
 	unsigned int len;
 	sector_t sector = blk_rq_pos(rq);
 	struct req_iterator iter;
@@ -1218,15 +1234,13 @@ static int null_handle_rq(struct nullb_cmd *cmd)
 		err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
 				     op_is_write(req_op(rq)), sector,
 				     rq->cmd_flags & REQ_FUA);
-		if (err) {
-			spin_unlock_irq(&nullb->lock);
-			return err;
-		}
+		if (err)
+			break;
 		sector += len >> SECTOR_SHIFT;
 	}
 	spin_unlock_irq(&nullb->lock);
 
-	return 0;
+	return errno_to_blk_status(err);
 }
 
 static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
@@ -1273,8 +1287,8 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
 
 	if (op == REQ_OP_DISCARD)
 		return null_handle_discard(dev, sector, nr_sectors);
-	return errno_to_blk_status(null_handle_rq(cmd));
 
+	return null_handle_rq(cmd);
 }
 
 static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
@@ -1343,7 +1357,7 @@ static void null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
 	blk_status_t sts;
 
 	if (op == REQ_OP_FLUSH) {
-		cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+		cmd->error = null_handle_flush(nullb);
 		goto out;
 	}
 
@@ -1912,7 +1926,7 @@ static int null_add_dev(struct nullb_device *dev)
 
 	if (dev->cache_size > 0) {
 		set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
-		blk_queue_write_cache(nullb->q, true, true);
+		blk_queue_write_cache(nullb->q, true, dev->fua);
 	}
 
 	nullb->q->queuedata = nullb;
@@ -2113,10 +2127,13 @@ static void __exit null_exit(void)
 
 	if (tag_set.ops)
 		blk_mq_free_tag_set(&tag_set);
+
+	mutex_destroy(&lock);
 }
 
 module_init(null_init);
 module_exit(null_exit);
 
 MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
+MODULE_DESCRIPTION("multi queue aware block test driver");
 MODULE_LICENSE("GPL");
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 477b97746823..3234e6c85eed 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -82,6 +82,7 @@ struct nullb_device {
 	unsigned int zone_nr_conv; /* number of conventional zones */
 	unsigned int zone_max_open; /* max number of open zones */
 	unsigned int zone_max_active; /* max number of active zones */
+	unsigned int zone_append_max_sectors; /* Max sectors per zone append command */
 	unsigned int submit_queues; /* number of submission queues */
 	unsigned int prev_submit_queues; /* number of submission queues before change */
 	unsigned int poll_queues; /* number of IOPOLL submission queues */
@@ -104,6 +105,7 @@ struct nullb_device {
 	bool no_sched; /* no IO scheduler for the device */
 	bool shared_tags; /* share tag set between devices for blk-mq */
 	bool shared_tag_bitmap; /* use hostwide shared tags */
+	bool fua; /* Support FUA */
 };
 
 struct nullb {
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 1689e2584104..5b5a63adacc1 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -9,6 +9,8 @@
 #undef pr_fmt
 #define pr_fmt(fmt)	"null_blk: " fmt
 
+#define NULL_ZONE_INVALID_WP	((sector_t)-1)
+
 static inline sector_t mb_to_sects(unsigned long mb)
 {
 	return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT;
@@ -19,18 +21,6 @@ static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
 	return sect >> ilog2(dev->zone_size_sects);
 }
 
-static inline void null_lock_zone_res(struct nullb_device *dev)
-{
-	if (dev->need_zone_res_mgmt)
-		spin_lock_irq(&dev->zone_res_lock);
-}
-
-static inline void null_unlock_zone_res(struct nullb_device *dev)
-{
-	if (dev->need_zone_res_mgmt)
-		spin_unlock_irq(&dev->zone_res_lock);
-}
-
 static inline void null_init_zone_lock(struct nullb_device *dev,
 				       struct nullb_zone *zone)
 {
@@ -103,6 +93,11 @@ int null_init_zoned_dev(struct nullb_device *dev,
 			dev->zone_nr_conv);
 	}
 
+	dev->zone_append_max_sectors =
+		min(ALIGN_DOWN(dev->zone_append_max_sectors,
+			       dev->blocksize >> SECTOR_SHIFT),
+		    zone_capacity_sects);
+
 	/* Max active zones has to be < nbr of seq zones in order to be enforceable */
 	if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) {
 		dev->zone_max_active = 0;
@@ -154,7 +149,7 @@ int null_init_zoned_dev(struct nullb_device *dev,
 
 	lim->zoned = true;
 	lim->chunk_sectors = dev->zone_size_sects;
-	lim->max_zone_append_sectors = dev->zone_size_sects;
+	lim->max_zone_append_sectors = dev->zone_append_max_sectors;
 	lim->max_open_zones = dev->zone_max_open;
 	lim->max_active_zones = dev->zone_max_active;
 	return 0;
@@ -163,11 +158,16 @@ int null_init_zoned_dev(struct nullb_device *dev,
 int null_register_zoned_dev(struct nullb *nullb)
 {
 	struct request_queue *q = nullb->q;
+	struct gendisk *disk = nullb->disk;
 
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
-	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
-	nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
-	return blk_revalidate_disk_zones(nullb->disk, NULL);
+	disk->nr_zones = bdev_nr_zones(disk->part0);
+
+	pr_info("%s: using %s zone append\n",
+		disk->disk_name,
+		queue_emulates_zone_append(q) ? "emulated" : "native");
+
+	return blk_revalidate_disk_zones(disk);
 }
 
 void null_free_zoned_dev(struct nullb_device *dev)
@@ -241,35 +241,6 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
 	return (zone->wp - sector) << SECTOR_SHIFT;
 }
 
-static blk_status_t __null_close_zone(struct nullb_device *dev,
-				      struct nullb_zone *zone)
-{
-	switch (zone->cond) {
-	case BLK_ZONE_COND_CLOSED:
-		/* close operation on closed is not an error */
-		return BLK_STS_OK;
-	case BLK_ZONE_COND_IMP_OPEN:
-		dev->nr_zones_imp_open--;
-		break;
-	case BLK_ZONE_COND_EXP_OPEN:
-		dev->nr_zones_exp_open--;
-		break;
-	case BLK_ZONE_COND_EMPTY:
-	case BLK_ZONE_COND_FULL:
-	default:
-		return BLK_STS_IOERR;
-	}
-
-	if (zone->wp == zone->start) {
-		zone->cond = BLK_ZONE_COND_EMPTY;
-	} else {
-		zone->cond = BLK_ZONE_COND_CLOSED;
-		dev->nr_zones_closed++;
-	}
-
-	return BLK_STS_OK;
-}
-
 static void null_close_imp_open_zone(struct nullb_device *dev)
 {
 	struct nullb_zone *zone;
@@ -286,7 +257,13 @@ static void null_close_imp_open_zone(struct nullb_device *dev)
 			zno = dev->zone_nr_conv;
 
 		if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
-			__null_close_zone(dev, zone);
+			dev->nr_zones_imp_open--;
+			if (zone->wp == zone->start) {
+				zone->cond = BLK_ZONE_COND_EMPTY;
+			} else {
+				zone->cond = BLK_ZONE_COND_CLOSED;
+				dev->nr_zones_closed++;
+			}
 			dev->imp_close_zone_no = zno;
 			return;
 		}
@@ -374,73 +351,73 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
 
 	null_lock_zone(dev, zone);
 
-	if (zone->cond == BLK_ZONE_COND_FULL ||
-	    zone->cond == BLK_ZONE_COND_READONLY ||
-	    zone->cond == BLK_ZONE_COND_OFFLINE) {
-		/* Cannot write to the zone */
-		ret = BLK_STS_IOERR;
-		goto unlock;
-	}
-
 	/*
-	 * Regular writes must be at the write pointer position.
-	 * Zone append writes are automatically issued at the write
-	 * pointer and the position returned using the request or BIO
-	 * sector.
+	 * Regular writes must be at the write pointer position. Zone append
+	 * writes are automatically issued at the write pointer and the position
+	 * returned using the request sector. Note that we do not check the zone
+	 * condition because for FULL, READONLY and OFFLINE zones, the sector
+	 * check against the zone write pointer will always result in failing
+	 * the command.
 	 */
 	if (append) {
+		if (WARN_ON_ONCE(!dev->zone_append_max_sectors) ||
+		    zone->wp == NULL_ZONE_INVALID_WP) {
+			ret = BLK_STS_IOERR;
+			goto unlock_zone;
+		}
 		sector = zone->wp;
 		blk_mq_rq_from_pdu(cmd)->__sector = sector;
-	} else if (sector != zone->wp) {
-		ret = BLK_STS_IOERR;
-		goto unlock;
 	}
 
-	if (zone->wp + nr_sectors > zone->start + zone->capacity) {
+	if (sector != zone->wp ||
+	    zone->wp + nr_sectors > zone->start + zone->capacity) {
 		ret = BLK_STS_IOERR;
-		goto unlock;
+		goto unlock_zone;
 	}
 
 	if (zone->cond == BLK_ZONE_COND_CLOSED ||
 	    zone->cond == BLK_ZONE_COND_EMPTY) {
-		null_lock_zone_res(dev);
+		if (dev->need_zone_res_mgmt) {
+			spin_lock(&dev->zone_res_lock);
 
-		ret = null_check_zone_resources(dev, zone);
-		if (ret != BLK_STS_OK) {
-			null_unlock_zone_res(dev);
-			goto unlock;
-		}
-		if (zone->cond == BLK_ZONE_COND_CLOSED) {
-			dev->nr_zones_closed--;
-			dev->nr_zones_imp_open++;
-		} else if (zone->cond == BLK_ZONE_COND_EMPTY) {
-			dev->nr_zones_imp_open++;
-		}
+			ret = null_check_zone_resources(dev, zone);
+			if (ret != BLK_STS_OK) {
+				spin_unlock(&dev->zone_res_lock);
+				goto unlock_zone;
+			}
+			if (zone->cond == BLK_ZONE_COND_CLOSED) {
+				dev->nr_zones_closed--;
+				dev->nr_zones_imp_open++;
+			} else if (zone->cond == BLK_ZONE_COND_EMPTY) {
+				dev->nr_zones_imp_open++;
+			}
 
-		if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
-			zone->cond = BLK_ZONE_COND_IMP_OPEN;
+			spin_unlock(&dev->zone_res_lock);
+		}
 
-		null_unlock_zone_res(dev);
+		zone->cond = BLK_ZONE_COND_IMP_OPEN;
 	}
 
 	ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
 	if (ret != BLK_STS_OK)
-		goto unlock;
+		goto unlock_zone;
 
 	zone->wp += nr_sectors;
 	if (zone->wp == zone->start + zone->capacity) {
-		null_lock_zone_res(dev);
-		if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
-			dev->nr_zones_exp_open--;
-		else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
-			dev->nr_zones_imp_open--;
+		if (dev->need_zone_res_mgmt) {
+			spin_lock(&dev->zone_res_lock);
+			if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
+				dev->nr_zones_exp_open--;
+			else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
+				dev->nr_zones_imp_open--;
+			spin_unlock(&dev->zone_res_lock);
+		}
 		zone->cond = BLK_ZONE_COND_FULL;
-		null_unlock_zone_res(dev);
 	}
 
 	ret = BLK_STS_OK;
 
-unlock:
+unlock_zone:
 	null_unlock_zone(dev, zone);
 
 	return ret;
@@ -454,54 +431,100 @@ static blk_status_t null_open_zone(struct nullb_device *dev,
 	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
 		return BLK_STS_IOERR;
 
-	null_lock_zone_res(dev);
-
 	switch (zone->cond) {
 	case BLK_ZONE_COND_EXP_OPEN:
-		/* open operation on exp open is not an error */
-		goto unlock;
+		/* Open operation on exp open is not an error */
+		return BLK_STS_OK;
 	case BLK_ZONE_COND_EMPTY:
-		ret = null_check_zone_resources(dev, zone);
-		if (ret != BLK_STS_OK)
-			goto unlock;
-		break;
 	case BLK_ZONE_COND_IMP_OPEN:
-		dev->nr_zones_imp_open--;
-		break;
 	case BLK_ZONE_COND_CLOSED:
-		ret = null_check_zone_resources(dev, zone);
-		if (ret != BLK_STS_OK)
-			goto unlock;
-		dev->nr_zones_closed--;
 		break;
 	case BLK_ZONE_COND_FULL:
 	default:
-		ret = BLK_STS_IOERR;
-		goto unlock;
+		return BLK_STS_IOERR;
 	}
 
-	zone->cond = BLK_ZONE_COND_EXP_OPEN;
-	dev->nr_zones_exp_open++;
+	if (dev->need_zone_res_mgmt) {
+		spin_lock(&dev->zone_res_lock);
 
-unlock:
-	null_unlock_zone_res(dev);
+		switch (zone->cond) {
+		case BLK_ZONE_COND_EMPTY:
+			ret = null_check_zone_resources(dev, zone);
+			if (ret != BLK_STS_OK) {
+				spin_unlock(&dev->zone_res_lock);
+				return ret;
+			}
+			break;
+		case BLK_ZONE_COND_IMP_OPEN:
+			dev->nr_zones_imp_open--;
+			break;
+		case BLK_ZONE_COND_CLOSED:
+			ret = null_check_zone_resources(dev, zone);
+			if (ret != BLK_STS_OK) {
+				spin_unlock(&dev->zone_res_lock);
+				return ret;
+			}
+			dev->nr_zones_closed--;
+			break;
+		default:
+			break;
+		}
 
-	return ret;
+		dev->nr_zones_exp_open++;
+
+		spin_unlock(&dev->zone_res_lock);
+	}
+
+	zone->cond = BLK_ZONE_COND_EXP_OPEN;
+
+	return BLK_STS_OK;
 }
 
 static blk_status_t null_close_zone(struct nullb_device *dev,
 				    struct nullb_zone *zone)
 {
-	blk_status_t ret;
-
 	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
 		return BLK_STS_IOERR;
 
-	null_lock_zone_res(dev);
-	ret = __null_close_zone(dev, zone);
-	null_unlock_zone_res(dev);
+	switch (zone->cond) {
+	case BLK_ZONE_COND_CLOSED:
+		/* close operation on closed is not an error */
+		return BLK_STS_OK;
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+		break;
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_FULL:
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	if (dev->need_zone_res_mgmt) {
+		spin_lock(&dev->zone_res_lock);
 
-	return ret;
+		switch (zone->cond) {
+		case BLK_ZONE_COND_IMP_OPEN:
+			dev->nr_zones_imp_open--;
+			break;
+		case BLK_ZONE_COND_EXP_OPEN:
+			dev->nr_zones_exp_open--;
+			break;
+		default:
+			break;
+		}
+
+		if (zone->wp > zone->start)
+			dev->nr_zones_closed++;
+
+		spin_unlock(&dev->zone_res_lock);
+	}
+
+	if (zone->wp == zone->start)
+		zone->cond = BLK_ZONE_COND_EMPTY;
+	else
+		zone->cond = BLK_ZONE_COND_CLOSED;
+
+	return BLK_STS_OK;
 }
 
 static blk_status_t null_finish_zone(struct nullb_device *dev,
@@ -512,41 +535,47 @@ static blk_status_t null_finish_zone(struct nullb_device *dev,
 	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
 		return BLK_STS_IOERR;
 
-	null_lock_zone_res(dev);
+	if (dev->need_zone_res_mgmt) {
+		spin_lock(&dev->zone_res_lock);
 
-	switch (zone->cond) {
-	case BLK_ZONE_COND_FULL:
-		/* finish operation on full is not an error */
-		goto unlock;
-	case BLK_ZONE_COND_EMPTY:
-		ret = null_check_zone_resources(dev, zone);
-		if (ret != BLK_STS_OK)
-			goto unlock;
-		break;
-	case BLK_ZONE_COND_IMP_OPEN:
-		dev->nr_zones_imp_open--;
-		break;
-	case BLK_ZONE_COND_EXP_OPEN:
-		dev->nr_zones_exp_open--;
-		break;
-	case BLK_ZONE_COND_CLOSED:
-		ret = null_check_zone_resources(dev, zone);
-		if (ret != BLK_STS_OK)
-			goto unlock;
-		dev->nr_zones_closed--;
-		break;
-	default:
-		ret = BLK_STS_IOERR;
-		goto unlock;
+		switch (zone->cond) {
+		case BLK_ZONE_COND_FULL:
+			/* Finish operation on full is not an error */
+			spin_unlock(&dev->zone_res_lock);
+			return BLK_STS_OK;
+		case BLK_ZONE_COND_EMPTY:
+			ret = null_check_zone_resources(dev, zone);
+			if (ret != BLK_STS_OK) {
+				spin_unlock(&dev->zone_res_lock);
+				return ret;
+			}
+			break;
+		case BLK_ZONE_COND_IMP_OPEN:
+			dev->nr_zones_imp_open--;
+			break;
+		case BLK_ZONE_COND_EXP_OPEN:
+			dev->nr_zones_exp_open--;
+			break;
+		case BLK_ZONE_COND_CLOSED:
+			ret = null_check_zone_resources(dev, zone);
+			if (ret != BLK_STS_OK) {
+				spin_unlock(&dev->zone_res_lock);
+				return ret;
+			}
+			dev->nr_zones_closed--;
+			break;
+		default:
+			spin_unlock(&dev->zone_res_lock);
+			return BLK_STS_IOERR;
+		}
+
+		spin_unlock(&dev->zone_res_lock);
 	}
 
 	zone->cond = BLK_ZONE_COND_FULL;
 	zone->wp = zone->start + zone->len;
 
-unlock:
-	null_unlock_zone_res(dev);
-
-	return ret;
+	return BLK_STS_OK;
 }
 
 static blk_status_t null_reset_zone(struct nullb_device *dev,
@@ -555,34 +584,33 @@ static blk_status_t null_reset_zone(struct nullb_device *dev,
 	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
 		return BLK_STS_IOERR;
 
-	null_lock_zone_res(dev);
+	if (dev->need_zone_res_mgmt) {
+		spin_lock(&dev->zone_res_lock);
 
-	switch (zone->cond) {
-	case BLK_ZONE_COND_EMPTY:
-		/* reset operation on empty is not an error */
-		null_unlock_zone_res(dev);
-		return BLK_STS_OK;
-	case BLK_ZONE_COND_IMP_OPEN:
-		dev->nr_zones_imp_open--;
-		break;
-	case BLK_ZONE_COND_EXP_OPEN:
-		dev->nr_zones_exp_open--;
-		break;
-	case BLK_ZONE_COND_CLOSED:
-		dev->nr_zones_closed--;
-		break;
-	case BLK_ZONE_COND_FULL:
-		break;
-	default:
-		null_unlock_zone_res(dev);
-		return BLK_STS_IOERR;
+		switch (zone->cond) {
+		case BLK_ZONE_COND_IMP_OPEN:
+			dev->nr_zones_imp_open--;
+			break;
+		case BLK_ZONE_COND_EXP_OPEN:
+			dev->nr_zones_exp_open--;
+			break;
+		case BLK_ZONE_COND_CLOSED:
+			dev->nr_zones_closed--;
+			break;
+		case BLK_ZONE_COND_EMPTY:
+		case BLK_ZONE_COND_FULL:
+			break;
+		default:
+			spin_unlock(&dev->zone_res_lock);
+			return BLK_STS_IOERR;
+		}
+
+		spin_unlock(&dev->zone_res_lock);
 	}
 
 	zone->cond = BLK_ZONE_COND_EMPTY;
 	zone->wp = zone->start;
 
-	null_unlock_zone_res(dev);
-
 	if (dev->memory_backed)
 		return null_handle_discard(dev, zone->start, zone->len);
 
@@ -711,7 +739,7 @@ static void null_set_zone_cond(struct nullb_device *dev,
 		    zone->cond != BLK_ZONE_COND_OFFLINE)
 			null_finish_zone(dev, zone);
 		zone->cond = cond;
-		zone->wp = (sector_t)-1;
+		zone->wp = NULL_ZONE_INVALID_WP;
 	}
 
 	null_unlock_zone(dev, zone);
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 374e4efa8759..176657dce3e3 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -221,7 +221,7 @@ static int ublk_get_nr_zones(const struct ublk_device *ub)
 
 static int ublk_revalidate_disk_zones(struct ublk_device *ub)
 {
-	return blk_revalidate_disk_zones(ub->ub_disk, NULL);
+	return blk_revalidate_disk_zones(ub->ub_disk);
 }
 
 static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
@@ -249,8 +249,7 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
 static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
 {
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
-	blk_queue_required_elevator_features(ub->ub_disk->queue,
-					     ELEVATOR_F_ZBD_SEQ_WRITE);
+
 	ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
 }
 
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 42dea7601d87..c1af0a7d56c8 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -1543,7 +1543,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 	 */
 	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
 		blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
-		err = blk_revalidate_disk_zones(vblk->disk, NULL);
+		err = blk_revalidate_disk_zones(vblk->disk);
 		if (err)
 			goto out_cleanup_disk;
 	}
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 2bba4d6aaaa2..463eb13bd0b2 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -54,7 +54,7 @@ void bch_dump_bucket(struct btree_keys *b)
 int __bch_count_data(struct btree_keys *b)
 {
 	unsigned int ret = 0;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct bkey *k;
 
 	if (b->ops->is_extents)
@@ -67,7 +67,7 @@ void __bch_check_keys(struct btree_keys *b, const char *fmt, ...)
 {
 	va_list args;
 	struct bkey *k, *p = NULL;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	const char *err;
 
 	for_each_key(b, k, &iter) {
@@ -879,7 +879,7 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
 	unsigned int status = BTREE_INSERT_STATUS_NO_INSERT;
 	struct bset *i = bset_tree_last(b)->data;
 	struct bkey *m, *prev = NULL;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct bkey preceding_key_on_stack = ZERO_KEY;
 	struct bkey *preceding_key_p = &preceding_key_on_stack;
 
@@ -895,9 +895,9 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
 	else
 		preceding_key(k, &preceding_key_p);
 
-	m = bch_btree_iter_init(b, &iter, preceding_key_p);
+	m = bch_btree_iter_stack_init(b, &iter, preceding_key_p);
 
-	if (b->ops->insert_fixup(b, k, &iter, replace_key))
+	if (b->ops->insert_fixup(b, k, &iter.iter, replace_key))
 		return status;
 
 	status = BTREE_INSERT_STATUS_INSERT;
@@ -1100,33 +1100,33 @@ void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
 				 btree_iter_cmp));
 }
 
-static struct bkey *__bch_btree_iter_init(struct btree_keys *b,
-					  struct btree_iter *iter,
-					  struct bkey *search,
-					  struct bset_tree *start)
+static struct bkey *__bch_btree_iter_stack_init(struct btree_keys *b,
+						struct btree_iter_stack *iter,
+						struct bkey *search,
+						struct bset_tree *start)
 {
 	struct bkey *ret = NULL;
 
-	iter->size = ARRAY_SIZE(iter->data);
-	iter->used = 0;
+	iter->iter.size = ARRAY_SIZE(iter->stack_data);
+	iter->iter.used = 0;
 
 #ifdef CONFIG_BCACHE_DEBUG
-	iter->b = b;
+	iter->iter.b = b;
 #endif
 
 	for (; start <= bset_tree_last(b); start++) {
 		ret = bch_bset_search(b, start, search);
-		bch_btree_iter_push(iter, ret, bset_bkey_last(start->data));
+		bch_btree_iter_push(&iter->iter, ret, bset_bkey_last(start->data));
 	}
 
 	return ret;
 }
 
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
-				 struct btree_iter *iter,
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+				 struct btree_iter_stack *iter,
 				 struct bkey *search)
 {
-	return __bch_btree_iter_init(b, iter, search, b->set);
+	return __bch_btree_iter_stack_init(b, iter, search, b->set);
 }
 
 static inline struct bkey *__bch_btree_iter_next(struct btree_iter *iter,
@@ -1293,10 +1293,10 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
 			    struct bset_sort_state *state)
 {
 	size_t order = b->page_order, keys = 0;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	int oldsize = bch_count_data(b);
 
-	__bch_btree_iter_init(b, &iter, NULL, &b->set[start]);
+	__bch_btree_iter_stack_init(b, &iter, NULL, &b->set[start]);
 
 	if (start) {
 		unsigned int i;
@@ -1307,7 +1307,7 @@ void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
 		order = get_order(__set_bytes(b->set->data, keys));
 	}
 
-	__btree_sort(b, &iter, start, order, false, state);
+	__btree_sort(b, &iter.iter, start, order, false, state);
 
 	EBUG_ON(oldsize >= 0 && bch_count_data(b) != oldsize);
 }
@@ -1323,11 +1323,11 @@ void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
 			 struct bset_sort_state *state)
 {
 	uint64_t start_time = local_clock();
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 
-	bch_btree_iter_init(b, &iter, NULL);
+	bch_btree_iter_stack_init(b, &iter, NULL);
 
-	btree_mergesort(b, new->set->data, &iter, false, true);
+	btree_mergesort(b, new->set->data, &iter.iter, false, true);
 
 	bch_time_stats_update(&state->time, start_time);
 
diff --git a/drivers/md/bcache/bset.h b/drivers/md/bcache/bset.h
index d795c84246b0..011f6062c4c0 100644
--- a/drivers/md/bcache/bset.h
+++ b/drivers/md/bcache/bset.h
@@ -321,7 +321,14 @@ struct btree_iter {
 #endif
 	struct btree_iter_set {
 		struct bkey *k, *end;
-	} data[MAX_BSETS];
+	} data[];
+};
+
+/* Fixed-size btree_iter that can be allocated on the stack */
+
+struct btree_iter_stack {
+	struct btree_iter iter;
+	struct btree_iter_set stack_data[MAX_BSETS];
 };
 
 typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
@@ -333,9 +340,9 @@ struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 
 void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
 			 struct bkey *end);
-struct bkey *bch_btree_iter_init(struct btree_keys *b,
-				 struct btree_iter *iter,
-				 struct bkey *search);
+struct bkey *bch_btree_iter_stack_init(struct btree_keys *b,
+				       struct btree_iter_stack *iter,
+				       struct bkey *search);
 
 struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
 			       const struct bkey *search);
@@ -350,13 +357,14 @@ static inline struct bkey *bch_bset_search(struct btree_keys *b,
 	return search ? __bch_bset_search(b, t, search) : t->data->start;
 }
 
-#define for_each_key_filter(b, k, iter, filter)				\
-	for (bch_btree_iter_init((b), (iter), NULL);			\
-	     ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
+#define for_each_key_filter(b, k, stack_iter, filter)                      \
+	for (bch_btree_iter_stack_init((b), (stack_iter), NULL);           \
+	     ((k) = bch_btree_iter_next_filter(&((stack_iter)->iter), (b), \
+					       filter));)
 
-#define for_each_key(b, k, iter)					\
-	for (bch_btree_iter_init((b), (iter), NULL);			\
-	     ((k) = bch_btree_iter_next(iter));)
+#define for_each_key(b, k, stack_iter)                           \
+	for (bch_btree_iter_stack_init((b), (stack_iter), NULL); \
+	     ((k) = bch_btree_iter_next(&((stack_iter)->iter)));)
 
 /* Sorting */
 
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 196cdacce38f..d011a7154d33 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -1309,7 +1309,7 @@ static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
 	uint8_t stale = 0;
 	unsigned int keys = 0, good_keys = 0;
 	struct bkey *k;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct bset_tree *t;
 
 	gc->nodes++;
@@ -1570,7 +1570,7 @@ static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
 static unsigned int btree_gc_count_keys(struct btree *b)
 {
 	struct bkey *k;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	unsigned int ret = 0;
 
 	for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
@@ -1611,17 +1611,18 @@ static int btree_gc_recurse(struct btree *b, struct btree_op *op,
 	int ret = 0;
 	bool should_rewrite;
 	struct bkey *k;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct gc_merge_info r[GC_MERGE_NODES];
 	struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
 
-	bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
+	bch_btree_iter_stack_init(&b->keys, &iter, &b->c->gc_done);
 
 	for (i = r; i < r + ARRAY_SIZE(r); i++)
 		i->b = ERR_PTR(-EINTR);
 
 	while (1) {
-		k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
+		k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+					       bch_ptr_bad);
 		if (k) {
 			r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
 						  true, b);
@@ -1911,7 +1912,7 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 {
 	int ret = 0;
 	struct bkey *k, *p = NULL;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 
 	for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
 		bch_initial_mark_key(b->c, b->level, k);
@@ -1919,10 +1920,10 @@ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
 	bch_initial_mark_key(b->c, b->level + 1, &b->key);
 
 	if (b->level) {
-		bch_btree_iter_init(&b->keys, &iter, NULL);
+		bch_btree_iter_stack_init(&b->keys, &iter, NULL);
 
 		do {
-			k = bch_btree_iter_next_filter(&iter, &b->keys,
+			k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
 						       bch_ptr_bad);
 			if (k) {
 				btree_node_prefetch(b, k);
@@ -1950,7 +1951,7 @@ static int bch_btree_check_thread(void *arg)
 	struct btree_check_info *info = arg;
 	struct btree_check_state *check_state = info->state;
 	struct cache_set *c = check_state->c;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct bkey *k, *p;
 	int cur_idx, prev_idx, skip_nr;
 
@@ -1959,8 +1960,8 @@ static int bch_btree_check_thread(void *arg)
 	ret = 0;
 
 	/* root node keys are checked before thread created */
-	bch_btree_iter_init(&c->root->keys, &iter, NULL);
-	k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+	bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+	k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
 	BUG_ON(!k);
 
 	p = k;
@@ -1978,7 +1979,7 @@ static int bch_btree_check_thread(void *arg)
 		skip_nr = cur_idx - prev_idx;
 
 		while (skip_nr) {
-			k = bch_btree_iter_next_filter(&iter,
+			k = bch_btree_iter_next_filter(&iter.iter,
 						       &c->root->keys,
 						       bch_ptr_bad);
 			if (k)
@@ -2051,7 +2052,7 @@ int bch_btree_check(struct cache_set *c)
 	int ret = 0;
 	int i;
 	struct bkey *k = NULL;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct btree_check_state check_state;
 
 	/* check and mark root node keys */
@@ -2547,11 +2548,11 @@ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
 
 	if (b->level) {
 		struct bkey *k;
-		struct btree_iter iter;
+		struct btree_iter_stack iter;
 
-		bch_btree_iter_init(&b->keys, &iter, from);
+		bch_btree_iter_stack_init(&b->keys, &iter, from);
 
-		while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
+		while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
 						       bch_ptr_bad))) {
 			ret = bcache_btree(map_nodes_recurse, k, b,
 				    op, from, fn, flags);
@@ -2580,11 +2581,12 @@ int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
 {
 	int ret = MAP_CONTINUE;
 	struct bkey *k;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 
-	bch_btree_iter_init(&b->keys, &iter, from);
+	bch_btree_iter_stack_init(&b->keys, &iter, from);
 
-	while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
+	while ((k = bch_btree_iter_next_filter(&iter.iter, &b->keys,
+					       bch_ptr_bad))) {
 		ret = !b->level
 			? fn(op, b, k)
 			: bcache_btree(map_keys_recurse, k,
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 330bcd9ea4a9..cba09660148a 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -881,8 +881,8 @@ static void bcache_device_free(struct bcache_device *d)
 		bcache_device_detach(d);
 
 	if (disk) {
-		ida_simple_remove(&bcache_device_idx,
-				  first_minor_to_idx(disk->first_minor));
+		ida_free(&bcache_device_idx,
+			 first_minor_to_idx(disk->first_minor));
 		put_disk(disk);
 	}
 
@@ -940,8 +940,8 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
 	if (!d->full_dirty_stripes)
 		goto out_free_stripe_sectors_dirty;
 
-	idx = ida_simple_get(&bcache_device_idx, 0,
-				BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
+	idx = ida_alloc_max(&bcache_device_idx, BCACHE_DEVICE_IDX_MAX - 1,
+			    GFP_KERNEL);
 	if (idx < 0)
 		goto out_free_full_dirty_stripes;
 
@@ -986,7 +986,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
 out_bioset_exit:
 	bioset_exit(&d->bio_split);
 out_ida_remove:
-	ida_simple_remove(&bcache_device_idx, idx);
+	ida_free(&bcache_device_idx, idx);
 out_free_full_dirty_stripes:
 	kvfree(d->full_dirty_stripes);
 out_free_stripe_sectors_dirty:
@@ -1914,8 +1914,9 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
 	INIT_LIST_HEAD(&c->btree_cache_freed);
 	INIT_LIST_HEAD(&c->data_buckets);
 
-	iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
-		sizeof(struct btree_iter_set);
+	iter_size = sizeof(struct btree_iter) +
+		    ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size) *
+			    sizeof(struct btree_iter_set);
 
 	c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
 	if (!c->devices)
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 6956beb55326..826b14cae4e5 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -660,7 +660,7 @@ static unsigned int bch_root_usage(struct cache_set *c)
 	unsigned int bytes = 0;
 	struct bkey *k;
 	struct btree *b;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 
 	goto lock_root;
 
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 8827a6f130ad..792e070ccf38 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -908,15 +908,15 @@ static int bch_dirty_init_thread(void *arg)
 	struct dirty_init_thrd_info *info = arg;
 	struct bch_dirty_init_state *state = info->state;
 	struct cache_set *c = state->c;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct bkey *k, *p;
 	int cur_idx, prev_idx, skip_nr;
 
 	k = p = NULL;
 	prev_idx = 0;
 
-	bch_btree_iter_init(&c->root->keys, &iter, NULL);
-	k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
+	bch_btree_iter_stack_init(&c->root->keys, &iter, NULL);
+	k = bch_btree_iter_next_filter(&iter.iter, &c->root->keys, bch_ptr_bad);
 	BUG_ON(!k);
 
 	p = k;
@@ -930,7 +930,7 @@ static int bch_dirty_init_thread(void *arg)
 		skip_nr = cur_idx - prev_idx;
 
 		while (skip_nr) {
-			k = bch_btree_iter_next_filter(&iter,
+			k = bch_btree_iter_next_filter(&iter.iter,
 						       &c->root->keys,
 						       bch_ptr_bad);
 			if (k)
@@ -979,7 +979,7 @@ void bch_sectors_dirty_init(struct bcache_device *d)
 	int i;
 	struct btree *b = NULL;
 	struct bkey *k = NULL;
-	struct btree_iter iter;
+	struct btree_iter_stack iter;
 	struct sectors_dirty_init op;
 	struct cache_set *c = d->c;
 	struct bch_dirty_init_state state;
diff --git a/drivers/md/dm-bio-prison-v2.c b/drivers/md/dm-bio-prison-v2.c
index fd852981ef9c..cf433b0cf742 100644
--- a/drivers/md/dm-bio-prison-v2.c
+++ b/drivers/md/dm-bio-prison-v2.c
@@ -321,8 +321,7 @@ static bool __unlock(struct dm_bio_prison_v2 *prison,
 {
 	BUG_ON(!cell->exclusive_lock);
 
-	bio_list_merge(bios, &cell->bios);
-	bio_list_init(&cell->bios);
+	bio_list_merge_init(bios, &cell->bios);
 
 	if (cell->shared_count) {
 		cell->exclusive_lock = false;
diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c
index 911f73f7ebba..0fcbf8603846 100644
--- a/drivers/md/dm-cache-target.c
+++ b/drivers/md/dm-cache-target.c
@@ -115,8 +115,7 @@ static void __commit(struct work_struct *_ws)
 	 */
 	spin_lock_irq(&b->lock);
 	list_splice_init(&b->work_items, &work_items);
-	bio_list_merge(&bios, &b->bios);
-	bio_list_init(&b->bios);
+	bio_list_merge_init(&bios, &b->bios);
 	b->commit_scheduled = false;
 	spin_unlock_irq(&b->lock);
 
@@ -565,8 +564,7 @@ static void defer_bio(struct cache *cache, struct bio *bio)
 static void defer_bios(struct cache *cache, struct bio_list *bios)
 {
 	spin_lock_irq(&cache->lock);
-	bio_list_merge(&cache->deferred_bios, bios);
-	bio_list_init(bios);
+	bio_list_merge_init(&cache->deferred_bios, bios);
 	spin_unlock_irq(&cache->lock);
 
 	wake_deferred_bio_worker(cache);
@@ -1816,8 +1814,7 @@ static void process_deferred_bios(struct work_struct *ws)
 	bio_list_init(&bios);
 
 	spin_lock_irq(&cache->lock);
-	bio_list_merge(&bios, &cache->deferred_bios);
-	bio_list_init(&cache->deferred_bios);
+	bio_list_merge_init(&bios, &cache->deferred_bios);
 	spin_unlock_irq(&cache->lock);
 
 	while ((bio = bio_list_pop(&bios))) {
@@ -1847,8 +1844,7 @@ static void requeue_deferred_bios(struct cache *cache)
 	struct bio_list bios;
 
 	bio_list_init(&bios);
-	bio_list_merge(&bios, &cache->deferred_bios);
-	bio_list_init(&cache->deferred_bios);
+	bio_list_merge_init(&bios, &cache->deferred_bios);
 
 	while ((bio = bio_list_pop(&bios))) {
 		bio->bi_status = BLK_STS_DM_REQUEUE;
diff --git a/drivers/md/dm-clone-target.c b/drivers/md/dm-clone-target.c
index 94b2fc33f64b..3f68672ab7c9 100644
--- a/drivers/md/dm-clone-target.c
+++ b/drivers/md/dm-clone-target.c
@@ -1181,8 +1181,7 @@ static void process_deferred_discards(struct clone *clone)
 	struct bio_list discards = BIO_EMPTY_LIST;
 
 	spin_lock_irq(&clone->lock);
-	bio_list_merge(&discards, &clone->deferred_discard_bios);
-	bio_list_init(&clone->deferred_discard_bios);
+	bio_list_merge_init(&discards, &clone->deferred_discard_bios);
 	spin_unlock_irq(&clone->lock);
 
 	if (bio_list_empty(&discards))
@@ -1215,8 +1214,7 @@ static void process_deferred_bios(struct clone *clone)
 	struct bio_list bios = BIO_EMPTY_LIST;
 
 	spin_lock_irq(&clone->lock);
-	bio_list_merge(&bios, &clone->deferred_bios);
-	bio_list_init(&clone->deferred_bios);
+	bio_list_merge_init(&bios, &clone->deferred_bios);
 	spin_unlock_irq(&clone->lock);
 
 	if (bio_list_empty(&bios))
@@ -1237,11 +1235,9 @@ static void process_deferred_flush_bios(struct clone *clone)
 	 * before issuing them or signaling their completion.
 	 */
 	spin_lock_irq(&clone->lock);
-	bio_list_merge(&bios, &clone->deferred_flush_bios);
-	bio_list_init(&clone->deferred_flush_bios);
-
-	bio_list_merge(&bio_completions, &clone->deferred_flush_completions);
-	bio_list_init(&clone->deferred_flush_completions);
+	bio_list_merge_init(&bios, &clone->deferred_flush_bios);
+	bio_list_merge_init(&bio_completions,
+			    &clone->deferred_flush_completions);
 	spin_unlock_irq(&clone->lock);
 
 	if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
diff --git a/drivers/md/dm-core.h b/drivers/md/dm-core.h
index e6757a30dcca..08700bfc3e23 100644
--- a/drivers/md/dm-core.h
+++ b/drivers/md/dm-core.h
@@ -140,7 +140,7 @@ struct mapped_device {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 	unsigned int nr_zones;
-	unsigned int *zwp_offset;
+	void *zone_revalidate_map;
 #endif
 
 #ifdef CONFIG_IMA
diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c
index 6acfa5bf97a4..8f81e597858d 100644
--- a/drivers/md/dm-era-target.c
+++ b/drivers/md/dm-era-target.c
@@ -1272,8 +1272,7 @@ static void process_deferred_bios(struct era *era)
 	bio_list_init(&marked_bios);
 
 	spin_lock(&era->deferred_lock);
-	bio_list_merge(&deferred_bios, &era->deferred_bios);
-	bio_list_init(&era->deferred_bios);
+	bio_list_merge_init(&deferred_bios, &era->deferred_bios);
 	spin_unlock(&era->deferred_lock);
 
 	if (bio_list_empty(&deferred_bios))
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 05d1328d1811..15b681b90153 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -704,8 +704,7 @@ static void process_queued_bios(struct work_struct *work)
 		return;
 	}
 
-	bio_list_merge(&bios, &m->queued_bios);
-	bio_list_init(&m->queued_bios);
+	bio_list_merge_init(&bios, &m->queued_bios);
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..2c6fbd87363f 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -2042,7 +2042,8 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 		r = dm_set_zones_restrictions(t, q);
 		if (r)
 			return r;
-		if (!static_key_enabled(&zoned_enabled.key))
+		if (blk_queue_is_zoned(q) &&
+		    !static_key_enabled(&zoned_enabled.key))
 			static_branch_enable(&zoned_enabled);
 	}
 
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 4793ad2aa1f7..f359984c8ef2 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -592,12 +592,6 @@ struct dm_thin_endio_hook {
 	struct dm_bio_prison_cell *cell;
 };
 
-static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
-{
-	bio_list_merge(bios, master);
-	bio_list_init(master);
-}
-
 static void error_bio_list(struct bio_list *bios, blk_status_t error)
 {
 	struct bio *bio;
@@ -616,7 +610,7 @@ static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
 	bio_list_init(&bios);
 
 	spin_lock_irq(&tc->lock);
-	__merge_bio_list(&bios, master);
+	bio_list_merge_init(&bios, master);
 	spin_unlock_irq(&tc->lock);
 
 	error_bio_list(&bios, error);
@@ -645,8 +639,8 @@ static void requeue_io(struct thin_c *tc)
 	bio_list_init(&bios);
 
 	spin_lock_irq(&tc->lock);
-	__merge_bio_list(&bios, &tc->deferred_bio_list);
-	__merge_bio_list(&bios, &tc->retry_on_resume_list);
+	bio_list_merge_init(&bios, &tc->deferred_bio_list);
+	bio_list_merge_init(&bios, &tc->retry_on_resume_list);
 	spin_unlock_irq(&tc->lock);
 
 	error_bio_list(&bios, BLK_STS_DM_REQUEUE);
diff --git a/drivers/md/dm-vdo/data-vio.c b/drivers/md/dm-vdo/data-vio.c
index 94f6f1ccfb7d..ab3ea8337809 100644
--- a/drivers/md/dm-vdo/data-vio.c
+++ b/drivers/md/dm-vdo/data-vio.c
@@ -604,8 +604,7 @@ static void assign_discard_permit(struct limiter *limiter)
 
 static void get_waiters(struct limiter *limiter)
 {
-	bio_list_merge(&limiter->waiters, &limiter->new_waiters);
-	bio_list_init(&limiter->new_waiters);
+	bio_list_merge_init(&limiter->waiters, &limiter->new_waiters);
 }
 
 static inline struct data_vio *get_available_data_vio(struct data_vio_pool *pool)
diff --git a/drivers/md/dm-vdo/flush.c b/drivers/md/dm-vdo/flush.c
index 57e87f0d7069..dd4fdee2ca0c 100644
--- a/drivers/md/dm-vdo/flush.c
+++ b/drivers/md/dm-vdo/flush.c
@@ -369,8 +369,7 @@ void vdo_dump_flusher(const struct flusher *flusher)
 static void initialize_flush(struct vdo_flush *flush, struct vdo *vdo)
 {
 	bio_list_init(&flush->bios);
-	bio_list_merge(&flush->bios, &vdo->flusher->waiting_flush_bios);
-	bio_list_init(&vdo->flusher->waiting_flush_bios);
+	bio_list_merge_init(&flush->bios, &vdo->flusher->waiting_flush_bios);
 }
 
 static void launch_flush(struct vdo_flush *flush)
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index eb9832b22b14..8e6bcb0d786a 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -60,16 +60,23 @@ int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 	struct dm_table *map;
 	int srcu_idx, ret;
 
-	if (dm_suspended_md(md))
-		return -EAGAIN;
+	if (!md->zone_revalidate_map) {
+		/* Regular user context */
+		if (dm_suspended_md(md))
+			return -EAGAIN;
 
-	map = dm_get_live_table(md, &srcu_idx);
-	if (!map)
-		return -EIO;
+		map = dm_get_live_table(md, &srcu_idx);
+		if (!map)
+			return -EIO;
+	} else {
+		/* Zone revalidation during __bind() */
+		map = md->zone_revalidate_map;
+	}
 
 	ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
 
-	dm_put_live_table(md, srcu_idx);
+	if (!md->zone_revalidate_map)
+		dm_put_live_table(md, srcu_idx);
 
 	return ret;
 }
@@ -138,80 +145,47 @@ bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
 	}
 }
 
-void dm_cleanup_zoned_dev(struct mapped_device *md)
+/*
+ * Count conventional zones of a mapped zoned device. If the device
+ * only has conventional zones, do not expose it as zoned.
+ */
+static int dm_check_zoned_cb(struct blk_zone *zone, unsigned int idx,
+			     void *data)
 {
-	if (md->disk) {
-		bitmap_free(md->disk->conv_zones_bitmap);
-		md->disk->conv_zones_bitmap = NULL;
-		bitmap_free(md->disk->seq_zones_wlock);
-		md->disk->seq_zones_wlock = NULL;
-	}
+	unsigned int *nr_conv_zones = data;
 
-	kvfree(md->zwp_offset);
-	md->zwp_offset = NULL;
-	md->nr_zones = 0;
-}
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		(*nr_conv_zones)++;
 
-static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
-{
-	switch (zone->cond) {
-	case BLK_ZONE_COND_IMP_OPEN:
-	case BLK_ZONE_COND_EXP_OPEN:
-	case BLK_ZONE_COND_CLOSED:
-		return zone->wp - zone->start;
-	case BLK_ZONE_COND_FULL:
-		return zone->len;
-	case BLK_ZONE_COND_EMPTY:
-	case BLK_ZONE_COND_NOT_WP:
-	case BLK_ZONE_COND_OFFLINE:
-	case BLK_ZONE_COND_READONLY:
-	default:
-		/*
-		 * Conventional, offline and read-only zones do not have a valid
-		 * write pointer. Use 0 as for an empty zone.
-		 */
-		return 0;
-	}
+	return 0;
 }
 
-static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
-				 void *data)
+static int dm_check_zoned(struct mapped_device *md, struct dm_table *t)
 {
-	struct mapped_device *md = data;
 	struct gendisk *disk = md->disk;
+	unsigned int nr_conv_zones = 0;
+	int ret;
 
-	switch (zone->type) {
-	case BLK_ZONE_TYPE_CONVENTIONAL:
-		if (!disk->conv_zones_bitmap) {
-			disk->conv_zones_bitmap = bitmap_zalloc(disk->nr_zones,
-								GFP_NOIO);
-			if (!disk->conv_zones_bitmap)
-				return -ENOMEM;
-		}
-		set_bit(idx, disk->conv_zones_bitmap);
-		break;
-	case BLK_ZONE_TYPE_SEQWRITE_REQ:
-	case BLK_ZONE_TYPE_SEQWRITE_PREF:
-		if (!disk->seq_zones_wlock) {
-			disk->seq_zones_wlock = bitmap_zalloc(disk->nr_zones,
-							      GFP_NOIO);
-			if (!disk->seq_zones_wlock)
-				return -ENOMEM;
-		}
-		if (!md->zwp_offset) {
-			md->zwp_offset =
-				kvcalloc(disk->nr_zones, sizeof(unsigned int),
-					 GFP_KERNEL);
-			if (!md->zwp_offset)
-				return -ENOMEM;
-		}
-		md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
-
-		break;
-	default:
-		DMERR("Invalid zone type 0x%x at sectors %llu",
-		      (int)zone->type, zone->start);
-		return -ENODEV;
+	/* Count conventional zones */
+	md->zone_revalidate_map = t;
+	ret = dm_blk_report_zones(disk, 0, UINT_MAX,
+				  dm_check_zoned_cb, &nr_conv_zones);
+	md->zone_revalidate_map = NULL;
+	if (ret < 0) {
+		DMERR("Check zoned failed %d", ret);
+		return ret;
+	}
+
+	/*
+	 * If we only have conventional zones, expose the mapped device as
+	 * a regular device.
+	 */
+	if (nr_conv_zones >= ret) {
+		disk->queue->limits.max_open_zones = 0;
+		disk->queue->limits.max_active_zones = 0;
+		disk->queue->limits.zoned = false;
+		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+		disk->nr_zones = 0;
 	}
 
 	return 0;
@@ -226,41 +200,32 @@ static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
 static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
 {
 	struct gendisk *disk = md->disk;
-	unsigned int noio_flag;
 	int ret;
 
-	/*
-	 * Check if something changed. If yes, cleanup the current resources
-	 * and reallocate everything.
-	 */
+	/* Revalidate only if something changed. */
 	if (!disk->nr_zones || disk->nr_zones != md->nr_zones)
-		dm_cleanup_zoned_dev(md);
+		md->nr_zones = 0;
+
 	if (md->nr_zones)
 		return 0;
 
 	/*
-	 * Scan all zones to initialize everything. Ensure that all vmalloc
-	 * operations in this context are done as if GFP_NOIO was specified.
+	 * Our table is not live yet. So the call to dm_get_live_table()
+	 * in dm_blk_report_zones() will fail. Set a temporary pointer to
+	 * our table for dm_blk_report_zones() to use directly.
 	 */
-	noio_flag = memalloc_noio_save();
-	ret = dm_blk_do_report_zones(md, t, 0, disk->nr_zones,
-				     dm_zone_revalidate_cb, md);
-	memalloc_noio_restore(noio_flag);
-	if (ret < 0)
-		goto err;
-	if (ret != disk->nr_zones) {
-		ret = -EIO;
-		goto err;
+	md->zone_revalidate_map = t;
+	ret = blk_revalidate_disk_zones(disk);
+	md->zone_revalidate_map = NULL;
+
+	if (ret) {
+		DMERR("Revalidate zones failed %d", ret);
+		return ret;
 	}
 
 	md->nr_zones = disk->nr_zones;
 
 	return 0;
-
-err:
-	DMERR("Revalidate zones failed %d", ret);
-	dm_cleanup_zoned_dev(md);
-	return ret;
 }
 
 static int device_not_zone_append_capable(struct dm_target *ti,
@@ -289,294 +254,40 @@ static bool dm_table_supports_zone_append(struct dm_table *t)
 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
 {
 	struct mapped_device *md = t->md;
+	int ret;
 
 	/*
-	 * For a zoned target, the number of zones should be updated for the
-	 * correct value to be exposed in sysfs queue/nr_zones.
+	 * Check if zone append is natively supported, and if not, set the
+	 * mapped device queue as needing zone append emulation.
 	 */
 	WARN_ON_ONCE(queue_is_mq(q));
-	md->disk->nr_zones = bdev_nr_zones(md->disk->part0);
-
-	/* Check if zone append is natively supported */
 	if (dm_table_supports_zone_append(t)) {
 		clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
-		dm_cleanup_zoned_dev(md);
-		return 0;
+	} else {
+		set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
+		blk_queue_max_zone_append_sectors(q, 0);
 	}
 
-	/*
-	 * Mark the mapped device as needing zone append emulation and
-	 * initialize the emulation resources once the capacity is set.
-	 */
-	set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 	if (!get_capacity(md->disk))
 		return 0;
 
-	return dm_revalidate_zones(md, t);
-}
-
-static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
-				       void *data)
-{
-	unsigned int *wp_offset = data;
-
-	*wp_offset = dm_get_zone_wp_offset(zone);
-
-	return 0;
-}
-
-static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
-				    unsigned int *wp_ofst)
-{
-	sector_t sector = zno * bdev_zone_sectors(md->disk->part0);
-	unsigned int noio_flag;
-	struct dm_table *t;
-	int srcu_idx, ret;
-
-	t = dm_get_live_table(md, &srcu_idx);
-	if (!t)
-		return -EIO;
-
-	/*
-	 * Ensure that all memory allocations in this context are done as if
-	 * GFP_NOIO was specified.
-	 */
-	noio_flag = memalloc_noio_save();
-	ret = dm_blk_do_report_zones(md, t, sector, 1,
-				     dm_update_zone_wp_offset_cb, wp_ofst);
-	memalloc_noio_restore(noio_flag);
-
-	dm_put_live_table(md, srcu_idx);
-
-	if (ret != 1)
-		return -EIO;
-
-	return 0;
-}
-
-struct orig_bio_details {
-	enum req_op op;
-	unsigned int nr_sectors;
-};
-
-/*
- * First phase of BIO mapping for targets with zone append emulation:
- * check all BIO that change a zone writer pointer and change zone
- * append operations into regular write operations.
- */
-static bool dm_zone_map_bio_begin(struct mapped_device *md,
-				  unsigned int zno, struct bio *clone)
-{
-	sector_t zsectors = bdev_zone_sectors(md->disk->part0);
-	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-
-	/*
-	 * If the target zone is in an error state, recover by inspecting the
-	 * zone to get its current write pointer position. Note that since the
-	 * target zone is already locked, a BIO issuing context should never
-	 * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
-	 */
-	if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
-		if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
-			return false;
-		WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
-	}
-
-	switch (bio_op(clone)) {
-	case REQ_OP_ZONE_RESET:
-	case REQ_OP_ZONE_FINISH:
-		return true;
-	case REQ_OP_WRITE_ZEROES:
-	case REQ_OP_WRITE:
-		/* Writes must be aligned to the zone write pointer */
-		if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
-			return false;
-		break;
-	case REQ_OP_ZONE_APPEND:
-		/*
-		 * Change zone append operations into a non-mergeable regular
-		 * writes directed at the current write pointer position of the
-		 * target zone.
-		 */
-		clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
-			(clone->bi_opf & (~REQ_OP_MASK));
-		clone->bi_iter.bi_sector += zwp_offset;
-		break;
-	default:
-		DMWARN_LIMIT("Invalid BIO operation");
-		return false;
-	}
-
-	/* Cannot write to a full zone */
-	if (zwp_offset >= zsectors)
-		return false;
-
-	return true;
-}
-
-/*
- * Second phase of BIO mapping for targets with zone append emulation:
- * update the zone write pointer offset array to account for the additional
- * data written to a zone. Note that at this point, the remapped clone BIO
- * may already have completed, so we do not touch it.
- */
-static blk_status_t dm_zone_map_bio_end(struct mapped_device *md, unsigned int zno,
-					struct orig_bio_details *orig_bio_details,
-					unsigned int nr_sectors)
-{
-	unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-
-	/* The clone BIO may already have been completed and failed */
-	if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
-		return BLK_STS_IOERR;
-
-	/* Update the zone wp offset */
-	switch (orig_bio_details->op) {
-	case REQ_OP_ZONE_RESET:
-		WRITE_ONCE(md->zwp_offset[zno], 0);
-		return BLK_STS_OK;
-	case REQ_OP_ZONE_FINISH:
-		WRITE_ONCE(md->zwp_offset[zno],
-			   bdev_zone_sectors(md->disk->part0));
-		return BLK_STS_OK;
-	case REQ_OP_WRITE_ZEROES:
-	case REQ_OP_WRITE:
-		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
-		return BLK_STS_OK;
-	case REQ_OP_ZONE_APPEND:
-		/*
-		 * Check that the target did not truncate the write operation
-		 * emulating a zone append.
-		 */
-		if (nr_sectors != orig_bio_details->nr_sectors) {
-			DMWARN_LIMIT("Truncated write for zone append");
-			return BLK_STS_IOERR;
-		}
-		WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
-		return BLK_STS_OK;
-	default:
-		DMWARN_LIMIT("Invalid BIO operation");
-		return BLK_STS_IOERR;
-	}
-}
-
-static inline void dm_zone_lock(struct gendisk *disk, unsigned int zno,
-				struct bio *clone)
-{
-	if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
-		return;
-
-	wait_on_bit_lock_io(disk->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
-	bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
-}
-
-static inline void dm_zone_unlock(struct gendisk *disk, unsigned int zno,
-				  struct bio *clone)
-{
-	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
-		return;
-
-	WARN_ON_ONCE(!test_bit(zno, disk->seq_zones_wlock));
-	clear_bit_unlock(zno, disk->seq_zones_wlock);
-	smp_mb__after_atomic();
-	wake_up_bit(disk->seq_zones_wlock, zno);
-
-	bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
-}
-
-static bool dm_need_zone_wp_tracking(struct bio *bio)
-{
 	/*
-	 * Special processing is not needed for operations that do not need the
-	 * zone write lock, that is, all operations that target conventional
-	 * zones and all operations that do not modify directly a sequential
-	 * zone write pointer.
+	 * Check that the mapped device will indeed be zoned, that is, that it
+	 * has sequential write required zones.
 	 */
-	if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
-		return false;
-	switch (bio_op(bio)) {
-	case REQ_OP_WRITE_ZEROES:
-	case REQ_OP_WRITE:
-	case REQ_OP_ZONE_RESET:
-	case REQ_OP_ZONE_FINISH:
-	case REQ_OP_ZONE_APPEND:
-		return bio_zone_is_seq(bio);
-	default:
-		return false;
-	}
-}
-
-/*
- * Special IO mapping for targets needing zone append emulation.
- */
-int dm_zone_map_bio(struct dm_target_io *tio)
-{
-	struct dm_io *io = tio->io;
-	struct dm_target *ti = tio->ti;
-	struct mapped_device *md = io->md;
-	struct bio *clone = &tio->clone;
-	struct orig_bio_details orig_bio_details;
-	unsigned int zno;
-	blk_status_t sts;
-	int r;
-
-	/*
-	 * IOs that do not change a zone write pointer do not need
-	 * any additional special processing.
-	 */
-	if (!dm_need_zone_wp_tracking(clone))
-		return ti->type->map(ti, clone);
-
-	/* Lock the target zone */
-	zno = bio_zone_no(clone);
-	dm_zone_lock(md->disk, zno, clone);
-
-	orig_bio_details.nr_sectors = bio_sectors(clone);
-	orig_bio_details.op = bio_op(clone);
+	ret = dm_check_zoned(md, t);
+	if (ret)
+		return ret;
+	if (!blk_queue_is_zoned(q))
+		return 0;
 
-	/*
-	 * Check that the bio and the target zone write pointer offset are
-	 * both valid, and if the bio is a zone append, remap it to a write.
-	 */
-	if (!dm_zone_map_bio_begin(md, zno, clone)) {
-		dm_zone_unlock(md->disk, zno, clone);
-		return DM_MAPIO_KILL;
+	if (!md->disk->nr_zones) {
+		DMINFO("%s using %s zone append",
+		       md->disk->disk_name,
+		       queue_emulates_zone_append(q) ? "emulated" : "native");
 	}
 
-	/* Let the target do its work */
-	r = ti->type->map(ti, clone);
-	switch (r) {
-	case DM_MAPIO_SUBMITTED:
-		/*
-		 * The target submitted the clone BIO. The target zone will
-		 * be unlocked on completion of the clone.
-		 */
-		sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
-					  *tio->len_ptr);
-		break;
-	case DM_MAPIO_REMAPPED:
-		/*
-		 * The target only remapped the clone BIO. In case of error,
-		 * unlock the target zone here as the clone will not be
-		 * submitted.
-		 */
-		sts = dm_zone_map_bio_end(md, zno, &orig_bio_details,
-					  *tio->len_ptr);
-		if (sts != BLK_STS_OK)
-			dm_zone_unlock(md->disk, zno, clone);
-		break;
-	case DM_MAPIO_REQUEUE:
-	case DM_MAPIO_KILL:
-	default:
-		dm_zone_unlock(md->disk, zno, clone);
-		sts = BLK_STS_IOERR;
-		break;
-	}
-
-	if (sts != BLK_STS_OK)
-		return DM_MAPIO_KILL;
-
-	return r;
+	return dm_revalidate_zones(md, t);
 }
 
 /*
@@ -587,61 +298,17 @@ void dm_zone_endio(struct dm_io *io, struct bio *clone)
 	struct mapped_device *md = io->md;
 	struct gendisk *disk = md->disk;
 	struct bio *orig_bio = io->orig_bio;
-	unsigned int zwp_offset;
-	unsigned int zno;
 
 	/*
-	 * For targets that do not emulate zone append, we only need to
-	 * handle native zone-append bios.
+	 * Get the offset within the zone of the written sector
+	 * and add that to the original bio sector position.
 	 */
-	if (!dm_emulate_zone_append(md)) {
-		/*
-		 * Get the offset within the zone of the written sector
-		 * and add that to the original bio sector position.
-		 */
-		if (clone->bi_status == BLK_STS_OK &&
-		    bio_op(clone) == REQ_OP_ZONE_APPEND) {
-			sector_t mask =
-				(sector_t)bdev_zone_sectors(disk->part0) - 1;
-
-			orig_bio->bi_iter.bi_sector +=
-				clone->bi_iter.bi_sector & mask;
-		}
-
-		return;
-	}
+	if (clone->bi_status == BLK_STS_OK &&
+	    bio_op(clone) == REQ_OP_ZONE_APPEND) {
+		sector_t mask = bdev_zone_sectors(disk->part0) - 1;
 
-	/*
-	 * For targets that do emulate zone append, if the clone BIO does not
-	 * own the target zone write lock, we have nothing to do.
-	 */
-	if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
-		return;
-
-	zno = bio_zone_no(orig_bio);
-
-	if (clone->bi_status != BLK_STS_OK) {
-		/*
-		 * BIOs that modify a zone write pointer may leave the zone
-		 * in an unknown state in case of failure (e.g. the write
-		 * pointer was only partially advanced). In this case, set
-		 * the target zone write pointer as invalid unless it is
-		 * already being updated.
-		 */
-		WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
-	} else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
-		/*
-		 * Get the written sector for zone append operation that were
-		 * emulated using regular write operations.
-		 */
-		zwp_offset = READ_ONCE(md->zwp_offset[zno]);
-		if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
-			WRITE_ONCE(md->zwp_offset[zno],
-				   DM_ZONE_INVALID_WP_OFST);
-		else
-			orig_bio->bi_iter.bi_sector +=
-				zwp_offset - bio_sectors(orig_bio);
+		orig_bio->bi_iter.bi_sector += clone->bi_iter.bi_sector & mask;
 	}
 
-	dm_zone_unlock(disk, zno, clone);
+	return;
 }
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7d0746b37c8e..597dd7a25823 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1428,25 +1428,12 @@ static void __map_bio(struct bio *clone)
 		down(&md->swap_bios_semaphore);
 	}
 
-	if (static_branch_unlikely(&zoned_enabled)) {
-		/*
-		 * Check if the IO needs a special mapping due to zone append
-		 * emulation on zoned target. In this case, dm_zone_map_bio()
-		 * calls the target map operation.
-		 */
-		if (unlikely(dm_emulate_zone_append(md)))
-			r = dm_zone_map_bio(tio);
-		else
-			goto do_map;
-	} else {
-do_map:
-		if (likely(ti->type->map == linear_map))
-			r = linear_map(ti, clone);
-		else if (ti->type->map == stripe_map)
-			r = stripe_map(ti, clone);
-		else
-			r = ti->type->map(ti, clone);
-	}
+	if (likely(ti->type->map == linear_map))
+		r = linear_map(ti, clone);
+	else if (ti->type->map == stripe_map)
+		r = stripe_map(ti, clone);
+	else
+		r = ti->type->map(ti, clone);
 
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
@@ -1774,6 +1761,33 @@ static void init_clone_info(struct clone_info *ci, struct dm_io *io,
 		ci->sector_count = 0;
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+					   struct bio *bio)
+{
+	/*
+	 * For mapped device that need zone append emulation, we must
+	 * split any large BIO that straddles zone boundaries.
+	 */
+	return dm_emulate_zone_append(md) && bio_straddles_zones(bio) &&
+		!bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+	return dm_emulate_zone_append(md) && blk_zone_plug_bio(bio, 0);
+}
+#else
+static inline bool dm_zone_bio_needs_split(struct mapped_device *md,
+					   struct bio *bio)
+{
+	return false;
+}
+static inline bool dm_zone_plug_bio(struct mapped_device *md, struct bio *bio)
+{
+	return false;
+}
+#endif
+
 /*
  * Entry point to split a bio into clones and submit them to the targets.
  */
@@ -1783,19 +1797,32 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 	struct clone_info ci;
 	struct dm_io *io;
 	blk_status_t error = BLK_STS_OK;
-	bool is_abnormal;
+	bool is_abnormal, need_split;
+
+	need_split = is_abnormal = is_abnormal_io(bio);
+	if (static_branch_unlikely(&zoned_enabled))
+		need_split = is_abnormal || dm_zone_bio_needs_split(md, bio);
 
-	is_abnormal = is_abnormal_io(bio);
-	if (unlikely(is_abnormal)) {
+	if (unlikely(need_split)) {
 		/*
 		 * Use bio_split_to_limits() for abnormal IO (e.g. discard, etc)
 		 * otherwise associated queue_limits won't be imposed.
+		 * Also split the BIO for mapped devices needing zone append
+		 * emulation to ensure that the BIO does not cross zone
+		 * boundaries.
 		 */
 		bio = bio_split_to_limits(bio);
 		if (!bio)
 			return;
 	}
 
+	/*
+	 * Use the block layer zone write plugging for mapped devices that
+	 * need zone append emulation (e.g. dm-crypt).
+	 */
+	if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
+		return;
+
 	/* Only support nowait for normal IO */
 	if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
 		io = alloc_io(md, bio, GFP_NOWAIT);
@@ -2016,7 +2043,6 @@ static void cleanup_mapped_device(struct mapped_device *md)
 		md->dax_dev = NULL;
 	}
 
-	dm_cleanup_zoned_dev(md);
 	if (md->disk) {
 		spin_lock(&_minor_lock);
 		md->disk->private_data = NULL;
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 7f1acbf6bd9e..e0c57f19839b 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -104,13 +104,11 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
 int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q);
 void dm_zone_endio(struct dm_io *io, struct bio *clone);
 #ifdef CONFIG_BLK_DEV_ZONED
-void dm_cleanup_zoned_dev(struct mapped_device *md);
 int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
 			unsigned int nr_zones, report_zones_cb cb, void *data);
 bool dm_is_zone_write(struct mapped_device *md, struct bio *bio);
 int dm_zone_map_bio(struct dm_target_io *io);
 #else
-static inline void dm_cleanup_zoned_dev(struct mapped_device *md) {}
 #define dm_blk_report_zones	NULL
 static inline bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
 {
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 059afc24c08b..0a2d37eb38ef 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1424,7 +1424,7 @@ __acquires(bitmap->lock)
 	sector_t chunk = offset >> bitmap->chunkshift;
 	unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
 	unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
-	sector_t csize;
+	sector_t csize = ((sector_t)1) << bitmap->chunkshift;
 	int err;
 
 	if (page >= bitmap->pages) {
@@ -1433,6 +1433,7 @@ __acquires(bitmap->lock)
 		 * End-of-device while looking for a whole page or
 		 * user set a huge number to sysfs bitmap_set_bits.
 		 */
+		*blocks = csize - (offset & (csize - 1));
 		return NULL;
 	}
 	err = md_bitmap_checkpage(bitmap, page, create, 0);
@@ -1441,8 +1442,7 @@ __acquires(bitmap->lock)
 	    bitmap->bp[page].map == NULL)
 		csize = ((sector_t)1) << (bitmap->chunkshift +
 					  PAGE_COUNTER_SHIFT);
-	else
-		csize = ((sector_t)1) << bitmap->chunkshift;
+
 	*blocks = csize - (offset & (csize - 1));
 
 	if (err < 0)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index e575e74aabf5..aff9118ff697 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8087,7 +8087,8 @@ void md_wakeup_thread(struct md_thread __rcu *thread)
 	if (t) {
 		pr_debug("md: waking up MD thread %s.\n", t->tsk->comm);
 		set_bit(THREAD_WAKEUP, &t->flags);
-		wake_up(&t->wqueue);
+		if (wq_has_sleeper(&t->wqueue))
+			wake_up(&t->wqueue);
 	}
 	rcu_read_unlock();
 }
@@ -8582,6 +8583,10 @@ static int is_mddev_idle(struct mddev *mddev, int init)
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev) {
 		struct gendisk *disk = rdev->bdev->bd_disk;
+
+		if (!init && !blk_queue_io_stat(disk->queue))
+			continue;
+
 		curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
 			      atomic_read(&disk->sync_io);
 		/* sync IO will cause sync_io to increase before the disk_stats
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 097d9dbd69b8..ca085ecad504 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -621,7 +621,8 @@ extern void mddev_unlock(struct mddev *mddev);
 
 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
 {
-	atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
+	if (blk_queue_io_stat(bdev->bd_disk->queue))
+		atomic_add(nr_sectors, &bdev->bd_disk->sync_io);
 }
 
 static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors)
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d874abfc1836..2bd1ce9b3922 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,7 +36,6 @@
  */
 
 #include <linux/blkdev.h>
-#include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
@@ -6734,6 +6733,9 @@ static void raid5d(struct md_thread *thread)
 		int batch_size, released;
 		unsigned int offset;
 
+		if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
+			break;
+
 		released = release_stripe_list(conf, conf->temp_inactive_list);
 		if (released)
 			clear_bit(R5_DID_ALLOC, &conf->cache_state);
@@ -6770,18 +6772,7 @@ static void raid5d(struct md_thread *thread)
 			spin_unlock_irq(&conf->device_lock);
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
-
-			/*
-			 * Waiting on MD_SB_CHANGE_PENDING below may deadlock
-			 * seeing md_check_recovery() is needed to clear
-			 * the flag when using mdmon.
-			 */
-			continue;
 		}
-
-		wait_event_lock_irq(mddev->sb_wait,
-			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
-			conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);
 
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 095f59e7aa93..bf7615cb36ee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2132,7 +2132,7 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
 	blk_mq_unfreeze_queue(ns->disk->queue);
 
 	if (blk_queue_is_zoned(ns->queue)) {
-		ret = blk_revalidate_disk_zones(ns->disk, NULL);
+		ret = blk_revalidate_disk_zones(ns->disk);
 		if (ret && !nvme_first_scan(ns->disk))
 			goto out;
 	}
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 3148d9f1bde6..0021d06041c1 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -52,14 +52,10 @@ bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
 	if (get_capacity(bd_disk) & (bdev_zone_sectors(ns->bdev) - 1))
 		return false;
 	/*
-	 * ZNS does not define a conventional zone type. If the underlying
-	 * device has a bitmap set indicating the existence of conventional
-	 * zones, reject the device. Otherwise, use report zones to detect if
-	 * the device has conventional zones.
+	 * ZNS does not define a conventional zone type. Use report zones
+	 * to detect if the device has conventional zones and reject it if
+	 * it does.
 	 */
-	if (ns->bdev->bd_disk->conv_zones_bitmap)
-		return false;
-
 	ret = blkdev_report_zones(ns->bdev, 0, bdev_nr_zones(ns->bdev),
 				  validate_conv_zones_cb, NULL);
 	if (ret < 0)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 5b3230ef51fe..967b6d62bb37 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1869,7 +1869,6 @@ static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	case BLK_STS_OK:
 		break;
 	case BLK_STS_RESOURCE:
-	case BLK_STS_ZONE_RESOURCE:
 		if (scsi_device_blocked(sdev))
 			ret = BLK_STS_DEV_RESOURCE;
 		break;
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 65cdc8b77e35..64c5129044b3 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1260,12 +1260,6 @@ static blk_status_t sd_setup_read_write_cmnd(struct scsi_cmnd *cmd)
 		}
 	}
 
-	if (req_op(rq) == REQ_OP_ZONE_APPEND) {
-		ret = sd_zbc_prepare_zone_append(cmd, &lba, nr_blocks);
-		if (ret)
-			goto fail;
-	}
-
 	fua = rq->cmd_flags & REQ_FUA ? 0x8 : 0;
 	dix = scsi_prot_sg_count(cmd);
 	dif = scsi_host_dif_capable(cmd->device->host, sdkp->protection_type);
@@ -1348,7 +1342,6 @@ static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
 		return sd_setup_flush_cmnd(cmd);
 	case REQ_OP_READ:
 	case REQ_OP_WRITE:
-	case REQ_OP_ZONE_APPEND:
 		return sd_setup_read_write_cmnd(cmd);
 	case REQ_OP_ZONE_RESET:
 		return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
@@ -3981,7 +3974,6 @@ static void scsi_disk_release(struct device *dev)
 	struct scsi_disk *sdkp = to_scsi_disk(dev);
 
 	ida_free(&sd_index_ida, sdkp->index);
-	sd_zbc_free_zone_info(sdkp);
 	put_device(&sdkp->device->sdev_gendev);
 	free_opal_dev(sdkp->opal_dev);
 
diff --git a/drivers/scsi/sd.h b/drivers/scsi/sd.h
index 5c4285a582b2..49dd600bfa48 100644
--- a/drivers/scsi/sd.h
+++ b/drivers/scsi/sd.h
@@ -104,12 +104,6 @@ struct scsi_disk {
 	 * between zone starting LBAs is constant.
 	 */
 	u32		zone_starting_lba_gran;
-	u32		*zones_wp_offset;
-	spinlock_t	zones_wp_offset_lock;
-	u32		*rev_wp_offset;
-	struct mutex	rev_mutex;
-	struct work_struct zone_wp_offset_work;
-	char		*zone_wp_update_buf;
 #endif
 	atomic_t	openers;
 	sector_t	capacity;	/* size in logical blocks */
@@ -245,7 +239,6 @@ static inline int sd_is_zoned(struct scsi_disk *sdkp)
 
 #ifdef CONFIG_BLK_DEV_ZONED
 
-void sd_zbc_free_zone_info(struct scsi_disk *sdkp);
 int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE]);
 int sd_zbc_revalidate_zones(struct scsi_disk *sdkp);
 blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
@@ -255,13 +248,8 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 int sd_zbc_report_zones(struct gendisk *disk, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data);
 
-blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
-				        unsigned int nr_blocks);
-
 #else /* CONFIG_BLK_DEV_ZONED */
 
-static inline void sd_zbc_free_zone_info(struct scsi_disk *sdkp) {}
-
 static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
 {
 	return 0;
@@ -285,13 +273,6 @@ static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd,
 	return good_bytes;
 }
 
-static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd,
-						      sector_t *lba,
-						      unsigned int nr_blocks)
-{
-	return BLK_STS_TARGET;
-}
-
 #define sd_zbc_report_zones NULL
 
 #endif /* CONFIG_BLK_DEV_ZONED */
diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c
index 26af5ab7d7c1..806036e48abe 100644
--- a/drivers/scsi/sd_zbc.c
+++ b/drivers/scsi/sd_zbc.c
@@ -23,36 +23,6 @@
 #define CREATE_TRACE_POINTS
 #include "sd_trace.h"
 
-/**
- * sd_zbc_get_zone_wp_offset - Get zone write pointer offset.
- * @zone: Zone for which to return the write pointer offset.
- *
- * Return: offset of the write pointer from the start of the zone.
- */
-static unsigned int sd_zbc_get_zone_wp_offset(struct blk_zone *zone)
-{
-	if (zone->type == ZBC_ZONE_TYPE_CONV)
-		return 0;
-
-	switch (zone->cond) {
-	case BLK_ZONE_COND_IMP_OPEN:
-	case BLK_ZONE_COND_EXP_OPEN:
-	case BLK_ZONE_COND_CLOSED:
-		return zone->wp - zone->start;
-	case BLK_ZONE_COND_FULL:
-		return zone->len;
-	case BLK_ZONE_COND_EMPTY:
-	case BLK_ZONE_COND_OFFLINE:
-	case BLK_ZONE_COND_READONLY:
-	default:
-		/*
-		 * Offline and read-only zones do not have a valid
-		 * write pointer. Use 0 as for an empty zone.
-		 */
-		return 0;
-	}
-}
-
 /* Whether or not a SCSI zone descriptor describes a gap zone. */
 static bool sd_zbc_is_gap_zone(const u8 buf[64])
 {
@@ -121,9 +91,6 @@ static int sd_zbc_parse_report(struct scsi_disk *sdkp, const u8 buf[64],
 	if (ret)
 		return ret;
 
-	if (sdkp->rev_wp_offset)
-		sdkp->rev_wp_offset[idx] = sd_zbc_get_zone_wp_offset(&zone);
-
 	return 0;
 }
 
@@ -347,123 +314,6 @@ static blk_status_t sd_zbc_cmnd_checks(struct scsi_cmnd *cmd)
 	return BLK_STS_OK;
 }
 
-#define SD_ZBC_INVALID_WP_OFST	(~0u)
-#define SD_ZBC_UPDATING_WP_OFST	(SD_ZBC_INVALID_WP_OFST - 1)
-
-static int sd_zbc_update_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
-				    void *data)
-{
-	struct scsi_disk *sdkp = data;
-
-	lockdep_assert_held(&sdkp->zones_wp_offset_lock);
-
-	sdkp->zones_wp_offset[idx] = sd_zbc_get_zone_wp_offset(zone);
-
-	return 0;
-}
-
-/*
- * An attempt to append a zone triggered an invalid write pointer error.
- * Reread the write pointer of the zone(s) in which the append failed.
- */
-static void sd_zbc_update_wp_offset_workfn(struct work_struct *work)
-{
-	struct scsi_disk *sdkp;
-	unsigned long flags;
-	sector_t zno;
-	int ret;
-
-	sdkp = container_of(work, struct scsi_disk, zone_wp_offset_work);
-
-	spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
-	for (zno = 0; zno < sdkp->zone_info.nr_zones; zno++) {
-		if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
-			continue;
-
-		spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-		ret = sd_zbc_do_report_zones(sdkp, sdkp->zone_wp_update_buf,
-					     SD_BUF_SIZE,
-					     zno * sdkp->zone_info.zone_blocks, true);
-		spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
-		if (!ret)
-			sd_zbc_parse_report(sdkp, sdkp->zone_wp_update_buf + 64,
-					    zno, sd_zbc_update_wp_offset_cb,
-					    sdkp);
-	}
-	spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-
-	scsi_device_put(sdkp->device);
-}
-
-/**
- * sd_zbc_prepare_zone_append() - Prepare an emulated ZONE_APPEND command.
- * @cmd: the command to setup
- * @lba: the LBA to patch
- * @nr_blocks: the number of LBAs to be written
- *
- * Called from sd_setup_read_write_cmnd() for REQ_OP_ZONE_APPEND.
- * @sd_zbc_prepare_zone_append() handles the necessary zone wrote locking and
- * patching of the lba for an emulated ZONE_APPEND command.
- *
- * In case the cached write pointer offset is %SD_ZBC_INVALID_WP_OFST it will
- * schedule a REPORT ZONES command and return BLK_STS_IOERR.
- */
-blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba,
-					unsigned int nr_blocks)
-{
-	struct request *rq = scsi_cmd_to_rq(cmd);
-	struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
-	unsigned int wp_offset, zno = blk_rq_zone_no(rq);
-	unsigned long flags;
-	blk_status_t ret;
-
-	ret = sd_zbc_cmnd_checks(cmd);
-	if (ret != BLK_STS_OK)
-		return ret;
-
-	if (!blk_rq_zone_is_seq(rq))
-		return BLK_STS_IOERR;
-
-	/* Unlock of the write lock will happen in sd_zbc_complete() */
-	if (!blk_req_zone_write_trylock(rq))
-		return BLK_STS_ZONE_RESOURCE;
-
-	spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
-	wp_offset = sdkp->zones_wp_offset[zno];
-	switch (wp_offset) {
-	case SD_ZBC_INVALID_WP_OFST:
-		/*
-		 * We are about to schedule work to update a zone write pointer
-		 * offset, which will cause the zone append command to be
-		 * requeued. So make sure that the scsi device does not go away
-		 * while the work is being processed.
-		 */
-		if (scsi_device_get(sdkp->device)) {
-			ret = BLK_STS_IOERR;
-			break;
-		}
-		sdkp->zones_wp_offset[zno] = SD_ZBC_UPDATING_WP_OFST;
-		schedule_work(&sdkp->zone_wp_offset_work);
-		fallthrough;
-	case SD_ZBC_UPDATING_WP_OFST:
-		ret = BLK_STS_DEV_RESOURCE;
-		break;
-	default:
-		wp_offset = sectors_to_logical(sdkp->device, wp_offset);
-		if (wp_offset + nr_blocks > sdkp->zone_info.zone_blocks) {
-			ret = BLK_STS_IOERR;
-			break;
-		}
-
-		trace_scsi_prepare_zone_append(cmd, *lba, wp_offset);
-		*lba += wp_offset;
-	}
-	spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-	if (ret)
-		blk_req_zone_write_unlock(rq);
-	return ret;
-}
-
 /**
  * sd_zbc_setup_zone_mgmt_cmnd - Prepare a zone ZBC_OUT command. The operations
  *			can be RESET WRITE POINTER, OPEN, CLOSE or FINISH.
@@ -504,96 +354,6 @@ blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd,
 	return BLK_STS_OK;
 }
 
-static bool sd_zbc_need_zone_wp_update(struct request *rq)
-{
-	switch (req_op(rq)) {
-	case REQ_OP_ZONE_APPEND:
-	case REQ_OP_ZONE_FINISH:
-	case REQ_OP_ZONE_RESET:
-	case REQ_OP_ZONE_RESET_ALL:
-		return true;
-	case REQ_OP_WRITE:
-	case REQ_OP_WRITE_ZEROES:
-		return blk_rq_zone_is_seq(rq);
-	default:
-		return false;
-	}
-}
-
-/**
- * sd_zbc_zone_wp_update - Update cached zone write pointer upon cmd completion
- * @cmd: Completed command
- * @good_bytes: Command reply bytes
- *
- * Called from sd_zbc_complete() to handle the update of the cached zone write
- * pointer value in case an update is needed.
- */
-static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
-					  unsigned int good_bytes)
-{
-	int result = cmd->result;
-	struct request *rq = scsi_cmd_to_rq(cmd);
-	struct scsi_disk *sdkp = scsi_disk(rq->q->disk);
-	unsigned int zno = blk_rq_zone_no(rq);
-	enum req_op op = req_op(rq);
-	unsigned long flags;
-
-	/*
-	 * If we got an error for a command that needs updating the write
-	 * pointer offset cache, we must mark the zone wp offset entry as
-	 * invalid to force an update from disk the next time a zone append
-	 * command is issued.
-	 */
-	spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
-
-	if (result && op != REQ_OP_ZONE_RESET_ALL) {
-		if (op == REQ_OP_ZONE_APPEND) {
-			/* Force complete completion (no retry) */
-			good_bytes = 0;
-			scsi_set_resid(cmd, blk_rq_bytes(rq));
-		}
-
-		/*
-		 * Force an update of the zone write pointer offset on
-		 * the next zone append access.
-		 */
-		if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
-			sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
-		goto unlock_wp_offset;
-	}
-
-	switch (op) {
-	case REQ_OP_ZONE_APPEND:
-		trace_scsi_zone_wp_update(cmd, rq->__sector,
-				  sdkp->zones_wp_offset[zno], good_bytes);
-		rq->__sector += sdkp->zones_wp_offset[zno];
-		fallthrough;
-	case REQ_OP_WRITE_ZEROES:
-	case REQ_OP_WRITE:
-		if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
-			sdkp->zones_wp_offset[zno] +=
-						good_bytes >> SECTOR_SHIFT;
-		break;
-	case REQ_OP_ZONE_RESET:
-		sdkp->zones_wp_offset[zno] = 0;
-		break;
-	case REQ_OP_ZONE_FINISH:
-		sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
-		break;
-	case REQ_OP_ZONE_RESET_ALL:
-		memset(sdkp->zones_wp_offset, 0,
-		       sdkp->zone_info.nr_zones * sizeof(unsigned int));
-		break;
-	default:
-		break;
-	}
-
-unlock_wp_offset:
-	spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
-
-	return good_bytes;
-}
-
 /**
  * sd_zbc_complete - ZBC command post processing.
  * @cmd: Completed command
@@ -619,11 +379,7 @@ unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
 		 * so be quiet about the error.
 		 */
 		rq->rq_flags |= RQF_QUIET;
-	} else if (sd_zbc_need_zone_wp_update(rq))
-		good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
-
-	if (req_op(rq) == REQ_OP_ZONE_APPEND)
-		blk_req_zone_write_unlock(rq);
+	}
 
 	return good_bytes;
 }
@@ -780,46 +536,6 @@ static void sd_zbc_print_zones(struct scsi_disk *sdkp)
 			  sdkp->zone_info.zone_blocks);
 }
 
-static int sd_zbc_init_disk(struct scsi_disk *sdkp)
-{
-	sdkp->zones_wp_offset = NULL;
-	spin_lock_init(&sdkp->zones_wp_offset_lock);
-	sdkp->rev_wp_offset = NULL;
-	mutex_init(&sdkp->rev_mutex);
-	INIT_WORK(&sdkp->zone_wp_offset_work, sd_zbc_update_wp_offset_workfn);
-	sdkp->zone_wp_update_buf = kzalloc(SD_BUF_SIZE, GFP_KERNEL);
-	if (!sdkp->zone_wp_update_buf)
-		return -ENOMEM;
-
-	return 0;
-}
-
-void sd_zbc_free_zone_info(struct scsi_disk *sdkp)
-{
-	if (!sdkp->zone_wp_update_buf)
-		return;
-
-	/* Serialize against revalidate zones */
-	mutex_lock(&sdkp->rev_mutex);
-
-	kvfree(sdkp->zones_wp_offset);
-	sdkp->zones_wp_offset = NULL;
-	kfree(sdkp->zone_wp_update_buf);
-	sdkp->zone_wp_update_buf = NULL;
-
-	sdkp->early_zone_info = (struct zoned_disk_info){ };
-	sdkp->zone_info = (struct zoned_disk_info){ };
-
-	mutex_unlock(&sdkp->rev_mutex);
-}
-
-static void sd_zbc_revalidate_zones_cb(struct gendisk *disk)
-{
-	struct scsi_disk *sdkp = scsi_disk(disk);
-
-	swap(sdkp->zones_wp_offset, sdkp->rev_wp_offset);
-}
-
 /*
  * Call blk_revalidate_disk_zones() if any of the zoned disk properties have
  * changed that make it necessary to call that function. Called by
@@ -831,18 +547,8 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
 	struct request_queue *q = disk->queue;
 	u32 zone_blocks = sdkp->early_zone_info.zone_blocks;
 	unsigned int nr_zones = sdkp->early_zone_info.nr_zones;
-	int ret = 0;
 	unsigned int flags;
-
-	/*
-	 * For all zoned disks, initialize zone append emulation data if not
-	 * already done.
-	 */
-	if (sd_is_zoned(sdkp) && !sdkp->zone_wp_update_buf) {
-		ret = sd_zbc_init_disk(sdkp);
-		if (ret)
-			return ret;
-	}
+	int ret;
 
 	/*
 	 * There is nothing to do for regular disks, including host-aware disks
@@ -851,50 +557,32 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp)
 	if (!blk_queue_is_zoned(q))
 		return 0;
 
-	/*
-	 * Make sure revalidate zones are serialized to ensure exclusive
-	 * updates of the scsi disk data.
-	 */
-	mutex_lock(&sdkp->rev_mutex);
-
 	if (sdkp->zone_info.zone_blocks == zone_blocks &&
 	    sdkp->zone_info.nr_zones == nr_zones &&
 	    disk->nr_zones == nr_zones)
-		goto unlock;
+		return 0;
 
-	flags = memalloc_noio_save();
 	sdkp->zone_info.zone_blocks = zone_blocks;
 	sdkp->zone_info.nr_zones = nr_zones;
-	sdkp->rev_wp_offset = kvcalloc(nr_zones, sizeof(u32), GFP_KERNEL);
-	if (!sdkp->rev_wp_offset) {
-		ret = -ENOMEM;
-		memalloc_noio_restore(flags);
-		goto unlock;
-	}
 
 	blk_queue_chunk_sectors(q,
 			logical_to_sectors(sdkp->device, zone_blocks));
-	blk_queue_max_zone_append_sectors(q,
-			q->limits.max_segments << PAGE_SECTORS_SHIFT);
 
-	ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb);
+	/* Enable block layer zone append emulation */
+	blk_queue_max_zone_append_sectors(q, 0);
 
+	flags = memalloc_noio_save();
+	ret = blk_revalidate_disk_zones(disk);
 	memalloc_noio_restore(flags);
-	kvfree(sdkp->rev_wp_offset);
-	sdkp->rev_wp_offset = NULL;
-
 	if (ret) {
 		sdkp->zone_info = (struct zoned_disk_info){ };
 		sdkp->capacity = 0;
-		goto unlock;
+		return ret;
 	}
 
 	sd_zbc_print_zones(sdkp);
 
-unlock:
-	mutex_unlock(&sdkp->rev_mutex);
-
-	return ret;
+	return 0;
 }
 
 /**
@@ -917,10 +605,8 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
 	if (!sd_is_zoned(sdkp)) {
 		/*
 		 * Device managed or normal SCSI disk, no special handling
-		 * required. Nevertheless, free the disk zone information in
-		 * case the device type changed.
+		 * required.
 		 */
-		sd_zbc_free_zone_info(sdkp);
 		return 0;
 	}
 
@@ -941,7 +627,6 @@ int sd_zbc_read_zones(struct scsi_disk *sdkp, u8 buf[SD_BUF_SIZE])
 
 	/* The drive satisfies the kernel restrictions: set it up */
 	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
-	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
 	if (sdkp->zones_max_open == U32_MAX)
 		disk_set_max_open_zones(disk, 0);
 	else
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 6f4a9cfeea44..831fac45e70f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -331,12 +331,11 @@ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 static void merge_rbio(struct btrfs_raid_bio *dest,
 		       struct btrfs_raid_bio *victim)
 {
-	bio_list_merge(&dest->bio_list, &victim->bio_list);
+	bio_list_merge_init(&dest->bio_list, &victim->bio_list);
 	dest->bio_list_bytes += victim->bio_list_bytes;
 	/* Also inherit the bitmaps from @victim. */
 	bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
 		  dest->stripe_nsectors);
-	bio_list_init(&victim->bio_list);
 }
 
 /*
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 875d792bffff..d5379548d684 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -615,6 +615,13 @@ static inline void bio_list_merge(struct bio_list *bl, struct bio_list *bl2)
 	bl->tail = bl2->tail;
 }
 
+static inline void bio_list_merge_init(struct bio_list *bl,
+		struct bio_list *bl2)
+{
+	bio_list_merge(bl, bl2);
+	bio_list_init(bl2);
+}
+
 static inline void bio_list_merge_head(struct bio_list *bl,
 				       struct bio_list *bl2)
 {
@@ -824,5 +831,9 @@ static inline void bio_clear_polled(struct bio *bio)
 
 struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
 		unsigned int nr_pages, blk_opf_t opf, gfp_t gfp);
+struct bio *bio_chain_and_submit(struct bio *prev, struct bio *new);
+
+struct bio *blk_alloc_discard_bio(struct block_device *bdev,
+		sector_t *sector, sector_t *nr_sects, gfp_t gfp_mask);
 
 #endif /* __LINUX_BIO_H */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d3d8fd8e229b..89ba6b16fe8b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -54,8 +54,8 @@ typedef __u32 __bitwise req_flags_t;
 /* Look at ->special_vec for the actual data payload instead of the
    bio chain. */
 #define RQF_SPECIAL_PAYLOAD	((__force req_flags_t)(1 << 18))
-/* The per-zone write lock is held for this request */
-#define RQF_ZONE_WRITE_LOCKED	((__force req_flags_t)(1 << 19))
+/* The request completion needs to be signaled to zone write pluging. */
+#define RQF_ZONE_WRITE_PLUGGING	((__force req_flags_t)(1 << 20))
 /* ->timeout has been called, don't expire again */
 #define RQF_TIMED_OUT		((__force req_flags_t)(1 << 21))
 #define RQF_RESV		((__force req_flags_t)(1 << 23))
@@ -1150,85 +1150,4 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 }
 void blk_dump_rq_flags(struct request *, char *);
 
-#ifdef CONFIG_BLK_DEV_ZONED
-static inline unsigned int blk_rq_zone_no(struct request *rq)
-{
-	return disk_zone_no(rq->q->disk, blk_rq_pos(rq));
-}
-
-static inline unsigned int blk_rq_zone_is_seq(struct request *rq)
-{
-	return disk_zone_is_seq(rq->q->disk, blk_rq_pos(rq));
-}
-
-/**
- * blk_rq_is_seq_zoned_write() - Check if @rq requires write serialization.
- * @rq: Request to examine.
- *
- * Note: REQ_OP_ZONE_APPEND requests do not require serialization.
- */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
-	return op_needs_zoned_write_locking(req_op(rq)) &&
-		blk_rq_zone_is_seq(rq);
-}
-
-bool blk_req_needs_zone_write_lock(struct request *rq);
-bool blk_req_zone_write_trylock(struct request *rq);
-void __blk_req_zone_write_lock(struct request *rq);
-void __blk_req_zone_write_unlock(struct request *rq);
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-	if (blk_req_needs_zone_write_lock(rq))
-		__blk_req_zone_write_lock(rq);
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-	if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED)
-		__blk_req_zone_write_unlock(rq);
-}
-
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-	return rq->q->disk->seq_zones_wlock &&
-		test_bit(blk_rq_zone_no(rq), rq->q->disk->seq_zones_wlock);
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-	if (!blk_req_needs_zone_write_lock(rq))
-		return true;
-	return !blk_req_zone_is_write_locked(rq);
-}
-#else /* CONFIG_BLK_DEV_ZONED */
-static inline bool blk_rq_is_seq_zoned_write(struct request *rq)
-{
-	return false;
-}
-
-static inline bool blk_req_needs_zone_write_lock(struct request *rq)
-{
-	return false;
-}
-
-static inline void blk_req_zone_write_lock(struct request *rq)
-{
-}
-
-static inline void blk_req_zone_write_unlock(struct request *rq)
-{
-}
-static inline bool blk_req_zone_is_write_locked(struct request *rq)
-{
-	return false;
-}
-
-static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
-{
-	return true;
-}
-#endif /* CONFIG_BLK_DEV_ZONED */
-
 #endif /* BLK_MQ_H */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index c3e098b21c16..25dbf1097085 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -130,18 +130,6 @@ typedef u16 blk_short_t;
  */
 #define BLK_STS_DEV_RESOURCE	((__force blk_status_t)13)
 
-/*
- * BLK_STS_ZONE_RESOURCE is returned from the driver to the block layer if zone
- * related resources are unavailable, but the driver can guarantee the queue
- * will be rerun in the future once the resources become available again.
- *
- * This is different from BLK_STS_DEV_RESOURCE in that it explicitly references
- * a zone specific resource and IO to a different zone on the same device could
- * still be served. Examples of that are zones that are write-locked, but a read
- * to the same zone could be served.
- */
-#define BLK_STS_ZONE_RESOURCE	((__force blk_status_t)14)
-
 /*
  * BLK_STS_ZONE_OPEN_RESOURCE is returned from the driver in the completion
  * path if the device returns a status indicating that too many zone resources
@@ -149,7 +137,7 @@ typedef u16 blk_short_t;
  * after the number of open zones decreases below the device's limits, which is
  * reported in the request_queue's max_open_zones.
  */
-#define BLK_STS_ZONE_OPEN_RESOURCE	((__force blk_status_t)15)
+#define BLK_STS_ZONE_OPEN_RESOURCE	((__force blk_status_t)14)
 
 /*
  * BLK_STS_ZONE_ACTIVE_RESOURCE is returned from the driver in the completion
@@ -158,20 +146,20 @@ typedef u16 blk_short_t;
  * after the number of active zones decreases below the device's limits, which
  * is reported in the request_queue's max_active_zones.
  */
-#define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)16)
+#define BLK_STS_ZONE_ACTIVE_RESOURCE	((__force blk_status_t)15)
 
 /*
  * BLK_STS_OFFLINE is returned from the driver when the target device is offline
  * or is being taken offline. This could help differentiate the case where a
  * device is intentionally being shut down from a real I/O error.
  */
-#define BLK_STS_OFFLINE		((__force blk_status_t)17)
+#define BLK_STS_OFFLINE		((__force blk_status_t)16)
 
 /*
  * BLK_STS_DURATION_LIMIT is returned from the driver when the target device
  * aborted the command because it exceeded one of its Command Duration Limits.
  */
-#define BLK_STS_DURATION_LIMIT	((__force blk_status_t)18)
+#define BLK_STS_DURATION_LIMIT	((__force blk_status_t)17)
 
 /**
  * blk_path_error - returns true if error may be path related
@@ -228,7 +216,12 @@ struct bio {
 
 	struct bvec_iter	bi_iter;
 
-	blk_qc_t		bi_cookie;
+	union {
+		/* for polled bios: */
+		blk_qc_t		bi_cookie;
+		/* for plugged zoned writes only: */
+		unsigned int		__bi_nr_segments;
+	};
 	bio_end_io_t		*bi_end_io;
 	void			*bi_private;
 #ifdef CONFIG_BLK_CGROUP
@@ -298,7 +291,8 @@ enum {
 	BIO_QOS_THROTTLED,	/* bio went through rq_qos throttle path */
 	BIO_QOS_MERGED,		/* but went through rq_qos merge path */
 	BIO_REMAPPED,
-	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
+	BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
+	BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
 	BIO_FLAG_LAST
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 69e7da33ca49..69c4f113db42 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -179,22 +179,21 @@ struct gendisk {
 
 #ifdef CONFIG_BLK_DEV_ZONED
 	/*
-	 * Zoned block device information for request dispatch control.
-	 * nr_zones is the total number of zones of the device. This is always
-	 * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones
-	 * bits which indicates if a zone is conventional (bit set) or
-	 * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones
-	 * bits which indicates if a zone is write locked, that is, if a write
-	 * request targeting the zone was dispatched.
-	 *
-	 * Reads of this information must be protected with blk_queue_enter() /
-	 * blk_queue_exit(). Modifying this information is only allowed while
-	 * no requests are being processed. See also blk_mq_freeze_queue() and
-	 * blk_mq_unfreeze_queue().
+	 * Zoned block device information. Reads of this information must be
+	 * protected with blk_queue_enter() / blk_queue_exit(). Modifying this
+	 * information is only allowed while no requests are being processed.
+	 * See also blk_mq_freeze_queue() and blk_mq_unfreeze_queue().
 	 */
 	unsigned int		nr_zones;
+	unsigned int		zone_capacity;
 	unsigned long		*conv_zones_bitmap;
-	unsigned long		*seq_zones_wlock;
+	unsigned int            zone_wplugs_hash_bits;
+	spinlock_t              zone_wplugs_lock;
+	struct mempool_s	*zone_wplugs_pool;
+	struct hlist_head       *zone_wplugs_hash;
+	struct list_head        zone_wplugs_err_list;
+	struct work_struct	zone_wplugs_work;
+	struct workqueue_struct *zone_wplugs_wq;
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 #if IS_ENABLED(CONFIG_CDROM)
@@ -233,6 +232,19 @@ static inline unsigned int disk_openers(struct gendisk *disk)
 	return atomic_read(&disk->part0->bd_openers);
 }
 
+/**
+ * disk_has_partscan - return %true if partition scanning is enabled on a disk
+ * @disk: disk to check
+ *
+ * Returns %true if partitions scanning is enabled for @disk, or %false if
+ * partition scanning is disabled either permanently or temporarily.
+ */
+static inline bool disk_has_partscan(struct gendisk *disk)
+{
+	return !(disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) &&
+		!test_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+}
+
 /*
  * The gendisk is refcounted by the part0 block_device, and the bd_device
  * therein is also used for device model presentation in sysfs.
@@ -331,8 +343,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 		unsigned int nr_zones, report_zones_cb cb, void *data);
 int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
 		sector_t sectors, sector_t nr_sectors);
-int blk_revalidate_disk_zones(struct gendisk *disk,
-		void (*update_driver_data)(struct gendisk *disk));
+int blk_revalidate_disk_zones(struct gendisk *disk);
 
 /*
  * Independent access ranges: struct blk_independent_access_range describes
@@ -449,8 +460,6 @@ struct request_queue {
 
 	atomic_t		nr_active_requests_shared_tags;
 
-	unsigned int		required_elevator_features;
-
 	struct blk_mq_tags	*sched_shared_tags;
 
 	struct list_head	icq_list;
@@ -633,15 +642,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
 	return sector >> ilog2(disk->queue->limits.chunk_sectors);
 }
 
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
-	if (!blk_queue_is_zoned(disk->queue))
-		return false;
-	if (!disk->conv_zones_bitmap)
-		return true;
-	return !test_bit(disk_zone_no(disk, sector), disk->conv_zones_bitmap);
-}
-
 static inline void disk_set_max_open_zones(struct gendisk *disk,
 		unsigned int max_open_zones)
 {
@@ -664,6 +664,7 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 	return bdev->bd_disk->queue->limits.max_active_zones;
 }
 
+bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs);
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline unsigned int bdev_nr_zones(struct block_device *bdev)
 {
@@ -674,10 +675,6 @@ static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
 }
-static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
-{
-	return false;
-}
 static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector)
 {
 	return 0;
@@ -691,6 +688,10 @@ static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
 {
 	return 0;
 }
+static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs)
+{
+	return false;
+}
 #endif /* CONFIG_BLK_DEV_ZONED */
 
 static inline unsigned int blk_queue_depth(struct request_queue *q)
@@ -855,9 +856,11 @@ static inline unsigned int bio_zone_no(struct bio *bio)
 	return disk_zone_no(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
 }
 
-static inline unsigned int bio_zone_is_seq(struct bio *bio)
+static inline bool bio_straddles_zones(struct bio *bio)
 {
-	return disk_zone_is_seq(bio->bi_bdev->bd_disk, bio->bi_iter.bi_sector);
+	return bio_sectors(bio) &&
+		bio_zone_no(bio) !=
+		disk_zone_no(bio->bi_bdev->bd_disk, bio_end_sector(bio) - 1);
 }
 
 /*
@@ -942,14 +945,6 @@ disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges);
 void disk_set_independent_access_ranges(struct gendisk *disk,
 				struct blk_independent_access_ranges *iars);
 
-/*
- * Elevator features for blk_queue_required_elevator_features:
- */
-/* Supports zoned block devices sequential write constraint */
-#define ELEVATOR_F_ZBD_SEQ_WRITE	(1U << 0)
-
-extern void blk_queue_required_elevator_features(struct request_queue *q,
-						 unsigned int features);
 extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
 					      struct device *dev);
 
@@ -1156,12 +1151,29 @@ static inline unsigned int queue_max_segment_size(const struct request_queue *q)
 	return q->limits.max_segment_size;
 }
 
-static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q)
+static inline unsigned int queue_limits_max_zone_append_sectors(struct queue_limits *l)
 {
+	unsigned int max_sectors = min(l->chunk_sectors, l->max_hw_sectors);
 
-	const struct queue_limits *l = &q->limits;
+	return min_not_zero(l->max_zone_append_sectors, max_sectors);
+}
+
+static inline unsigned int queue_max_zone_append_sectors(struct request_queue *q)
+{
+	if (!blk_queue_is_zoned(q))
+		return 0;
+
+	return queue_limits_max_zone_append_sectors(&q->limits);
+}
+
+static inline bool queue_emulates_zone_append(struct request_queue *q)
+{
+	return blk_queue_is_zoned(q) && !q->limits.max_zone_append_sectors;
+}
 
-	return min(l->max_zone_append_sectors, l->max_sectors);
+static inline bool bdev_emulates_zone_append(struct block_device *bdev)
+{
+	return queue_emulates_zone_append(bdev_get_queue(bdev));
 }
 
 static inline unsigned int
@@ -1303,18 +1315,6 @@ static inline unsigned int bdev_zone_no(struct block_device *bdev, sector_t sec)
 	return disk_zone_no(bdev->bd_disk, sec);
 }
 
-/* Whether write serialization is required for @op on zoned devices. */
-static inline bool op_needs_zoned_write_locking(enum req_op op)
-{
-	return op == REQ_OP_WRITE || op == REQ_OP_WRITE_ZEROES;
-}
-
-static inline bool bdev_op_is_zoned_write(struct block_device *bdev,
-					  enum req_op op)
-{
-	return bdev_is_zoned(bdev) && op_needs_zoned_write_locking(op);
-}
-
 static inline sector_t bdev_zone_sectors(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
@@ -1330,6 +1330,12 @@ static inline sector_t bdev_offset_from_zone_start(struct block_device *bdev,
 	return sector & (bdev_zone_sectors(bdev) - 1);
 }
 
+static inline sector_t bio_offset_from_zone_start(struct bio *bio)
+{
+	return bdev_offset_from_zone_start(bio->bi_bdev,
+					   bio->bi_iter.bi_sector);
+}
+
 static inline bool bdev_is_zone_start(struct block_device *bdev,
 				      sector_t sector)
 {
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 92c6b1fd8989..1e453f825c05 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -494,18 +494,18 @@ unsigned long __sbitmap_queue_get_batch(struct sbitmap_queue *sbq, int nr_tags,
 		struct sbitmap_word *map = &sb->map[index];
 		unsigned long get_mask;
 		unsigned int map_depth = __map_depth(sb, index);
+		unsigned long val;
 
 		sbitmap_deferred_clear(map);
-		if (map->word == (1UL << (map_depth - 1)) - 1)
+		val = READ_ONCE(map->word);
+		if (val == (1UL << (map_depth - 1)) - 1)
 			goto next;
 
-		nr = find_first_zero_bit(&map->word, map_depth);
+		nr = find_first_zero_bit(&val, map_depth);
 		if (nr + nr_tags <= map_depth) {
 			atomic_long_t *ptr = (atomic_long_t *) &map->word;
-			unsigned long val;
 
 			get_mask = ((1UL << nr_tags) - 1) << nr;
-			val = READ_ONCE(map->word);
 			while (!atomic_long_try_cmpxchg(ptr, &val,
 							  get_mask | val))
 				;