for-5.5/block-20191121

-----BEGIN PGP SIGNATURE----- iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAl3WxrEQHGF4Ym9lQGtl cm5lbC5kawAKCRD301j7KXHgpuH5D/9qQKfIIuQDUNO4Xx+dIHimTDCrfiEOeO9e CRaMuSj+yMxLDMwfX8RnDmR17H3ZVoiIY1CT24U9ZkA5iDjeAH4xmzkH30US7LR7 /64YVZTxB0OrWppRK8RiIhaJJZDQ6+HPUQsn6PRaLVuFHi2unMoTQnj/ZQKz03QA Pl8Xx7qBtH1JwYCzQ21f/uryAcNg9eWabRLN2f1uiOXLmvRxOfh6Z/iaezlaZlmL qeJdcdLjjvOgOPwEOfNjfS6pd+XBz3gdEhn0l+11nHITxWZmVBwsWTKyUQlCmKnl yuCWDVyx5d6zCnlrLYG0l2Fn2lr9SwAkdkq3YAKV03hA/6s6P9q9bm31VvOf828x 7gmr4YVz68y7H9bM0QAHCvDpjll0aIEUw6XFzSOCDtZ9B6/pppYQWzMU71J05eyF 8DOKv2M2EVNLUjf6u0RDyolnWGU0kIjt5ryWE3OsGcezAVa2wYstgUJTKbrn1YgT j+4KTpaI+sg8GKDFauvxcSa6gwoRp6jweFNW+7vC090/shXmrGmVLOnQZKRuHho/ O4W8y/1/deM8CCIAETpiNxA8RV5U/EZygrFGDFc7yzTtVDGHY356M/B4Bmm2qkVu K3WgeZp8Fc0lH0QF6Pp9ZlBkZEpGNCAPVsPkXIsxQXbctftkn3KY//uIubfpFEB1 PpHSicvkww== =HYYq -----END PGP SIGNATURE----- Merge tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block Pull core block updates from Jens Axboe: "Due to more granular branches, this one is small and will be followed with other core branches that add specific features. I meant to just have a core and drivers branch, but external dependencies we ended up adding a few more that are also core. The changes are: - Fixes and improvements for the zoned device support (Ajay, Damien) - sed-opal table writing and datastore UID (Revanth) - blk-cgroup (and bfq) blk-cgroup stat fixes (Tejun) - Improvements to the block stats tracking (Pavel) - Fix for overruning sysfs buffer for large number of CPUs (Ming) - Optimization for small IO (Ming, Christoph) - Fix typo in RWH lifetime hint (Eugene) - Dead code removal and documentation (Bart) - Reduction in memory usage for queue and tag set (Bart) - Kerneldoc header documentation (André) - Device/partition revalidation fixes (Jan) - Stats tracking for flush requests (Konstantin) - Various other little fixes here and there (et al)" * tag 'for-5.5/block-20191121' of git://git.kernel.dk/linux-block: (48 commits) Revert "block: split bio if the only bvec's length is > SZ_4K" block: add iostat counters for flush requests block,bfq: Skip tracing hooks if possible block: sed-opal: Introduce SUM_SET_LIST parameter and append it using 'add_token_u64' blk-cgroup: cgroup_rstat_updated() shouldn't be called on cgroup1 block: Don't disable interrupts in trigger_softirq() sbitmap: Delete sbitmap_any_bit_clear() blk-mq: Delete blk_mq_has_free_tags() and blk_mq_can_queue() block: split bio if the only bvec's length is > SZ_4K block: still try to split bio if the bvec crosses pages blk-cgroup: separate out blkg_rwstat under CONFIG_BLK_CGROUP_RWSTAT blk-cgroup: reimplement basic IO stats using cgroup rstat blk-cgroup: remove now unused blkg_print_stat_{bytes|ios}_recursive() blk-throtl: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: stop using blkg->stat_bytes and ->stat_ios bfq-iosched: relocate bfqg_*rwstat*() helpers block: add zone open, close and finish ioctl support block: add zone open, close and finish operations block: Simplify REQ_OP_ZONE_RESET_ALL handling block: Remove REQ_OP_ZONE_RESET plugging ...
2019-11-25 10:59:41 -08:00 · 2019-11-25 10:59:41 -08:00 · ff6814b078
commit ff6814b078
parent 6e7b06a4c8 1e279153df
51 changed files with 1404 additions and 806 deletions
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@ -15,7 +15,9 @@
 */

 #include <linux/cgroup.h>
+#include <linux/percpu.h>
 #include <linux/percpu_counter.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
@ -31,15 +33,12 @@

 #ifdef CONFIG_BLK_CGROUP

-enum blkg_rwstat_type {
-	BLKG_RWSTAT_READ,
-	BLKG_RWSTAT_WRITE,
-	BLKG_RWSTAT_SYNC,
-	BLKG_RWSTAT_ASYNC,
-	BLKG_RWSTAT_DISCARD,
+enum blkg_iostat_type {
+	BLKG_IOSTAT_READ,
+	BLKG_IOSTAT_WRITE,
+	BLKG_IOSTAT_DISCARD,

-	BLKG_RWSTAT_NR,
-	BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+	BLKG_IOSTAT_NR,
 };

 struct blkcg_gq;
@ -61,17 +60,15 @@ struct blkcg {
 #endif
 };

-/*
- * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
- * recursive.  Used to carry stats of dead children.
- */
-struct blkg_rwstat {
-	struct percpu_counter		cpu_cnt[BLKG_RWSTAT_NR];
-	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
+struct blkg_iostat {
+	u64				bytes[BLKG_IOSTAT_NR];
+	u64				ios[BLKG_IOSTAT_NR];
 };

-struct blkg_rwstat_sample {
-	u64				cnt[BLKG_RWSTAT_NR];
+struct blkg_iostat_set {
+	struct u64_stats_sync		sync;
+	struct blkg_iostat		cur;
+	struct blkg_iostat		last;
 };

 /*
@ -127,8 +124,8 @@ struct blkcg_gq {
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;

-	struct blkg_rwstat		stat_bytes;
-	struct blkg_rwstat		stat_ios;
+	struct blkg_iostat_set __percpu	*iostat_cpu;
+	struct blkg_iostat_set		iostat;

 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];

@ -202,13 +199,6 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
 			     const struct blkcg_policy *pol);

-static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
-		unsigned int idx)
-{
-	return atomic64_read(&rwstat->aux_cnt[idx]) +
-		percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
-}
-
 const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *,
@ -216,17 +206,6 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       const struct blkcg_policy *pol, int data,
 		       bool show_total);
 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-			 const struct blkg_rwstat_sample *rwstat);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-		       int off);
-int blkg_print_stat_bytes(struct seq_file *sf, void *v);
-int blkg_print_stat_ios(struct seq_file *sf, void *v);
-int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
-int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
-
-void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
-		int off, struct blkg_rwstat_sample *sum);

 struct blkg_conf_ctx {
 	struct gendisk			*disk;
@ -578,128 +557,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),	\
 					      (p_blkg)->q, false)))

-static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
-{
-	int i, ret;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
-		ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
-		if (ret) {
-			while (--i >= 0)
-				percpu_counter_destroy(&rwstat->cpu_cnt[i]);
-			return ret;
-		}
-		atomic64_set(&rwstat->aux_cnt[i], 0);
-	}
-	return 0;
-}
-
-static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
-{
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		percpu_counter_destroy(&rwstat->cpu_cnt[i]);
-}
-
-/**
- * blkg_rwstat_add - add a value to a blkg_rwstat
- * @rwstat: target blkg_rwstat
- * @op: REQ_OP and flags
- * @val: value to add
- *
- * Add @val to @rwstat.  The counters are chosen according to @rw.  The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-				   unsigned int op, uint64_t val)
-{
-	struct percpu_counter *cnt;
-
-	if (op_is_discard(op))
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
-	else if (op_is_write(op))
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
-	else
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
-
-	percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
-
-	if (op_is_sync(op))
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
-	else
-		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
-
-	percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
-}
-
-/**
- * blkg_rwstat_read - read the current values of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Read the current snapshot of @rwstat and return it in the aux counts.
- */
-static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
-		struct blkg_rwstat_sample *result)
-{
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		result->cnt[i] =
-			percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
-}
-
-/**
- * blkg_rwstat_total - read the total count of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Return the total count of @rwstat regardless of the IO direction.  This
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
-{
-	struct blkg_rwstat_sample tmp = { };
-
-	blkg_rwstat_read(rwstat, &tmp);
-	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
-}
-
-/**
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
-{
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
-		percpu_counter_set(&rwstat->cpu_cnt[i], 0);
-		atomic64_set(&rwstat->aux_cnt[i], 0);
-	}
-}
-
-/**
- * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
- * @to: the destination blkg_rwstat
- * @from: the source
- *
- * Add @from's count including the aux one to @to's aux count.
- */
-static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
-				       struct blkg_rwstat *from)
-{
-	u64 sum[BLKG_RWSTAT_NR];
-	int i;
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
-
-	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
-			     &to->aux_cnt[i]);
-}
-
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 			   struct bio *bio);
@ -745,15 +602,33 @@ static inline bool blkcg_bio_issue_check(struct request_queue *q,
 	throtl = blk_throtl_bio(q, blkg, bio);

 	if (!throtl) {
+		struct blkg_iostat_set *bis;
+		int rwd, cpu;
+
+		if (op_is_discard(bio->bi_opf))
+			rwd = BLKG_IOSTAT_DISCARD;
+		else if (op_is_write(bio->bi_opf))
+			rwd = BLKG_IOSTAT_WRITE;
+		else
+			rwd = BLKG_IOSTAT_READ;
+
+		cpu = get_cpu();
+		bis = per_cpu_ptr(blkg->iostat_cpu, cpu);
+		u64_stats_update_begin(&bis->sync);
+
 		/*
 		 * If the bio is flagged with BIO_QUEUE_ENTERED it means this
 		 * is a split bio and we would have already accounted for the
 		 * size of the bio.
 		 */
 		if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
-			blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
-					bio->bi_iter.bi_size);
-		blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
+			bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+		bis->cur.ios[rwd]++;
+
+		u64_stats_update_end(&bis->sync);
+		if (cgroup_subsys_on_dfl(io_cgrp_subsys))
+			cgroup_rstat_updated(blkg->blkcg->css.cgroup, cpu);
+		put_cpu();
 	}

 	blkcg_bio_issue_init(bio);
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@ -10,103 +10,239 @@ struct blk_mq_tags;
 struct blk_flush_queue;

 /**
- * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware block device
+ * struct blk_mq_hw_ctx - State for a hardware queue facing the hardware
+ * block device
 */
 struct blk_mq_hw_ctx {
 	struct {
+		/** @lock: Protects the dispatch list. */
 		spinlock_t		lock;
+		/**
+		 * @dispatch: Used for requests that are ready to be
+		 * dispatched to the hardware but for some reason (e.g. lack of
+		 * resources) could not be sent to the hardware. As soon as the
+		 * driver can send new requests, requests at this list will
+		 * be sent first for a fairer dispatch.
+		 */
 		struct list_head	dispatch;
-		unsigned long		state;		/* BLK_MQ_S_* flags */
+		 /**
+		  * @state: BLK_MQ_S_* flags. Defines the state of the hw
+		  * queue (active, scheduled to restart, stopped).
+		  */
+		unsigned long		state;
 	} ____cacheline_aligned_in_smp;

+	/**
+	 * @run_work: Used for scheduling a hardware queue run at a later time.
+	 */
 	struct delayed_work	run_work;
+	/** @cpumask: Map of available CPUs where this hctx can run. */
 	cpumask_var_t		cpumask;
+	/**
+	 * @next_cpu: Used by blk_mq_hctx_next_cpu() for round-robin CPU
+	 * selection from @cpumask.
+	 */
 	int			next_cpu;
+	/**
+	 * @next_cpu_batch: Counter of how many works left in the batch before
+	 * changing to the next CPU.
+	 */
 	int			next_cpu_batch;

-	unsigned long		flags;		/* BLK_MQ_F_* flags */
+	/** @flags: BLK_MQ_F_* flags. Defines the behaviour of the queue. */
+	unsigned long		flags;

+	/**
+	 * @sched_data: Pointer owned by the IO scheduler attached to a request
+	 * queue. It's up to the IO scheduler how to use this pointer.
+	 */
 	void			*sched_data;
+	/**
+	 * @queue: Pointer to the request queue that owns this hardware context.
+	 */
 	struct request_queue	*queue;
+	/** @fq: Queue of requests that need to perform a flush operation. */
 	struct blk_flush_queue	*fq;

+	/**
+	 * @driver_data: Pointer to data owned by the block driver that created
+	 * this hctx
+	 */
 	void			*driver_data;

+	/**
+	 * @ctx_map: Bitmap for each software queue. If bit is on, there is a
+	 * pending request in that software queue.
+	 */
 	struct sbitmap		ctx_map;

+	/**
+	 * @dispatch_from: Software queue to be used when no scheduler was
+	 * selected.
+	 */
 	struct blk_mq_ctx	*dispatch_from;
+	/**
+	 * @dispatch_busy: Number used by blk_mq_update_dispatch_busy() to
+	 * decide if the hw_queue is busy using Exponential Weighted Moving
+	 * Average algorithm.
+	 */
 	unsigned int		dispatch_busy;

+	/** @type: HCTX_TYPE_* flags. Type of hardware queue. */
 	unsigned short		type;
+	/** @nr_ctx: Number of software queues. */
 	unsigned short		nr_ctx;
+	/** @ctxs: Array of software queues. */
 	struct blk_mq_ctx	**ctxs;

+	/** @dispatch_wait_lock: Lock for dispatch_wait queue. */
 	spinlock_t		dispatch_wait_lock;
+	/**
+	 * @dispatch_wait: Waitqueue to put requests when there is no tag
+	 * available at the moment, to wait for another try in the future.
+	 */
 	wait_queue_entry_t	dispatch_wait;
+
+	/**
+	 * @wait_index: Index of next available dispatch_wait queue to insert
+	 * requests.
+	 */
 	atomic_t		wait_index;

+	/**
+	 * @tags: Tags owned by the block driver. A tag at this set is only
+	 * assigned when a request is dispatched from a hardware queue.
+	 */
 	struct blk_mq_tags	*tags;
+	/**
+	 * @sched_tags: Tags owned by I/O scheduler. If there is an I/O
+	 * scheduler associated with a request queue, a tag is assigned when
+	 * that request is allocated. Else, this member is not used.
+	 */
 	struct blk_mq_tags	*sched_tags;

+	/** @queued: Number of queued requests. */
 	unsigned long		queued;
+	/** @run: Number of dispatched requests. */
 	unsigned long		run;
 #define BLK_MQ_MAX_DISPATCH_ORDER	7
+	/** @dispatched: Number of dispatch requests by queue. */
 	unsigned long		dispatched[BLK_MQ_MAX_DISPATCH_ORDER];

+	/** @numa_node: NUMA node the storage adapter has been connected to. */
 	unsigned int		numa_node;
+	/** @queue_num: Index of this hardware queue. */
 	unsigned int		queue_num;

+	/**
+	 * @nr_active: Number of active requests. Only used when a tag set is
+	 * shared across request queues.
+	 */
 	atomic_t		nr_active;

+	/** @cpuhp_dead: List to store request if some CPU die. */
 	struct hlist_node	cpuhp_dead;
+	/** @kobj: Kernel object for sysfs. */
 	struct kobject		kobj;

+	/** @poll_considered: Count times blk_poll() was called. */
 	unsigned long		poll_considered;
+	/** @poll_invoked: Count how many requests blk_poll() polled. */
 	unsigned long		poll_invoked;
+	/** @poll_success: Count how many polled requests were completed. */
 	unsigned long		poll_success;

 #ifdef CONFIG_BLK_DEBUG_FS
+	/**
+	 * @debugfs_dir: debugfs directory for this hardware queue. Named
+	 * as cpu<cpu_number>.
+	 */
 	struct dentry		*debugfs_dir;
+	/** @sched_debugfs_dir:	debugfs directory for the scheduler. */
 	struct dentry		*sched_debugfs_dir;
 #endif

+	/** @hctx_list:	List of all hardware queues. */
 	struct list_head	hctx_list;

-	/* Must be the last member - see also blk_mq_hw_ctx_size(). */
+	/**
+	 * @srcu: Sleepable RCU. Use as lock when type of the hardware queue is
+	 * blocking (BLK_MQ_F_BLOCKING). Must be the last member - see also
+	 * blk_mq_hw_ctx_size().
+	 */
 	struct srcu_struct	srcu[0];
 };

+/**
+ * struct blk_mq_queue_map - Map software queues to hardware queues
+ * @mq_map:       CPU ID to hardware queue index map. This is an array
+ *	with nr_cpu_ids elements. Each element has a value in the range
+ *	[@queue_offset, @queue_offset + @nr_queues).
+ * @nr_queues:    Number of hardware queues to map CPU IDs onto.
+ * @queue_offset: First hardware queue to map onto. Used by the PCIe NVMe
+ *	driver to map each hardware queue type (enum hctx_type) onto a distinct
+ *	set of hardware queues.
+ */
 struct blk_mq_queue_map {
 	unsigned int *mq_map;
 	unsigned int nr_queues;
 	unsigned int queue_offset;
 };

+/**
+ * enum hctx_type - Type of hardware queue
+ * @HCTX_TYPE_DEFAULT:	All I/O not otherwise accounted for.
+ * @HCTX_TYPE_READ:	Just for READ I/O.
+ * @HCTX_TYPE_POLL:	Polled I/O of any kind.
+ * @HCTX_MAX_TYPES:	Number of types of hctx.
+ */
 enum hctx_type {
-	HCTX_TYPE_DEFAULT,	/* all I/O not otherwise accounted for */
-	HCTX_TYPE_READ,		/* just for READ I/O */
-	HCTX_TYPE_POLL,		/* polled I/O of any kind */
+	HCTX_TYPE_DEFAULT,
+	HCTX_TYPE_READ,
+	HCTX_TYPE_POLL,

 	HCTX_MAX_TYPES,
 };

+/**
+ * struct blk_mq_tag_set - tag set that can be shared between request queues
+ * @map:	   One or more ctx -> hctx mappings. One map exists for each
+ *		   hardware queue type (enum hctx_type) that the driver wishes
+ *		   to support. There are no restrictions on maps being of the
+ *		   same size, and it's perfectly legal to share maps between
+ *		   types.
+ * @nr_maps:	   Number of elements in the @map array. A number in the range
+ *		   [1, HCTX_MAX_TYPES].
+ * @ops:	   Pointers to functions that implement block driver behavior.
+ * @nr_hw_queues:  Number of hardware queues supported by the block driver that
+ *		   owns this data structure.
+ * @queue_depth:   Number of tags per hardware queue, reserved tags included.
+ * @reserved_tags: Number of tags to set aside for BLK_MQ_REQ_RESERVED tag
+ *		   allocations.
+ * @cmd_size:	   Number of additional bytes to allocate per request. The block
+ *		   driver owns these additional bytes.
+ * @numa_node:	   NUMA node the storage adapter has been connected to.
+ * @timeout:	   Request processing timeout in jiffies.
+ * @flags:	   Zero or more BLK_MQ_F_* flags.
+ * @driver_data:   Pointer to data owned by the block driver that created this
+ *		   tag set.
+ * @tags:	   Tag sets. One tag set per hardware queue. Has @nr_hw_queues
+ *		   elements.
+ * @tag_list_lock: Serializes tag_list accesses.
+ * @tag_list:	   List of the request queues that use this tag set. See also
+ *		   request_queue.tag_set_list.
+ */
 struct blk_mq_tag_set {
-	/*
-	 * map[] holds ctx -> hctx mappings, one map exists for each type
-	 * that the driver wishes to support. There are no restrictions
-	 * on maps being of the same size, and it's perfectly legal to
-	 * share maps between types.
-	 */
 	struct blk_mq_queue_map	map[HCTX_MAX_TYPES];
-	unsigned int		nr_maps;	/* nr entries in map[] */
+	unsigned int		nr_maps;
 	const struct blk_mq_ops	*ops;
-	unsigned int		nr_hw_queues;	/* nr hw queues across maps */
-	unsigned int		queue_depth;	/* max hw supported */
+	unsigned int		nr_hw_queues;
+	unsigned int		queue_depth;
 	unsigned int		reserved_tags;
-	unsigned int		cmd_size;	/* per-request extra data */
+	unsigned int		cmd_size;
 	int			numa_node;
 	unsigned int		timeout;
-	unsigned int		flags;		/* BLK_MQ_F_* */
+	unsigned int		flags;
 	void			*driver_data;

 	struct blk_mq_tags	**tags;
@ -115,6 +251,12 @@ struct blk_mq_tag_set {
 	struct list_head	tag_list;
 };

+/**
+ * struct blk_mq_queue_data - Data about a request inserted in a queue
+ *
+ * @rq:   Request pointer.
+ * @last: If it is the last request in the queue.
+ */
 struct blk_mq_queue_data {
 	struct request *rq;
 	bool last;
@ -142,81 +284,101 @@ typedef bool (busy_fn)(struct request_queue *);
 typedef void (complete_fn)(struct request *);
 typedef void (cleanup_rq_fn)(struct request *);

-
+/**
+ * struct blk_mq_ops - Callback functions that implements block driver
+ * behaviour.
+ */
 struct blk_mq_ops {
-	/*
-	 * Queue request
+	/**
+	 * @queue_rq: Queue a new request from block IO.
 	 */
 	queue_rq_fn		*queue_rq;

-	/*
-	 * If a driver uses bd->last to judge when to submit requests to
-	 * hardware, it must define this function. In case of errors that
-	 * make us stop issuing further requests, this hook serves the
+	/**
+	 * @commit_rqs: If a driver uses bd->last to judge when to submit
+	 * requests to hardware, it must define this function. In case of errors
+	 * that make us stop issuing further requests, this hook serves the
 	 * purpose of kicking the hardware (which the last request otherwise
 	 * would have done).
 	 */
 	commit_rqs_fn		*commit_rqs;

-	/*
-	 * Reserve budget before queue request, once .queue_rq is
+	/**
+	 * @get_budget: Reserve budget before queue request, once .queue_rq is
 	 * run, it is driver's responsibility to release the
 	 * reserved budget. Also we have to handle failure case
 	 * of .get_budget for avoiding I/O deadlock.
 	 */
 	get_budget_fn		*get_budget;
+	/**
+	 * @put_budget: Release the reserved budget.
+	 */
 	put_budget_fn		*put_budget;

-	/*
-	 * Called on request timeout
+	/**
+	 * @timeout: Called on request timeout.
 	 */
 	timeout_fn		*timeout;

-	/*
-	 * Called to poll for completion of a specific tag.
+	/**
+	 * @poll: Called to poll for completion of a specific tag.
 	 */
 	poll_fn			*poll;

+	/**
+	 * @complete: Mark the request as complete.
+	 */
 	complete_fn		*complete;

-	/*
-	 * Called when the block layer side of a hardware queue has been
-	 * set up, allowing the driver to allocate/init matching structures.
-	 * Ditto for exit/teardown.
+	/**
+	 * @init_hctx: Called when the block layer side of a hardware queue has
+	 * been set up, allowing the driver to allocate/init matching
+	 * structures.
 	 */
 	init_hctx_fn		*init_hctx;
+	/**
+	 * @exit_hctx: Ditto for exit/teardown.
+	 */
 	exit_hctx_fn		*exit_hctx;

-	/*
-	 * Called for every command allocated by the block layer to allow
-	 * the driver to set up driver specific data.
+	/**
+	 * @init_request: Called for every command allocated by the block layer
+	 * to allow the driver to set up driver specific data.
 	 *
 	 * Tag greater than or equal to queue_depth is for setting up
 	 * flush request.
-	 *
-	 * Ditto for exit/teardown.
 	 */
 	init_request_fn		*init_request;
+	/**
+	 * @exit_request: Ditto for exit/teardown.
+	 */
 	exit_request_fn		*exit_request;
-	/* Called from inside blk_get_request() */
+
+	/**
+	 * @initialize_rq_fn: Called from inside blk_get_request().
+	 */
 	void (*initialize_rq_fn)(struct request *rq);

-	/*
-	 * Called before freeing one request which isn't completed yet,
-	 * and usually for freeing the driver private data
+	/**
+	 * @cleanup_rq: Called before freeing one request which isn't completed
+	 * yet, and usually for freeing the driver private data.
 	 */
 	cleanup_rq_fn		*cleanup_rq;

-	/*
-	 * If set, returns whether or not this queue currently is busy
+	/**
+	 * @busy: If set, returns whether or not this queue currently is busy.
 	 */
 	busy_fn			*busy;

+	/**
+	 * @map_queues: This allows drivers specify their own queue mapping by
+	 * overriding the setup-time function that builds the mq_map.
+	 */
 	map_queues_fn		*map_queues;

 #ifdef CONFIG_BLK_DEBUG_FS
-	/*
-	 * Used by the debugfs implementation to show driver-specific
+	/**
+	 * @show_rq: Used by the debugfs implementation to show driver-specific
 	 * information about a request.
 	 */
 	void (*show_rq)(struct seq_file *m, struct request *rq);
@ -262,7 +424,6 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set);
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);

 void blk_mq_free_request(struct request *rq);
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *);

 bool blk_mq_queue_inflight(struct request_queue *q);

@ -301,9 +462,25 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag)
 	return unique_tag & BLK_MQ_UNIQUE_TAG_MASK;
 }

+/**
+ * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
+ * @rq: target request.
+ */
+static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
+{
+	return READ_ONCE(rq->state);
+}
+
+static inline int blk_mq_request_started(struct request *rq)
+{
+	return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
+}
+
+static inline int blk_mq_request_completed(struct request *rq)
+{
+	return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
+}

-int blk_mq_request_started(struct request *rq);
-int blk_mq_request_completed(struct request *rq);
 void blk_mq_start_request(struct request *rq);
 void blk_mq_end_request(struct request *rq, blk_status_t error);
 void __blk_mq_end_request(struct request *rq, blk_status_t error);
@ -324,7 +501,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
 void blk_mq_quiesce_queue(struct request_queue *q);
 void blk_mq_unquiesce_queue(struct request_queue *q);
 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
 void blk_mq_run_hw_queues(struct request_queue *q, bool async);
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv);
@ -343,14 +520,29 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q);

 unsigned int blk_mq_rq_cpu(struct request *rq);

-/*
+/**
+ * blk_mq_rq_from_pdu - cast a PDU to a request
+ * @pdu: the PDU (Protocol Data Unit) to be casted
+ *
+ * Return: request
+ *
 * Driver command data is immediately after the request. So subtract request
- * size to get back to the original request, add request size to get the PDU.
+ * size to get back to the original request.
 */
 static inline struct request *blk_mq_rq_from_pdu(void *pdu)
 {
 	return pdu - sizeof(struct request);
 }
+
+/**
+ * blk_mq_rq_to_pdu - cast a request to a PDU
+ * @rq: the request to be casted
+ *
+ * Return: pointer to the PDU
+ *
+ * Driver command data is immediately after the request. So add request to get
+ * the PDU.
+ */
 static inline void *blk_mq_rq_to_pdu(struct request *rq)
 {
 	return rq + 1;
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@ -153,10 +153,10 @@ struct bio {
 	unsigned short		bi_write_hint;
 	blk_status_t		bi_status;
 	u8			bi_partno;
+	atomic_t		__bi_remaining;

 	struct bvec_iter	bi_iter;

-	atomic_t		__bi_remaining;
 	bio_end_io_t		*bi_end_io;

 	void			*bi_private;
@ -290,6 +290,12 @@ enum req_opf {
 	REQ_OP_ZONE_RESET_ALL	= 8,
 	/* write the zero filled sector many times */
 	REQ_OP_WRITE_ZEROES	= 9,
+	/* Open a zone */
+	REQ_OP_ZONE_OPEN	= 10,
+	/* Close a zone */
+	REQ_OP_ZONE_CLOSE	= 11,
+	/* Transition a zone to full */
+	REQ_OP_ZONE_FINISH	= 12,

 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
@ -371,6 +377,7 @@ enum stat_group {
 	STAT_READ,
 	STAT_WRITE,
 	STAT_DISCARD,
+	STAT_FLUSH,

 	NR_STAT_GROUPS
 };
@ -417,6 +424,25 @@ static inline bool op_is_discard(unsigned int op)
 	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
 }

+/*
+ * Check if a bio or request operation is a zone management operation, with
+ * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
+ * due to its different handling in the block layer and device response in
+ * case of command failure.
+ */
+static inline bool op_is_zone_mgmt(enum req_opf op)
+{
+	switch (op & REQ_OP_MASK) {
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static inline int op_stat_group(unsigned int op)
 {
 	if (op_is_discard(op))
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@ -360,14 +360,15 @@ extern unsigned int blkdev_nr_zones(struct block_device *bdev);
 extern int blkdev_report_zones(struct block_device *bdev,
 			       sector_t sector, struct blk_zone *zones,
 			       unsigned int *nr_zones);
-extern int blkdev_reset_zones(struct block_device *bdev, sector_t sectors,
-			      sector_t nr_sectors, gfp_t gfp_mask);
+extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
+			    sector_t sectors, sector_t nr_sectors,
+			    gfp_t gfp_mask);
 extern int blk_revalidate_disk_zones(struct gendisk *disk);

 extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 				     unsigned int cmd, unsigned long arg);
-extern int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
-				    unsigned int cmd, unsigned long arg);
+extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
+				  unsigned int cmd, unsigned long arg);

 #else /* CONFIG_BLK_DEV_ZONED */

@ -388,9 +389,9 @@ static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
 	return -ENOTTY;
 }

-static inline int blkdev_reset_zones_ioctl(struct block_device *bdev,
-					   fmode_t mode, unsigned int cmd,
-					   unsigned long arg)
+static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
+					 fmode_t mode, unsigned int cmd,
+					 unsigned long arg)
 {
 	return -ENOTTY;
 }
@ -411,7 +412,6 @@ struct request_queue {

 	/* sw queues */
 	struct blk_mq_ctx __percpu	*queue_ctx;
-	unsigned int		nr_queues;

 	unsigned int		queue_depth;

--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@ -216,15 +216,6 @@ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint,
 */
 bool sbitmap_any_bit_set(const struct sbitmap *sb);

-/**
- * sbitmap_any_bit_clear() - Check for an unset bit in a &struct
- * sbitmap.
- * @sb: Bitmap to check.
- *
- * Return: true if any bit in the bitmap is clear, false otherwise.
- */
-bool sbitmap_any_bit_clear(const struct sbitmap *sb);
-
 #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift)
 #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U))

--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@ -42,6 +42,7 @@ static inline bool is_sed_ioctl(unsigned int cmd)
 	case IOC_OPAL_PSID_REVERT_TPR:
 	case IOC_OPAL_MBR_DONE:
 	case IOC_OPAL_WRITE_SHADOW_MBR:
+	case IOC_OPAL_GENERIC_TABLE_RW:
 		return true;
 	}
 	return false;