Merge branch 'for-6.1/block' into for-6.1/passthrough

* for-6.1/block: (162 commits)
  sbitmap: fix lockup while swapping
  block: add rationale for not using blk_mq_plug() when applicable
  block: adapt blk_mq_plug() to not plug for writes that require a zone lock
  s390/dasd: use blk_mq_alloc_disk
  blk-cgroup: don't update the blkg lookup hint in blkg_conf_prep
  nvmet: don't look at the request_queue in nvmet_bdev_set_limits
  nvmet: don't look at the request_queue in nvmet_bdev_zone_mgmt_emulate_all
  blk-mq: use quiesced elevator switch when reinitializing queues
  block: replace blk_queue_nowait with bdev_nowait
  nvme: remove nvme_ctrl_init_connect_q
  nvme-loop: use the tagset alloc/free helpers
  nvme-loop: store the generic nvme_ctrl in set->driver_data
  nvme-loop: initialize sqsize later
  nvme-fc: use the tagset alloc/free helpers
  nvme-fc: store the generic nvme_ctrl in set->driver_data
  nvme-fc: keep ctrl->sqsize in sync with opts->queue_size
  nvme-rdma: use the tagset alloc/free helpers
  nvme-rdma: store the generic nvme_ctrl in set->driver_data
  nvme-tcp: use the tagset alloc/free helpers
  nvme-tcp: store the generic nvme_ctrl in set->driver_data
  ...

Signed-off-by: Jens Axboe <axboe@kernel.dk>
This commit is contained in:
Jens Axboe 2022-09-30 07:47:38 -06:00
commit 736feaa3a0
135 changed files with 3209 additions and 1649 deletions

View file

@ -254,17 +254,12 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
#else /* CONFIG_BFQ_CGROUP_DEBUG */
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
blk_opf_t opf) { }
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { }
void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { }
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf) { }
void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
#endif /* CONFIG_BFQ_CGROUP_DEBUG */

View file

@ -1925,7 +1925,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfqq->service_from_backlogged = 0;
bfq_clear_bfqq_softrt_update(bfqq);
bfq_add_bfqq_busy(bfqd, bfqq);
bfq_add_bfqq_busy(bfqq);
/*
* Expire in-service queue if preemption may be needed for
@ -2419,7 +2419,7 @@ static void bfq_remove_request(struct request_queue *q,
bfqq->next_rq = NULL;
if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
bfq_del_bfqq_busy(bfqd, bfqq, false);
bfq_del_bfqq_busy(bfqq, false);
/*
* bfqq emptied. In normal operation, when
* bfqq is empty, bfqq->entity.service and
@ -3098,7 +3098,7 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
*/
if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
bfqq != bfqd->in_service_queue)
bfq_del_bfqq_busy(bfqd, bfqq, false);
bfq_del_bfqq_busy(bfqq, false);
bfq_reassign_last_bfqq(bfqq, NULL);
@ -3908,7 +3908,7 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
bfqq->budget_timeout = jiffies;
bfq_del_bfqq_busy(bfqd, bfqq, true);
bfq_del_bfqq_busy(bfqq, true);
} else {
bfq_requeue_bfqq(bfqd, bfqq, true);
/*
@ -5255,9 +5255,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
struct hlist_node *n;
struct bfq_group *bfqg = bfqq_group(bfqq);
if (bfqq->bfqd)
bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
bfqq, bfqq->ref);
bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
bfqq->ref--;
if (bfqq->ref)
@ -5321,7 +5319,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
hlist_del_init(&item->woken_list_node);
}
if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
if (bfqq->bfqd->last_completed_rq_bfqq == bfqq)
bfqq->bfqd->last_completed_rq_bfqq = NULL;
kmem_cache_free(bfq_pool, bfqq);

View file

@ -993,20 +993,23 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
/* ---------------- cgroups-support interface ---------------- */
void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
blk_opf_t opf);
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf);
void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf);
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf);
void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_CGROUP_DEBUG
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
blk_opf_t opf);
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
#endif
void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
void bfq_end_wr_async(struct bfq_data *bfqd);
@ -1077,9 +1080,8 @@ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration);
void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration);
void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */

View file

@ -1651,9 +1651,10 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
* the service tree. As a special case, it can be invoked during an
* expiration.
*/
void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration)
void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration)
{
struct bfq_data *bfqd = bfqq->bfqd;
bfq_log_bfqq(bfqd, bfqq, "del from busy");
bfq_clear_bfqq_busy(bfqq);
@ -1674,8 +1675,10 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/*
* Called when an inactive queue receives a new request.
*/
void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
void bfq_add_bfqq_busy(struct bfq_queue *bfqq)
{
struct bfq_data *bfqd = bfqq->bfqd;
bfq_log_bfqq(bfqd, bfqq, "add to busy");
bfq_activate_bfqq(bfqd, bfqq);

View file

@ -760,8 +760,6 @@ EXPORT_SYMBOL(bio_put);
static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
bio_set_flag(bio, BIO_CLONED);
if (bio_flagged(bio_src, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
bio->bi_ioprio = bio_src->bi_ioprio;
bio->bi_iter = bio_src->bi_iter;
@ -1065,9 +1063,6 @@ void __bio_add_page(struct bio *bio, struct page *page,
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);
@ -1276,9 +1271,6 @@ out:
* fit into the bio, or are requested in @iter, whatever is smaller. If
* MM encounters an error pinning the requested pages, it stops. Error
* is returned only if 0 pages could be pinned.
*
* It's intended for direct IO, so doesn't do PSI tracking, the caller is
* responsible for setting BIO_WORKINGSET if necessary.
*/
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
@ -1294,8 +1286,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
/* don't account direct I/O as memory stall */
bio_clear_flag(bio, BIO_WORKINGSET);
return bio->bi_vcnt ? 0 : ret;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
@ -1754,7 +1744,8 @@ static int __init init_bio(void)
cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
bio_cpu_dead);
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
panic("bio: can't allocate bios\n");
if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))

View file

@ -202,19 +202,19 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
* @q: request_queue the new blkg is associated with
* @disk: gendisk the new blkg is associated with
* @gfp_mask: allocation mask to use
*
* Allocate a new blkg assocating @blkcg and @q.
*/
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
gfp_t gfp_mask)
{
struct blkcg_gq *blkg;
int i, cpu;
/* alloc and init base part */
blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
if (!blkg)
return NULL;
@ -225,10 +225,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg->iostat_cpu)
goto err_free;
if (!blk_get_queue(q))
if (!blk_get_queue(disk->queue))
goto err_free;
blkg->q = q;
blkg->q = disk->queue;
INIT_LIST_HEAD(&blkg->q_node);
spin_lock_init(&blkg->async_bio_lock);
bio_list_init(&blkg->async_bios);
@ -243,11 +243,11 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd;
if (!blkcg_policy_enabled(q, pol))
if (!blkcg_policy_enabled(disk->queue, pol))
continue;
/* alloc per-policy data and attach it to blkg */
pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
pd = pol->pd_alloc_fn(gfp_mask, disk->queue, blkcg);
if (!pd)
goto err_free;
@ -263,45 +263,20 @@ err_free:
return NULL;
}
struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
struct request_queue *q, bool update_hint)
{
struct blkcg_gq *blkg;
/*
* Hint didn't match. Look up from the radix tree. Note that the
* hint can only be updated under queue_lock as otherwise @blkg
* could have already been removed from blkg_tree. The caller is
* responsible for grabbing queue_lock if @update_hint.
*/
blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
if (blkg && blkg->q == q) {
if (update_hint) {
lockdep_assert_held(&q->queue_lock);
rcu_assign_pointer(blkcg->blkg_hint, blkg);
}
return blkg;
}
return NULL;
}
EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
/*
* If @new_blkg is %NULL, this function tries to allocate a new one as
* necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
*/
static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
struct request_queue *q,
static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
int i, ret;
lockdep_assert_held(&q->queue_lock);
lockdep_assert_held(&disk->queue->queue_lock);
/* request_queue is dying, do not create/recreate a blkg */
if (blk_queue_dying(q)) {
if (blk_queue_dying(disk->queue)) {
ret = -ENODEV;
goto err_free_blkg;
}
@ -314,7 +289,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
goto err_put_css;
@ -324,7 +299,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* link parent */
if (blkcg_parent(blkcg)) {
blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -ENODEV;
goto err_put_css;
@ -342,10 +317,10 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* insert */
spin_lock(&blkcg->lock);
ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
if (likely(!ret)) {
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
list_add(&blkg->q_node, &q->blkg_list);
list_add(&blkg->q_node, &disk->queue->blkg_list);
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@ -374,19 +349,20 @@ err_free_blkg:
/**
* blkg_lookup_create - lookup blkg, try to create one if not there
* @blkcg: blkcg of interest
* @q: request_queue of interest
* @disk: gendisk of interest
*
* Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
* Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to
* create one. blkg creation is performed recursively from blkcg_root such
* that all non-root blkg's have access to the parent blkg. This function
* should be called under RCU read lock and takes @q->queue_lock.
* should be called under RCU read lock and takes @disk->queue->queue_lock.
*
* Returns the blkg or the closest blkg if blkg_create() fails as it walks
* down from root.
*/
static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct request_queue *q)
struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct blkcg_gq *blkg;
unsigned long flags;
@ -397,9 +373,13 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
return blkg;
spin_lock_irqsave(&q->queue_lock, flags);
blkg = __blkg_lookup(blkcg, q, true);
if (blkg)
blkg = blkg_lookup(blkcg, q);
if (blkg) {
if (blkcg != &blkcg_root &&
blkg != rcu_dereference(blkcg->blkg_hint))
rcu_assign_pointer(blkcg->blkg_hint, blkg);
goto found;
}
/*
* Create blkgs walking down from blkcg_root to @blkcg, so that all
@ -412,7 +392,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
struct blkcg_gq *ret_blkg = q->root_blkg;
while (parent) {
blkg = __blkg_lookup(parent, q, false);
blkg = blkg_lookup(parent, q);
if (blkg) {
/* remember closest blkg */
ret_blkg = blkg;
@ -422,7 +402,7 @@ static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
parent = blkcg_parent(parent);
}
blkg = blkg_create(pos, q, NULL);
blkg = blkg_create(pos, disk, NULL);
if (IS_ERR(blkg)) {
blkg = ret_blkg;
break;
@ -476,14 +456,9 @@ static void blkg_destroy(struct blkcg_gq *blkg)
percpu_ref_kill(&blkg->refcnt);
}
/**
* blkg_destroy_all - destroy all blkgs associated with a request_queue
* @q: request_queue of interest
*
* Destroy all blkgs associated with @q.
*/
static void blkg_destroy_all(struct request_queue *q)
static void blkg_destroy_all(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct blkcg_gq *blkg, *n;
int count = BLKG_DESTROY_BATCH_SIZE;
@ -616,19 +591,6 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
/* Performs queue bypass and policy enabled checks then looks up blkg. */
static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
const struct blkcg_policy *pol,
struct request_queue *q)
{
WARN_ON_ONCE(!rcu_read_lock_held());
lockdep_assert_held(&q->queue_lock);
if (!blkcg_policy_enabled(q, pol))
return ERR_PTR(-EOPNOTSUPP);
return __blkg_lookup(blkcg, q, true /* update_hint */);
}
/**
* blkcg_conf_open_bdev - parse and open bdev for per-blkg config update
* @inputp: input string pointer
@ -684,6 +646,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
{
struct block_device *bdev;
struct gendisk *disk;
struct request_queue *q;
struct blkcg_gq *blkg;
int ret;
@ -691,8 +654,8 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
bdev = blkcg_conf_open_bdev(&input);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
q = bdev_get_queue(bdev);
disk = bdev->bd_disk;
q = disk->queue;
/*
* blkcg_deactivate_policy() requires queue to be frozen, we can grab
@ -705,12 +668,12 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
rcu_read_lock();
spin_lock_irq(&q->queue_lock);
blkg = blkg_lookup_check(blkcg, pol, q);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
if (!blkcg_policy_enabled(q, pol)) {
ret = -EOPNOTSUPP;
goto fail_unlock;
}
blkg = blkg_lookup(blkcg, q);
if (blkg)
goto success;
@ -724,7 +687,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkcg_gq *new_blkg;
parent = blkcg_parent(blkcg);
while (parent && !__blkg_lookup(parent, q, false)) {
while (parent && !blkg_lookup(parent, q)) {
pos = parent;
parent = blkcg_parent(parent);
}
@ -733,7 +696,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
spin_unlock_irq(&q->queue_lock);
rcu_read_unlock();
new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
goto fail_exit_queue;
@ -748,17 +711,17 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
rcu_read_lock();
spin_lock_irq(&q->queue_lock);
blkg = blkg_lookup_check(pos, pol, q);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
if (!blkcg_policy_enabled(q, pol)) {
blkg_free(new_blkg);
ret = -EOPNOTSUPP;
goto fail_preloaded;
}
blkg = blkg_lookup(pos, q);
if (blkg) {
blkg_free(new_blkg);
} else {
blkg = blkg_create(pos, q, new_blkg);
blkg = blkg_create(pos, disk, new_blkg);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
goto fail_preloaded;
@ -915,8 +878,7 @@ static void blkcg_fill_root_iostats(void)
class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
while ((dev = class_dev_iter_next(&iter))) {
struct block_device *bdev = dev_to_bdev(dev);
struct blkcg_gq *blkg =
blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
struct blkg_iostat tmp;
int cpu;
unsigned long flags;
@ -1255,25 +1217,16 @@ static int blkcg_css_online(struct cgroup_subsys_state *css)
return 0;
}
/**
* blkcg_init_queue - initialize blkcg part of request queue
* @q: request_queue to initialize
*
* Called from blk_alloc_queue(). Responsible for initializing blkcg
* part of new request_queue @q.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
int blkcg_init_queue(struct request_queue *q)
int blkcg_init_disk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct blkcg_gq *new_blkg, *blkg;
bool preloaded;
int ret;
INIT_LIST_HEAD(&q->blkg_list);
new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
@ -1282,7 +1235,7 @@ int blkcg_init_queue(struct request_queue *q)
/* Make sure the root blkg exists. */
/* spin_lock_irq can serve as RCU read-side critical section. */
spin_lock_irq(&q->queue_lock);
blkg = blkg_create(&blkcg_root, q, new_blkg);
blkg = blkg_create(&blkcg_root, disk, new_blkg);
if (IS_ERR(blkg))
goto err_unlock;
q->root_blkg = blkg;
@ -1291,25 +1244,26 @@ int blkcg_init_queue(struct request_queue *q)
if (preloaded)
radix_tree_preload_end();
ret = blk_ioprio_init(q);
ret = blk_ioprio_init(disk);
if (ret)
goto err_destroy_all;
ret = blk_throtl_init(q);
ret = blk_throtl_init(disk);
if (ret)
goto err_destroy_all;
goto err_ioprio_exit;
ret = blk_iolatency_init(q);
if (ret) {
blk_throtl_exit(q);
blk_ioprio_exit(q);
goto err_destroy_all;
}
ret = blk_iolatency_init(disk);
if (ret)
goto err_throtl_exit;
return 0;
err_throtl_exit:
blk_throtl_exit(disk);
err_ioprio_exit:
blk_ioprio_exit(disk);
err_destroy_all:
blkg_destroy_all(q);
blkg_destroy_all(disk);
return ret;
err_unlock:
spin_unlock_irq(&q->queue_lock);
@ -1318,16 +1272,10 @@ err_unlock:
return PTR_ERR(blkg);
}
/**
* blkcg_exit_queue - exit and release blkcg part of request_queue
* @q: request_queue being released
*
* Called from blk_exit_queue(). Responsible for exiting blkcg part.
*/
void blkcg_exit_queue(struct request_queue *q)
void blkcg_exit_disk(struct gendisk *disk)
{
blkg_destroy_all(q);
blk_throtl_exit(q);
blkg_destroy_all(disk);
blk_throtl_exit(disk);
}
static void blkcg_bind(struct cgroup_subsys_state *root_css)
@ -1836,13 +1784,13 @@ out:
/**
* blkcg_schedule_throttle - this task needs to check for throttling
* @q: the request queue IO was submitted on
* @gendisk: disk to throttle
* @use_memdelay: do we charge this to memory delay for PSI
*
* This is called by the IO controller when we know there's delay accumulated
* for the blkg for this task. We do not pass the blkg because there are places
* we call this that may not have that information, the swapping code for
* instance will only have a request_queue at that point. This set's the
* instance will only have a block_device at that point. This set's the
* notify_resume for the task to check and see if it requires throttling before
* returning to user space.
*
@ -1851,8 +1799,10 @@ out:
* throttle once. If the task needs to be throttled again it'll need to be
* re-set at the next time we see the task.
*/
void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{
struct request_queue *q = disk->queue;
if (unlikely(current->flags & PF_KTHREAD))
return;
@ -1902,8 +1852,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
struct blkcg_gq *blkg, *ret_blkg = NULL;
rcu_read_lock();
blkg = blkg_lookup_create(css_to_blkcg(css),
bdev_get_queue(bio->bi_bdev));
blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
while (blkg) {
if (blkg_tryget(blkg)) {
ret_blkg = blkg;

View file

@ -178,10 +178,8 @@ struct blkcg_policy {
extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats;
struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
struct request_queue *q, bool update_hint);
int blkcg_init_queue(struct request_queue *q);
void blkcg_exit_queue(struct request_queue *q);
int blkcg_init_disk(struct gendisk *disk);
void blkcg_exit_disk(struct gendisk *disk);
/* Blkio controller policy registration */
int blkcg_policy_register(struct blkcg_policy *pol);
@ -227,22 +225,21 @@ static inline bool bio_issue_as_root_blkg(struct bio *bio)
}
/**
* __blkg_lookup - internal version of blkg_lookup()
* blkg_lookup - lookup blkg for the specified blkcg - q pair
* @blkcg: blkcg of interest
* @q: request_queue of interest
* @update_hint: whether to update lookup hint with the result or not
*
* This is internal version and shouldn't be used by policy
* implementations. Looks up blkgs for the @blkcg - @q pair regardless of
* @q's bypass state. If @update_hint is %true, the caller should be
* holding @q->queue_lock and lookup hint is updated on success.
* Lookup blkg for the @blkcg - @q pair.
* Must be called in a RCU critical section.
*/
static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
struct request_queue *q,
bool update_hint)
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
struct request_queue *q)
{
struct blkcg_gq *blkg;
WARN_ON_ONCE(!rcu_read_lock_held());
if (blkcg == &blkcg_root)
return q->root_blkg;
@ -250,33 +247,10 @@ static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
if (blkg && blkg->q == q)
return blkg;
return blkg_lookup_slowpath(blkcg, q, update_hint);
}
/**
* blkg_lookup - lookup blkg for the specified blkcg - q pair
* @blkcg: blkcg of interest
* @q: request_queue of interest
*
* Lookup blkg for the @blkcg - @q pair. This function should be called
* under RCU read lock.
*/
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
struct request_queue *q)
{
WARN_ON_ONCE(!rcu_read_lock_held());
return __blkg_lookup(blkcg, q, false);
}
/**
* blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
* @q: request_queue of interest
*
* Lookup blkg for @q at the root level. See also blkg_lookup().
*/
static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
{
return q->root_blkg;
blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
if (blkg && blkg->q != q)
blkg = NULL;
return blkg;
}
/**
@ -373,8 +347,8 @@ static inline void blkg_put(struct blkcg_gq *blkg)
*/
#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q, false)))
if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q)))
/**
* blkg_for_each_descendant_post - post-order walk of a blkg's descendants
@ -388,8 +362,8 @@ static inline void blkg_put(struct blkcg_gq *blkg)
*/
#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q, false)))
if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
(p_blkg)->q)))
bool __blkcg_punt_bio_submit(struct bio *bio);
@ -507,10 +481,8 @@ struct blkcg {
};
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
{ return NULL; }
static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
static inline void blkcg_exit_queue(struct request_queue *q) { }
static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
static inline int blkcg_activate_policy(struct request_queue *q,

View file

@ -37,7 +37,6 @@
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
@ -487,18 +486,15 @@ static int __init fail_make_request_debugfs(void)
late_initcall(fail_make_request_debugfs);
#endif /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool bio_check_ro(struct bio *bio)
static inline void bio_check_ro(struct bio *bio)
{
if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
return false;
return;
pr_warn("Trying to write to read-only block-device %pg\n",
bio->bi_bdev);
/* Older lvm-tools actually trigger this */
return false;
}
return false;
}
static noinline int should_fail_bio(struct bio *bio)
@ -717,13 +713,12 @@ void submit_bio_noacct(struct bio *bio)
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
* if queue does not support NOWAIT.
*/
if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q))
if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
goto not_supported;
if (should_fail_bio(bio))
goto end_io;
if (unlikely(bio_check_ro(bio)))
goto end_io;
bio_check_ro(bio);
if (!bio_flagged(bio, BIO_REMAPPED)) {
if (unlikely(bio_check_eod(bio)))
goto end_io;
@ -814,7 +809,7 @@ EXPORT_SYMBOL(submit_bio_noacct);
*
* The success/failure status of the request, along with notification of
* completion, is delivered asynchronously through the ->bi_end_io() callback
* in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
* in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has
* been called.
*/
void submit_bio(struct bio *bio)
@ -829,22 +824,6 @@ void submit_bio(struct bio *bio)
count_vm_events(PGPGOUT, bio_sectors(bio));
}
/*
* If we're reading data that is part of the userspace workingset, count
* submission time as memory stall. When the device is congested, or
* the submitting cgroup IO-throttled, submission can be a significant
* part of overall IO time.
*/
if (unlikely(bio_op(bio) == REQ_OP_READ &&
bio_flagged(bio, BIO_WORKINGSET))) {
unsigned long pflags;
psi_memstall_enter(&pflags);
submit_bio_noacct(bio);
psi_memstall_leave(&pflags);
return;
}
submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
@ -871,6 +850,12 @@ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return 0;
/*
* As the requests that require a zone lock are not plugged in the
* first place, directly accessing the plug instead of using
* blk_mq_plug() should not have any consequences during flushing for
* zoned devices.
*/
blk_flush_plug(current->plug, false);
if (bio_queue_enter(bio))

View file

@ -664,17 +664,13 @@ static struct ioc *q_to_ioc(struct request_queue *q)
return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
}
static const char *q_name(struct request_queue *q)
{
if (blk_queue_registered(q))
return kobject_name(q->kobj.parent);
else
return "<unknown>";
}
static const char __maybe_unused *ioc_name(struct ioc *ioc)
{
return q_name(ioc->rqos.q);
struct gendisk *disk = ioc->rqos.q->disk;
if (!disk)
return "<unknown>";
return disk->disk_name;
}
static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
@ -1430,7 +1426,7 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
int flags, void *key)
{
struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
struct iocg_wake_ctx *ctx = key;
u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
ctx->vbudget -= cost;
@ -2640,7 +2636,7 @@ retry_lock:
if (use_debt) {
iocg_incur_debt(iocg, abs_cost, &now);
if (iocg_kick_delay(iocg, &now))
blkcg_schedule_throttle(rqos->q,
blkcg_schedule_throttle(rqos->q->disk,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
iocg_unlock(iocg, ioc_locked, &flags);
return;
@ -2741,7 +2737,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
if (likely(!list_empty(&iocg->active_list))) {
iocg_incur_debt(iocg, abs_cost, &now);
if (iocg_kick_delay(iocg, &now))
blkcg_schedule_throttle(rqos->q,
blkcg_schedule_throttle(rqos->q->disk,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
} else {
iocg_commit_bio(iocg, bio, abs_cost, cost);
@ -2832,8 +2828,9 @@ static struct rq_qos_ops ioc_rqos_ops = {
.exit = ioc_rqos_exit,
};
static int blk_iocost_init(struct request_queue *q)
static int blk_iocost_init(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct ioc *ioc;
struct rq_qos *rqos;
int i, cpu, ret;
@ -3170,6 +3167,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct block_device *bdev;
struct gendisk *disk;
struct ioc *ioc;
u32 qos[NR_QOS_PARAMS];
bool enable, user;
@ -3180,12 +3178,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
ioc = q_to_ioc(bdev_get_queue(bdev));
disk = bdev->bd_disk;
ioc = q_to_ioc(disk->queue);
if (!ioc) {
ret = blk_iocost_init(bdev_get_queue(bdev));
ret = blk_iocost_init(disk);
if (ret)
goto err;
ioc = q_to_ioc(bdev_get_queue(bdev));
ioc = q_to_ioc(disk->queue);
}
spin_lock_irq(&ioc->lock);
@ -3262,11 +3261,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
spin_lock_irq(&ioc->lock);
if (enable) {
blk_stat_enable_accounting(ioc->rqos.q);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
blk_stat_enable_accounting(disk->queue);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = true;
} else {
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = false;
}
@ -3349,7 +3348,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(bdev_get_queue(bdev));
if (!ioc) {
ret = blk_iocost_init(bdev_get_queue(bdev));
ret = blk_iocost_init(bdev->bd_disk);
if (ret)
goto err;
ioc = q_to_ioc(bdev_get_queue(bdev));

View file

@ -292,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
if (use_delay)
blkcg_schedule_throttle(rqos->q, use_memdelay);
blkcg_schedule_throttle(rqos->q->disk, use_memdelay);
/*
* To avoid priority inversions we want to just take a slot if we are
@ -756,8 +756,9 @@ static void blkiolatency_enable_work_fn(struct work_struct *work)
}
}
int blk_iolatency_init(struct request_queue *q)
int blk_iolatency_init(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct blk_iolatency *blkiolat;
struct rq_qos *rqos;
int ret;

View file

@ -202,14 +202,14 @@ void blkcg_set_ioprio(struct bio *bio)
bio->bi_ioprio = prio;
}
void blk_ioprio_exit(struct request_queue *q)
void blk_ioprio_exit(struct gendisk *disk)
{
blkcg_deactivate_policy(q, &ioprio_policy);
blkcg_deactivate_policy(disk->queue, &ioprio_policy);
}
int blk_ioprio_init(struct request_queue *q)
int blk_ioprio_init(struct gendisk *disk)
{
return blkcg_activate_policy(q, &ioprio_policy);
return blkcg_activate_policy(disk->queue, &ioprio_policy);
}
static int __init ioprio_init(void)

View file

@ -9,15 +9,15 @@ struct request_queue;
struct bio;
#ifdef CONFIG_BLK_CGROUP_IOPRIO
int blk_ioprio_init(struct request_queue *q);
void blk_ioprio_exit(struct request_queue *q);
int blk_ioprio_init(struct gendisk *disk);
void blk_ioprio_exit(struct gendisk *disk);
void blkcg_set_ioprio(struct bio *bio);
#else
static inline int blk_ioprio_init(struct request_queue *q)
static inline int blk_ioprio_init(struct gendisk *disk)
{
return 0;
}
static inline void blk_ioprio_exit(struct request_queue *q)
static inline void blk_ioprio_exit(struct gendisk *disk)
{
}
static inline void blkcg_set_ioprio(struct bio *bio)

View file

@ -158,7 +158,7 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq));
if (map_data) {
nr_pages = 1 << map_data->page_order;
nr_pages = 1U << map_data->page_order;
i = map_data->offset / PAGE_SIZE;
}
while (len) {
@ -231,6 +231,16 @@ out_bmd:
return ret;
}
static void bio_map_put(struct bio *bio)
{
if (bio->bi_opf & REQ_ALLOC_CACHE) {
bio_put(bio);
} else {
bio_uninit(bio);
kfree(bio);
}
}
static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
gfp_t gfp_mask)
{
@ -243,18 +253,34 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (!iov_iter_count(iter))
return -EINVAL;
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return -ENOMEM;
bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
if (rq->cmd_flags & REQ_POLLED) {
blk_opf_t opf = rq->cmd_flags | REQ_ALLOC_CACHE;
bio = bio_alloc_bioset(NULL, nr_vecs, opf, gfp_mask,
&fs_bio_set);
if (!bio)
return -ENOMEM;
} else {
bio = bio_kmalloc(nr_vecs, gfp_mask);
if (!bio)
return -ENOMEM;
bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
}
while (iov_iter_count(iter)) {
struct page **pages;
struct page **pages, *stack_pages[UIO_FASTIOV];
ssize_t bytes;
size_t offs, added = 0;
size_t offs;
int npages;
bytes = iov_iter_get_pages_alloc2(iter, &pages, LONG_MAX, &offs);
if (nr_vecs <= ARRAY_SIZE(stack_pages)) {
pages = stack_pages;
bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
nr_vecs, &offs);
} else {
bytes = iov_iter_get_pages_alloc2(iter, &pages,
LONG_MAX, &offs);
}
if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT;
goto out_unmap;
@ -280,7 +306,6 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
break;
}
added += n;
bytes -= n;
offs = 0;
}
@ -290,7 +315,8 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
*/
while (j < npages)
put_page(pages[j++]);
kvfree(pages);
if (pages != stack_pages)
kvfree(pages);
/* couldn't stuff something into bio? */
if (bytes) {
iov_iter_revert(iter, bytes);
@ -305,8 +331,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
out_unmap:
bio_release_pages(bio, false);
bio_uninit(bio);
kfree(bio);
bio_map_put(bio);
return ret;
}
@ -611,8 +636,7 @@ int blk_rq_unmap_user(struct bio *bio)
next_bio = bio;
bio = bio->bi_next;
bio_uninit(next_bio);
kfree(next_bio);
bio_map_put(next_bio);
}
return ret;

View file

@ -32,7 +32,7 @@ static int get_first_sibling(unsigned int cpu)
return cpu;
}
int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
unsigned int *map = qmap->mq_map;
unsigned int nr_queues = qmap->nr_queues;
@ -70,8 +70,6 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
map[cpu] = map[first_sibling];
}
}
return 0;
}
EXPORT_SYMBOL_GPL(blk_mq_map_queues);

View file

@ -807,8 +807,6 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
return "latency";
case RQ_QOS_COST:
return "cost";
case RQ_QOS_IOPRIO:
return "ioprio";
}
return "unknown";
}

View file

@ -23,8 +23,8 @@
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
*/
int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
int offset)
void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
int offset)
{
const struct cpumask *mask;
unsigned int queue, cpu;
@ -38,11 +38,10 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
return 0;
return;
fallback:
WARN_ON_ONCE(qmap->nr_queues > 1);
blk_mq_clear_mq_map(qmap);
return 0;
}
EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);

View file

@ -21,7 +21,7 @@
* @set->nr_hw_queues, or @dev does not provide an affinity mask for a
* vector, we fallback to the naive mapping.
*/
int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
void blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
struct ib_device *dev, int first_vec)
{
const struct cpumask *mask;
@ -36,9 +36,9 @@ int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
map->mq_map[cpu] = map->queue_offset + queue;
}
return 0;
return;
fallback:
return blk_mq_map_queues(map);
blk_mq_map_queues(map);
}
EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);

View file

@ -196,7 +196,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
* other allocations on previous queue won't be starved.
*/
if (bt != bt_prev)
sbitmap_queue_wake_up(bt_prev);
sbitmap_queue_wake_up(bt_prev, 1);
ws = bt_wait_ptr(bt, data->hctx);
} while (1);

View file

@ -21,7 +21,7 @@
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
*/
int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
struct virtio_device *vdev, int first_vec)
{
const struct cpumask *mask;
@ -39,8 +39,9 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
return 0;
return;
fallback:
return blk_mq_map_queues(qmap);
blk_mq_map_queues(qmap);
}
EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);

View file

@ -1093,10 +1093,12 @@ bool blk_mq_complete_request_remote(struct request *rq)
WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
* For a polled request, always complete locally, it's pointless
* to redirect the completion.
* For request which hctx has only one ctx mapping,
* or a polled request, always complete locally,
* it's pointless to redirect the completion.
*/
if (rq->cmd_flags & REQ_POLLED)
if (rq->mq_hctx->nr_ctx == 1 ||
rq->cmd_flags & REQ_POLLED)
return false;
if (blk_mq_complete_need_ipi(rq)) {
@ -1213,6 +1215,12 @@ void blk_execute_rq_nowait(struct request *rq, bool at_head)
WARN_ON(!blk_rq_is_passthrough(rq));
blk_account_io_start(rq);
/*
* As plugging can be enabled for passthrough requests on a zoned
* device, directly accessing the plug instead of using blk_mq_plug()
* should not have any consequences.
*/
if (current->plug)
blk_add_rq_to_plug(current->plug, rq);
else
@ -1992,7 +2000,7 @@ out:
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
else if (needs_restart && needs_resource)
else if (needs_resource)
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
@ -4191,7 +4199,7 @@ static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
return 0;
}
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
/*
* blk_mq_map_queues() and multiple .map_queues() implementations
@ -4221,10 +4229,10 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
for (i = 0; i < set->nr_maps; i++)
blk_mq_clear_mq_map(&set->map[i]);
return set->ops->map_queues(set);
set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}
}
@ -4323,9 +4331,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
}
ret = blk_mq_update_queue_map(set);
if (ret)
goto out_free_mq_map;
blk_mq_update_queue_map(set);
ret = blk_mq_alloc_set_map_and_rqs(set);
if (ret)
@ -4473,14 +4479,14 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
list_add(&qe->node, head);
/*
* After elevator_switch_mq, the previous elevator_queue will be
* After elevator_switch, the previous elevator_queue will be
* released by elevator_release. The reference of the io scheduler
* module get by elevator_get will also be put. So we need to get
* a reference of the io scheduler module here to prevent it to be
* removed.
*/
__module_get(qe->type->elevator_owner);
elevator_switch_mq(q, NULL);
elevator_switch(q, NULL);
mutex_unlock(&q->sysfs_lock);
return true;
@ -4512,7 +4518,7 @@ static void blk_mq_elv_switch_back(struct list_head *head,
kfree(qe);
mutex_lock(&q->sysfs_lock);
elevator_switch_mq(q, t);
elevator_switch(q, t);
mutex_unlock(&q->sysfs_lock);
}

View file

@ -312,7 +312,8 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
static inline struct blk_plug *blk_mq_plug( struct bio *bio)
{
/* Zoned block device write operation case: do not plug the BIO */
if (bdev_is_zoned(bio->bi_bdev) && op_is_write(bio_op(bio)))
if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
return NULL;
/*

View file

@ -17,7 +17,6 @@ enum rq_qos_id {
RQ_QOS_WBT,
RQ_QOS_LATENCY,
RQ_QOS_COST,
RQ_QOS_IOPRIO,
};
struct rq_wait {

View file

@ -844,7 +844,7 @@ int blk_register_queue(struct gendisk *disk)
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register_queue(q);
blk_throtl_register(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&q->kobj, KOBJ_ADD);

View file

@ -329,8 +329,8 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
/* init a service_queue, assumes the caller zeroed it */
static void throtl_service_queue_init(struct throtl_service_queue *sq)
{
INIT_LIST_HEAD(&sq->queued[0]);
INIT_LIST_HEAD(&sq->queued[1]);
INIT_LIST_HEAD(&sq->queued[READ]);
INIT_LIST_HEAD(&sq->queued[WRITE]);
sq->pending_tree = RB_ROOT_CACHED;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
@ -420,24 +420,17 @@ static void tg_update_has_rules(struct throtl_grp *tg)
struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
struct throtl_data *td = tg->td;
int rw;
int has_iops_limit = 0;
for (rw = READ; rw <= WRITE; rw++) {
unsigned int iops_limit = tg_iops_limit(tg, rw);
tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
tg->has_rules_iops[rw] =
(parent_tg && parent_tg->has_rules_iops[rw]) ||
(td->limit_valid[td->limit_index] &&
(tg_bps_limit(tg, rw) != U64_MAX ||
iops_limit != UINT_MAX));
if (iops_limit != UINT_MAX)
has_iops_limit = 1;
tg_iops_limit(tg, rw) != UINT_MAX);
tg->has_rules_bps[rw] =
(parent_tg && parent_tg->has_rules_bps[rw]) ||
(td->limit_valid[td->limit_index] &&
(tg_bps_limit(tg, rw) != U64_MAX));
}
if (has_iops_limit)
tg->flags |= THROTL_TG_HAS_IOPS_LIMIT;
else
tg->flags &= ~THROTL_TG_HAS_IOPS_LIMIT;
}
static void throtl_pd_online(struct blkg_policy_data *pd)
@ -520,7 +513,6 @@ static void throtl_rb_erase(struct rb_node *n,
{
rb_erase_cached(n, &parent_sq->pending_tree);
RB_CLEAR_NODE(n);
--parent_sq->nr_pending;
}
static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
@ -572,7 +564,11 @@ static void throtl_enqueue_tg(struct throtl_grp *tg)
static void throtl_dequeue_tg(struct throtl_grp *tg)
{
if (tg->flags & THROTL_TG_PENDING) {
throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
struct throtl_service_queue *parent_sq =
tg->service_queue.parent_sq;
throtl_rb_erase(&tg->rb_node, parent_sq);
--parent_sq->nr_pending;
tg->flags &= ~THROTL_TG_PENDING;
}
}
@ -639,6 +635,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->carryover_bytes[rw] = 0;
tg->carryover_ios[rw] = 0;
/*
* Previous slice has expired. We must have trimmed it after last
@ -656,12 +654,17 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->slice_end[rw], jiffies);
}
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
bool clear_carryover)
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
if (clear_carryover) {
tg->carryover_bytes[rw] = 0;
tg->carryover_ios[rw] = 0;
}
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
@ -754,13 +757,76 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
tg->slice_start[rw], tg->slice_end[rw], jiffies);
}
static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
u32 iops_limit, unsigned long *wait)
static unsigned int calculate_io_allowed(u32 iops_limit,
unsigned long jiffy_elapsed)
{
unsigned int io_allowed;
u64 tmp;
/*
* jiffy_elapsed should not be a big value as minimum iops can be
* 1 then at max jiffy elapsed should be equivalent of 1 second as we
* will allow dispatch after 1 second and after that slice should
* have been trimmed.
*/
tmp = (u64)iops_limit * jiffy_elapsed;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
io_allowed = UINT_MAX;
else
io_allowed = tmp;
return io_allowed;
}
static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
{
return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
}
static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
{
unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
u64 bps_limit = tg_bps_limit(tg, rw);
u32 iops_limit = tg_iops_limit(tg, rw);
/*
* If config is updated while bios are still throttled, calculate and
* accumulate how many bytes/ios are waited across changes. And
* carryover_bytes/ios will be used to calculate new wait time under new
* configuration.
*/
if (bps_limit != U64_MAX)
tg->carryover_bytes[rw] +=
calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
tg->bytes_disp[rw];
if (iops_limit != UINT_MAX)
tg->carryover_ios[rw] +=
calculate_io_allowed(iops_limit, jiffy_elapsed) -
tg->io_disp[rw];
}
static void tg_update_carryover(struct throtl_grp *tg)
{
if (tg->service_queue.nr_queued[READ])
__tg_update_carryover(tg, READ);
if (tg->service_queue.nr_queued[WRITE])
__tg_update_carryover(tg, WRITE);
/* see comments in struct throtl_grp for meaning of these fields. */
throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__,
tg->carryover_bytes[READ], tg->carryover_bytes[WRITE],
tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
}
static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
u32 iops_limit, unsigned long *wait)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
if (iops_limit == UINT_MAX) {
if (wait)
@ -772,22 +838,8 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
/* Round up to the next throttle slice, wait time must be nonzero */
jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
* 1 then at max jiffy elapsed should be equivalent of 1 second as we
* will allow dispatch after 1 second and after that slice should
* have been trimmed.
*/
tmp = (u64)iops_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ);
if (tmp > UINT_MAX)
io_allowed = UINT_MAX;
else
io_allowed = tmp;
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
tg->carryover_ios[rw];
if (tg->io_disp[rw] + 1 <= io_allowed) {
if (wait)
*wait = 0;
@ -802,16 +854,16 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
return false;
}
static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
u64 bps_limit, unsigned long *wait)
static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
u64 bps_limit, unsigned long *wait)
{
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes, tmp;
u64 bytes_allowed, extra_bytes;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
/* no need to throttle if this bio's bytes have been accounted */
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_THROTTLED)) {
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
if (wait)
*wait = 0;
return true;
@ -824,11 +876,8 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
tmp = bps_limit * jiffy_elapsed_rnd;
do_div(tmp, HZ);
bytes_allowed = tmp;
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
tg->carryover_bytes[rw];
if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
if (wait)
*wait = 0;
@ -889,7 +938,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* slice and it should be extended instead.
*/
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
throtl_start_new_slice(tg, rw);
throtl_start_new_slice(tg, rw, true);
else {
if (time_before(tg->slice_end[rw],
jiffies + tg->td->throtl_slice))
@ -897,8 +946,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) &&
tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) {
if (wait)
*wait = 0;
return true;
@ -921,22 +970,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
if (!bio_flagged(bio, BIO_THROTTLED)) {
if (!bio_flagged(bio, BIO_BPS_THROTTLED)) {
tg->bytes_disp[rw] += bio_size;
tg->last_bytes_disp[rw] += bio_size;
}
tg->io_disp[rw]++;
tg->last_io_disp[rw]++;
/*
* BIO_THROTTLED is used to prevent the same bio to be throttled
* more than once as a throttled bio will go through blk-throtl the
* second time when it eventually gets issued. Set it when a bio
* is being charged to a tg.
*/
if (!bio_flagged(bio, BIO_THROTTLED))
bio_set_flag(bio, BIO_THROTTLED);
}
/**
@ -990,9 +1030,9 @@ static void tg_update_disptime(struct throtl_grp *tg)
disptime = jiffies + min_wait;
/* Update dispatch time */
throtl_dequeue_tg(tg);
throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
tg->disptime = disptime;
throtl_enqueue_tg(tg);
tg_service_queue_add(tg);
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
@ -1026,6 +1066,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio);
bio_set_flag(bio, BIO_BPS_THROTTLED);
/*
* If our parent is another tg, we just need to transfer @bio to
@ -1101,13 +1142,13 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
if (time_before(jiffies, tg->disptime))
break;
throtl_dequeue_tg(tg);
nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue;
if (sq->nr_queued[0] || sq->nr_queued[1])
if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
tg_update_disptime(tg);
else
throtl_dequeue_tg(tg);
if (nr_disp >= THROTL_QUANTUM)
break;
@ -1321,8 +1362,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
* that a group's limit are dropped suddenly and we don't want to
* account recently dispatched IO with new low rate.
*/
throtl_start_new_slice(tg, READ);
throtl_start_new_slice(tg, WRITE);
throtl_start_new_slice(tg, READ, false);
throtl_start_new_slice(tg, WRITE, false);
if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg);
@ -1350,6 +1391,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
if (is_u64)
*(u64 *)((void *)tg + of_cft(of)->private) = v;
@ -1536,6 +1578,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
return ret;
tg = blkg_to_tg(ctx.blkg);
tg_update_carryover(tg);
v[0] = tg->bps_conf[READ][index];
v[1] = tg->bps_conf[WRITE][index];
@ -1673,6 +1716,41 @@ struct blkcg_policy blkcg_policy_throtl = {
.pd_free_fn = throtl_pd_free,
};
void blk_throtl_cancel_bios(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
spin_lock_irq(&q->queue_lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
* However, rcu lock is still held to emphasize that following
* path need RCU protection and to prevent warning from lockdep.
*/
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq = &tg->service_queue;
/*
* Set the flag to make sure throtl_pending_timer_fn() won't
* stop until all throttled bios are dispatched.
*/
blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
/*
* Update disptime after setting the above flag to make sure
* throtl_select_dispatch() won't exit without dispatching.
*/
tg_update_disptime(tg);
throtl_schedule_pending_timer(sq, jiffies + 1);
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
}
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
{
unsigned long rtime = jiffies, wtime = jiffies;
@ -1777,39 +1855,6 @@ static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
return false;
}
void blk_throtl_cancel_bios(struct request_queue *q)
{
struct cgroup_subsys_state *pos_css;
struct blkcg_gq *blkg;
spin_lock_irq(&q->queue_lock);
/*
* queue_lock is held, rcu lock is not needed here technically.
* However, rcu lock is still held to emphasize that following
* path need RCU protection and to prevent warning from lockdep.
*/
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq = &tg->service_queue;
/*
* Set the flag to make sure throtl_pending_timer_fn() won't
* stop until all throttled bios are dispatched.
*/
blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
/*
* Update disptime after setting the above flag to make sure
* throtl_select_dispatch() won't exit without dispatching.
*/
tg_update_disptime(tg);
throtl_schedule_pending_timer(sq, jiffies + 1);
}
rcu_read_unlock();
spin_unlock_irq(&q->queue_lock);
}
static bool throtl_can_upgrade(struct throtl_data *td,
struct throtl_grp *this_tg)
{
@ -2005,7 +2050,6 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
tg->checked_last_finish_time = last_finish_time;
}
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td)
{
struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
@ -2086,6 +2130,28 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
static inline void throtl_update_latency_buckets(struct throtl_data *td)
{
}
static void blk_throtl_update_idletime(struct throtl_grp *tg)
{
}
static void throtl_downgrade_check(struct throtl_grp *tg)
{
}
static void throtl_upgrade_check(struct throtl_grp *tg)
{
}
static bool throtl_can_upgrade(struct throtl_data *td,
struct throtl_grp *this_tg)
{
return false;
}
static void throtl_upgrade_state(struct throtl_data *td)
{
}
#endif
bool __blk_throtl_bio(struct bio *bio)
@ -2159,8 +2225,10 @@ again:
qn = &tg->qnode_on_parent[rw];
sq = sq->parent_sq;
tg = sq_to_tg(sq);
if (!tg)
if (!tg) {
bio_set_flag(bio, BIO_BPS_THROTTLED);
goto out_unlock;
}
}
/* out-of-limit, queue to @tg */
@ -2189,8 +2257,6 @@ again:
}
out_unlock:
bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)
bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
@ -2286,8 +2352,9 @@ void blk_throtl_bio_endio(struct bio *bio)
}
#endif
int blk_throtl_init(struct request_queue *q)
int blk_throtl_init(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct throtl_data *td;
int ret;
@ -2329,8 +2396,10 @@ int blk_throtl_init(struct request_queue *q)
return ret;
}
void blk_throtl_exit(struct request_queue *q)
void blk_throtl_exit(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
BUG_ON(!q->td);
del_timer_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
@ -2340,8 +2409,9 @@ void blk_throtl_exit(struct request_queue *q)
kfree(q->td);
}
void blk_throtl_register_queue(struct request_queue *q)
void blk_throtl_register(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
struct throtl_data *td;
int i;

View file

@ -55,8 +55,7 @@ struct throtl_service_queue {
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
THROTL_TG_HAS_IOPS_LIMIT = 1 << 2, /* tg has iops limit */
THROTL_TG_CANCELING = 1 << 3, /* starts to cancel bio */
THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
};
enum {
@ -99,7 +98,8 @@ struct throtl_grp {
unsigned int flags;
/* are there any throtl rules between this group and td? */
bool has_rules[2];
bool has_rules_bps[2];
bool has_rules_iops[2];
/* internally used bytes per second rate limits */
uint64_t bps[2][LIMIT_CNT];
@ -121,6 +121,15 @@ struct throtl_grp {
uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2];
/*
* The following two fields are updated when new configuration is
* submitted while some bios are still throttled, they record how many
* bytes/ios are waited already in previous configuration, and they will
* be used to calculate wait time under new configuration.
*/
uint64_t carryover_bytes[2];
unsigned int carryover_ios[2];
unsigned long last_check_time;
unsigned long latency_target; /* us */
@ -159,27 +168,37 @@ static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
* Internal throttling interface
*/
#ifndef CONFIG_BLK_DEV_THROTTLING
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
static inline int blk_throtl_init(struct gendisk *disk) { return 0; }
static inline void blk_throtl_exit(struct gendisk *disk) { }
static inline void blk_throtl_register(struct gendisk *disk) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
static inline void blk_throtl_cancel_bios(struct request_queue *q) { }
static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
#else /* CONFIG_BLK_DEV_THROTTLING */
int blk_throtl_init(struct request_queue *q);
void blk_throtl_exit(struct request_queue *q);
void blk_throtl_register_queue(struct request_queue *q);
int blk_throtl_init(struct gendisk *disk);
void blk_throtl_exit(struct gendisk *disk);
void blk_throtl_register(struct gendisk *disk);
bool __blk_throtl_bio(struct bio *bio);
void blk_throtl_cancel_bios(struct request_queue *q);
static inline bool blk_throtl_bio(struct bio *bio)
void blk_throtl_cancel_bios(struct gendisk *disk);
static inline bool blk_should_throtl(struct bio *bio)
{
struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
int rw = bio_data_dir(bio);
/* no need to throttle bps any more if the bio has been throttled */
if (bio_flagged(bio, BIO_THROTTLED) &&
!(tg->flags & THROTL_TG_HAS_IOPS_LIMIT))
return false;
/* iops limit is always counted */
if (tg->has_rules_iops[rw])
return true;
if (!tg->has_rules[bio_data_dir(bio)])
if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
return true;
return false;
}
static inline bool blk_throtl_bio(struct bio *bio)
{
if (!blk_should_throtl(bio))
return false;
return __blk_throtl_bio(bio);

View file

@ -843,6 +843,10 @@ int wbt_init(struct request_queue *q)
rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->wc = 1;
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
/*
* Assign rwb and add the stats callback.
@ -853,11 +857,6 @@ int wbt_init(struct request_queue *q)
blk_stat_add_callback(q, rwb->cb);
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;
err_free:

View file

@ -63,13 +63,10 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
if (!rq->q->disk->seq_zones_wlock)
return false;
switch (req_op(rq)) {
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE:
if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq)))
return blk_rq_zone_is_seq(rq);
default:
return false;
}
return false;
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);

View file

@ -270,8 +270,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
void blk_insert_flush(struct request *rq);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
@ -389,9 +388,9 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
}
#ifdef CONFIG_BLK_CGROUP_IOLATENCY
extern int blk_iolatency_init(struct request_queue *q);
int blk_iolatency_init(struct gendisk *disk);
#else
static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
static inline int blk_iolatency_init(struct gendisk *disk) { return 0; };
#endif
#ifdef CONFIG_BLK_DEV_ZONED

View file

@ -588,7 +588,7 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
int elevator_switch_mq(struct request_queue *q,
static int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e)
{
int ret;
@ -723,7 +723,7 @@ void elevator_init_mq(struct request_queue *q)
* need for the new one. this way we have a chance of going back to the old
* one, if the new one fails init for some reason.
*/
static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
int err;

View file

@ -627,7 +627,7 @@ void del_gendisk(struct gendisk *disk)
blk_mq_freeze_queue_wait(q);
blk_throtl_cancel_bios(disk->queue);
blk_throtl_cancel_bios(disk);
blk_sync_queue(q);
blk_flush_integrity();
@ -1151,7 +1151,8 @@ static void disk_release(struct device *dev)
!test_bit(GD_ADDED, &disk->state))
blk_mq_exit_queue(disk->queue);
blkcg_exit_queue(disk->queue);
blkcg_exit_disk(disk);
bioset_exit(&disk->bio_split);
disk_release_events(disk);
@ -1364,7 +1365,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
goto out_destroy_part_tbl;
if (blkcg_init_queue(q))
if (blkcg_init_disk(disk))
goto out_erase_part0;
rand_initialize_disk(disk);

View file

@ -39,7 +39,12 @@ enum opal_response_token {
#define FIRST_TPER_SESSION_NUM 4096
#define TPER_SYNC_SUPPORTED 0x01
/* FC_LOCKING features */
#define LOCKING_SUPPORTED_MASK 0x01
#define LOCKING_ENABLED_MASK 0x02
#define LOCKED_MASK 0x04
#define MBR_ENABLED_MASK 0x10
#define MBR_DONE_MASK 0x20
#define TINY_ATOM_DATA_MASK 0x3F
#define TINY_ATOM_SIGNED 0x40

View file

@ -74,8 +74,7 @@ struct parsed_resp {
};
struct opal_dev {
bool supported;
bool mbr_enabled;
u32 flags;
void *data;
sec_send_recv *send_recv;
@ -280,6 +279,30 @@ static bool check_tper(const void *data)
return true;
}
static bool check_lcksuppt(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & LOCKING_SUPPORTED_MASK);
}
static bool check_lckenabled(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & LOCKING_ENABLED_MASK);
}
static bool check_locked(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & LOCKED_MASK);
}
static bool check_mbrenabled(const void *data)
{
const struct d0_locking_features *lfeat = data;
@ -288,6 +311,14 @@ static bool check_mbrenabled(const void *data)
return !!(sup_feat & MBR_ENABLED_MASK);
}
static bool check_mbrdone(const void *data)
{
const struct d0_locking_features *lfeat = data;
u8 sup_feat = lfeat->supported_features;
return !!(sup_feat & MBR_DONE_MASK);
}
static bool check_sum(const void *data)
{
const struct d0_single_user_mode *sum = data;
@ -435,7 +466,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
u32 hlen = be32_to_cpu(hdr->length);
print_buffer(dev->resp, hlen);
dev->mbr_enabled = false;
dev->flags &= OPAL_FL_SUPPORTED;
if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
@ -461,7 +492,16 @@ static int opal_discovery0_end(struct opal_dev *dev)
check_geometry(dev, body);
break;
case FC_LOCKING:
dev->mbr_enabled = check_mbrenabled(body->features);
if (check_lcksuppt(body->features))
dev->flags |= OPAL_FL_LOCKING_SUPPORTED;
if (check_lckenabled(body->features))
dev->flags |= OPAL_FL_LOCKING_ENABLED;
if (check_locked(body->features))
dev->flags |= OPAL_FL_LOCKED;
if (check_mbrenabled(body->features))
dev->flags |= OPAL_FL_MBR_ENABLED;
if (check_mbrdone(body->features))
dev->flags |= OPAL_FL_MBR_DONE;
break;
case FC_ENTERPRISE:
case FC_DATASTORE:
@ -2109,7 +2149,8 @@ static int check_opal_support(struct opal_dev *dev)
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = opal_discovery0_step(dev);
dev->supported = !ret;
if (!ret)
dev->flags |= OPAL_FL_SUPPORTED;
mutex_unlock(&dev->dev_lock);
return ret;
@ -2148,6 +2189,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
INIT_LIST_HEAD(&dev->unlk_lst);
mutex_init(&dev->dev_lock);
dev->flags = 0;
dev->data = data;
dev->send_recv = send_recv;
if (check_opal_support(dev) != 0) {
@ -2528,7 +2570,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
if (!dev)
return false;
if (!dev->supported)
if (!(dev->flags & OPAL_FL_SUPPORTED))
return false;
mutex_lock(&dev->dev_lock);
@ -2546,7 +2588,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
was_failure = true;
}
if (dev->mbr_enabled) {
if (dev->flags & OPAL_FL_MBR_ENABLED) {
ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
if (ret)
pr_debug("Failed to set MBR Done in S3 resume\n");
@ -2620,6 +2662,23 @@ static int opal_generic_read_write_table(struct opal_dev *dev,
return ret;
}
static int opal_get_status(struct opal_dev *dev, void __user *data)
{
struct opal_status sts = {0};
/*
* check_opal_support() error is not fatal,
* !dev->supported is a valid condition
*/
if (!check_opal_support(dev))
sts.flags = dev->flags;
if (copy_to_user(data, &sts, sizeof(sts))) {
pr_debug("Error copying status to userspace\n");
return -EFAULT;
}
return 0;
}
int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
{
void *p;
@ -2629,12 +2688,14 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
return -EACCES;
if (!dev)
return -ENOTSUPP;
if (!dev->supported)
if (!(dev->flags & OPAL_FL_SUPPORTED))
return -ENOTSUPP;
p = memdup_user(arg, _IOC_SIZE(cmd));
if (IS_ERR(p))
return PTR_ERR(p);
if (cmd & IOC_IN) {
p = memdup_user(arg, _IOC_SIZE(cmd));
if (IS_ERR(p))
return PTR_ERR(p);
}
switch (cmd) {
case IOC_OPAL_SAVE:
@ -2685,11 +2746,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
case IOC_OPAL_GENERIC_TABLE_RW:
ret = opal_generic_read_write_table(dev, p);
break;
case IOC_OPAL_GET_STATUS:
ret = opal_get_status(dev, arg);
break;
default:
break;
}
kfree(p);
if (cmd & IOC_IN)
kfree(p);
return ret;
}
EXPORT_SYMBOL_GPL(sed_ioctl);