[PATCH] Update cfq io scheduler to time sliced design

This updates the CFQ io scheduler to the new time sliced design (cfq
v3).  It provides full process fairness, while giving excellent
aggregate system throughput even for many competing processes.  It
supports io priorities, either inherited from the cpu nice value or set
directly with the ioprio_get/set syscalls.  The latter closely mimic
set/getpriority.

This import is based on my latest from -mm.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
This commit is contained in:
Jens Axboe 2005-06-27 10:55:12 +02:00 committed by Linus Torvalds
parent 020f46a39e
commit 22e2c507c3
26 changed files with 1732 additions and 739 deletions

View File

@ -289,3 +289,5 @@ ENTRY(sys_call_table)
.long sys_add_key
.long sys_request_key
.long sys_keyctl
.long sys_ioprio_set
.long sys_ioprio_get /* 290 */

View File

@ -1577,8 +1577,8 @@ sys_call_table:
data8 sys_add_key
data8 sys_request_key
data8 sys_keyctl
data8 sys_ni_syscall
data8 sys_ni_syscall // 1275
data8 sys_ioprio_set
data8 sys_ioprio_get // 1275
data8 sys_set_zone_reclaim
data8 sys_ni_syscall
data8 sys_ni_syscall

View File

@ -1449,3 +1449,5 @@ _GLOBAL(sys_call_table)
.long sys_request_key /* 270 */
.long sys_keyctl
.long sys_waitid
.long sys_ioprio_set
.long sys_ioprio_get

View File

@ -1806,7 +1806,8 @@ static void as_put_request(request_queue_t *q, struct request *rq)
rq->elevator_private = NULL;
}
static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
static int as_set_request(request_queue_t *q, struct request *rq,
struct bio *bio, int gfp_mask)
{
struct as_data *ad = q->elevator->elevator_data;
struct as_rq *arq = mempool_alloc(ad->arq_pool, gfp_mask);
@ -1827,7 +1828,7 @@ static int as_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
return 1;
}
static int as_may_queue(request_queue_t *q, int rw)
static int as_may_queue(request_queue_t *q, int rw, struct bio *bio)
{
int ret = ELV_MQUEUE_MAY;
struct as_data *ad = q->elevator->elevator_data;

File diff suppressed because it is too large Load Diff

View File

@ -760,7 +760,8 @@ static void deadline_put_request(request_queue_t *q, struct request *rq)
}
static int
deadline_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
deadline_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
int gfp_mask)
{
struct deadline_data *dd = q->elevator->elevator_data;
struct deadline_rq *drq;

View File

@ -486,12 +486,13 @@ struct request *elv_former_request(request_queue_t *q, struct request *rq)
return NULL;
}
int elv_set_request(request_queue_t *q, struct request *rq, int gfp_mask)
int elv_set_request(request_queue_t *q, struct request *rq, struct bio *bio,
int gfp_mask)
{
elevator_t *e = q->elevator;
if (e->ops->elevator_set_req_fn)
return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
return e->ops->elevator_set_req_fn(q, rq, bio, gfp_mask);
rq->elevator_private = NULL;
return 0;
@ -505,12 +506,12 @@ void elv_put_request(request_queue_t *q, struct request *rq)
e->ops->elevator_put_req_fn(q, rq);
}
int elv_may_queue(request_queue_t *q, int rw)
int elv_may_queue(request_queue_t *q, int rw, struct bio *bio)
{
elevator_t *e = q->elevator;
if (e->ops->elevator_may_queue_fn)
return e->ops->elevator_may_queue_fn(q, rw);
return e->ops->elevator_may_queue_fn(q, rw, bio);
return ELV_MQUEUE_MAY;
}

View File

@ -276,6 +276,7 @@ static inline void rq_init(request_queue_t *q, struct request *rq)
rq->errors = 0;
rq->rq_status = RQ_ACTIVE;
rq->bio = rq->biotail = NULL;
rq->ioprio = 0;
rq->buffer = NULL;
rq->ref_count = 1;
rq->q = q;
@ -1442,11 +1443,7 @@ void __generic_unplug_device(request_queue_t *q)
if (!blk_remove_plug(q))
return;
/*
* was plugged, fire request_fn if queue has stuff to do
*/
if (elv_next_request(q))
q->request_fn(q);
q->request_fn(q);
}
EXPORT_SYMBOL(__generic_unplug_device);
@ -1776,8 +1773,8 @@ static inline void blk_free_request(request_queue_t *q, struct request *rq)
mempool_free(rq, q->rq.rq_pool);
}
static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
int gfp_mask)
static inline struct request *
blk_alloc_request(request_queue_t *q, int rw, struct bio *bio, int gfp_mask)
{
struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
@ -1790,7 +1787,7 @@ static inline struct request *blk_alloc_request(request_queue_t *q, int rw,
*/
rq->flags = rw;
if (!elv_set_request(q, rq, gfp_mask))
if (!elv_set_request(q, rq, bio, gfp_mask))
return rq;
mempool_free(rq, q->rq.rq_pool);
@ -1872,7 +1869,8 @@ static void freed_request(request_queue_t *q, int rw)
/*
* Get a free request, queue_lock must not be held
*/
static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
static struct request *get_request(request_queue_t *q, int rw, struct bio *bio,
int gfp_mask)
{
struct request *rq = NULL;
struct request_list *rl = &q->rq;
@ -1895,7 +1893,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
}
}
switch (elv_may_queue(q, rw)) {
switch (elv_may_queue(q, rw, bio)) {
case ELV_MQUEUE_NO:
goto rq_starved;
case ELV_MQUEUE_MAY:
@ -1920,7 +1918,7 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
set_queue_congested(q, rw);
spin_unlock_irq(q->queue_lock);
rq = blk_alloc_request(q, rw, gfp_mask);
rq = blk_alloc_request(q, rw, bio, gfp_mask);
if (!rq) {
/*
* Allocation failed presumably due to memory. Undo anything
@ -1961,7 +1959,8 @@ static struct request *get_request(request_queue_t *q, int rw, int gfp_mask)
* No available requests for this queue, unplug the device and wait for some
* requests to become available.
*/
static struct request *get_request_wait(request_queue_t *q, int rw)
static struct request *get_request_wait(request_queue_t *q, int rw,
struct bio *bio)
{
DEFINE_WAIT(wait);
struct request *rq;
@ -1972,7 +1971,7 @@ static struct request *get_request_wait(request_queue_t *q, int rw)
prepare_to_wait_exclusive(&rl->wait[rw], &wait,
TASK_UNINTERRUPTIBLE);
rq = get_request(q, rw, GFP_NOIO);
rq = get_request(q, rw, bio, GFP_NOIO);
if (!rq) {
struct io_context *ioc;
@ -2003,9 +2002,9 @@ struct request *blk_get_request(request_queue_t *q, int rw, int gfp_mask)
BUG_ON(rw != READ && rw != WRITE);
if (gfp_mask & __GFP_WAIT)
rq = get_request_wait(q, rw);
rq = get_request_wait(q, rw, NULL);
else
rq = get_request(q, rw, gfp_mask);
rq = get_request(q, rw, NULL, gfp_mask);
return rq;
}
@ -2333,7 +2332,6 @@ static void __blk_put_request(request_queue_t *q, struct request *req)
return;
req->rq_status = RQ_INACTIVE;
req->q = NULL;
req->rl = NULL;
/*
@ -2462,6 +2460,8 @@ static int attempt_merge(request_queue_t *q, struct request *req,
req->rq_disk->in_flight--;
}
req->ioprio = ioprio_best(req->ioprio, next->ioprio);
__blk_put_request(q, next);
return 1;
}
@ -2514,11 +2514,13 @@ static int __make_request(request_queue_t *q, struct bio *bio)
{
struct request *req, *freereq = NULL;
int el_ret, rw, nr_sectors, cur_nr_sectors, barrier, err, sync;
unsigned short prio;
sector_t sector;
sector = bio->bi_sector;
nr_sectors = bio_sectors(bio);
cur_nr_sectors = bio_cur_sectors(bio);
prio = bio_prio(bio);
rw = bio_data_dir(bio);
sync = bio_sync(bio);
@ -2559,6 +2561,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->biotail->bi_next = bio;
req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req);
@ -2583,6 +2586,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->hard_cur_sectors = cur_nr_sectors;
req->sector = req->hard_sector = sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req);
@ -2610,7 +2614,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
freereq = NULL;
} else {
spin_unlock_irq(q->queue_lock);
if ((freereq = get_request(q, rw, GFP_ATOMIC)) == NULL) {
if ((freereq = get_request(q, rw, bio, GFP_ATOMIC)) == NULL) {
/*
* READA bit set
*/
@ -2618,7 +2622,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
if (bio_rw_ahead(bio))
goto end_io;
freereq = get_request_wait(q, rw);
freereq = get_request_wait(q, rw, bio);
}
goto again;
}
@ -2646,6 +2650,7 @@ static int __make_request(request_queue_t *q, struct bio *bio)
req->buffer = bio_data(bio); /* see ->buffer comment above */
req->waiting = NULL;
req->bio = req->biotail = bio;
req->ioprio = prio;
req->rq_disk = bio->bi_bdev->bd_disk;
req->start_time = jiffies;
@ -2674,7 +2679,7 @@ static inline void blk_partition_remap(struct bio *bio)
if (bdev != bdev->bd_contains) {
struct hd_struct *p = bdev->bd_part;
switch (bio->bi_rw) {
switch (bio_data_dir(bio)) {
case READ:
p->read_sectors += bio_sectors(bio);
p->reads++;
@ -2693,6 +2698,7 @@ void blk_finish_queue_drain(request_queue_t *q)
{
struct request_list *rl = &q->rq;
struct request *rq;
int requeued = 0;
spin_lock_irq(q->queue_lock);
clear_bit(QUEUE_FLAG_DRAIN, &q->queue_flags);
@ -2701,9 +2707,13 @@ void blk_finish_queue_drain(request_queue_t *q)
rq = list_entry_rq(q->drain_list.next);
list_del_init(&rq->queuelist);
__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 1);
elv_requeue_request(q, rq);
requeued++;
}
if (requeued)
q->request_fn(q);
spin_unlock_irq(q->queue_lock);
wake_up(&rl->wait[0]);
@ -2900,7 +2910,7 @@ void submit_bio(int rw, struct bio *bio)
BIO_BUG_ON(!bio->bi_size);
BIO_BUG_ON(!bio->bi_io_vec);
bio->bi_rw = rw;
bio->bi_rw |= rw;
if (rw & WRITE)
mod_page_state(pgpgout, count);
else
@ -3257,8 +3267,11 @@ void exit_io_context(void)
struct io_context *ioc;
local_irq_save(flags);
task_lock(current);
ioc = current->io_context;
current->io_context = NULL;
ioc->task = NULL;
task_unlock(current);
local_irq_restore(flags);
if (ioc->aic && ioc->aic->exit)
@ -3293,12 +3306,12 @@ struct io_context *get_io_context(int gfp_flags)
ret = kmem_cache_alloc(iocontext_cachep, gfp_flags);
if (ret) {
atomic_set(&ret->refcount, 1);
ret->pid = tsk->pid;
ret->task = current;
ret->set_ioprio = NULL;
ret->last_waited = jiffies; /* doesn't matter... */
ret->nr_batch_requests = 0; /* because this is 0 */
ret->aic = NULL;
ret->cic = NULL;
spin_lock_init(&ret->lock);
local_irq_save(flags);

View File

@ -10,6 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \
ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
ioprio.o
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_COMPAT) += compat.o

172
fs/ioprio.c Normal file
View File

@ -0,0 +1,172 @@
/*
* fs/ioprio.c
*
* Copyright (C) 2004 Jens Axboe <axboe@suse.de>
*
* Helper functions for setting/querying io priorities of processes. The
* system calls closely mimmick getpriority/setpriority, see the man page for
* those. The prio argument is a composite of prio class and prio data, where
* the data argument has meaning within that class. The standard scheduling
* classes have 8 distinct prio levels, with 0 being the highest prio and 7
* being the lowest.
*
* IOW, setting BE scheduling class with prio 2 is done ala:
*
* unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
*
* ioprio_set(PRIO_PROCESS, pid, prio);
*
* See also Documentation/block/ioprio.txt
*
*/
#include <linux/kernel.h>
#include <linux/ioprio.h>
#include <linux/blkdev.h>
static int set_task_ioprio(struct task_struct *task, int ioprio)
{
struct io_context *ioc;
if (task->uid != current->euid &&
task->uid != current->uid && !capable(CAP_SYS_NICE))
return -EPERM;
task_lock(task);
task->ioprio = ioprio;
ioc = task->io_context;
if (ioc && ioc->set_ioprio)
ioc->set_ioprio(ioc, ioprio);
task_unlock(task);
return 0;
}
asmlinkage int sys_ioprio_set(int which, int who, int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
int data = IOPRIO_PRIO_DATA(ioprio);
struct task_struct *p, *g;
struct user_struct *user;
int ret;
switch (class) {
case IOPRIO_CLASS_RT:
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
/* fall through, rt has prio field too */
case IOPRIO_CLASS_BE:
if (data >= IOPRIO_BE_NR || data < 0)
return -EINVAL;
break;
case IOPRIO_CLASS_IDLE:
break;
default:
return -EINVAL;
}
ret = -ESRCH;
read_lock_irq(&tasklist_lock);
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_pid(who);
if (p)
ret = set_task_ioprio(p, ioprio);
break;
case IOPRIO_WHO_PGRP:
if (!who)
who = process_group(current);
do_each_task_pid(who, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
if (ret)
break;
} while_each_task_pid(who, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
if (!who)
user = current->user;
else
user = find_user(who);
if (!user)
break;
do_each_thread(g, p) {
if (p->uid != who)
continue;
ret = set_task_ioprio(p, ioprio);
if (ret)
break;
} while_each_thread(g, p);
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
read_unlock_irq(&tasklist_lock);
return ret;
}
asmlinkage int sys_ioprio_get(int which, int who)
{
struct task_struct *g, *p;
struct user_struct *user;
int ret = -ESRCH;
read_lock_irq(&tasklist_lock);
switch (which) {
case IOPRIO_WHO_PROCESS:
if (!who)
p = current;
else
p = find_task_by_pid(who);
if (p)
ret = p->ioprio;
break;
case IOPRIO_WHO_PGRP:
if (!who)
who = process_group(current);
do_each_task_pid(who, PIDTYPE_PGID, p) {
if (ret == -ESRCH)
ret = p->ioprio;
else
ret = ioprio_best(ret, p->ioprio);
} while_each_task_pid(who, PIDTYPE_PGID, p);
break;
case IOPRIO_WHO_USER:
if (!who)
user = current->user;
else
user = find_user(who);
if (!user)
break;
do_each_thread(g, p) {
if (p->uid != user->uid)
continue;
if (ret == -ESRCH)
ret = p->ioprio;
else
ret = ioprio_best(ret, p->ioprio);
} while_each_thread(g, p);
if (who)
free_uid(user);
break;
default:
ret = -EINVAL;
}
read_unlock_irq(&tasklist_lock);
return ret;
}

View File

@ -645,18 +645,22 @@ struct buffer_chunk {
static void write_chunk(struct buffer_chunk *chunk) {
int i;
get_fs_excl();
for (i = 0; i < chunk->nr ; i++) {
submit_logged_buffer(chunk->bh[i]) ;
}
chunk->nr = 0;
put_fs_excl();
}
static void write_ordered_chunk(struct buffer_chunk *chunk) {
int i;
get_fs_excl();
for (i = 0; i < chunk->nr ; i++) {
submit_ordered_buffer(chunk->bh[i]) ;
}
chunk->nr = 0;
put_fs_excl();
}
static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh,
@ -918,6 +922,8 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
return 0 ;
}
get_fs_excl();
/* before we can put our commit blocks on disk, we have to make sure everyone older than
** us is on disk too
*/
@ -1055,6 +1061,7 @@ static int flush_commit_list(struct super_block *s, struct reiserfs_journal_list
if (retval)
reiserfs_abort (s, retval, "Journal write error in %s", __FUNCTION__);
put_fs_excl();
return retval;
}
@ -1251,6 +1258,8 @@ static int flush_journal_list(struct super_block *s,
return 0 ;
}
get_fs_excl();
/* if all the work is already done, get out of here */
if (atomic_read(&(jl->j_nonzerolen)) <= 0 &&
atomic_read(&(jl->j_commit_left)) <= 0) {
@ -1450,6 +1459,7 @@ static int flush_journal_list(struct super_block *s,
put_journal_list(s, jl);
if (flushall)
up(&journal->j_flush_sem);
put_fs_excl();
return err ;
}
@ -2719,6 +2729,7 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th, struct sup
th->t_trans_id = journal->j_trans_id ;
unlock_journal(p_s_sb) ;
INIT_LIST_HEAD (&th->t_list);
get_fs_excl();
return 0 ;
out_fail:
@ -3526,6 +3537,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th, struct super_b
BUG_ON (th->t_refcount > 1);
BUG_ON (!th->t_trans_id);
put_fs_excl();
current->journal_info = th->t_handle_save;
reiserfs_check_lock_depth(p_s_sb, "journal end");
if (journal->j_len == 0) {

View File

@ -294,8 +294,10 @@
#define __NR_add_key 286
#define __NR_request_key 287
#define __NR_keyctl 288
#define __NR_ioprio_set 289
#define __NR_ioprio_get 290
#define NR_syscalls 289
#define NR_syscalls 291
/*
* user-visible error numbers are in the range -1 - -128: see

View File

@ -263,6 +263,8 @@
#define __NR_add_key 1271
#define __NR_request_key 1272
#define __NR_keyctl 1273
#define __NR_ioprio_set 1274
#define __NR_ioprio_get 1275
#define __NR_set_zone_reclaim 1276
#ifdef __KERNEL__

View File

@ -277,8 +277,10 @@
#define __NR_request_key 270
#define __NR_keyctl 271
#define __NR_waitid 272
#define __NR_ioprio_set 273
#define __NR_ioprio_get 274
#define __NR_syscalls 273
#define __NR_syscalls 275
#define __NR(n) #n

View File

@ -561,8 +561,12 @@ __SYSCALL(__NR_add_key, sys_add_key)
__SYSCALL(__NR_request_key, sys_request_key)
#define __NR_keyctl 250
__SYSCALL(__NR_keyctl, sys_keyctl)
#define __NR_ioprio_set 251
__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
#define __NR_ioprio_get 252
__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
#define __NR_syscall_max __NR_keyctl
#define __NR_syscall_max __NR_ioprio_get
#ifndef __NO_STUBS
/* user-visible error numbers are in the range -1 - -4095 */

View File

@ -22,6 +22,7 @@
#include <linux/highmem.h>
#include <linux/mempool.h>
#include <linux/ioprio.h>
/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
#include <asm/io.h>
@ -149,6 +150,19 @@ struct bio {
#define BIO_RW_FAILFAST 3
#define BIO_RW_SYNC 4
/*
* upper 16 bits of bi_rw define the io priority of this bio
*/
#define BIO_PRIO_SHIFT (8 * sizeof(unsigned long) - IOPRIO_BITS)
#define bio_prio(bio) ((bio)->bi_rw >> BIO_PRIO_SHIFT)
#define bio_prio_valid(bio) ioprio_valid(bio_prio(bio))
#define bio_set_prio(bio, prio) do { \
WARN_ON(prio >= (1 << IOPRIO_BITS)); \
(bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1); \
(bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT); \
} while (0)
/*
* various member access, note that bio_data should of course not be used
* on highmem page vectors

View File

@ -54,16 +54,23 @@ struct as_io_context {
struct cfq_queue;
struct cfq_io_context {
void (*dtor)(struct cfq_io_context *);
void (*exit)(struct cfq_io_context *);
struct io_context *ioc;
/*
* circular list of cfq_io_contexts belonging to a process io context
*/
struct list_head list;
struct cfq_queue *cfqq;
void *key;
struct io_context *ioc;
unsigned long last_end_request;
unsigned long last_queue;
unsigned long ttime_total;
unsigned long ttime_samples;
unsigned long ttime_mean;
void (*dtor)(struct cfq_io_context *);
void (*exit)(struct cfq_io_context *);
};
/*
@ -73,7 +80,9 @@ struct cfq_io_context {
*/
struct io_context {
atomic_t refcount;
pid_t pid;
struct task_struct *task;
int (*set_ioprio)(struct io_context *, unsigned int);
/*
* For request batching
@ -81,8 +90,6 @@ struct io_context {
unsigned long last_waited; /* Time last woken after wait for request */
int nr_batch_requests; /* Number of requests left in the batch */
spinlock_t lock;
struct as_io_context *aic;
struct cfq_io_context *cic;
};
@ -134,6 +141,8 @@ struct request {
void *elevator_private;
unsigned short ioprio;
int rq_status; /* should split this into a few status bits */
struct gendisk *rq_disk;
int errors;

View File

@ -16,9 +16,9 @@ typedef void (elevator_remove_req_fn) (request_queue_t *, struct request *);
typedef void (elevator_requeue_req_fn) (request_queue_t *, struct request *);
typedef struct request *(elevator_request_list_fn) (request_queue_t *, struct request *);
typedef void (elevator_completed_req_fn) (request_queue_t *, struct request *);
typedef int (elevator_may_queue_fn) (request_queue_t *, int);
typedef int (elevator_may_queue_fn) (request_queue_t *, int, struct bio *);
typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, int);
typedef int (elevator_set_req_fn) (request_queue_t *, struct request *, struct bio *, int);
typedef void (elevator_put_req_fn) (request_queue_t *, struct request *);
typedef void (elevator_deactivate_req_fn) (request_queue_t *, struct request *);
@ -96,9 +96,9 @@ extern struct request *elv_former_request(request_queue_t *, struct request *);
extern struct request *elv_latter_request(request_queue_t *, struct request *);
extern int elv_register_queue(request_queue_t *q);
extern void elv_unregister_queue(request_queue_t *q);
extern int elv_may_queue(request_queue_t *, int);
extern int elv_may_queue(request_queue_t *, int, struct bio *);
extern void elv_completed_request(request_queue_t *, struct request *);
extern int elv_set_request(request_queue_t *, struct request *, int);
extern int elv_set_request(request_queue_t *, struct request *, struct bio *, int);
extern void elv_put_request(request_queue_t *, struct request *);
/*

View File

@ -213,6 +213,7 @@ extern int dir_notify_enable;
#include <linux/radix-tree.h>
#include <linux/prio_tree.h>
#include <linux/init.h>
#include <linux/sched.h>
#include <asm/atomic.h>
#include <asm/semaphore.h>
@ -822,16 +823,34 @@ enum {
#define vfs_check_frozen(sb, level) \
wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
static inline void get_fs_excl(void)
{
atomic_inc(&current->fs_excl);
}
static inline void put_fs_excl(void)
{
atomic_dec(&current->fs_excl);
}
static inline int has_fs_excl(void)
{
return atomic_read(&current->fs_excl);
}
/*
* Superblock locking.
*/
static inline void lock_super(struct super_block * sb)
{
get_fs_excl();
down(&sb->s_lock);
}
static inline void unlock_super(struct super_block * sb)
{
put_fs_excl();
up(&sb->s_lock);
}

View File

@ -81,6 +81,7 @@ extern struct group_info init_groups;
.mm = NULL, \
.active_mm = &init_mm, \
.run_list = LIST_HEAD_INIT(tsk.run_list), \
.ioprio = 0, \
.time_slice = HZ, \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children), \
@ -110,6 +111,7 @@ extern struct group_info init_groups;
.proc_lock = SPIN_LOCK_UNLOCKED, \
.journal_info = NULL, \
.cpu_timers = INIT_CPU_TIMERS(tsk.cpu_timers), \
.fs_excl = ATOMIC_INIT(0), \
}

87
include/linux/ioprio.h Normal file
View File

@ -0,0 +1,87 @@
#ifndef IOPRIO_H
#define IOPRIO_H
#include <linux/sched.h>
/*
* Gives us 8 prio classes with 13-bits of data for each class
*/
#define IOPRIO_BITS (16)
#define IOPRIO_CLASS_SHIFT (13)
#define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1)
#define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT)
#define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK)
#define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE)
/*
* These are the io priority groups as implemented by CFQ. RT is the realtime
* class, it always gets premium service. BE is the best-effort scheduling
* class, the default for any process. IDLE is the idle scheduling class, it
* is only served when no one else is using the disk.
*/
enum {
IOPRIO_CLASS_NONE,
IOPRIO_CLASS_RT,
IOPRIO_CLASS_BE,
IOPRIO_CLASS_IDLE,
};
/*
* 8 best effort priority levels are supported
*/
#define IOPRIO_BE_NR (8)
asmlinkage int sys_ioprio_set(int, int, int);
asmlinkage int sys_ioprio_get(int, int);
enum {
IOPRIO_WHO_PROCESS = 1,
IOPRIO_WHO_PGRP,
IOPRIO_WHO_USER,
};
/*
* if process has set io priority explicitly, use that. if not, convert
* the cpu scheduler nice value to an io priority
*/
#define IOPRIO_NORM (4)
static inline int task_ioprio(struct task_struct *task)
{
WARN_ON(!ioprio_valid(task->ioprio));
return IOPRIO_PRIO_DATA(task->ioprio);
}
static inline int task_nice_ioprio(struct task_struct *task)
{
return (task_nice(task) + 20) / 5;
}
/*
* For inheritance, return the highest of the two given priorities
*/
static inline int ioprio_best(unsigned short aprio, unsigned short bprio)
{
unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
if (!ioprio_valid(aprio))
return bprio;
if (!ioprio_valid(bprio))
return aprio;
if (aclass == IOPRIO_CLASS_NONE)
aclass = IOPRIO_CLASS_BE;
if (bclass == IOPRIO_CLASS_NONE)
bclass = IOPRIO_CLASS_BE;
if (aclass == bclass)
return min(aprio, bprio);
if (aclass > bclass)
return bprio;
else
return aprio;
}
#endif

View File

@ -608,6 +608,8 @@ struct task_struct {
struct list_head run_list;
prio_array_t *array;
unsigned short ioprio;
unsigned long sleep_avg;
unsigned long long timestamp, last_ran;
unsigned long long sched_time; /* sched_clock time spent running */
@ -763,6 +765,7 @@ struct task_struct {
nodemask_t mems_allowed;
int cpuset_mems_generation;
#endif
atomic_t fs_excl; /* holding fs exclusive resources */
};
static inline pid_t process_group(struct task_struct *tsk)
@ -1112,7 +1115,8 @@ extern void unhash_process(struct task_struct *p);
/*
* Protects ->fs, ->files, ->mm, ->ptrace, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs.
* subscriptions and synchronises with wait4(). Also used in procfs. Also
* pins the final release of task.io_context.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),

View File

@ -14,11 +14,13 @@ extern struct list_head inode_unused;
* Yes, writeback.h requires sched.h
* No, sched.h is not included from here.
*/
static inline int current_is_pdflush(void)
static inline int task_is_pdflush(struct task_struct *task)
{
return current->flags & PF_FLUSHER;
return task->flags & PF_FLUSHER;
}
#define current_is_pdflush() task_is_pdflush(current)
/*
* fs/fs-writeback.c
*/

View File

@ -784,6 +784,8 @@ fastcall NORET_TYPE void do_exit(long code)
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))

View File

@ -1090,6 +1090,11 @@ static task_t *copy_process(unsigned long clone_flags,
spin_unlock(&current->sighand->siglock);
}
/*
* inherit ioprio
*/
p->ioprio = current->ioprio;
SET_LINKS(p);
if (unlikely(p->ptrace & PT_PTRACED))
__ptrace_link(p, current->parent);

View File

@ -3448,15 +3448,7 @@ int task_nice(const task_t *p)
{
return TASK_NICE(p);
}
/*
* The only users of task_nice are binfmt_elf and binfmt_elf32.
* binfmt_elf is no longer modular, but binfmt_elf32 still is.
* Therefore, task_nice is needed if there is a compat_mode.
*/
#ifdef CONFIG_COMPAT
EXPORT_SYMBOL_GPL(task_nice);
#endif
/**
* idle_cpu - is a given cpu idle currently?