2005-04-17 05:20:36 +07:00
|
|
|
#ifndef _LINUX_GENHD_H
|
|
|
|
#define _LINUX_GENHD_H
|
|
|
|
|
|
|
|
/*
|
|
|
|
* genhd.h Copyright (C) 1992 Drew Eckhardt
|
|
|
|
* Generic hard disk header file by
|
|
|
|
* Drew Eckhardt
|
|
|
|
*
|
|
|
|
* <drew@colorado.edu>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
2007-05-22 03:08:01 +07:00
|
|
|
#include <linux/kdev_t.h>
|
2008-09-03 14:03:02 +07:00
|
|
|
#include <linux/rcupdate.h>
|
2010-09-01 03:47:05 +07:00
|
|
|
#include <linux/slab.h>
|
2005-04-17 05:20:36 +07:00
|
|
|
|
[PATCH] BLOCK: Make it possible to disable the block layer [try #6]
Make it possible to disable the block layer. Not all embedded devices require
it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require
the block layer to be present.
This patch does the following:
(*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev
support.
(*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls
an item that uses the block layer. This includes:
(*) Block I/O tracing.
(*) Disk partition code.
(*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS.
(*) The SCSI layer. As far as I can tell, even SCSI chardevs use the
block layer to do scheduling. Some drivers that use SCSI facilities -
such as USB storage - end up disabled indirectly from this.
(*) Various block-based device drivers, such as IDE and the old CDROM
drivers.
(*) MTD blockdev handling and FTL.
(*) JFFS - which uses set_bdev_super(), something it could avoid doing by
taking a leaf out of JFFS2's book.
(*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and
linux/elevator.h contingent on CONFIG_BLOCK being set. sector_div() is,
however, still used in places, and so is still available.
(*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and
parts of linux/fs.h.
(*) Makes a number of files in fs/ contingent on CONFIG_BLOCK.
(*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK.
(*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK
is not enabled.
(*) fs/no-block.c is created to hold out-of-line stubs and things that are
required when CONFIG_BLOCK is not set:
(*) Default blockdev file operations (to give error ENODEV on opening).
(*) Makes some /proc changes:
(*) /proc/devices does not list any blockdevs.
(*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK.
(*) Makes some compat ioctl handling contingent on CONFIG_BLOCK.
(*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if
given command other than Q_SYNC or if a special device is specified.
(*) In init/do_mounts.c, no reference is made to the blockdev routines if
CONFIG_BLOCK is not defined. This does not prohibit NFS roots or JFFS2.
(*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return
error ENOSYS by way of cond_syscall if so).
(*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if
CONFIG_BLOCK is not set, since they can't then happen.
Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2006-10-01 01:45:40 +07:00
|
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
|
2008-08-29 14:01:47 +07:00
|
|
|
#define dev_to_disk(device) container_of((device), struct gendisk, part0.__dev)
|
2008-08-25 17:56:05 +07:00
|
|
|
#define dev_to_part(device) container_of((device), struct hd_struct, __dev)
|
2008-08-29 14:01:47 +07:00
|
|
|
#define disk_to_dev(disk) (&(disk)->part0.__dev)
|
2008-08-25 17:56:05 +07:00
|
|
|
#define part_to_dev(part) (&((part)->__dev))
|
2007-05-22 03:08:01 +07:00
|
|
|
|
|
|
|
extern struct device_type part_type;
|
|
|
|
extern struct kobject *block_depr;
|
|
|
|
extern struct class block_class;
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
enum {
|
|
|
|
/* These three have identical behaviour; use the second one if DOS FDISK gets
|
|
|
|
confused about extended/logical partitions starting past cylinder 1023. */
|
|
|
|
DOS_EXTENDED_PARTITION = 5,
|
|
|
|
LINUX_EXTENDED_PARTITION = 0x85,
|
|
|
|
WIN98_EXTENDED_PARTITION = 0x0f,
|
|
|
|
|
2007-02-11 14:50:00 +07:00
|
|
|
SUN_WHOLE_DISK = DOS_EXTENDED_PARTITION,
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
LINUX_SWAP_PARTITION = 0x82,
|
2007-02-10 16:45:47 +07:00
|
|
|
LINUX_DATA_PARTITION = 0x83,
|
|
|
|
LINUX_LVM_PARTITION = 0x8e,
|
2005-04-17 05:20:36 +07:00
|
|
|
LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */
|
|
|
|
|
|
|
|
SOLARIS_X86_PARTITION = LINUX_SWAP_PARTITION,
|
|
|
|
NEW_SOLARIS_X86_PARTITION = 0xbf,
|
|
|
|
|
|
|
|
DM6_AUX1PARTITION = 0x51, /* no DDO: use xlated geom */
|
|
|
|
DM6_AUX3PARTITION = 0x53, /* no DDO: use xlated geom */
|
|
|
|
DM6_PARTITION = 0x54, /* has DDO: use xlated geom & offset */
|
|
|
|
EZD_PARTITION = 0x55, /* EZ-DRIVE */
|
|
|
|
|
|
|
|
FREEBSD_PARTITION = 0xa5, /* FreeBSD Partition ID */
|
|
|
|
OPENBSD_PARTITION = 0xa6, /* OpenBSD Partition ID */
|
|
|
|
NETBSD_PARTITION = 0xa9, /* NetBSD Partition ID */
|
|
|
|
BSDI_PARTITION = 0xb7, /* BSDI Partition ID */
|
|
|
|
MINIX_PARTITION = 0x81, /* Minix Partition ID */
|
|
|
|
UNIXWARE_PARTITION = 0x63, /* Same as GNU_HURD and SCO Unix */
|
|
|
|
};
|
|
|
|
|
2008-08-25 17:56:16 +07:00
|
|
|
#define DISK_MAX_PARTS 256
|
2008-08-25 17:56:17 +07:00
|
|
|
#define DISK_NAME_LEN 32
|
2008-08-25 17:56:16 +07:00
|
|
|
|
2006-04-25 20:07:57 +07:00
|
|
|
#include <linux/major.h>
|
|
|
|
#include <linux/device.h>
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/string.h>
|
|
|
|
#include <linux/fs.h>
|
2007-05-24 03:57:38 +07:00
|
|
|
#include <linux/workqueue.h>
|
2006-04-25 20:07:57 +07:00
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
struct partition {
|
|
|
|
unsigned char boot_ind; /* 0x80 - active */
|
|
|
|
unsigned char head; /* starting head */
|
|
|
|
unsigned char sector; /* starting sector */
|
|
|
|
unsigned char cyl; /* starting cylinder */
|
|
|
|
unsigned char sys_ind; /* What partition type */
|
|
|
|
unsigned char end_head; /* end head */
|
|
|
|
unsigned char end_sector; /* end sector */
|
|
|
|
unsigned char end_cyl; /* end cylinder */
|
|
|
|
__le32 start_sect; /* starting sector counting from 0 */
|
|
|
|
__le32 nr_sects; /* nr of sectors in partition */
|
|
|
|
} __attribute__((packed));
|
|
|
|
|
2008-02-08 17:04:09 +07:00
|
|
|
struct disk_stats {
|
|
|
|
unsigned long sectors[2]; /* READs and WRITEs */
|
|
|
|
unsigned long ios[2];
|
|
|
|
unsigned long merges[2];
|
|
|
|
unsigned long ticks[2];
|
|
|
|
unsigned long io_ticks;
|
|
|
|
unsigned long time_in_queue;
|
|
|
|
};
|
2010-09-01 03:47:05 +07:00
|
|
|
|
|
|
|
#define PARTITION_META_INFO_VOLNAMELTH 64
|
|
|
|
#define PARTITION_META_INFO_UUIDLTH 16
|
|
|
|
|
|
|
|
struct partition_meta_info {
|
|
|
|
u8 uuid[PARTITION_META_INFO_UUIDLTH]; /* always big endian */
|
|
|
|
u8 volname[PARTITION_META_INFO_VOLNAMELTH];
|
|
|
|
};
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
struct hd_struct {
|
|
|
|
sector_t start_sect;
|
2012-08-01 17:24:18 +07:00
|
|
|
/*
|
|
|
|
* nr_sects is protected by sequence counter. One might extend a
|
|
|
|
* partition while IO is happening to it and update of nr_sects
|
|
|
|
* can be non-atomic on 32bit machines with 64bit sector_t.
|
|
|
|
*/
|
2005-04-17 05:20:36 +07:00
|
|
|
sector_t nr_sects;
|
2012-08-01 17:24:18 +07:00
|
|
|
seqcount_t nr_sects_seq;
|
2009-05-23 04:17:53 +07:00
|
|
|
sector_t alignment_offset;
|
2011-05-30 12:42:51 +07:00
|
|
|
unsigned int discard_alignment;
|
2008-08-25 17:56:05 +07:00
|
|
|
struct device __dev;
|
2006-03-27 16:17:55 +07:00
|
|
|
struct kobject *holder_dir;
|
2005-04-17 05:20:36 +07:00
|
|
|
int policy, partno;
|
2010-09-01 03:47:05 +07:00
|
|
|
struct partition_meta_info *info;
|
2006-12-08 17:39:46 +07:00
|
|
|
#ifdef CONFIG_FAIL_MAKE_REQUEST
|
|
|
|
int make_it_fail;
|
2008-02-08 17:04:09 +07:00
|
|
|
#endif
|
|
|
|
unsigned long stamp;
|
2011-03-22 14:35:35 +07:00
|
|
|
atomic_t in_flight[2];
|
2008-02-08 17:04:09 +07:00
|
|
|
#ifdef CONFIG_SMP
|
2010-02-02 12:38:57 +07:00
|
|
|
struct disk_stats __percpu *dkstats;
|
2008-02-08 17:04:09 +07:00
|
|
|
#else
|
|
|
|
struct disk_stats dkstats;
|
2006-12-08 17:39:46 +07:00
|
|
|
#endif
|
2011-01-07 14:43:37 +07:00
|
|
|
atomic_t ref;
|
2008-09-03 14:03:02 +07:00
|
|
|
struct rcu_head rcu_head;
|
2005-04-17 05:20:36 +07:00
|
|
|
};
|
|
|
|
|
|
|
|
#define GENHD_FL_REMOVABLE 1
|
2010-03-16 14:55:32 +07:00
|
|
|
/* 2 is unused */
|
2007-05-24 03:57:38 +07:00
|
|
|
#define GENHD_FL_MEDIA_CHANGE_NOTIFY 4
|
2005-04-17 05:20:36 +07:00
|
|
|
#define GENHD_FL_CD 8
|
|
|
|
#define GENHD_FL_UP 16
|
|
|
|
#define GENHD_FL_SUPPRESS_PARTITION_INFO 32
|
2008-08-25 17:56:16 +07:00
|
|
|
#define GENHD_FL_EXT_DEVT 64 /* allow extended devt */
|
2009-06-07 18:52:52 +07:00
|
|
|
#define GENHD_FL_NATIVE_CAPACITY 128
|
2011-04-22 01:54:46 +07:00
|
|
|
#define GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE 256
|
2011-08-24 01:01:04 +07:00
|
|
|
#define GENHD_FL_NO_PART_SCAN 512
|
2005-04-17 05:20:36 +07:00
|
|
|
|
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 02:57:37 +07:00
|
|
|
enum {
|
|
|
|
DISK_EVENT_MEDIA_CHANGE = 1 << 0, /* media changed */
|
|
|
|
DISK_EVENT_EJECT_REQUEST = 1 << 1, /* eject requested */
|
|
|
|
};
|
|
|
|
|
2008-08-25 17:56:15 +07:00
|
|
|
#define BLK_SCSI_MAX_CMDS (256)
|
|
|
|
#define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
|
|
|
|
|
|
|
|
struct blk_scsi_cmd_filter {
|
|
|
|
unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
|
|
|
|
unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
|
|
|
|
struct kobject kobj;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct disk_part_tbl {
|
|
|
|
struct rcu_head rcu_head;
|
|
|
|
int len;
|
2010-02-25 02:01:56 +07:00
|
|
|
struct hd_struct __rcu *last_lookup;
|
|
|
|
struct hd_struct __rcu *part[];
|
2008-08-25 17:56:15 +07:00
|
|
|
};
|
|
|
|
|
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 02:57:37 +07:00
|
|
|
struct disk_events;
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
struct gendisk {
|
2008-08-25 17:56:16 +07:00
|
|
|
/* major, first_minor and minors are input parameters only,
|
|
|
|
* don't use directly. Use disk_devt() and disk_max_parts().
|
2008-09-03 14:01:48 +07:00
|
|
|
*/
|
2005-04-17 05:20:36 +07:00
|
|
|
int major; /* major number of driver */
|
|
|
|
int first_minor;
|
|
|
|
int minors; /* maximum number of minors, =1 for
|
|
|
|
* disks that can't be partitioned. */
|
2008-09-03 14:01:48 +07:00
|
|
|
|
2008-08-25 17:56:17 +07:00
|
|
|
char disk_name[DISK_NAME_LEN]; /* name of major driver */
|
2011-07-24 07:24:48 +07:00
|
|
|
char *(*devnode)(struct gendisk *gd, umode_t *mode);
|
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 02:57:37 +07:00
|
|
|
|
|
|
|
unsigned int events; /* supported events */
|
|
|
|
unsigned int async_events; /* async events, subset of all */
|
|
|
|
|
2008-09-03 14:06:42 +07:00
|
|
|
/* Array of pointers to partitions indexed by partno.
|
2008-09-03 14:03:02 +07:00
|
|
|
* Protected with matching bdev lock but stat and other
|
|
|
|
* non-critical accesses use RCU. Always access through
|
|
|
|
* helpers.
|
|
|
|
*/
|
2010-02-25 02:01:56 +07:00
|
|
|
struct disk_part_tbl __rcu *part_tbl;
|
2008-09-03 14:06:42 +07:00
|
|
|
struct hd_struct part0;
|
2008-09-03 14:03:02 +07:00
|
|
|
|
2009-09-22 07:01:13 +07:00
|
|
|
const struct block_device_operations *fops;
|
2005-04-17 05:20:36 +07:00
|
|
|
struct request_queue *queue;
|
|
|
|
void *private_data;
|
|
|
|
|
|
|
|
int flags;
|
2007-05-22 03:08:01 +07:00
|
|
|
struct device *driverfs_dev; // FIXME: remove
|
2006-03-27 16:17:55 +07:00
|
|
|
struct kobject *slave_dir;
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
struct timer_rand_state *random;
|
|
|
|
atomic_t sync_io; /* RAID */
|
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 02:57:37 +07:00
|
|
|
struct disk_events *ev;
|
2008-07-01 01:04:41 +07:00
|
|
|
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
|
|
|
struct blk_integrity *integrity;
|
|
|
|
#endif
|
2008-08-25 17:56:15 +07:00
|
|
|
int node_id;
|
2005-04-17 05:20:36 +07:00
|
|
|
};
|
|
|
|
|
2008-08-25 17:47:17 +07:00
|
|
|
static inline struct gendisk *part_to_disk(struct hd_struct *part)
|
|
|
|
{
|
2008-08-29 14:01:47 +07:00
|
|
|
if (likely(part)) {
|
|
|
|
if (part->partno)
|
|
|
|
return dev_to_disk(part_to_dev(part)->parent);
|
|
|
|
else
|
|
|
|
return dev_to_disk(part_to_dev(part));
|
|
|
|
}
|
2008-08-25 17:47:17 +07:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2010-09-01 03:47:05 +07:00
|
|
|
static inline void part_pack_uuid(const u8 *uuid_str, u8 *to)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
for (i = 0; i < 16; ++i) {
|
|
|
|
*to++ = (hex_to_bin(*uuid_str) << 4) |
|
|
|
|
(hex_to_bin(*(uuid_str + 1)));
|
|
|
|
uuid_str += 2;
|
|
|
|
switch (i) {
|
|
|
|
case 3:
|
|
|
|
case 5:
|
|
|
|
case 7:
|
|
|
|
case 9:
|
|
|
|
uuid_str++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-09-03 14:01:48 +07:00
|
|
|
static inline int disk_max_parts(struct gendisk *disk)
|
|
|
|
{
|
2008-08-25 17:56:16 +07:00
|
|
|
if (disk->flags & GENHD_FL_EXT_DEVT)
|
|
|
|
return DISK_MAX_PARTS;
|
|
|
|
return disk->minors;
|
2008-09-03 14:06:42 +07:00
|
|
|
}
|
|
|
|
|
2011-08-24 01:01:04 +07:00
|
|
|
static inline bool disk_part_scan_enabled(struct gendisk *disk)
|
2008-09-03 14:06:42 +07:00
|
|
|
{
|
2011-08-24 01:01:04 +07:00
|
|
|
return disk_max_parts(disk) > 1 &&
|
|
|
|
!(disk->flags & GENHD_FL_NO_PART_SCAN);
|
2008-09-03 14:01:48 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline dev_t disk_devt(struct gendisk *disk)
|
|
|
|
{
|
2008-08-25 17:56:05 +07:00
|
|
|
return disk_to_dev(disk)->devt;
|
2008-09-03 14:01:48 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline dev_t part_devt(struct hd_struct *part)
|
|
|
|
{
|
2008-08-25 17:56:05 +07:00
|
|
|
return part_to_dev(part)->devt;
|
2008-09-03 14:01:48 +07:00
|
|
|
}
|
|
|
|
|
2008-09-03 14:03:02 +07:00
|
|
|
extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno);
|
|
|
|
|
|
|
|
static inline void disk_put_part(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
if (likely(part))
|
2008-08-25 17:56:05 +07:00
|
|
|
put_device(part_to_dev(part));
|
2008-09-03 14:03:02 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Smarter partition iterator without context limits.
|
|
|
|
*/
|
|
|
|
#define DISK_PITER_REVERSE (1 << 0) /* iterate in the reverse direction */
|
|
|
|
#define DISK_PITER_INCL_EMPTY (1 << 1) /* include 0-sized parts */
|
2008-09-03 14:06:42 +07:00
|
|
|
#define DISK_PITER_INCL_PART0 (1 << 2) /* include partition 0 */
|
2009-04-17 13:34:48 +07:00
|
|
|
#define DISK_PITER_INCL_EMPTY_PART0 (1 << 3) /* include empty partition 0 */
|
2008-09-03 14:03:02 +07:00
|
|
|
|
|
|
|
struct disk_part_iter {
|
|
|
|
struct gendisk *disk;
|
|
|
|
struct hd_struct *part;
|
|
|
|
int idx;
|
|
|
|
unsigned int flags;
|
|
|
|
};
|
|
|
|
|
|
|
|
extern void disk_part_iter_init(struct disk_part_iter *piter,
|
|
|
|
struct gendisk *disk, unsigned int flags);
|
|
|
|
extern struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter);
|
|
|
|
extern void disk_part_iter_exit(struct disk_part_iter *piter);
|
|
|
|
|
|
|
|
extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
|
|
|
|
sector_t sector);
|
|
|
|
|
2008-08-25 17:47:21 +07:00
|
|
|
/*
|
2005-04-17 05:20:36 +07:00
|
|
|
* Macros to operate on percpu disk statistics:
|
|
|
|
*
|
2008-08-25 17:47:21 +07:00
|
|
|
* {disk|part|all}_stat_{add|sub|inc|dec}() modify the stat counters
|
|
|
|
* and should be called between disk_stat_lock() and
|
|
|
|
* disk_stat_unlock().
|
|
|
|
*
|
|
|
|
* part_stat_read() can be called at any time.
|
|
|
|
*
|
|
|
|
* part_stat_{add|set_all}() and {init|free}_part_stats are for
|
|
|
|
* internal use only.
|
2005-04-17 05:20:36 +07:00
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
2008-08-25 17:56:14 +07:00
|
|
|
#define part_stat_lock() ({ rcu_read_lock(); get_cpu(); })
|
|
|
|
#define part_stat_unlock() do { put_cpu(); rcu_read_unlock(); } while (0)
|
2008-02-08 17:04:09 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
#define __part_stat_add(cpu, part, field, addnd) \
|
|
|
|
(per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd))
|
2008-02-08 17:04:09 +07:00
|
|
|
|
|
|
|
#define part_stat_read(part, field) \
|
|
|
|
({ \
|
2008-08-25 17:56:14 +07:00
|
|
|
typeof((part)->dkstats->field) res = 0; \
|
2010-01-07 06:45:55 +07:00
|
|
|
unsigned int _cpu; \
|
|
|
|
for_each_possible_cpu(_cpu) \
|
|
|
|
res += per_cpu_ptr((part)->dkstats, _cpu)->field; \
|
2008-02-08 17:04:09 +07:00
|
|
|
res; \
|
|
|
|
})
|
|
|
|
|
2008-05-07 15:15:46 +07:00
|
|
|
static inline void part_stat_set_all(struct hd_struct *part, int value)
|
|
|
|
{
|
2008-02-08 17:04:09 +07:00
|
|
|
int i;
|
2008-05-07 15:15:46 +07:00
|
|
|
|
2008-02-08 17:04:09 +07:00
|
|
|
for_each_possible_cpu(i)
|
|
|
|
memset(per_cpu_ptr(part->dkstats, i), value,
|
2008-05-07 15:15:46 +07:00
|
|
|
sizeof(struct disk_stats));
|
2008-02-08 17:04:09 +07:00
|
|
|
}
|
2008-08-25 17:47:21 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
static inline int init_part_stats(struct hd_struct *part)
|
2008-02-08 17:04:09 +07:00
|
|
|
{
|
2008-08-25 17:56:14 +07:00
|
|
|
part->dkstats = alloc_percpu(struct disk_stats);
|
|
|
|
if (!part->dkstats)
|
|
|
|
return 0;
|
|
|
|
return 1;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
2008-02-08 17:04:09 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
static inline void free_part_stats(struct hd_struct *part)
|
2008-02-08 17:04:09 +07:00
|
|
|
{
|
2008-08-25 17:56:14 +07:00
|
|
|
free_percpu(part->dkstats);
|
2008-02-08 17:04:09 +07:00
|
|
|
}
|
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
#else /* !CONFIG_SMP */
|
|
|
|
#define part_stat_lock() ({ rcu_read_lock(); 0; })
|
|
|
|
#define part_stat_unlock() rcu_read_unlock()
|
2008-08-25 17:47:21 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
#define __part_stat_add(cpu, part, field, addnd) \
|
|
|
|
((part)->dkstats.field += addnd)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
#define part_stat_read(part, field) ((part)->dkstats.field)
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
static inline void part_stat_set_all(struct hd_struct *part, int value)
|
2005-04-17 05:20:36 +07:00
|
|
|
{
|
2008-08-25 17:56:14 +07:00
|
|
|
memset(&part->dkstats, value, sizeof(struct disk_stats));
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
2008-02-08 17:04:09 +07:00
|
|
|
|
|
|
|
static inline int init_part_stats(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void free_part_stats(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
#endif /* CONFIG_SMP */
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
#define part_stat_add(cpu, part, field, addnd) do { \
|
|
|
|
__part_stat_add((cpu), (part), field, addnd); \
|
|
|
|
if ((part)->partno) \
|
|
|
|
__part_stat_add((cpu), &part_to_disk((part))->part0, \
|
|
|
|
field, addnd); \
|
|
|
|
} while (0)
|
2008-02-08 17:04:09 +07:00
|
|
|
|
2008-08-25 17:56:14 +07:00
|
|
|
#define part_stat_dec(cpu, gendiskp, field) \
|
|
|
|
part_stat_add(cpu, gendiskp, field, -1)
|
|
|
|
#define part_stat_inc(cpu, gendiskp, field) \
|
|
|
|
part_stat_add(cpu, gendiskp, field, 1)
|
|
|
|
#define part_stat_sub(cpu, gendiskp, field, subnd) \
|
|
|
|
part_stat_add(cpu, gendiskp, field, -subnd)
|
|
|
|
|
block: Seperate read and write statistics of in_flight requests v2
Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read
and write statistics of in_flight requests. And exported the number
of read and write requests in progress seperately through sysfs.
But Corrado Zoccolo <czoccolo@gmail.com> reported getting strange
output from "iostat -kx 2". Global values for service time and
utilization were garbage. For interval values, utilization was always
100%, and service time is higher than normal.
So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b
The problem was in part_round_stats_single(), I missed the following:
if (now == part->stamp)
return;
- if (part->in_flight) {
+ if (part_in_flight(part)) {
__part_stat_add(cpu, part, time_in_queue,
part_in_flight(part) * (now - part->stamp));
__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
With this chunk included, the reported regression gets fixed.
Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
--
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-07 01:16:55 +07:00
|
|
|
static inline void part_inc_in_flight(struct hd_struct *part, int rw)
|
2008-02-08 17:04:09 +07:00
|
|
|
{
|
2011-03-22 14:35:35 +07:00
|
|
|
atomic_inc(&part->in_flight[rw]);
|
2008-08-25 17:56:14 +07:00
|
|
|
if (part->partno)
|
2011-03-22 14:35:35 +07:00
|
|
|
atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
|
2008-02-08 17:04:09 +07:00
|
|
|
}
|
|
|
|
|
block: Seperate read and write statistics of in_flight requests v2
Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read
and write statistics of in_flight requests. And exported the number
of read and write requests in progress seperately through sysfs.
But Corrado Zoccolo <czoccolo@gmail.com> reported getting strange
output from "iostat -kx 2". Global values for service time and
utilization were garbage. For interval values, utilization was always
100%, and service time is higher than normal.
So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b
The problem was in part_round_stats_single(), I missed the following:
if (now == part->stamp)
return;
- if (part->in_flight) {
+ if (part_in_flight(part)) {
__part_stat_add(cpu, part, time_in_queue,
part_in_flight(part) * (now - part->stamp));
__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
With this chunk included, the reported regression gets fixed.
Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
--
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-07 01:16:55 +07:00
|
|
|
static inline void part_dec_in_flight(struct hd_struct *part, int rw)
|
2008-02-08 17:04:09 +07:00
|
|
|
{
|
2011-03-22 14:35:35 +07:00
|
|
|
atomic_dec(&part->in_flight[rw]);
|
2008-08-25 17:56:14 +07:00
|
|
|
if (part->partno)
|
2011-03-22 14:35:35 +07:00
|
|
|
atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
|
block: Seperate read and write statistics of in_flight requests v2
Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read
and write statistics of in_flight requests. And exported the number
of read and write requests in progress seperately through sysfs.
But Corrado Zoccolo <czoccolo@gmail.com> reported getting strange
output from "iostat -kx 2". Global values for service time and
utilization were garbage. For interval values, utilization was always
100%, and service time is higher than normal.
So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b
The problem was in part_round_stats_single(), I missed the following:
if (now == part->stamp)
return;
- if (part->in_flight) {
+ if (part_in_flight(part)) {
__part_stat_add(cpu, part, time_in_queue,
part_in_flight(part) * (now - part->stamp));
__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
With this chunk included, the reported regression gets fixed.
Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
--
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-07 01:16:55 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int part_in_flight(struct hd_struct *part)
|
|
|
|
{
|
2011-03-22 14:35:35 +07:00
|
|
|
return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
|
2008-02-08 17:04:09 +07:00
|
|
|
}
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2010-09-01 03:47:05 +07:00
|
|
|
static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
if (disk)
|
|
|
|
return kzalloc_node(sizeof(struct partition_meta_info),
|
|
|
|
GFP_KERNEL, disk->node_id);
|
|
|
|
return kzalloc(sizeof(struct partition_meta_info), GFP_KERNEL);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void free_part_info(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
kfree(part->info);
|
|
|
|
}
|
|
|
|
|
2009-03-11 16:49:35 +07:00
|
|
|
/* block/blk-core.c */
|
2008-08-25 17:47:21 +07:00
|
|
|
extern void part_round_stats(int cpu, struct hd_struct *part);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2009-03-10 14:25:54 +07:00
|
|
|
/* block/genhd.c */
|
2005-04-17 05:20:36 +07:00
|
|
|
extern void add_disk(struct gendisk *disk);
|
|
|
|
extern void del_gendisk(struct gendisk *gp);
|
2008-09-03 14:01:09 +07:00
|
|
|
extern struct gendisk *get_gendisk(dev_t dev, int *partno);
|
2008-09-03 14:01:48 +07:00
|
|
|
extern struct block_device *bdget_disk(struct gendisk *disk, int partno);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
|
|
|
extern void set_device_ro(struct block_device *bdev, int flag);
|
|
|
|
extern void set_disk_ro(struct gendisk *disk, int flag);
|
|
|
|
|
2008-08-25 17:56:10 +07:00
|
|
|
static inline int get_disk_ro(struct gendisk *disk)
|
|
|
|
{
|
|
|
|
return disk->part0.policy;
|
|
|
|
}
|
|
|
|
|
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 02:57:37 +07:00
|
|
|
extern void disk_block_events(struct gendisk *disk);
|
|
|
|
extern void disk_unblock_events(struct gendisk *disk);
|
2011-07-01 21:17:47 +07:00
|
|
|
extern void disk_flush_events(struct gendisk *disk, unsigned int mask);
|
implement in-kernel gendisk events handling
Currently, media presence polling for removeable block devices is done
from userland. There are several issues with this.
* Polling is done by periodically opening the device. For SCSI
devices, the command sequence generated by such action involves a
few different commands including TEST_UNIT_READY. This behavior,
while perfectly legal, is different from Windows which only issues
single command, GET_EVENT_STATUS_NOTIFICATION. Unfortunately, some
ATAPI devices lock up after being periodically queried such command
sequences.
* There is no reliable and unintrusive way for a userland program to
tell whether the target device is safe for media presence polling.
For example, polling for media presence during an on-going burning
session can make it fail. The polling program can avoid this by
opening the device with O_EXCL but then it risks making a valid
exclusive user of the device fail w/ -EBUSY.
* Userland polling is unnecessarily heavy and in-kernel implementation
is lighter and better coordinated (workqueue, timer slack).
This patch implements framework for in-kernel disk event handling,
which includes media presence polling.
* bdops->check_events() is added, which supercedes ->media_changed().
It should check whether there's any pending event and return if so.
Currently, two events are defined - DISK_EVENT_MEDIA_CHANGE and
DISK_EVENT_EJECT_REQUEST. ->check_events() is guaranteed not to be
called parallelly.
* gendisk->events and ->async_events are added. These should be
initialized by block driver before passing the device to add_disk().
The former contains the mask of all supported events and the latter
the mask of all events which the device can report without polling.
/sys/block/*/events[_async] export these to userland.
* Kernel parameter block.events_dfl_poll_msecs controls the system
polling interval (default is 0 which means disable) and
/sys/block/*/events_poll_msecs control polling intervals for
individual devices (default is -1 meaning use system setting). Note
that if a device can report all supported events asynchronously and
its polling interval isn't explicitly set, the device won't be
polled regardless of the system polling interval.
* If a device is opened exclusively with write access, event checking
is automatically disabled until all write exclusive accesses are
released.
* There are event 'clearing' events. For example, both of currently
defined events are cleared after the device has been successfully
opened. This information is passed to ->check_events() callback
using @clearing argument as a hint.
* Event checking is always performed from system_nrt_wq and timer
slack is set to 25% for polling.
* Nothing changes for drivers which implement ->media_changed() but
not ->check_events(). Going forward, all drivers will be converted
to ->check_events() and ->media_change() will be dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: Jan Kara <jack@suse.cz>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
2010-12-09 02:57:37 +07:00
|
|
|
extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);
|
|
|
|
|
2005-04-17 05:20:36 +07:00
|
|
|
/* drivers/char/random.c */
|
|
|
|
extern void add_disk_randomness(struct gendisk *disk);
|
|
|
|
extern void rand_initialize_disk(struct gendisk *disk);
|
|
|
|
|
|
|
|
static inline sector_t get_start_sect(struct block_device *bdev)
|
|
|
|
{
|
2008-08-25 17:56:12 +07:00
|
|
|
return bdev->bd_part->start_sect;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
static inline sector_t get_capacity(struct gendisk *disk)
|
|
|
|
{
|
2008-08-25 17:56:07 +07:00
|
|
|
return disk->part0.nr_sects;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
static inline void set_capacity(struct gendisk *disk, sector_t size)
|
|
|
|
{
|
2008-08-25 17:56:07 +07:00
|
|
|
disk->part0.nr_sects = size;
|
2005-04-17 05:20:36 +07:00
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_SOLARIS_X86_PARTITION
|
|
|
|
|
2007-07-26 08:30:08 +07:00
|
|
|
#define SOLARIS_X86_NUMSLICE 16
|
2005-04-17 05:20:36 +07:00
|
|
|
#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL)
|
|
|
|
|
|
|
|
struct solaris_x86_slice {
|
|
|
|
__le16 s_tag; /* ID tag of partition */
|
|
|
|
__le16 s_flag; /* permission flags */
|
|
|
|
__le32 s_start; /* start sector no of partition */
|
|
|
|
__le32 s_size; /* # of blocks in partition */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct solaris_x86_vtoc {
|
|
|
|
unsigned int v_bootinfo[3]; /* info needed by mboot (unsupported) */
|
|
|
|
__le32 v_sanity; /* to verify vtoc sanity */
|
|
|
|
__le32 v_version; /* layout version */
|
|
|
|
char v_volume[8]; /* volume name */
|
|
|
|
__le16 v_sectorsz; /* sector size in bytes */
|
|
|
|
__le16 v_nparts; /* number of partitions */
|
|
|
|
unsigned int v_reserved[10]; /* free space */
|
|
|
|
struct solaris_x86_slice
|
|
|
|
v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */
|
|
|
|
unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp (unsupported) */
|
|
|
|
char v_asciilabel[128]; /* for compatibility */
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* CONFIG_SOLARIS_X86_PARTITION */
|
|
|
|
|
|
|
|
#ifdef CONFIG_BSD_DISKLABEL
|
|
|
|
/*
|
|
|
|
* BSD disklabel support by Yossi Gottlieb <yogo@math.tau.ac.il>
|
|
|
|
* updated by Marc Espie <Marc.Espie@openbsd.org>
|
|
|
|
*/
|
|
|
|
|
|
|
|
/* check against BSD src/sys/sys/disklabel.h for consistency */
|
|
|
|
|
|
|
|
#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */
|
|
|
|
#define BSD_MAXPARTITIONS 16
|
|
|
|
#define OPENBSD_MAXPARTITIONS 16
|
|
|
|
#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */
|
|
|
|
struct bsd_disklabel {
|
|
|
|
__le32 d_magic; /* the magic number */
|
|
|
|
__s16 d_type; /* drive type */
|
|
|
|
__s16 d_subtype; /* controller/d_type specific */
|
|
|
|
char d_typename[16]; /* type name, e.g. "eagle" */
|
|
|
|
char d_packname[16]; /* pack identifier */
|
|
|
|
__u32 d_secsize; /* # of bytes per sector */
|
|
|
|
__u32 d_nsectors; /* # of data sectors per track */
|
|
|
|
__u32 d_ntracks; /* # of tracks per cylinder */
|
|
|
|
__u32 d_ncylinders; /* # of data cylinders per unit */
|
|
|
|
__u32 d_secpercyl; /* # of data sectors per cylinder */
|
|
|
|
__u32 d_secperunit; /* # of data sectors per unit */
|
|
|
|
__u16 d_sparespertrack; /* # of spare sectors per track */
|
|
|
|
__u16 d_sparespercyl; /* # of spare sectors per cylinder */
|
|
|
|
__u32 d_acylinders; /* # of alt. cylinders per unit */
|
|
|
|
__u16 d_rpm; /* rotational speed */
|
|
|
|
__u16 d_interleave; /* hardware sector interleave */
|
|
|
|
__u16 d_trackskew; /* sector 0 skew, per track */
|
|
|
|
__u16 d_cylskew; /* sector 0 skew, per cylinder */
|
|
|
|
__u32 d_headswitch; /* head switch time, usec */
|
|
|
|
__u32 d_trkseek; /* track-to-track seek, usec */
|
|
|
|
__u32 d_flags; /* generic flags */
|
|
|
|
#define NDDATA 5
|
|
|
|
__u32 d_drivedata[NDDATA]; /* drive-type specific information */
|
|
|
|
#define NSPARE 5
|
|
|
|
__u32 d_spare[NSPARE]; /* reserved for future use */
|
|
|
|
__le32 d_magic2; /* the magic number (again) */
|
|
|
|
__le16 d_checksum; /* xor of data incl. partitions */
|
|
|
|
|
|
|
|
/* filesystem and partition information: */
|
|
|
|
__le16 d_npartitions; /* number of partitions in following */
|
|
|
|
__le32 d_bbsize; /* size of boot area at sn0, bytes */
|
|
|
|
__le32 d_sbsize; /* max size of fs superblock, bytes */
|
|
|
|
struct bsd_partition { /* the partition table */
|
|
|
|
__le32 p_size; /* number of sectors in partition */
|
|
|
|
__le32 p_offset; /* starting sector */
|
|
|
|
__le32 p_fsize; /* filesystem basic fragment size */
|
|
|
|
__u8 p_fstype; /* filesystem type, see below */
|
|
|
|
__u8 p_frag; /* filesystem fragments per block */
|
|
|
|
__le16 p_cpg; /* filesystem cylinders per group */
|
|
|
|
} d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */
|
|
|
|
};
|
|
|
|
|
|
|
|
#endif /* CONFIG_BSD_DISKLABEL */
|
|
|
|
|
|
|
|
#ifdef CONFIG_UNIXWARE_DISKLABEL
|
|
|
|
/*
|
|
|
|
* Unixware slices support by Andrzej Krzysztofowicz <ankry@mif.pg.gda.pl>
|
|
|
|
* and Krzysztof G. Baranowski <kgb@knm.org.pl>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */
|
|
|
|
#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */
|
|
|
|
#define UNIXWARE_NUMSLICE 16
|
|
|
|
#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */
|
|
|
|
|
|
|
|
struct unixware_slice {
|
|
|
|
__le16 s_label; /* label */
|
|
|
|
__le16 s_flags; /* permission flags */
|
|
|
|
__le32 start_sect; /* starting sector */
|
|
|
|
__le32 nr_sects; /* number of sectors in slice */
|
|
|
|
};
|
|
|
|
|
|
|
|
struct unixware_disklabel {
|
|
|
|
__le32 d_type; /* drive type */
|
|
|
|
__le32 d_magic; /* the magic number */
|
|
|
|
__le32 d_version; /* version number */
|
|
|
|
char d_serial[12]; /* serial number of the device */
|
|
|
|
__le32 d_ncylinders; /* # of data cylinders per device */
|
|
|
|
__le32 d_ntracks; /* # of tracks per cylinder */
|
|
|
|
__le32 d_nsectors; /* # of data sectors per track */
|
|
|
|
__le32 d_secsize; /* # of bytes per sector */
|
|
|
|
__le32 d_part_start; /* # of first sector of this partition */
|
|
|
|
__le32 d_unknown1[12]; /* ? */
|
|
|
|
__le32 d_alt_tbl; /* byte offset of alternate table */
|
|
|
|
__le32 d_alt_len; /* byte length of alternate table */
|
|
|
|
__le32 d_phys_cyl; /* # of physical cylinders per device */
|
|
|
|
__le32 d_phys_trk; /* # of physical tracks per cylinder */
|
|
|
|
__le32 d_phys_sec; /* # of physical sectors per track */
|
|
|
|
__le32 d_phys_bytes; /* # of physical bytes per sector */
|
|
|
|
__le32 d_unknown2; /* ? */
|
|
|
|
__le32 d_unknown3; /* ? */
|
|
|
|
__le32 d_pad[8]; /* pad */
|
|
|
|
|
|
|
|
struct unixware_vtoc {
|
|
|
|
__le32 v_magic; /* the magic number */
|
|
|
|
__le32 v_version; /* version number */
|
|
|
|
char v_name[8]; /* volume name */
|
|
|
|
__le16 v_nslices; /* # of slices */
|
|
|
|
__le16 v_unknown1; /* ? */
|
|
|
|
__le32 v_reserved[10]; /* reserved */
|
|
|
|
struct unixware_slice
|
|
|
|
v_slice[UNIXWARE_NUMSLICE]; /* slice headers */
|
|
|
|
} vtoc;
|
|
|
|
|
|
|
|
}; /* 408 */
|
|
|
|
|
|
|
|
#endif /* CONFIG_UNIXWARE_DISKLABEL */
|
|
|
|
|
|
|
|
#ifdef CONFIG_MINIX_SUBPARTITION
|
|
|
|
# define MINIX_NR_SUBPARTITIONS 4
|
|
|
|
#endif /* CONFIG_MINIX_SUBPARTITION */
|
|
|
|
|
2007-02-11 14:50:00 +07:00
|
|
|
#define ADDPART_FLAG_NONE 0
|
|
|
|
#define ADDPART_FLAG_RAID 1
|
|
|
|
#define ADDPART_FLAG_WHOLEDISK 2
|
|
|
|
|
2008-08-25 17:47:22 +07:00
|
|
|
extern int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
|
|
|
|
extern void blk_free_devt(dev_t devt);
|
2008-09-03 14:01:09 +07:00
|
|
|
extern dev_t blk_lookup_devt(const char *name, int partno);
|
|
|
|
extern char *disk_name (struct gendisk *hd, int partno, char *buf);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-08-25 17:56:15 +07:00
|
|
|
extern int disk_expand_part_tbl(struct gendisk *disk, int target);
|
2005-04-17 05:20:36 +07:00
|
|
|
extern int rescan_partitions(struct gendisk *disk, struct block_device *bdev);
|
2012-03-02 16:38:33 +07:00
|
|
|
extern int invalidate_partitions(struct gendisk *disk, struct block_device *bdev);
|
2008-11-10 13:29:58 +07:00
|
|
|
extern struct hd_struct * __must_check add_partition(struct gendisk *disk,
|
|
|
|
int partno, sector_t start,
|
2010-09-01 03:47:05 +07:00
|
|
|
sector_t len, int flags,
|
|
|
|
struct partition_meta_info
|
|
|
|
*info);
|
2011-01-07 14:43:37 +07:00
|
|
|
extern void __delete_partition(struct hd_struct *);
|
2005-04-17 05:20:36 +07:00
|
|
|
extern void delete_partition(struct gendisk *, int);
|
2007-05-09 16:33:24 +07:00
|
|
|
extern void printk_all_partitions(void);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2005-06-23 14:08:19 +07:00
|
|
|
extern struct gendisk *alloc_disk_node(int minors, int node_id);
|
2005-04-17 05:20:36 +07:00
|
|
|
extern struct gendisk *alloc_disk(int minors);
|
|
|
|
extern struct kobject *get_disk(struct gendisk *disk);
|
|
|
|
extern void put_disk(struct gendisk *disk);
|
2007-05-22 03:08:01 +07:00
|
|
|
extern void blk_register_region(dev_t devt, unsigned long range,
|
2005-04-17 05:20:36 +07:00
|
|
|
struct module *module,
|
|
|
|
struct kobject *(*probe)(dev_t, int *, void *),
|
|
|
|
int (*lock)(dev_t, void *),
|
|
|
|
void *data);
|
2007-05-22 03:08:01 +07:00
|
|
|
extern void blk_unregister_region(dev_t devt, unsigned long range);
|
2005-04-17 05:20:36 +07:00
|
|
|
|
2008-08-25 17:56:09 +07:00
|
|
|
extern ssize_t part_size_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf);
|
2008-08-25 17:56:14 +07:00
|
|
|
extern ssize_t part_stat_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf);
|
block: Seperate read and write statistics of in_flight requests v2
Commit a9327cac440be4d8333bba975cbbf76045096275 added seperate read
and write statistics of in_flight requests. And exported the number
of read and write requests in progress seperately through sysfs.
But Corrado Zoccolo <czoccolo@gmail.com> reported getting strange
output from "iostat -kx 2". Global values for service time and
utilization were garbage. For interval values, utilization was always
100%, and service time is higher than normal.
So this was reverted by commit 0f78ab9899e9d6acb09d5465def618704255963b
The problem was in part_round_stats_single(), I missed the following:
if (now == part->stamp)
return;
- if (part->in_flight) {
+ if (part_in_flight(part)) {
__part_stat_add(cpu, part, time_in_queue,
part_in_flight(part) * (now - part->stamp));
__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
With this chunk included, the reported regression gets fixed.
Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
--
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2009-10-07 01:16:55 +07:00
|
|
|
extern ssize_t part_inflight_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf);
|
2008-08-25 17:56:13 +07:00
|
|
|
#ifdef CONFIG_FAIL_MAKE_REQUEST
|
|
|
|
extern ssize_t part_fail_show(struct device *dev,
|
|
|
|
struct device_attribute *attr, char *buf);
|
|
|
|
extern ssize_t part_fail_store(struct device *dev,
|
|
|
|
struct device_attribute *attr,
|
|
|
|
const char *buf, size_t count);
|
|
|
|
#endif /* CONFIG_FAIL_MAKE_REQUEST */
|
2008-08-25 17:56:09 +07:00
|
|
|
|
2011-01-07 14:43:37 +07:00
|
|
|
static inline void hd_ref_init(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
atomic_set(&part->ref, 1);
|
|
|
|
smp_mb();
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void hd_struct_get(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
atomic_inc(&part->ref);
|
|
|
|
smp_mb__after_atomic_inc();
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int hd_struct_try_get(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
return atomic_inc_not_zero(&part->ref);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void hd_struct_put(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&part->ref))
|
|
|
|
__delete_partition(part);
|
|
|
|
}
|
|
|
|
|
2012-08-01 17:24:18 +07:00
|
|
|
/*
|
|
|
|
* Any access of part->nr_sects which is not protected by partition
|
|
|
|
* bd_mutex or gendisk bdev bd_mutex, should be done using this
|
|
|
|
* accessor function.
|
|
|
|
*
|
|
|
|
* Code written along the lines of i_size_read() and i_size_write().
|
|
|
|
* CONFIG_PREEMPT case optimizes the case of UP kernel with preemption
|
|
|
|
* on.
|
|
|
|
*/
|
|
|
|
static inline sector_t part_nr_sects_read(struct hd_struct *part)
|
|
|
|
{
|
|
|
|
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
|
|
|
|
sector_t nr_sects;
|
|
|
|
unsigned seq;
|
|
|
|
do {
|
|
|
|
seq = read_seqcount_begin(&part->nr_sects_seq);
|
|
|
|
nr_sects = part->nr_sects;
|
|
|
|
} while (read_seqcount_retry(&part->nr_sects_seq, seq));
|
|
|
|
return nr_sects;
|
|
|
|
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
|
|
|
|
sector_t nr_sects;
|
|
|
|
|
|
|
|
preempt_disable();
|
|
|
|
nr_sects = part->nr_sects;
|
|
|
|
preempt_enable();
|
|
|
|
return nr_sects;
|
|
|
|
#else
|
|
|
|
return part->nr_sects;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Should be called with mutex lock held (typically bd_mutex) of partition
|
|
|
|
* to provide mutual exlusion among writers otherwise seqcount might be
|
|
|
|
* left in wrong state leaving the readers spinning infinitely.
|
|
|
|
*/
|
|
|
|
static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
|
|
|
|
{
|
|
|
|
#if BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_SMP)
|
|
|
|
write_seqcount_begin(&part->nr_sects_seq);
|
|
|
|
part->nr_sects = size;
|
|
|
|
write_seqcount_end(&part->nr_sects_seq);
|
|
|
|
#elif BITS_PER_LONG==32 && defined(CONFIG_LBDAF) && defined(CONFIG_PREEMPT)
|
|
|
|
preempt_disable();
|
|
|
|
part->nr_sects = size;
|
|
|
|
preempt_enable();
|
|
|
|
#else
|
|
|
|
part->nr_sects = size;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2007-05-11 18:29:54 +07:00
|
|
|
#else /* CONFIG_BLOCK */
|
|
|
|
|
|
|
|
static inline void printk_all_partitions(void) { }
|
|
|
|
|
2008-09-03 14:01:09 +07:00
|
|
|
static inline dev_t blk_lookup_devt(const char *name, int partno)
|
2007-05-22 03:08:01 +07:00
|
|
|
{
|
|
|
|
dev_t devt = MKDEV(0, 0);
|
|
|
|
return devt;
|
|
|
|
}
|
|
|
|
|
2007-05-11 18:29:54 +07:00
|
|
|
#endif /* CONFIG_BLOCK */
|
[PATCH] BLOCK: Make it possible to disable the block layer [try #6]
Make it possible to disable the block layer. Not all embedded devices require
it, some can make do with just JFFS2, NFS, ramfs, etc - none of which require
the block layer to be present.
This patch does the following:
(*) Introduces CONFIG_BLOCK to disable the block layer, buffering and blockdev
support.
(*) Adds dependencies on CONFIG_BLOCK to any configuration item that controls
an item that uses the block layer. This includes:
(*) Block I/O tracing.
(*) Disk partition code.
(*) All filesystems that are block based, eg: Ext3, ReiserFS, ISOFS.
(*) The SCSI layer. As far as I can tell, even SCSI chardevs use the
block layer to do scheduling. Some drivers that use SCSI facilities -
such as USB storage - end up disabled indirectly from this.
(*) Various block-based device drivers, such as IDE and the old CDROM
drivers.
(*) MTD blockdev handling and FTL.
(*) JFFS - which uses set_bdev_super(), something it could avoid doing by
taking a leaf out of JFFS2's book.
(*) Makes most of the contents of linux/blkdev.h, linux/buffer_head.h and
linux/elevator.h contingent on CONFIG_BLOCK being set. sector_div() is,
however, still used in places, and so is still available.
(*) Also made contingent are the contents of linux/mpage.h, linux/genhd.h and
parts of linux/fs.h.
(*) Makes a number of files in fs/ contingent on CONFIG_BLOCK.
(*) Makes mm/bounce.c (bounce buffering) contingent on CONFIG_BLOCK.
(*) set_page_dirty() doesn't call __set_page_dirty_buffers() if CONFIG_BLOCK
is not enabled.
(*) fs/no-block.c is created to hold out-of-line stubs and things that are
required when CONFIG_BLOCK is not set:
(*) Default blockdev file operations (to give error ENODEV on opening).
(*) Makes some /proc changes:
(*) /proc/devices does not list any blockdevs.
(*) /proc/diskstats and /proc/partitions are contingent on CONFIG_BLOCK.
(*) Makes some compat ioctl handling contingent on CONFIG_BLOCK.
(*) If CONFIG_BLOCK is not defined, makes sys_quotactl() return -ENODEV if
given command other than Q_SYNC or if a special device is specified.
(*) In init/do_mounts.c, no reference is made to the blockdev routines if
CONFIG_BLOCK is not defined. This does not prohibit NFS roots or JFFS2.
(*) The bdflush, ioprio_set and ioprio_get syscalls can now be absent (return
error ENOSYS by way of cond_syscall if so).
(*) The seclvl_bd_claim() and seclvl_bd_release() security calls do nothing if
CONFIG_BLOCK is not set, since they can't then happen.
Signed-Off-By: David Howells <dhowells@redhat.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
2006-10-01 01:45:40 +07:00
|
|
|
|
2008-03-12 23:52:56 +07:00
|
|
|
#endif /* _LINUX_GENHD_H */
|