md updates for 4.4.

Two major components to this update.
 
 1/ the clustered-raid1 support from SUSE is nearly
   complete.  There are a few outstanding issues being
   worked on.  Maybe half a dozen patches will bring
   this to a usable state.
 
 2/ The first stage of journalled-raid5 support from
    Facebook makes an appearance.  With a journal
    device configured (typically NVRAM or SSD), the
    "RAID5 write hole" should be closed - a crash
    during degraded operations cannot result in data
    corruption.
 
    The next stage will be to use the journal as a
    write-behind cache so that latency can be reduced
    and in some cases throughput increased by
    performing more full-stripe writes.
 -----BEGIN PGP SIGNATURE-----
 Version: GnuPG v2
 
 iQIcBAABCAAGBQJWNX9RAAoJEDnsnt1WYoG5bYMP/jI0pV3wcbs7mZQAa8S/V0lU
 2l25x4MdwDvqVKMfjIc/C5J08QNgcrgSvhiVPCEOK0w18q395vep9f6gFKbMHhu/
 lWU3PLHGw8XBHp5yEnxrpQkN0pRrNjh5NqIdlVMBNyL6u+RZPS2ZuzxJ8wiNAFg1
 MypNkgoUu6s+nBp4DWWnMGYhBc+szBR+gTYAzGiZ8vqOH9uiSJ2SsGG5aRVUN/af
 oMYvJAf9aA6uj+xSzNlXIaLfWJIrshQYS1jU/W4gTm0DwK9yqbTxvubJaE0SGu/o
 73FGU8tmQ6ELYfsp3D/jmfUkE7weiNEQhdVb/4wy1A/SGc+W7Ju9pxfhm8ra57s0
 /BCkfwWZXEvx1flegXfK1mC6EMpMIcGAD2FQEhmQbW6wTdDwtNyEhIePDVGJwD/F
 rhEThFa+Dg9+xnBGnS6OUK3EpXgml2hAeAC7uA3TVSAnWd/9/Mpim6fZhqrB/v9L
 Ik0tZt+H4nxYaheZjKlKhuXUQYcUWGiMb67bGMem/YAlMa4y9C9qF+9mPXxyjVlI
 hBsd5SfZNz99DyB/bO8BumQeIWlTfzLeFzWW67eQ864LRKO6k0/VIbPZHCfn2oVG
 XvyC2fUhNOIURP3IMxcyHYxOA7Mu6EDsVVDTpuqLVbZQ5IPjDEfQ54yB/BLUvbX/
 Gh2/tKn7Xc25HuLAFEbs
 =TD5o
 -----END PGP SIGNATURE-----

Merge tag 'md/4.4' of git://neil.brown.name/md

Pull md updates from Neil Brown:
 "Two major components to this update.

   1) The clustered-raid1 support from SUSE is nearly complete.  There
      are a few outstanding issues being worked on.  Maybe half a dozen
      patches will bring this to a usable state.

   2) The first stage of journalled-raid5 support from Facebook makes an
      appearance.  With a journal device configured (typically NVRAM or
      SSD), the "RAID5 write hole" should be closed - a crash during
      degraded operations cannot result in data corruption.

      The next stage will be to use the journal as a write-behind cache
      so that latency can be reduced and in some cases throughput
      increased by performing more full-stripe writes.

* tag 'md/4.4' of git://neil.brown.name/md: (66 commits)
  MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW
  MD: set journal disk ->raid_disk
  MD: kick out journal disk if it's not fresh
  raid5-cache: start raid5 readonly if journal is missing
  MD: add new bit to indicate raid array with journal
  raid5-cache: IO error handling
  raid5: journal disk can't be removed
  raid5-cache: add trim support for log
  MD: fix info output for journal disk
  raid5-cache: use bio chaining
  raid5-cache: small log->seq cleanup
  raid5-cache: new helper: r5_reserve_log_entry
  raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta
  raid5-cache: take rdev->data_offset into account early on
  raid5-cache: refactor bio allocation
  raid5-cache: clean up r5l_get_meta
  raid5-cache: simplify state machine when caches flushes are not needed
  raid5-cache: factor out a helper to run all stripes for an I/O unit
  raid5-cache: rename flushed_ios to finished_ios
  raid5-cache: free I/O units earlier
  ...
This commit is contained in:
Linus Torvalds 2015-11-04 21:12:47 -08:00
commit ac322de6bf
14 changed files with 1991 additions and 312 deletions

View file

@ -89,6 +89,12 @@
* read requests will only be sent here in
* dire need
*/
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
#define MD_DISK_ROLE_SPARE 0xffff
#define MD_DISK_ROLE_FAULTY 0xfffe
#define MD_DISK_ROLE_JOURNAL 0xfffd
#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */
@ -252,7 +258,10 @@ struct mdp_superblock_1 {
__le64 data_offset; /* sector start of data, often 0 */
__le64 data_size; /* sectors in this device that can be used for data */
__le64 super_offset; /* sector start of this superblock */
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
union {
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
__le64 journal_tail;/* journal tail of journal device (from data_offset) */
};
__le32 dev_number; /* permanent identifier of this device - not role in raid */
__le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
@ -302,6 +311,8 @@ struct mdp_superblock_1 {
#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening
* is guided by bitmap.
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
@ -310,6 +321,66 @@ struct mdp_superblock_1 {
|MD_FEATURE_RESHAPE_BACKWARDS \
|MD_FEATURE_NEW_OFFSET \
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
)
struct r5l_payload_header {
__le16 type;
__le16 flags;
} __attribute__ ((__packed__));
enum r5l_payload_type {
R5LOG_PAYLOAD_DATA = 0,
R5LOG_PAYLOAD_PARITY = 1,
R5LOG_PAYLOAD_FLUSH = 2,
};
struct r5l_payload_data_parity {
struct r5l_payload_header header;
__le32 size; /* sector. data/parity size. each 4k
* has a checksum */
__le64 location; /* sector. For data, it's raid sector. For
* parity, it's stripe sector */
__le32 checksum[];
} __attribute__ ((__packed__));
enum r5l_payload_data_parity_flag {
R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
/*
* RESHAPED/RESHAPING is only set when there is reshape activity. Note,
* both data/parity of a stripe should have the same flag set
*
* RESHAPED: reshape is running, and this stripe finished reshape
* RESHAPING: reshape is running, and this stripe isn't reshaped
*/
R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
};
struct r5l_payload_flush {
struct r5l_payload_header header;
__le32 size; /* flush_stripes size, bytes */
__le64 flush_stripes[];
} __attribute__ ((__packed__));
enum r5l_payload_flush_flag {
R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
};
struct r5l_meta_block {
__le32 magic;
__le32 checksum;
__u8 version;
__u8 __zero_pading_1;
__le16 __zero_pading_2;
__le32 meta_size; /* whole size of the block */
__le64 seq;
__le64 position; /* sector, start from rdev->data_offset, current position */
struct r5l_payload_header payloads[];
} __attribute__ ((__packed__));
#define R5LOG_VERSION 0x1
#define R5LOG_MAGIC 0x6433c509
#endif