extractedLnx/linux-2.6.17/drivers/md/raid5.c_handle_stripe.c
static void handle_stripe(struct stripe_head *sh)
{
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks;
struct bio *return_bi= NULL;
struct bio *bi;
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
int non_overwrite = 0;
int failed_num=0;
struct r5dev *dev;
PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
(unsigned long long)sh->sector, atomic_read(&sh->count),
sh->pd_idx);
spin_lock(&sh->lock);
clear_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state);
syncing = test_bit(STRIPE_SYNCING, &sh->state);
expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
/* Now to look around and see what can be done */
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
i, dev->flags, dev->toread, dev->towrite, dev->written);
/* maybe we can reply to a read */
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
struct bio *rbi, *rbi2;
PRINTK("Return read for disc %d\n", i);
spin_lock_irq(&conf->device_lock);
rbi = dev->toread;
dev->toread = NULL;
if (test_and_clear_bit(R5_Overlap, &dev->flags))
wake_up(&conf->wait_for_overlap);
spin_unlock_irq(&conf->device_lock);
while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) {
copy_data(0, rbi, dev->page, dev->sector);
rbi2 = r5_next_bio(rbi, dev->sector);
spin_lock_irq(&conf->device_lock);
if (--rbi->bi_phys_segments == 0) {
rbi->bi_next = return_bi;
return_bi = rbi;
}
spin_unlock_irq(&conf->device_lock);
rbi = rbi2;
}
}
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
if (dev->toread) to_read++;
if (dev->towrite) {
to_write++;
if (!test_bit(R5_OVERWRITE, &dev->flags))
non_overwrite++;
}
if (dev->written) written++;
rdev = rcu_dereference(conf->disks[i].rdev);
if (!rdev || !test_bit(In_sync, &rdev->flags)) {
/* The ReadError flag will just be confusing now */
clear_bit(R5_ReadError, &dev->flags);
clear_bit(R5_ReWrite, &dev->flags);
}
if (!rdev || !test_bit(In_sync, &rdev->flags)
|| test_bit(R5_ReadError, &dev->flags)) {
failed++;
failed_num = i;
} else
set_bit(R5_Insync, &dev->flags);
}
rcu_read_unlock();
PRINTK("locked=%d uptodate=%d to_read=%d"
" to_write=%d failed=%d failed_num=%d\n",
locked, uptodate, to_read, to_write, failed, failed_num);
/* check if the array has lost two devices and, if so, some requests might
* need to be failed
*/
if (failed > 1 && to_read+to_write+written) {
for (i=disks; i--; ) {
int bitmap_end = 0;
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
mdk_rdev_t *rdev;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && test_bit(In_sync, &rdev->flags))
/* multiple read failures in one stripe */
md_error(conf->mddev, rdev);
rcu_read_unlock();
}
spin_lock_irq(&conf->device_lock);
/* fail all writes first */
bi = sh->dev[i].towrite;
sh->dev[i].towrite = NULL;
if (bi) { to_write--; bitmap_end = 1; }
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (--bi->bi_phys_segments == 0) {
md_write_end(conf->mddev);
bi->bi_next = return_bi;
return_bi = bi;
}
bi = nextbi;
}
/* and fail all 'written' */
bi = sh->dev[i].written;
sh->dev[i].written = NULL;
if (bi) bitmap_end = 1;
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS) {
struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (--bi->bi_phys_segments == 0) {
md_write_end(conf->mddev);
bi->bi_next = return_bi;
return_bi = bi;
}
bi = bi2;
}
/* fail any reads if this device is non-operational */
if (!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags)) {
bi = sh->dev[i].toread;
sh->dev[i].toread = NULL;
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
if (bi) to_read--;
while (bi && bi->bi_sector < sh->dev[i].sector + STRIPE_SECTORS){
struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
clear_bit(BIO_UPTODATE, &bi->bi_flags);
if (--bi->bi_phys_segments == 0) {
bi->bi_next = return_bi;
return_bi = bi;
}
bi = nextbi;
}
}
spin_unlock_irq(&conf->device_lock);
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS, 0, 0);
}
}
if (failed > 1 && syncing) {
md_done_sync(conf->mddev, STRIPE_SECTORS,0);
clear_bit(STRIPE_SYNCING, &sh->state);
syncing = 0;
}
/* might be able to return some write requests if the parity block
* is safe, or on a failed drive
*/
dev = &sh->dev[sh->pd_idx];
if ( written &&
( (test_bit(R5_Insync, &dev->flags) && !test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags))
|| (failed == 1 && failed_num == sh->pd_idx))
) {
/* any written block on an uptodate or failed drive can be returned.
* Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
* never LOCKED, so we don't need to test 'failed' directly.
*/
for (i=disks; i--; )
if (sh->dev[i].written) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) &&
test_bit(R5_UPTODATE, &dev->flags) ) {
/* We can return any write requests */
struct bio *wbi, *wbi2;
int bitmap_end = 0;
PRINTK("Return write for disc %d\n", i);
spin_lock_irq(&conf->device_lock);
wbi = dev->written;
dev->written = NULL;
while (wbi && wbi->bi_sector < dev->sector + STRIPE_SECTORS) {
wbi2 = r5_next_bio(wbi, dev->sector);
if (--wbi->bi_phys_segments == 0) {
md_write_end(conf->mddev);
wbi->bi_next = return_bi;
return_bi = wbi;
}
wbi = wbi2;
}
if (dev->towrite == NULL)
bitmap_end = 1;
spin_unlock_irq(&conf->device_lock);
if (bitmap_end)
bitmap_endwrite(conf->mddev->bitmap, sh->sector,
STRIPE_SECTORS,
!test_bit(STRIPE_DEGRADED, &sh->state), 0);
}
}
}
/* Now we might consider reading some blocks, either to check/generate
* parity, or to satisfy requests
* or to load a block that is being partially written.
*/
if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) {
for (i=disks; i--;) {
dev = &sh->dev[i];
if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
syncing ||
expanding ||
(failed && (sh->dev[failed_num].toread ||
(sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags))))
)
) {
/* we would like to get this block, possibly
* by computing it, but we might not be able to
*/
if (uptodate == disks-1) {
PRINTK("Computing block %d\n", i);
compute_block(sh, i);
uptodate++;
} else if (test_bit(R5_Insync, &dev->flags)) {
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
#if 0
/* if I am just reading this block and we don't have
a failed drive, or any pending writes then sidestep the cache */
if (sh->bh_read[i] && !sh->bh_read[i]->b_reqnext &&
! syncing && !failed && !to_write) {
sh->bh_cache[i]->b_page = sh->bh_read[i]->b_page;
sh->bh_cache[i]->b_data = sh->bh_read[i]->b_data;
}
#endif
locked++;
PRINTK("Reading block %d (sync=%d)\n",
i, syncing);
}
}
}
set_bit(STRIPE_HANDLE, &sh->state);
}
/* now to consider writing and what else, if anything should be read */
if (to_write) {
int rmw=0, rcw=0;
for (i=disks ; i--;) {
/* would I have to read this buffer for read_modify_write */
dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
(!test_bit(R5_LOCKED, &dev->flags)
#if 0
|| sh->bh_page[i]!=bh->b_page
#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)
/* && !(!mddev->insync && i == sh->pd_idx) */
)
rmw++;
else rmw += 2*disks; /* cannot read it */
}
/* Would I have to read this buffer for reconstruct_write */
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
(!test_bit(R5_LOCKED, &dev->flags)
#if 0
|| sh->bh_page[i] != bh->b_page
#endif
) &&
!test_bit(R5_UPTODATE, &dev->flags)) {
if (test_bit(R5_Insync, &dev->flags)) rcw++;
else rcw += 2*disks;
}
}
PRINTK("for sector %llu, rmw=%d rcw=%d\n",
(unsigned long long)sh->sector, rmw, rcw);
set_bit(STRIPE_HANDLE, &sh->state);
if (rmw < rcw && rmw > 0)
/* prefer read-modify-write, but need to get some data */
for (i=disks; i--;) {
dev = &sh->dev[i];
if ((dev->towrite || i == sh->pd_idx) &&
!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
test_bit(R5_Insync, &dev->flags)) {
if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
{
PRINTK("Read_old block %d for r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
locked++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
if (rcw <= rmw && rcw > 0)
/* want reconstruct write, but need to get some data */
for (i=disks; i--;) {
dev = &sh->dev[i];
if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) &&
test_bit(R5_Insync, &dev->flags)) {
if (test_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
{
PRINTK("Read_old block %d for Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, &dev->flags);
locked++;
} else {
set_bit(STRIPE_DELAYED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
/* now if nothing is locked, and if we have enough data, we can start a write request */
if (locked == 0 && (rcw == 0 ||rmw == 0) &&
!test_bit(STRIPE_BIT_DELAY, &sh->state)) {
PRINTK("Computing parity...\n");
compute_parity(sh, rcw==0 ? RECONSTRUCT_WRITE : READ_MODIFY_WRITE);
/* now every locked buffer is ready to be written */
for (i=disks; i--;)
if (test_bit(R5_LOCKED, &sh->dev[i].flags)) {
PRINTK("Writing block %d\n", i);
locked++;
set_bit(R5_Wantwrite, &sh->dev[i].flags);
if (!test_bit(R5_Insync, &sh->dev[i].flags)
|| (i==sh->pd_idx && failed == 0))
set_bit(STRIPE_INSYNC, &sh->state);
}
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
md_wakeup_thread(conf->mddev->thread);
}
}
}
/* maybe we need to check and possibly fix the parity for this stripe
* Any reads will already have been scheduled, so we just see if enough data
* is available
*/
if (syncing && locked == 0 &&
!test_bit(STRIPE_INSYNC, &sh->state)) {
set_bit(STRIPE_HANDLE, &sh->state);
if (failed == 0) {
char *pagea;
BUG_ON(uptodate != disks);
compute_parity(sh, CHECK_PARITY);
uptodate--;
pagea = page_address(sh->dev[sh->pd_idx].page);
if ((*(u32*)pagea) == 0 &&
!memcmp(pagea, pagea+4, STRIPE_SIZE-4)) {
/* parity is correct (on disc, not in buffer any more) */
set_bit(STRIPE_INSYNC, &sh->state);
} else {
conf->mddev->resync_mismatches += STRIPE_SECTORS;
if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
/* don't try to repair!! */
set_bit(STRIPE_INSYNC, &sh->state);
else {
compute_block(sh, sh->pd_idx);
uptodate++;
}
}
}
if (!test_bit(STRIPE_INSYNC, &sh->state)) {
/* either failed parity check, or recovery is happening */
if (failed==0)
failed_num = sh->pd_idx;
dev = &sh->dev[failed_num];
BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
BUG_ON(uptodate != disks);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
clear_bit(STRIPE_DEGRADED, &sh->state);
locked++;
set_bit(STRIPE_INSYNC, &sh->state);
}
}
if (syncing && locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
md_done_sync(conf->mddev, STRIPE_SECTORS,1);
clear_bit(STRIPE_SYNCING, &sh->state);
}
/* If the failed drive is just a ReadError, then we might need to progress
* the repair/check process
*/
if (failed == 1 && ! conf->mddev->ro &&
test_bit(R5_ReadError, &sh->dev[failed_num].flags)
&& !test_bit(R5_LOCKED, &sh->dev[failed_num].flags)
&& test_bit(R5_UPTODATE, &sh->dev[failed_num].flags)
) {
dev = &sh->dev[failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
locked++;
}
}
if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks);
compute_parity(sh, RECONSTRUCT_WRITE);
for (i= conf->raid_disks; i--;) {
set_bit(R5_LOCKED, &sh->dev[i].flags);
locked++;
set_bit(R5_Wantwrite, &sh->dev[i].flags);
}
clear_bit(STRIPE_EXPANDING, &sh->state);
} else if (expanded) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
}
if (expanding && locked == 0) {
/* We have read all the blocks in this stripe and now we need to
* copy some of them into a target stripe for expand.
*/
clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
for (i=0; i< sh->disks; i++)
if (i != sh->pd_idx) {
int dd_idx, pd_idx, j;
struct stripe_head *sh2;
sector_t bn = compute_blocknr(sh, i);
sector_t s = raid5_compute_sector(bn, conf->raid_disks,
conf->raid_disks-1,
&dd_idx, &pd_idx, conf);
sh2 = get_active_stripe(conf, s, conf->raid_disks, pd_idx, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
* get requested, we will try again
*/
continue;
if(!test_bit(STRIPE_EXPANDING, &sh2->state) ||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
/* must have already done this block */
release_stripe(sh2);
continue;
}
memcpy(page_address(sh2->dev[dd_idx].page),
page_address(sh->dev[i].page),
STRIPE_SIZE);
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j=0; j<conf->raid_disks; j++)
if (j != sh2->pd_idx &&
!test_bit(R5_Expanded, &sh2->dev[j].flags))
break;
if (j == conf->raid_disks) {
set_bit(STRIPE_EXPAND_READY, &sh2->state);
set_bit(STRIPE_HANDLE, &sh2->state);
}
release_stripe(sh2);
}
}
spin_unlock(&sh->lock);
while ((bi=return_bi)) {
int bytes = bi->bi_size;
return_bi = bi->bi_next;
bi->bi_next = NULL;
bi->bi_size = 0;
bi->bi_end_io(bi, bytes, 0);
}
for (i=disks; i-- ;) {
int rw;
struct bio *bi;
mdk_rdev_t *rdev;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
rw = 1;
else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = 0;
else
continue;
bi = &sh->dev[i].req;
bi->bi_rw = rw;
if (rw)
bi->bi_end_io = raid5_end_write_request;
else
bi->bi_end_io = raid5_end_read_request;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev)
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
if (rdev) {
if (syncing || expanding || expanded)
md_sync_acct(rdev->bdev, STRIPE_SECTORS);
bi->bi_bdev = rdev->bdev;
PRINTK("for %llu schedule op %ld on disc %d\n",
(unsigned long long)sh->sector, bi->bi_rw, i);
atomic_inc(&sh->count);
bi->bi_sector = sh->sector + rdev->data_offset;
bi->bi_flags = 1 << BIO_UPTODATE;
bi->bi_vcnt = 1;
bi->bi_max_vecs = 1;
bi->bi_idx = 0;
bi->bi_io_vec = &sh->dev[i].vec;
bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
bi->bi_io_vec[0].bv_offset = 0;
bi->bi_size = STRIPE_SIZE;
bi->bi_next = NULL;
if (rw == WRITE &&
test_bit(R5_ReWrite, &sh->dev[i].flags))
atomic_add(STRIPE_SECTORS, &rdev->corrected_errors);
generic_make_request(bi);
} else {
if (rw == 1)
set_bit(STRIPE_DEGRADED, &sh->state);
PRINTK("skip op %ld on disc %d for sector %llu\n",
bi->bi_rw, i, (unsigned long long)sh->sector);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
}
}
}
Generated by GNU enscript 1.6.4.