您的位置:首页 > 大数据 > 人工智能

soft raid5阅读笔记之四--读成员磁盘错误的处理

2016-07-20 17:36 501 查看
在读取成员磁盘的过程中,可能会出错,错误来自于底层报告的错误,进而将该成员磁盘标记为故障磁盘。这部分从raid5_end_read_request()函数说起。



static void raid5_end_read_request(struct bio * bi, int error)
{
struct stripe_head *sh = bi->bi_private;
raid5_conf_t *conf = sh->raid_conf;
int disks = sh->disks, i;
int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);/*BIO_UPTODATE是底层报告给上层的数据成功完成标志*/
char b[BDEVNAME_SIZE];
mdk_rdev_t *rdev;

/*查找req对应的成员磁盘编号*/
for (i=0 ; i<disks; i++)
if (bi == &sh->dev[i].req)
break;

pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
(unsigned long long)sh->sector, i, atomic_read(&sh->count),
uptodate);
if (i == disks) {
BUG();
return;
}

if (uptodate) {     /*数据已经从底层成功读取*/
set_bit(R5_UPTODATE, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
rdev = conf->disks[i].rdev;
printk_rl(KERN_INFO "raid5:%s: read error corrected"
" (%lu sectors at %llu on %s)\n",
mdname(conf->mddev), STRIPE_SECTORS,
(unsigned long long)(sh->sector
+ rdev->data_offset),
bdevname(rdev->bdev, b));
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
}
if (atomic_read(&conf->disks[i].rdev->read_errors))
atomic_set(&conf->disks[i].rdev->read_errors, 0);
} else {     /*bio的BIO_UPTODATE未设置,说明出现了读错误*/
const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
int retry = 0;
rdev = conf->disks[i].rdev;

clear_bit(R5_UPTODATE, &sh->dev[i].flags);
atomic_inc(&rdev->read_errors);
if (conf->mddev->degraded)     /*读错误无法处理,考虑使用spare*/
printk_rl(KERN_WARNING
"raid5:%s: read error not correctable "
"(sector %llu on %s).\n",
mdname(conf->mddev),
(unsigned long long)(sh->sector
+ rdev->data_offset),
bdn);
else if (test_bit(R5_ReWrite, &sh->dev[i].flags))     /*说明读错误还没有处理*/
/* Oh, no!!! */
printk_rl(KERN_WARNING
"raid5:%s: read error NOT corrected!! "
"(sector %llu on %s).\n",
mdname(conf->mddev),
(unsigned long long)(sh->sector
+ rdev->data_offset),
bdn);
else if (atomic_read(&rdev->read_errors)
> conf->max_nr_stripes)     /*说明在某个stripe_head上的dev必然发生了两次读错误,对于RAID5,只能不能处理两个dev出错*/
printk(KERN_WARNING
"raid5:%s: Too many read errors, failing device %s.\n",
mdname(conf->mddev), bdn);
else
retry = 1;
if (retry)     /*说明读错误可以通过BSR处理*/
set_bit(R5_ReadError, &sh->dev[i].flags);
else {     /*这个读错误无法处理,调用md_error向上层报告错误*/
clear_bit(R5_ReadError, &sh->dev[i].flags);
clear_bit(R5_ReWrite, &sh->dev[i].flags);
md_error(conf->mddev, rdev);
}
}
rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);     /*设置条带为待处理标志后,调用release_stripe,使该条带进入下一轮的处理*/
}


在raid5_end_read_request()函数中,首先根据底层报告的BIO_UPTODATE来判断底层是否正确读取了数据。如果没有正确读取到数据,对于可以通过BSR处理的,则设置dev的R5_ReadError状态标志位,说明此次读取出错,并可以通过读取其它磁盘数据进行异或计算后获取;否则,直接通过调用md_error()函数向上层报告错误。最后通过release_stripe()函数唤醒raid5d线程进行后续处理。

raid5d()-->handle_stripe()-->handle_stripe5():
for (i=disks; i--; ) {
mdk_rdev_t *rdev;

dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
.......
if (test_bit(R5_Wantfill, &dev->flags))
s.to_fill++;
else if (dev->toread)
s.to_read++;
........
}

if (s.to_read || s.non_overwrite ||
(s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
handle_stripe_fill5(sh, &s, disks);     //第一步
if (s.ops_request)
raid_run_ops(sh, s.ops_request);     //第二步


在handle_stripe5()函数中,设置s.to_read计数器,统计需要读取成员磁盘数。然后,调用 handle_stripe_fill5()函数进行处理:
static void handle_stripe_fill5(struct stripe_head *sh,
struct stripe_head_state *s, int disks)
{
int i;

/* look for blocks to read/compute, skip this if a compute
* is already in flight, or if the stripe contents are in the
* midst of changing due to a write
*/
if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
!sh->reconstruct_state)     //该sh不是正在执行计算,不在校验,且不在重构
for (i = disks; i--; )
if (fetch_block5(sh, s, i, disks))
break;
set_bit(STRIPE_HANDLE, &sh->state);
}


该函数主要是查看给定磁盘是否有数据需要读出或是通过计算获取,通过判断该sh的状态不是正在执行计算,不在校验,且不在重构,此时对所有的成员磁盘,调用fetch_block5()函数:
static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
int disk_idx, int disks)
{
struct r5dev *dev = &sh->dev[disk_idx];
struct r5dev *failed_dev = &sh->dev[s->failed_num];

/* is the data in this block needed, and can we get it? */
if (!test_bit(R5_LOCKED, &dev->flags) &&
!test_bit(R5_UPTODATE, &dev->flags) &&
(dev->toread ||
(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
s->syncing || s->expanding ||
(s->failed &&
(failed_dev->toread ||
(failed_dev->towrite &&
!test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
/* We would like to get this block, possibly by
bbbd
computing it,
* otherwise read it if the backing disk is insync
*/
if ((s->uptodate == disks - 1) &&
(s->failed && disk_idx == s->failed_num)) {
set_bit(STRIPE_COMPUTE_RUN, &sh->state);
set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
set_bit(R5_Wantcompute, &dev->flags);
sh->ops.target = disk_idx;
sh->ops.target2 = -1;
s->req_compute = 1;
/* Careful: from this point on 'uptodate' is in the eye
* of raid_run_ops which services 'compute' operations
* before writes. R5_Wantcompute flags a block that will
* be R5_UPTODATE by the time it is needed for a
* subsequent operation.
*/
s->uptodate++;
return 1; /* uptodate + compute == disks */
} else if (test_bit(R5_Insync, &dev->flags)) {
......
}


该函数判断更新的磁盘数为总磁盘数-1(即只有一个磁盘没有更新),正好有一个磁盘出错,且该磁盘正是我们要读取的磁盘,这时,会设置sh的STRIPE_COMPUTE_RUN(计算)状态位,设置sh的操作请求为STRIPE_OP_COMPUTE_BLK(计算磁盘块),设置该磁盘的R5_Wantcompute标志位,并且会保存操作的目标磁盘,返回1,这时handle_stripe_fill5()函数会跳出循环并返回到handle_stripe5()函数中。

继续回到handle_stripe5()函数中,该函数会调用raid_run_ops()-->__raid_run_ops()函数:
if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
if (level < 6)
tx = ops_run_compute5(sh, percpu);
........
}


该函数判断sh的操作状态,进而调用ops_run_compute5()函数进行处理:
static struct dma_async_tx_descriptor *
ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
{
........

for (i = disks; i--; )
if (i != target)
xor_srcs[count++] = sh->dev[i].page;

atomic_inc(&sh->count);

init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
ops_complete_compute, sh, to_addr_conv(sh, percpu));
if (unlikely(count == 1))
tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
else
tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
return tx;
}


该函数首先遍历sh的所有dev,将除出错磁盘外的成员磁盘的物理页保存到xor_srcs数组中,设置完成回调函数ops_complete_compute,并通过async_xor()执行异步的异或操作。在异步操作完成后,会调用ops_complete_compute()函数:
static void ops_complete_compute(void *stripe_head_ref)
{
........
/* mark the computed target(s) as uptodate */
mark_target_uptodate(sh, sh->ops.target);
mark_target_uptodate(sh, sh->ops.target2);

clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
}


在该函数中,会通过调用mark_target_uptodate()函数,设置该成员磁盘的状态为R5_UPTODATE,同时清除R5_Wantcompute。此时,已经完成对错误磁盘的通过其它成员磁盘执行异或操作的处理过程。继续调用release_stripe()-->raid5d()-->handle_stripe()-->handle_stripe5()进入下一轮的处理,此时的操作和正常的读处理一致,不再重述。

但是并没有结束,此时虽然可以正常返回读取的数据,但可能该成员磁盘的扇区已经损坏,可以通过BSR处理。BSR(Bad Sector Recovery)是坏扇区修复,目前的磁盘都有这样的功能:在对磁盘的读写过程中,如果发现一个坏扇区,则由内部的管理程序自动分配一个备用扇区来取代该扇区,并将该扇区的物理位置及其替换情况记录到缺陷表中。基于这种想法,linux内核会将计算好的数据重新写回成员磁盘中。

因此,在handle_stripe5()中有下面的处理:
  
if (s.failed == 1 && !conf->mddev->ro &&                              //出错的磁盘个数为1
test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)          //读出错
&& !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)         //未下发读请求
&& test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)     //数据是更新状态,说明该数据不是通过读取获得的,而是通过异或计算获取的
) {
dev = &sh->dev[s.failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);     //设置想写入标志
set_bit(R5_ReWrite, &dev->flags);         //设置重新写入标志
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
s.locked++;
}
}


从中可以看出,在handle_stripe5()中,会设置错误磁盘的R5_ReWrite和R5_Wantwrite标志位,从而写回错误的成员磁盘扇区中。

综上所述,linux soft raid5可以处理一个读取错误,处理过程是通过计算其它磁盘的异或结果获取的,而且在最后利用磁盘的BSR特性,完成对读错误磁盘的回写。

内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  raid