最近遇到一例3.10内核的crash:
[ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 4109.683440] fas_readwriter D ffff881fffd147c0 0 46745 43298 0x00000080 [ 4109.683444] ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8 [ 4109.683469] ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20 [ 4109.683487] ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28 [ 4109.683491] Call Trace: [ 4109.683503] [<ffffffff8163f959>] schedule_preempt_disabled+0x29/0x70 [ 4109.683507] [<ffffffff8163d415>] __mutex_lock_slowpath+0xc5/0x1c0 [ 4109.683512] [<ffffffff811eaff7>] ? unlazy_walk+0x87/0x140 [ 4109.683515] [<ffffffff8163c87f>] mutex_lock+0x1f/0x2f [ 4109.683522] [<ffffffff81636235>] lookup_slow+0x33/0xa7 [ 4109.683525] [<ffffffff811f0483>] path_lookupat+0x773/0x7a0 [ 4109.683531] [<ffffffff81274454>] ? SYSC_semtimedop+0x264/0xd10 [ 4109.683537] [<ffffffff810c0cbb>] ? should_numa_migrate_memory+0x5b/0x150 [ 4109.683542] [<ffffffff811c3345>] ? kmem_cache_alloc+0x35/0x1d0 [ 4109.683545] [<ffffffff811f117f>] ? getname_flags+0x4f/0x1a0 [ 4109.683548] [<ffffffff811f04db>] filename_lookup+0x2b/0xc0 [ 4109.683550] [<ffffffff811f22a7>] user_path_at_empty+0x67/0xc0 [ 4109.683556] [<ffffffff811993c7>] ? handle_mm_fault+0x607/0xf40 [ 4109.683559] [<ffffffff811f2311>] user_path_at+0x11/0x20 [ 4109.683564] [<ffffffff811e5b53>] vfs_fstatat+0x63/0xc0 [ 4109.683567] [<ffffffff811e60be>] SYSC_newstat+0x2e/0x60 [ 4109.683570] [<ffffffff81273ced>] ? do_smart_update+0x11d/0x130 [ 4109.683575] [<ffffffff8101c899>] ? read_tsc+0x9/0x10 [ 4109.683580] [<ffffffff8110d336>] ? __audit_syscall_exit+0x1e6/0x280 [ 4109.683583] [<ffffffff811e639e>] SyS_newstat+0xe/0x10 [ 4109.683586] [<ffffffff81649909>] system_call_fastpath+0x16/0x1b
看来在一个mutex上处于uninterrupt同步等待超过了时间。
看堆栈是stat调用访问文件的元数据,
crash> struct nameidata ffff881f320bfdc0 struct nameidata { path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 }, last = { { { hash = 3495247374, len = 27 }, hash_len = 119459364366 }, name = 0xffff881040c56031 "00fe_0002_011712b3_00000001"-----------访问的文件名 }, root = { mnt = 0xffff883fccff1020, dentry = 0xffff881fcc7ff980 }, inode = 0xffff881571308e78, flags = 1, seq = 2, last_type = 0, depth = 0, saved_names = {0x7f30b11923e8 <Address 0x7f30b11923e8 out of bounds>, 0xffff881f8a6a2c40 "g", 0xffff881f46cc0438 "", 0xffff881fce793e80 "\260\334\341W\037\210\377\377\310\354\242o\037\210\377\377\020E-D\037\210\377\377\200\231\001\201\377\377\377\377\020\363\031\201\377\377\377\377", 0x80 <Address 0x80 out of bounds>, 0xffff881f320bfec0 "\260\004|\a(\177", 0xffffffff811993c7 <handle_mm_fault+1543> "A\211\302\351\327\375\377\377\220I\211\370L\211\341H\211\332L\211\356L\211\367D\211M\300\350\310]\003", 0x5cd06a0 <Address 0x5cd06a0 out of bounds>, 0x0} }
files 看不到这个文件,说明该进程没有open 这个file。
要找到对应的mutex:
crash> struct nameidata.path ffff881f320bfdc0 path = { mnt = 0xffff883fcb7e2820, dentry = 0xffff881f502552c0 } crash> dentry.d_inode 0xffff881f502552c0 d_inode = 0xffff881571308e78 crash> struct -x dentry.d_inode struct dentry { [0x30] struct inode *d_inode; } crash> inode.i_mutex 0xffff881571308e78 i_mutex = { count = { counter = -1 }, wait_lock = { { rlock = { raw_lock = { { head_tail = 100009462, tickets = { head = 1526, tail = 1526 } } } } } }, wait_list = { next = 0xffff8819fc5b3c90, prev = 0xffff881f5097bc40 }, owner = 0xffff881f52764500,-------spinlock在没开启debug的情况下不会设置owner,而mutex一般看owner就知道谁持有 { osq = 0x0, __UNIQUE_ID_rh_kabi_hide1 = { spin_mlock = 0x0 }, {<No data fields>} } }
然后看一下owner为啥拿了互斥量不放:
task 0xffff881f52764500 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" struct task_struct { state = 2, stack = 0xffff881a5c19c000,
查看它的堆栈:
crash> bt -f 47270 PID: 47270 TASK: ffff881f52764500 CPU: 25 COMMAND: "nginx" #0 [ffff881a5c19f7f0] __schedule at ffffffff8163df9b ffff881a5c19f7f8: 0000000000000082 ffff881a5c19ffd8 ffff881a5c19f808: ffff881a5c19ffd8 ffff881a5c19ffd8 ffff881a5c19f818: 00000000000147c0 ffff881fd3267300 ffff881a5c19f828: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f838: 7fffffffffffffff ffff881f52764500 ffff881a5c19f848: ffff881fd042b500 ffff881a5c19f860 ffff881a5c19f858: ffffffff8163e879 #1 [ffff881a5c19f858] schedule at ffffffff8163e879 ffff881a5c19f860: ffff881a5c19f908 ffffffff8163c329 #2 [ffff881a5c19f868] schedule_timeout at ffffffff8163c329 ffff881a5c19f870: 7fffffffffffffff ffff881a5c19f900 ffff881a5c19f880: 0000000000000000 0000000000000000 ffff881a5c19f890: 0000000000000000 ffff8813ee889980 ffff881a5c19f8a0: ffff881a5c19f8b8 ffffffff812cea44 ffff881a5c19f8b0: 0000000000000001 ffff881a5c19f968 ffff881a5c19f8c0: ffffffffa07a51c4 000010005c19f8d8 ffff881a5c19f8d0: 0000000000000000 00000000edb89915 ffff881a5c19f8e0: ffff8813ee889a58 ffff8813ee889a60 ffff881a5c19f8f0: 7fffffffffffffff ffff881f52764500 ffff881a5c19f900: ffff881fd042b500 ffff881a5c19f968 ffff881a5c19f910: ffffffff8163ec46 #3 [ffff881a5c19f910] wait_for_completion at ffffffff8163ec46 ffff881a5c19f918: 0000000000000001 ffff881f52764500 ffff881a5c19f928: ffffffff810b9930 ffff8813ee889a68 ffff881a5c19f938: ffff8813ee889a68 00000000edb89915 ffff881a5c19f948: ffff8813ee889980 0000000000000001 ffff881a5c19f958: ffffffffa07a6ce3 0000000000000000 ffff881a5c19f968: ffff881a5c19f990 ffffffffa07a6b74 #4 [ffff881a5c19f970] xfs_buf_submit_wait at ffffffffa07a6b74 [xfs]---------同步的buffer io提交函数 ffff881a5c19f978: ffff8813ee889980 0000000000000001-----------ffff8813ee889980 为xfs_buf ffff881a5c19f988: ffffffffa07d3829 ffff881a5c19f9a0 ffff881a5c19f998: ffffffffa07a6ce3 #5 [ffff881a5c19f998] _xfs_buf_read at ffffffffa07a6ce3 [xfs] ffff881a5c19f9a0: ffff881a5c19f9e0 ffffffffa07a6dda #6 [ffff881a5c19f9a8] xfs_buf_read_map at ffffffffa07a6dda [xfs]----------,走这个分支说明tp为NULL,根据map找xfs_buf,找不到则申请一个,扇区为1960485944,长度为8 ffff881a5c19f9b0: ffffffffa07e6cc0 0000000000000000 ffff881a5c19f9c0: 0000000000000000 ffff883f001f7800 ffff881a5c19f9d0: ffff881a5c19fa60 ffff881fd042b500 ffff881a5c19f9e0: ffff881a5c19fa20 ffffffffa07d3829 #7 [ffff881a5c19f9e8] xfs_trans_read_buf_map at ffffffffa07d3829 [xfs]----根据trans和xfs_buf_map调用xfs_buf_read_map 来获取xfs_buf ffff881a5c19f9f0: 00000000edb89915 ffff881a5c19fb10 ffff881a5c19fa00: 0000000000000000 ffff881571308cc0 ffff881a5c19fa10: ffffffffa07e6cc0 ffff881a5c19fa60 ffff881a5c19fa20: ffff881a5c19faa0 ffffffffa0786204 #8 [ffff881a5c19fa28] xfs_da_read_buf at ffffffffa0786204 [xfs] ffff881a5c19fa30: ffff881a5c19fa50 ffffffffa07e6cc0 ffff881a5c19fa40: 0000000000000000 00000001a07e6650 ffff881a5c19fa50: 0000000000000000 ffff881a5c19fa60 ffff881a5c19fa60: 0000000074daa438 ffff881a00000008 ffff881a5c19fa70: 00000000edb89915 0000000000000000 ffff881a5c19fa80: ffff881a5c19fb10 ffff8810cfbd1b00 ffff881a5c19fa90: 0000000000000016 000000000000006c ffff881a5c19faa0: ffff881a5c19fad0 ffffffffa078baa6 #9 [ffff881a5c19faa8] xfs_dir3_data_read at ffffffffa078baa6 [xfs] ffff881a5c19fab0: ffffffffa07e6cc0 ffff881f5a13ad00 ffff881a5c19fac0: ffff881f5a13ad00 ffff8814905453a0 ffff881a5c19fad0: ffff881a5c19fb58 ffffffffa078f99e #10 [ffff881a5c19fad8] xfs_dir2_leafn_lookup_for_entry at ffffffffa078f99e [xfs] ffff881a5c19fae0: 0000000172bc1800 ffff881f5a13ad58 ffff881a5c19faf0: ffff883f001f7800 ffff881571308cc0 ffff881a5c19fb00: 0000000000000000 ffff881400000016 ffff881a5c19fb10: 0000000000000000 0080001d00800010 ffff881a5c19fb20: ffff000501873dff 00000000edb89915 ffff881a5c19fb30: ffff881f5a13ad40 ffff881f5a13ad00 ffff881a5c19fb40: ffff881f5a13ad10 ffff8810cfbd1b00 ffff881a5c19fb50: ffff881f5a13ad58 ffff881a5c19fb68 ffff881a5c19fb60: ffffffffa0791177 #11 [ffff881a5c19fb60] xfs_dir2_leafn_lookup_int at ffffffffa0791177 [xfs] ffff881a5c19fb68: ffff881a5c19fbc8 ffffffffa0787726 #12 [ffff881a5c19fb70] xfs_da3_node_lookup_int at ffffffffa0787726 [xfs] ffff881a5c19fb78: ffff881a5c19fbdc ffff881a5c19fcb8 ffff881a5c19fb88: 0000000000000000 ffff000100643ebe ffff881a5c19fb98: 00000000edb89915 ffff8810cfbd1b00 ffff881a5c19fba8: 0000000000000000 ffff8810cfbd1b00 ffff881a5c19fbb8: ffff881f5a13ad00 ffff881a5c19fcb8 ffff881a5c19fbc8: ffff881a5c19fc08 ffffffffa0791ded #13 [ffff881a5c19fbd0] xfs_dir2_node_lookup at ffffffffa0791ded [xfs] ffff881a5c19fbd8: ffff881a5c19fc08 00000000edb89915 ffff881a5c19fbe8: ffff8810cfbd1b00 0000000000000000 ffff881a5c19fbf8: 0000000000000000 0000000000000000 ffff881a5c19fc08: ffff881a5c19fc58 ffffffffa07897b5 #14 [ffff881a5c19fc10] xfs_dir_lookup at ffffffffa07897b5 [xfs] ffff881a5c19fc18: ffff881a5c19fc68 0000000000000008 ffff881a5c19fc28: 00000000edb89915 0000000000000008 ffff881a5c19fc38: ffff881571308cc0 ffff881a5c19fcb8 ffff881a5c19fc48: 0000000000000000 ffff881a5c19fcb0 ffff881a5c19fc58: ffff881a5c19fca0 ffffffffa07b7bb6 #15 [ffff881a5c19fc60] xfs_lookup at ffffffffa07b7bb6 [xfs] ffff881a5c19fc68: ffff881a5c19fc90 00000000edb89915 ffff881a5c19fc78: ffff881fcd1038c0 0000000000008000 ffff881a5c19fc88: ffff881a5c19fdf0 ffff881f502552c0 ffff881a5c19fc98: ffff881a5c19fe50 ffff881a5c19fcd8 ffff881a5c19fca8: ffffffffa07b4a0b #16 [ffff881a5c19fca8] xfs_vn_lookup at ffffffffa07b4a0b [xfs] ffff881a5c19fcb0: ffff881a5c19fe60 ffff881fcd1038f8 ffff881a5c19fcc0: 000000000000001b 00000000edb89915 ffff881a5c19fcd0: ffff881fcd1038c0 ffff881a5c19fcf8 ffff881a5c19fce0: ffffffff811eac3d #17 [ffff881a5c19fce0] lookup_real at ffffffff811eac3d ffff881a5c19fce8: ffff881f502552c0 ffff881a5c19ff28 ffff881a5c19fcf8: ffff881a5c19fda8 ffffffff811ee813 #18 [ffff881a5c19fd00] do_last at ffffffff811ee813---------------do_last-->lookup_open-->lookup_real,对应2905行 ffff881a5c19fd08: ffff881a5c19ff58 ffffffff81274abd ffff881a5c19fd18: 0000000000000000 ffff881f52764500 ffff881a5c19fd28: 0000001f52764500 0000000000000000 ffff881a5c19fd38: ffff881571308e78 ffff88041076d000------------ ffff881571308e78对应的inode ffff881a5c19fd48: ffff881a5c19fde4 ffff881f502552c0-------------父目录的dentry ffff881a5c19fd58: ffff880aed4d5700 ffff880000000024 ffff881a5c19fd68: 01ff880aed4d5700 ffff881a5c19fd80 ffff881a5c19fd78: 00000000edb89915 ffff881a5c19fe50 ffff881a5c19fd88: ffff880aed4d5700 ffff88041076d000 ffff881a5c19fd98: ffff881a5c19ff28 ffff881f52764500 ffff881a5c19fda8: ffff881a5c19fe40 ffffffff811f0be2 #19 [ffff881a5c19fdb0] path_openat at ffffffff811f0be2 ffff881a5c19fdb8: ffffea00515463b0 0000000000000000 ffff881a5c19fdc8: 0000000000000200 ffff883fc04c7080 ffff881a5c19fdd8: 0000004100000000 00000000000000a9 ffff881a5c19fde8: 0000000000000000 00007f2c5676d000 ffff881a5c19fdf8: ffffea00d1d8c2c0 00000000edb89915 ffff881a5c19fe08: 00007f2c5676dff0 00000000edb89915 ffff881a5c19fe18: 00000000ffffff9c ffff88041076d000 ffff881a5c19fe28: ffff881a5c19ff28 0000000000000001 ffff881a5c19fe38: 00000000000036cc ffff881a5c19ff10 ffff881a5c19fe48: ffffffff811f23ab #20 [ffff881a5c19fe48] do_filp_open at ffffffff811f23ab ffff881a5c19fe50: ffff883fcb7e2820 ffff881f502552c0 ffff881a5c19fe60: 0000001bc5e44c0c ffff88041076d031 ffff881a5c19fe70: ffff883fccff1020 ffff881fcc7ff980 ffff881a5c19fe80: ffff881571308e78 0000000200000101 ffff881a5c19fe90: 0000000000000000 0000000000000001 ffff881a5c19fea0: 00007f2c64217510 0000000000000000 ffff881a5c19feb0: 0000000000000000 ffff881a5c19ff00 ffff881a5c19fec0: ffffffff811fef47 ffff883fc175c3c0 ffff881a5c19fed0: 0010000000008000 0000000000000001 ffff881a5c19fee0: 0000000000008000 00000000edb89915 ffff881a5c19fef0: 0000000000000001 00000000000000e3 ffff881a5c19ff00: 00000000ffffff9c ffff88041076d000 ffff881a5c19ff10: ffff881a5c19ff68 ffffffff811dfd53 #21 [ffff881a5c19ff18] do_sys_open at ffffffff811dfd53 ffff881a5c19ff20: ffff881f49292a48 ffff000000008000 ffff881a5c19ff30: 0000010000000024 00000000edb89915 ffff881a5c19ff40: 0000000000000000 00007f2c64217664 ffff881a5c19ff50: 0000000000000002 00007f2c64217653 ffff881a5c19ff60: 00000000000036cc ffff881a5c19ff78 ffff881a5c19ff70: ffffffff811dfe6e #22 [ffff881a5c19ff70] sys_open at ffffffff811dfe6e ffff881a5c19ff78: 00007f2c64217510 ffffffff81649909 #23 [ffff881a5c19ff80] system_call_fastpath at ffffffff81649909 RIP: 00007f2e31a3dc3d RSP: 00007f2c64217358 RFLAGS: 00010202 RAX: 0000000000000002 RBX: ffffffff81649909 RCX: 0000000000010000 RDX: 0000000000000005 RSI: 0000000000000000 RDI: 00007f2c64217510 RBP: 00007f2c64217510 R8: 000000000000ffff R9: 000000000000001f R10: 00007f2e30df7000 R11: 0000000000000293 R12: ffffffff811dfe6e R13: ffff881a5c19ff78 R14: 00000000000036cc R15: 00007f2c64217653 ORIG_RAX: 0000000000000002 CS: 0033 SS: 002b crash>
该进程在一个完成量上等待:
等待的完成量: crash>completion ffff8813ee889a58 struct completion { done = 0,----------------至今未完成,如果完成则为1,由complete 函数修改,并唤醒等待的task wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }
这个完成量其实就是在等待一个io的完成,
对应的调用链:
xfs_buf_read_map--》_xfs_buf_read--》xfs_buf_submit_wait-->_xfs_buf_ioapply-->xfs_buf_ioapply_map-->submit_bio
_xfs_buf_ioapply返回后,执行wait_for_completion,然后就一直等着了
查看对应的xfs_buf:
crash> xfs_buf 0xffff8813ee889980 struct xfs_buf { b_rbnode = { __rb_parent_color = 18446612139147634816, rb_right = 0x0, rb_left = 0x0 }, b_bn = 1960485944,----------扇区号 b_length = 8, b_hold = { counter = 2 }, b_lru_ref = { counter = 1 }, b_flags = 1048577,也就是0x100001,io未完成,没有置为异步标志:#define XBF_ASYNC (1 << 4),#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */,说明需要等待的同步io b_sema = { lock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } }, count = 0, wait_list = { next = 0xffff8813ee8899b8, prev = 0xffff8813ee8899b8 } }, b_lru = { next = 0xffff8813ee8899c8, prev = 0xffff8813ee8899c8 }, b_lru_flags = 0, b_lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, b_io_error = 0, b_waiters = { lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, task_list = { next = 0xffff8813ee8899f0, prev = 0xffff8813ee8899f0 } }, b_list = { next = 0xffff8813ee889a00, prev = 0xffff8813ee889a00 }, b_pag = 0xffff881f4b98db00,-------这个xfs_buf关联的ag b_target = 0xffff881fd042b500, b_addr = 0xffff880750046000, b_ioend_work = {----------------------这个是io结束之后,会用到的work_struct data = { counter = 0 }, entry = { next = 0x0, prev = 0x0 }, func = 0x0 }, b_ioend_wq = 0xffff881f5ab7c400,-----------ioend之后,初始化对应的b_ioend_work,然后挂到这个workqueue中去 b_iodone = 0x0,-----------------------为NULL, b_iowait = {---------------这个就是完成量的地址 done = 0, wait = { lock = { { rlock = { raw_lock = { { head_tail = 131074, tickets = { head = 2,------------这把锁曾经吃过一次亏,记忆深刻, tail = 2 } } } } } }, task_list = { next = 0xffff881a5c19f930, prev = 0xffff881a5c19f930 } } }, b_fspriv = 0x0, b_transp = 0x0,----------------关联的xfs_trans,传入的tans为NULL b_pages = 0xffff8813ee889a90, b_page_array = {0xffffea001d401180, 0x0}, b_maps = 0xffff8813ee889aa8, __b_map = { bm_bn = 1960485944,----------扇区,由于一个ag是管理个扇区,所以这个扇区对应的是ag1 bm_len = 8------长度 }, b_map_count = 1, b_io_length = 8, b_pin_count = { counter = 0 }, b_io_remaining = { counter = 1------------依然为1 }, b_page_count = 1, b_offset = 0, b_error = 0, b_ops = 0xffffffffa07e6cc0 <xfs_dir3_data_buf_ops>---xfs_buf负责的不同对象,有不同的操作指针
在io正常完成时,回调的 xfs_buf_ioend 会处理xfs_buf的io结果,
if (bp->b_iodone)------我们的buf不满足
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)----我们的buf也不满足
xfs_buf_relse(bp);
else
complete(&bp->b_iowait);----我们的buf该走的流程
xfs_buf是由ag来管理的,xfs使用一颗radix树来管理ag,我们的扇区号除以8就是block编号,每个ag管理的block数是固定的,那么很容易获取到对应的index。
crash> xfs_perag 0xffff881f4b98db00 struct xfs_perag { pag_mount = 0xffff883f001f7800, pag_agno = 1,-------------------------和上面对得上 pag_ref = { counter = 1002 }, pagf_init = 1 ‘\001‘, pagi_init = 1 ‘\001‘, pagf_metadata = 0 ‘\000‘,----是否和元数据关联 pagi_inodeok = 1 ‘\001‘, pagf_levels = "\001\001", pagf_flcount = 4, pagf_freeblks = 112266782,------空闲block pagf_longest = 112265313,----最长的空闲块 pagf_btreeblks = 0, pagi_freecount = 39,--------空闲inode个数 pagi_count = 175488,--------已经分配的inode个数 pagl_pagino = 81, pagl_leftrec = 1855879528, pagl_rightrec = 76512, pagb_lock = { { rlock = { raw_lock = { { head_tail = 4587590, tickets = { head = 70, tail = 70 } } } } } }, pagb_tree = { rb_node = 0x0 }, pagf_fstrms = { counter = 0 }, pag_ici_lock = { { rlock = { raw_lock = { { head_tail = 65799148, tickets = { head = 1004, tail = 1004 } } } } } }, pag_ici_root = { height = 5, gfp_mask = 32, rnode = 0xffff88144b2c7909 }, pag_ici_reclaimable = 0, pag_ici_reclaim_lock = { count = { counter = 1 }, wait_lock = { { rlock = { raw_lock = { { head_tail = 0, tickets = { head = 0, tail = 0 } } } } } }, wait_list = { next = 0xffff881f4b98db70, prev = 0xffff881f4b98db70 }, owner = 0x0, { osq = 0x0, __UNIQUE_ID_rh_kabi_hide1 = { spin_mlock = 0x0 }, {<No data fields>} } }, pag_ici_reclaim_cursor = 0, pag_buf_lock = { { rlock = { raw_lock = { { head_tail = 593896294, tickets = { head = 9062, tail = 9062 } } } } } }, pag_buf_tree = { rb_node = 0xffff881576c18180-----------管理xfs_buf的红黑树 }, callback_head = { next = 0x0, func = 0x0 }, pagb_count = 0 }
ag使用红黑树来管理xfs_buf,我进一步确认下我找的xfs_buf在红黑树中:
tree -t rbtree -o xfs_buf.b_rbnode ffff881f4b98dba0 -s xfs_buf.b_bn,b_length ffff881576c18180 b_bn = 2021058616 b_length = 8 ffff881571630d80 b_bn = 1975870104 b_length = 8 ffff88144a547180 b_bn = 1959990760 b_length = 8 ffff881452e61800 b_bn = 1957446840 b_length = 8 ffff883ed4be4a80 b_bn = 1953586592 b_length = 8 ffff883e829a3000 b_bn = 1953509312 b_length = 8 ffff881577099080 b_bn = 1953509298 b_length = 1 ffff8817d2cd1200 b_bn = 1953509297 b_length = 1 ffff883e829a3300 b_bn = 1953509299 b_length = 1 ffff883e85d85680 b_bn = 1953509304 b_length = 8 ffff88144e985680 b_bn = 1953509352 b_length = 8 ffff883e829a2880 b_bn = 1953509320 b_length = 8 ffff883f3752f780 b_bn = 1953509344 b_length = 8 ffff883ed266a700 b_bn = 1953585224 b_length = 8 。。。。 ffff883ececa1b00 b_bn = 1960485840 b_length = 32 ffff8813ee889980---------------我们出问题的xfs_buf b_bn = 1960485944------------我们出问题的起始扇区和长度 b_length = 8 ffff883e71c6c000 b_bn = 1960486072 b_length = 8 。。。。
说明xfs_buf是正常的,那么xfs_buf下发的io请求去哪了呢?我在这个地方因为找错了block_device,发现该block_device中的request_queue的io数为0,陷入了迷茫,后来其他同事发现
找错了设备,我找的是sdae,其实出错的是sde。
根据文件路径,可以查找对应的挂载点对应的设备的request_queue:
block_device.bd_queue 0xffff883fc1048d00 bd_queue = 0xffff883fc1e28828 crash> request_queue 0xffff883fc1e28828 struct request_queue { queue_head = { next = 0xffff883fc1e28828,-----------prev和next都指向自己,说明为空,也就是目前该队列因为调度关系没有io下发的原因,目前没有待下发给驱动执行的io prev = 0xffff883fc1e28828 }, last_merge = 0xffff883e826fcd80, elevator = 0xffff883e86875000,------------------这个里面有cfqd nr_rqs = {0, 17},--------------积压了17个同步io nr_rqs_elvpriv = 17,-----------有17个io需要经过调度器,这个值最大就和nr_rqs的总数相同
request_fn = 0xffffffff81422f70 <scsi_request_fn>,---这个函数会将request从request queue中取出来执行
make_request_fn = 0xffffffff812ce690 <blk_queue_bio>,
prep_rq_fn = 0xffffffff814213c0 <scsi_prep_fn>,
merge_bvec_fn = 0x0,
softirq_done_fn = 0xffffffff81422e20 <scsi_softirq_done>,
rq_timed_out_fn = 0xffffffff8141ec00 <scsi_times_out>,-------当队列中的request超时之后被调用,
dma_drain_needed = 0x0,
lld_busy_fn = 0xffffffff81421ad0 <scsi_lld_busy>,
。。。
nr_sorted = 11,-----ELEVATOR_INSERT_SORT 方式加入到队列中的数量
in_flight = {0, 5},----有5个in_flight的同步io没有返回,其实有6个io request,但其中一个不是REQ_TYPE_FS,所以不会统计在 in_flight中
rq_timeout = 30000,--------1个request 30ms的超时时间,会在blk_add_timer中赋值给 req->timeout
timeout_list = {------------管理所有下发给驱动的request的超时
next = 0xffff883e9d3f7ed0,
prev = 0xffff883e9d24ecd0
},
我们目前知道下发io的xfs_buf,但是bio是动态申请的,怎么知道我们当前的这个io下发了多长时间呢?
在 xfs_buf_ioapply_map函数中,bio的申请如下:
static void xfs_buf_ioapply_map( struct xfs_buf *bp, int map, int *buf_offset, int *count, int rw) { 。。。 bio = bio_alloc(GFP_NOIO, nr_pages); bio->bi_bdev = bp->b_target->bt_bdev; bio->bi_sector = sector; bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; 。。。}
可以看出,bio的bi_private成员就是我们的xfs_buf,而且bi_sector成员就是我们的扇区号,既然我们知道扇区号,可以通过search 扇区的方式找到对应的bio为 0xffff881490a76000
request为 ffff8813ee88b900:
crash> struct bio.bi_sector,bi_private,bi_bdev 0xffff881490a76000 bi_sector = 1960485944 bi_private = 0xffff8813ee889980 bi_bdev = 0xffff883fc1048d00 crash> request.start_time,__sector,bio ffff8813ee88b900 start_time = 4298780289----------io生成时的jiffies __sector = 1960485944 bio = 0xffff881490a76000
crash> p jiffies
jiffies = $2 = 4298781359--------------当前的jiffies
可以看出,我们io的生成时间和当前的jiffies相差很小,也就是1秒钟左右,该进程虽然拿到了锁,但是它并不是持有了1200s,而很大的可能是它经过唤醒,抢到了锁,然后提交了io,继续等待io完成,正常情况下。它算幸运的,因为不幸运的 46745 进程因为等锁已经1200s了,被khungtaskd 内核线程 选中而触发了hung_task_panic 。
不管怎么样,不幸的 46745进程等待mutex已经超过了阈值,有可能是某个进程一直拿锁而不放,也有可能它排在等待队列的后面,锁虽然释放但也没轮到它。mutex本不应该这么慢,
是什么导致了这么慢呢?
crash> cfq_data.active_queue 0xffff883e86877c00 active_queue = 0xffff883e9efc03a0 crash> cfq_queue.dispatch_start 0xffff883e9efc03a0 dispatch_start = 4298776243 crash> p jiffies jiffies = $2 = 4298781359 crash> p 4298781359-4298776243 $3 = 5116
可以看出,当前active的cfq_queue 距离调度发起的时候已经过去5秒钟。而正常来说,一个request设置的超时默认为30ms。
查看一下request_queue中的hash成员管理的request ,在rq_mergeable 的情况下,request的hash成员是归request queue中的hash管理的。
elevator_queue.hash 0xffff883e86875000 hash = {{ 。。。。。 }, { first = 0xffff883e826fdb80 }, { first = 0xffff883e40708800 }, { first = 0x0 }, { first = 0x0 }, { first = 0xffff8812ae0c0980 }, { 。。。。。 }, { first = 0xffff8812c96efb00 }, { first = 0x0 }, { first = 0x0 }, { first = 0xffff8813ee88b980 }, { 。。。。。 }, { first = 0xffff88014a437800 }, { first = 0x0 }, { first = 0xffff880149d39280 }, { first = 0xffff883e4341e300 }, { 。。。。。 }, { first = 0xffff883e826fce00 }, { 。。。。。
通过一个个查看,确定只有9个io,而 q->nr_sorted为11个,说明有2个io是不能merge的,再加上inflight的 5个,也就是16个io,那为什么request queue中alloced了17个io呢。
那是因为,有一个io没有计入在 in_flight 中。
9个未执行的可以merge的io: crash> request.start_time 0xffff883e826fdb00 start_time = 4298776286 crash> request.start_time 0xffff883e40708780 start_time = 4298780265 crash> request.start_time 0xffff8812ae0c0900 start_time = 4298776286 crash> request.start_time 0xffff8812c96efa80 start_time = 4298780264 crash> request.start_time 0xffff8813ee88b900 start_time = 4298780289 crash> request.start_time 0xffff88014a437780 start_time = 4298780258 crash> request.start_time 0xffff880149d39200 start_time = 4298780285 crash> request.start_time 0xffff883e4341e280 start_time = 4298780285 crash> request.start_time 0xffff883e826fcd80 start_time = 4298776272 2个不能merge的io:
request.hash ffff883e40708c00
hash = {
next = 0x0,
pprev = 0x0
request.hash ffff8812ae0c0780
hash = {
next = 0x0,
pprev = 0x0
}
那么等待执行的6个io怎么获取呢?在request_queue的elevator中,我们看到它有10个cfq_queue等待调度,
crash> elevator_queue.elevator_data 0xffff883e86875000 elevator_data = 0xffff883e86877c00 crash> cfq_data 0xffff883e86877c00 struct cfq_data { queue = 0xffff883fc1e28828, grp_service_tree = { rb = { rb_node = 0xffff883f36a09820 }, left = 0xffff883f36a09820, count = 4294964171, min_vdisktime = 4707409920, ttime = { last_end_request = 4295195837, ttime_total = 0, ttime_samples = 0, ttime_mean = 0 } }, root_group = 0xffff883f36a09800, serving_wl_class = BE_WORKLOAD, serving_wl_type = SYNC_WORKLOAD, workload_expires = 4298776270,---------jiffies = $1 = 4298781359,cfqd->workload_expires = jiffies + slice; serving_group = 0xffff883f36a09800,----根据这个当前服务的group,可以遍历其管理的所有cfq_queue prio_trees = {{ rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0xffff881490a87250 }, { rb_node = 0x0 }, { rb_node = 0x0 }, { rb_node = 0x0 }}, busy_queues = 10,-------------有10个queue等待调度 busy_sync_queues = 10, rq_in_driver = 5,----还没完成的io数量 rq_in_flight = {0, 5}, rq_queued = 11,-------有11个io在cfq_data中,也就是各个cfqq中的rq_queued中的总和 hw_tag = 1, hw_tag_est_depth = 32, hw_tag_samples = 51, idle_slice_timer = { entry = { next = 0x0, prev = 0xdead000000200200 }, expires = 4298772352, base = 0xffff881fd2fc8000, function = 0xffffffff812ef710 <cfq_idle_slice_timer>, data = 18446612400859216896, slack = -1, start_pid = -1, start_site = 0x0, start_comm = "\000\000\000\000\000\000\000\000\000\000\000\000\000\000\000" }, unplug_work = { data = { counter = 32 }, entry = { next = 0xffff883e86877d28, prev = 0xffff883e86877d28 }, func = 0xffffffff812edef0 <cfq_kick_queue> }, active_queue = 0xffff883e9efc03a0,---------当前active的queue active_cic = 0xffff883e47904f00, async_cfqq = {{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, {0x0, 0x0, 0x0, 0x0, 0xffff883f1f7c9878, 0x0, 0x0, 0x0}},---根据io优先级的异步cfq_queue async_idle_cfqq = 0x0, last_position = 5943375904, cfq_quantum = 8,-----static const int cfq_quantum = 8; cfq_fifo_expire = {250, 125},---static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; cfq_back_penalty = 2,-----static const int cfq_back_penalty = 2; cfq_back_max = 16384,-----static const int cfq_back_max = 16 * 1024; cfq_slice = {40, 100},---static int cfq_slice_async = HZ / 25;static const int cfq_slice_sync = HZ / 10; cfq_slice_async_rq = 2,---static const int cfq_slice_async_rq = 2; cfq_slice_idle = 8,----static int cfq_slice_idle = HZ / 125; cfq_group_idle = 8,----static int cfq_group_idle = HZ / 125; cfq_latency = 1,------ cfqd->cfq_latency = 1; cfq_target_latency = 300,---static const int cfq_target_latency = HZ * 3/10; /* 300 ms */ oom_cfqq = { ref = 1, flags = 64, cfqd = 0xffff883e86877c00, rb_node = { __rb_parent_color = 18446612400859217440, rb_right = 0x0, rb_left = 0x0 }, rb_key = 0, p_node = { __rb_parent_color = 18446612400859217472, rb_right = 0x0, rb_left = 0x0 }, p_root = 0x0, sort_list = { rb_node = 0x0 }, next_rq = 0x0, queued = {0, 0}, allocated = {0, 0}, fifo = { next = 0xffff883e86877e80, prev = 0xffff883e86877e80 }, dispatch_start = 0, allocated_slice = 0, slice_dispatch = 0, slice_start = 0, slice_end = 0, slice_resid = 0, prio_pending = 0, dispatched = 0, ioprio = 0, org_ioprio = 0, ioprio_class = 0, pid = 1, seek_history = 0, last_request_pos = 0, service_tree = 0x0, new_cfqq = 0x0, cfqg = 0xffff883f36a09800, nr_sectors = 0 }, last_delayed_sync = 4298780320 }
先找到当前的serving_group,
serving_group = 0xffff883f36a09800,
根据这个cfq_group,
crash> struct -xo cfq_group.service_trees 0xffff883f36a09800 struct cfq_group { [ffff883f36a09878] struct cfq_rb_root service_trees[2][3]; } crash> cfq_group.dispatched 0xffff883f36a09800 dispatched = 5 ----这个dispatched和 cfqd->rq_in_flight是相同的,对的上
根据当前的io类别:
serving_wl_class = BE_WORKLOAD,
serving_wl_type = SYNC_WORKLOAD,
可以确定它的 service_trees 的数组下标:
enum wl_type_t {---列下标 ASYNC_WORKLOAD = 0, SYNC_NOIDLE_WORKLOAD = 1, SYNC_WORKLOAD = 2 }; enum wl_class_t {----行下标 BE_WORKLOAD = 0, RT_WORKLOAD = 1, IDLE_WORKLOAD = 2, CFQ_PRIO_NR, };
找到对应的下标为[0][1],[0][2]。
总共10个queue crash> tree -t rb_tree 0xffff883f36a098f8 ffff883f061a0920 ffff883e6fb8c3b0 ffff883e9efc03b0 ffff881490a87230 ffff883e71bcdee0 ffff8801c8b61df8 ffff881577ee1c28 ffff88018a4577a0 crash> tree -t rb_tree 0xffff883f36a098b8 ffff88144e961318 ffff883ea2021400
也就找到了当前group调度的cfq_queue,遍历各个cfq_queue,就知道对应的request了:
crash> cfq_queue.rb_node struct cfq_queue { [16] struct rb_node rb_node; } 由于cfq_queue通过rb_node成员嵌入到对应cfq_group的service_tree数组中对应class和类型的红黑树去,所以可以查看对应的cfq_queue为: 只有11个io,2个dispatch的 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883f061a0910 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} request=ffff88014a437780 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e6fb8c3a0------相当于有一个在request_queue中或者到driver层了,有一个还未被dispatch allocated = {2, 0} ref = 4 dispatched = 1-------------它的dispatch为1 prio_pending = 0 queued = {0, 1} request=ffff883e826fcd80 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e9efc03a0---当前active的queue,无request了 allocated = {1, 0} ref = 2 dispatched = 1-------------它的dispatch为1, prio_pending = 0 queued = {0, 0} crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881490a87220 allocated = {2, 0} ref = 3 dispatched = 0 prio_pending = 0 queued = {0, 2} ffff8812ae0c0780----不在hash链中 request.hash ffff8812ae0c0780 hash = { next = 0x0, pprev = 0x0 } ffff8812ae0c0900 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883e71bcded0 allocated = {2, 0} ref = 3 dispatched = 0 prio_pending = 0 queued = {0, 2} ffff883e40708780 ffff883e40708c00---这个io不在hash中 crash> request.hash ffff883e40708c00 hash = { next = 0x0, pprev = 0x0 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff8801c8b61de8 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff8812c96efa80 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff881577ee1c18 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff883e4341e280 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88018a457790 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff8813ee88b900 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff88144e961308 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff883e826fdb00 crash> cfq_queue.allocated,ref,dispatched,prio_pending ffff883ea20213f0 allocated = {1, 0} ref = 2 dispatched = 0 prio_pending = 0 queued = {0, 1} ffff880149d39200
其中,所以dispatch的io,可以在request_queue中的timeout_list 中找到:
crash> struct -xo request_queue.timeout_list 0xffff883fc1e28828 struct request_queue { [ffff883fc1e28dc0] struct list_head timeout_list; } crash> list request.timeout_list -s request.__sector,queuelist,cmd_type,bio -H ffff883fc1e28dc0 ffff883e9d3f7d80 __sector = 4090189560 queuelist = { next = 0xffff883e9d3f7d80, prev = 0xffff883e9d3f7d80 } cmd_type = REQ_TYPE_FS bio = 0xffff883ebb2f2200 ffff8804a8440d80 __sector = 4151431904 queuelist = { next = 0xffff8804a8440d80, prev = 0xffff8804a8440d80 } cmd_type = REQ_TYPE_FS bio = 0xffff88029f7a9f00 ffff8802924f8a80-------------------------------------这个io是dfs下发的ioctl __sector = 18446744073709551615 queuelist = { next = 0xffff8802924f8a80, prev = 0xffff8802924f8a80 } cmd_type = REQ_TYPE_BLOCK_PC bio = 0x0 ffff883e73374480 __sector = 5860805840 queuelist = { next = 0xffff883e73374480, prev = 0xffff883e73374480 } cmd_type = REQ_TYPE_FS bio = 0xffff883ea412bd00 ffff883ed1c4ea00 __sector = 5977690392 queuelist = { next = 0xffff883ed1c4ea00, prev = 0xffff883ed1c4ea00 } cmd_type = REQ_TYPE_FS bio = 0xffff883f1e7b6300 ffff883e9d24eb80 __sector = 5943375896 queuelist = { next = 0xffff883e9d24eb80, prev = 0xffff883e9d24eb80 } cmd_type = REQ_TYPE_FS bio = 0xffff883f1ae6ec00
6个io,加上11个未下发到request_queue中的io,为17个,全部找到了,通过上面的遍历,对io的管理也都熟悉了,此为后话。
我们在回过头来看我们的log:
[ 3272.940387] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 3272.940392] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor] [ 3272.940395] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error [ 3272.940397] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00 [ 3272.940399] blk_update_request: critical medium error, dev sde, sector 8 [ 3272.942369] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8 [ 3278.783813] sd 5:0:4:0: [sde] FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_SENSE [ 3278.783823] sd 5:0:4:0: [sde] Sense Key : Medium Error [current] [descriptor] [ 3278.783826] sd 5:0:4:0: [sde] Add. Sense: Unrecovered read error [ 3278.783830] sd 5:0:4:0: [sde] CDB: Read(16) 88 00 00 00 00 00 00 00 00 08 00 00 00 08 00 00 [ 3278.783832] blk_update_request: critical medium error, dev sde, sector 8 [ 3278.785946] XFS (sde): metadata I/O error: block 0x8 ("xfs_trans_read_buf_map") error 61 numblks 8 [ 3820.434234] umount:start to umount target /run/user/989 . [ 3820.434239] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs [ 3820.434287] umount:finish umount target :/run/user/989 ,retval:0 . [ 4000.258607] umount:start to umount target /run/user/989 . [ 4000.258613] umount:according to target /run/user/989 , fstype tmpfs,device tmpfs [ 4000.258661] umount:finish umount target :/run/user/989 ,retval:0 . [ 4109.680685] INFO: task fas_readwriter:46745 blocked for more than 1200 seconds. [ 4109.682163] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [ 4109.683440] fas_readwriter D ffff881fffd147c0 0 46745 43298 0x00000080 [ 4109.683444] ffff881f320bfc20 0000000000000082 ffff881f320bffd8 ffff881f320bffd8 [ 4109.683469] ffff881f320bffd8 00000000000147c0 ffff881fd321ae00 ffff881571308f20 [ 4109.683487] ffff881571308f24 ffff881f32186780 00000000ffffffff ffff881571308f28 [ 4109.683491] Call Trace:
在触发crash之前,前面一直打印 访问8号扇区出错,对应的调度链是什么呢?
scsi_finish_command-->scsi_io_completion case ACTION_FAIL: -->scsi_print_result -->scsi_print_sense -->scsi_print_command -->scsi_end_request---->blk_update_request
从现象来看,综合日志中的delay打印:
read file dee46cc8be5ab43982c75a835768773e delay 3511 ms, readsize(581795) count(581795) offset(0)>
推测是由于访问硬盘慢导致了io的积压,可能是因为坏块的重试,具体原因由于功力问题,无法确认。
结论:
1.访问某些扇区,可能会导致多次重试而io缓慢,如果后面的io继续来到,则可能积压。
2.hungtask检测挑选的是第一个满足条件的task 来打印,但其实有可能另外一个进程等待已经超过2个hung检测的周期而没有被选中,因为khungtask在上一次调度的时候,大家都没有到检测周期,等到了下一个检测周期之后,可能多个task 满足条件而挑选的是第一个task,甚至有可能有的task在下一个检测周期之间到达了阈值而因为等待的资源得到满足之后又重新开始调度了。
原文:https://www.cnblogs.com/10087622blog/p/10882980.html