Iostat
From pressy's brainbackup
Using this options to find issues... (SRU 60)
iostat -Td -xpnCzMu 1
once the whole system was not responsive but with no obvious problems... digging deeper we saw a dying disk still online in our core dump...
CAT(vmcore.0/11V)> zfs sync "rpool" @ 0x64002d5dc000: synced ub_timestamp: -16s txg: 42072594 current ub_timestamp: -16s txg: 42072594 spa_sync_start: -13.474908338s spa_sync is overrunning spa_sync_pass: 4 sync thread: 0x2a101783b00 SLEEP idle:2.810509485s sema:0xfff80104353b64c0 (from zfs:zio_wait+0x38) sched(zfs:txg_sync_thread) quiesce thread: 0x2a102355b00 SLEEP idle:13.474438725s cv: 0x640024fc73ac (from zfs:txg_thread_wait+0x64) sched(zfs:txg_quiesce_thread) open[1] quiesced[0] syncing[3] synced[2] ============ ============ ============ ============ txg 42072597 42072596 42072595 42072594 dp_space_towrite 102027264 352653312 244187136 0 dp_tempreserved 0 0 0 0 dp_blocks_tofree 23 67 171 0 dp_unique_blocks_toxcopy 0 0 0 0 dp_duplicate_blocks_toxcopy 0 0 0 0 dp_space_toxcopy 0 0 0 0 dp_sync_write_time: 8.607377705s dp_read_overhead: 0s dp_sync_block_free_time: 0.000011722s dp_sync_scan_time: 0s dp_prev_txg_sync_time: 25.500s dp_prev_sync_write_time: 23.267171916s dp_written_in_sync: 243904512 (9.69M actual) dp_writes_inflight: 698867712 (27.7M actual) dp_sync_blocks_freed: 1 blocks dp_prev_xcopy_blocks: 0 blocks dp_throughput: 40265 B/ms (1.59M/s actual) dp_freerate: 312975 blocks/s dp_write_limit: 201326592 (8M actual) dp_write_limit_min: 201326592 (8M actual) dp_free_blocks_limit: 32768 blocks dp_sync_max_count: 44 dp_sync_max_last: Mon Jan 16 16:02:48 UTC 2023 dp_sync_max_limit: 201326592 (192M) dp_sync_max_ms: 54.800s dp_sync_max_txg: 42072582 CAT(vmcore.0/11V)> dev latency sd 6 kstat:/disk/sd/6/io_latency response time count density distribution <16us 0 0.00% 0.00% 16-32us 1 0.00% 0.00% 32-64us 64 0.02% 0.02% 64-128us 869 0.29% 0.31% 128-256us 15994 5.34% 5.65% 256-512us 27612 9.21% 14.86% 512-1024us 35023 11.69% 26.55% 1-2ms 32486 10.84% 37.38% 2-4ms 19954 6.66% 44.04% 4-8ms 25050 8.36% 52.40% 8-16ms 29670 9.90% 62.30% 16-32ms 34515 11.52% 73.81% 32-64ms 28853 9.63% 83.44% 64-128ms 12078 4.03% 87.47% 128-256ms 2709 0.90% 88.37% 256-512ms 425 0.14% 88.52% 512-1024ms 278 0.09% 88.61% 1-2s 23491 7.84% 96.45% 2-4s 7923 2.64% 99.09% 4-8s 2656 0.89% 99.98% 8-16s 72 0.02% 100.00% >16s 0 0.00% 100.00% total 299723 CAT(vmcore.0/11V)> zfs -l spa "rpool" @ 0x64002d5dc000 (ACTIVE) 49% used (276G/556G) scrub finished on Thu Dec 29 06:44:52 UTC 2022, repaired 0 in 7h5m12s with 0 error vdev Address State Aux Description 0x64002d5cb000 HEALTHY - root-0 0x64002d5c8000 HEALTHY - mirror-0 0x64002d5c9000 HEALTHY - /dev/dsk/c0t5000CCA0565BC6D0d0s0 0x64002d5c6000 HEALTHY - /dev/dsk/c0t5000CCA0565BC3CCd0s0 0x64002d5c7000 HEALTHY - /dev/dsk/c0t5000CCA05659D358d0s0. <----sd 6 0x64002d5dd000 HEALTHY - spareroot-0 0x64002d5c3000 HEALTHY - /dev/dsk/c0t5000CCA056577D60d0s0 CAT(vmcore.0/11V)> dev busy Scanning for busy devices: sd6 @ 0x64002cf5ad00 (sd_lun), DEVICE BUSY, un_ncmds_in_driver: 1, un_ncmds_in_transport: 1 /ORCL,SPARC-T5-4/scsi_vhci/scsiclass,00@g5000cca05659d358 zfs vdev queues (zfs_vdev_max_pending: 10): pool "rpool" vdev /dev/dsk/c0t5000CCA05659D358d0s0 @ 0x64002d5c7000 vq_deadline_tree: 0, vq_read_tree: 0, vq_write_tree: 0, vq_pending_tree: 1 zfs pending io: spa "rpool" @ 0x64002d5dc000 (ACTIVE) 49% used (276G/556G) /dev/dsk/c0t5000CCA05659D358d0s0 sd6 (1 cmd, longest pending time 2.807017193 seconds) zio pending time buf R/W scsi_pkt packet start 0xfff801033dc673e8 2.807017193s 0xfff801003f358d08 W 0x640045f41340 2.807010638s /sr/3-09999999999/Explorer/explorer.86583752.ldom0001-2023.01.16.12.54/ldom/ldm_list_-o_disk.out NAME primary VDS NAME VOLUME OPTIONS MPGROUP DEVICE primary-vds ldom0910.hdd0 /dev/zvol/dsk/rpool/ldoms/ldom0910.hdd0 ldom0920.hdd0 /dev/zvol/dsk/rpool/ldoms/ldom0920.hdd0 ldom0901.hdd0 /dev/zvol/dsk/rpool/ldoms/ldom0901.hdd0 ldom0911.hdd0 /dev/zvol/dsk/rpool/ldoms/ldom0911.hdd0 ------------------------------------------------------------------------------ NAME ldom0901 DISK NAME VOLUME TOUT ID DEVICE SERVER MPGROUP hdd0 ldom0901.hdd0@primary-vds 0 disk@0 primary ------------------------------------------------------------------------------ NAME ldom0910 DISK NAME VOLUME TOUT ID DEVICE SERVER MPGROUP hdd0 ldom0910.hdd0@primary-vds 0 disk@0 primary ------------------------------------------------------------------------------ NAME ldom0911 DISK NAME VOLUME TOUT ID DEVICE SERVER MPGROUP hdd0 ldom0911.hdd0@primary-vds 0 disk@0 primary ------------------------------------------------------------------------------ NAME ldom0920 DISK NAME VOLUME TOUT ID DEVICE SERVER MPGROUP hdd0 ldom0920.hdd0@primary-vds 0 disk@0 primary rpool/ldoms/ldom0901.hdd0 sync standard default CAT(vmcore.0/11V)> tlist -h -s idle call txg_wait_open thread pri pctcpu idle PID wchan command 0x64006e968b80 60 0.017 13.474440660s 19133 0x640024fc73ae /usr/bin/prstat -s size -can 10 1 20 0x64006e9a2640 60 0.053 13.474560165s 19218 0x640024fc73ae /usr/bin/netstat -in -I lo0 1 20 0x64006e9491c0 60 0.167 13.474579815s 19163 0x640024fc73ae /usr/bin/iostat -LY 1 20 0x64006e997300 60 0.055 13.474615763s 19223 0x640024fc73ae /usr/bin/netstat -in -I net18 1 20 0x64006e9aab40 60 0.363 13.474639441s 19430 0x640024fc73ae /tmp/guds_perl -w /var/tmp/guds_data/arcstat.pl 1 20 0x64006e96fa00 60 0.992 13.474652565s 19123 0x640024fc73ae /usr/bin/prstat -s cpu -can 10 1 20 0x64006e9795c0 60 5.523 13.474660141s 19440 0x640024fc73ae /usr/sbin/intrstat 1 20 0x64006e98e800 60 0.138 13.474660845s 19158 0x640024fc73ae /usr/bin/iostat -xpnCXMmz 1 20 0x64006e994d00 60 0.053 13.474663426s 19207 0x640024fc73ae /usr/bin/netstat -in -I aggr1 1 20 0x64006dd22240 60 1.206 13.474679198s 19118 0x640024fc73ae /usr/bin/prstat -Z -n 1 1 20 0x64006e96c5c0 60 0.613 13.474680473s 19128 0x640024fc73ae /usr/bin/prstat -s rss -can 10 1 20 0x64006e9d0300 60 0.020 13.474691618s 19102 0x640024fc73ae /usr/bin/vmstat 1 20 0x64006dd95ac0 60 0.143 13.474697311s 19153 0x640024fc73ae /usr/bin/iostat -xpnC 1 20 0x64006e9750c0 60 0.021 13.474701331s 19113 0x640024fc73ae /usr/bin/vmstat -p 1 20 0x64006e9ab000 60 0.136 13.474701503s 19147 0x640024fc73ae /usr/bin/iostat -x 1 20 0x64006e98f640 60 0.115 13.474703491s 19483 0x640024fc73ae /usr/bin/poolstat -r pset 1 20