On Oracle: February 2020

Tuesday, February 25, 2020

Cache Buffer Chains Latch Contention Case Study-2: Reverse Primary Key Index

Continue with previous Blog: Cache Buffer Chains Latch Contention Case Study-1: Reverse Key Index, now we are going to make the second Case Study: Reverse Primary Key Index. It shows the diligent work of Oracle optimization.

Note: Tested in Oracle 12c, 18c, 19c.

1. Test Setup

First we drop the previous reverse index and create a new reverse primary key index, then get object_id of that index:


--drop table test_tab purge;
--create table test_tab (id number, val number);

drop index test_tab#r;

create unique index test_tab#r on test_tab(id) reverse;

alter table test_tab add constraint test_tab#r primary key (id);

select object_id, to_char(object_id, 'XXXXXXXX') id_hex, data_object_id, to_char(data_object_id, 'XXXXXXXX') did_hex 
from dba_objects where object_name = 'TEST_TAB#R';

  OBJECT_ID ID_HEX DATA_OBJECT_ID DID_HEX
  --------- ------ -------------- -------
    2460326 258AA6        2460326  258AA6

2. Leaf Block Split Point Finding

With appended Plsql procedure: leaf_split_finder, we can find one exact point of leaf block split.


SQL > exec leaf_split_finder(5000);

--- Leaf Split at row count: 4906, (id between 1 and 11) in 2 Blocks
block: 2661262, cnt: 5, min_id: 1, max_id: 5
block: 3665613, cnt: 6, min_id: 6, max_id: 11

Above output shows index split occurs at row count: 4906. In the previous Blog, that is 4708. So primary key index is more compact (4.2%) than normal index.

3. CBC Latch Contention Producing

Open a Plsql Window, truncate table, insert 4905 rows at first.


truncate table test_tab;

insert into test_tab select level, -level from dual connect by level <= 4905;

commit;

-- using above index object_id to substitute for 2460326
select blk, count(*), min(id) min_id, max(id) max_id from (
  select id 
        ,dbms_rowid.rowid_block_number(sys_op_lbid (2460326, 'L', rowid)) blk
    from test_tab)  
where id between 1 and 11
group by blk order by min_id, blk;

      BLK COUNT(*) MIN_ID MAX_ID
  ------- -------- ------ ------
  2661263       11      1     11

We can see all rows from 1 to 11 are in the same leaf block (2661263).

Now we start two groups of jobs, each group having 6 jobs in a query loop (see appended test_tab_select_job). All 6 jobs in the first group access row 1 to 5; whereas 6 jobs in the second group access row 6 to 10;


exec test_tab_select_job(6, 1e9, 1, 0);

exec test_tab_select_job(6, 1e9, 6, 0);

-- stop all jobs by:
--   exec clean_jobs;

Each job in the first group executes the query with id from 1 to 5 in a loop, whereas job in the second group executes it with id from 6 to 10. Both are with the same Xplan (INDEX UNIQUE SCAN. In previous Blog, it is INDEX RANGE SCAN).


select val from test_tab where id = :B1
 
------------------------------------------------------------------------------------------
| Id  | Operation                   | Name       | Rows  | Bytes | Cost (%CPU)| Time     |
------------------------------------------------------------------------------------------
|   0 | SELECT STATEMENT            |            |       |       |     1 (100)|          |
|   1 |  TABLE ACCESS BY INDEX ROWID| TEST_TAB   |     1 |    26 |     1   (0)| 00:00:01 |
|*  2 |   INDEX UNIQUE SCAN         | TEST_TAB#R |     1 |       |     1   (0)| 00:00:01 |
------------------------------------------------------------------------------------------

Now insert a single new row:


insert into test_tab values(4906, -4906);

select blk, count(*), min(id) min_id, max(id) max_id from (
  select id 
        ,dbms_rowid.rowid_block_number(sys_op_lbid (2460326, 'L', rowid)) blk
    from test_tab)  
where id between 1 and 11
group by blk order by min_id, blk;

      BLK COUNT(*) MIN_ID MAX_ID
  ------- -------- ------ ------
  2661263        5      1      5
  3665609        6      6     11

One leaf block split occurs. Row 6 to 11 are moved to new block (3665609). Now if we look again all 12 job sessions. Only 6 sessions in the second group have contention, similar to the case of previous Blog.


select s.client_identifier, s.sid, s.serial#, p.pid, p.spid, s.event, p1, p2, p3, latchwait, latchspin
  from v$session s, v$process p
 where s.paddr=p.addr and lower(s.program) like '%j0%' 
 order by s.client_identifier; 

  CLIENT_IDENTIFIER     SID  SERIAL#  PID  SPID  EVENT                        P1          P2       P3  LATCHWAIT         LATCHSPIN
  --------------------  ---  -------  ---  ----  ---------------------------  ----------  -------  --  ----------------  ---------
  test_tab_select_id_1  725  53859    46   5226  buffer busy waits            1548        2661259  1
  test_tab_select_id_1  549  449      51   5236  null event                   0           0        0
  test_tab_select_id_1  547  35779    45   5224  buffer busy waits            1548        2661263  1
  test_tab_select_id_1  6    60174    42   5218  buffer busy waits            1548        2661263  1
  test_tab_select_id_1  8    49157    48   5230  buffer busy waits            1548        2661263  1
  test_tab_select_id_1  187  44089    43   5220  null event                   0           0        0
  test_tab_select_id_6  727  18689    52   5238  latch: cache buffers chains  6636068072  228      0   000000018B8A5CE8
  test_tab_select_id_6  369  18373    50   5234  latch: cache buffers chains  6636068072  228      0   000000018B8A5CE8
  test_tab_select_id_6  189  58221    49   5232  latch: cache buffers chains  6636068072  228      0   000000018B8A5CE8
  test_tab_select_id_6  905  10131    53   5240  latch: cache buffers chains  6636068072  228      0   000000018B8A5CE8
  test_tab_select_id_6  367  49253    44   5222  latch: cache buffers chains  6636068072  228      0
  test_tab_select_id_6  903  64294    47   5228  latch: cache buffers chains  6636068072  228      0   000000018B8A5CE8

A further query on v$latch_children shows majority of CBC SLEEPS (140174) is on child latch: 000000018B8A5CE8 of second group. The child latch: 000000018B8DC5E8 of first group almost has no SLEEPS and WAIT_TIME.


select latch#, child#, addr, gets, misses, sleeps, spin_gets, wait_time, round(wait_time/sleeps) us_per_sleep 
  from v$latch_children 
 where addr = hextoraw('000000018B8DC5E8') or addr = hextoraw('000000018B8A5CE8'); 

  LATCH#  CHILD#  ADDR              GETS       MISSES    SLEEPS   SPIN_GETS  WAIT_TIME  US_PER_SLEEP
  ------  ------  ----------------  ---------  --------  -------  ---------  ---------  ------------
  228     31143   000000018B8DC5E8  57352278   59783     4       59779      81         20
  228     29979   000000018B8A5CE8  104090722  28359401  140174  28213248   334439191  2386

With one more query on v$latch_misses, we can display CBC latch contention locations.


select * from v$latch_misses 
 where parent_name = 'cache buffers chains' and sleep_count > 0
 order by sleep_count desc;
 
  PARENT_NAME           WHERE                     NWFAIL_COUNT  SLEEP_COUNT  WTR_SLP_COUNT  LONGHOLD_COUNT  LOCATION
  --------------------  ------------------------  ------------  -----------  -------------  --------------  -----------------------
  cache buffers chains  kcbgtcr: slow path excl    0             181709       24123          2864            kcbgtcr: slow path excl
  cache buffers chains  kcbgtcr: fast path exam    0             61093        160708         10363           kcbgtcr: fast path exam
  cache buffers chains  kcbgkcbcr                  0             8999         65758          1806            kcbgkcbcr
  cache buffers chains  kcbrls: pin release        0             7171         10946          1580            kcbrls: pin release
  cache buffers chains  kcb_trim_hash_chain        0             2191         19             30              kcb_trim_hash_chain
  cache buffers chains  kcbchg: cr pin cleanout    0             444          82             43              kcbchg: cr pin cleanout
  cache buffers chains  kcbcge                     0             3            54             1               kcbcge
  cache buffers chains  kcbchg: cr pin change      0             3            0              0               kcbchg: cr pin change
  cache buffers chains  kcbzwb                     0             1            0              0               kcbzwb
  cache buffers chains  kcbgtcr: retry after read  0             1            0              0               kcbgtcr: retry after read

The top Location are "kcbgtcr: slow path excl" and "kcbgtcr: fast path exam". "kcbgtcr: slow path excl" has higher SLEEP_COUNT, but "kcbgtcr: fast path exam" has much higher WTR_SLP_COUNT (Number of times a waiter slept) and LONGHOLD_COUNT (Number of times someone held a latch for the entire duration of someone else's sleep).

In the previous Blog for normal reverse index, the top Location is "kcbgtcr: fast path pin".

LONGHOLD_COUNT is a parmountal indicator for latch misses since it signifies the sleeping sessions can go to sleep again ("Recurrent Sleeps", see Blog: Oracle Latch Misses, Spin_gets, Sleeps and Recurrent_Misses ). But it is not included in AWR - "Latch Miss Sources" section.

If we make an index TEST_TAB#R tree dump, we can see two leaf blocks: 2661263 and 3665609:


alter session set tracefile_identifier = "TD_TEST_TAB#R_1";
alter session set events 'immediate trace name treedump level 2460326';

----- begin tree dump
branch: 0x13589b 1267867 (0: nrow: 16, level: 1)
   leaf: 0x289b8f 2661263 (-1: row:291.291 avs:3932)
   leaf: 0x37eec9 3665609 (0: row:282.282 avs:4060)
   leaf: 0x1358a1 1267873 (1: row:295.295 avs:3878)

Above dump shows that index root node block is 1267867. With following query, we can see that it is the next block of header_block: 1267866 on index TEST_TAB#R segment:


SQL > select segment_name, header_file, header_block from dba_segments where segment_name = 'TEST_TAB#R';

  SEGMENT_NAME HEADER_FILE HEADER_BLOCK
  ------------ ----------- ------------
  TEST_TAB#R            22      1267866

Even though there is heavy "latch: cache buffers chains" contention, it seems that no blocking sessions are tracked for such wait event:


SQL > select * from v$wait_chains where chain_signature like 'latch: cache buffers chains' or blocker_sid is not null;

      no rows selected

4. Latch State Transition Diagram By Dtrace

Pick one CBC latch contention session (SID: 367, Oracle PID: 44 (0x2C), spid: 5222) from above v$session and v$process query.

Run following Dtrace Script on contention latch address: 0x18B8A5CE8 (For Latch State Transition Diagram, See Blog: PL/SQL Function Result Cache Invalidation (I)).


sudo dtrace -w -n \
'
typedef unsigned int ub4;

typedef struct latchstate /* latch state */ {
    ub4 nproc_pid;  /* Nproc or pid   */
    ub4 flag;       /* blocking flag  */ 
    ub4 gets;       /* number of gets */
    ub4 latch;      /* latch number   */
    ub4 level;      /* latch level    */
    ub4 where; } latchstate;
    
BEGIN {laddr = $1; step = 1; begin_gets = 0, end_gets = 0; prev_ts = timestamp;
   self->evt = "begin"; self->loc = 0x0;}
pid$target::kslgess:entry /arg0 == laddr && (self->evt != probefunc || self->loc != arg4)/ { 
   printf("\n---------Step %d at %Y (Elapsed-ns=%d)---------", step, walltimestamp, (timestamp-prev_ts));
   ls = ((latchstate *) copyin(arg0, sizeof(latchstate)));
   begin_gets = (begin_gets == 0 ? ls->gets: begin_gets); end_gets = ls->gets;
   printf("\n***State Entry>>>(nproc_pid=>%d, flag=>%d, gets=>%d, latch=>%d, level=>%d, where=>%d)"
           ,ls->nproc_pid, ls->flag, ls->gets, ls->latch, ls->level, ls->where);
   printf("\n>>>Transition>>>%s:%s\n      (Addr=>0x%X, Where=>%d)", 
          probefunc, probename, arg0, arg4);
   self->evt = probefunc; self->loc = arg4;    
   ustack(10, 0); 
   /*stop(); exit(0);*/    /* no latch blocking, failed get, try again */
   step = step + 1; self->entry = 1; prev_ts = timestamp;} 
pid$target::kslgess:return /self->entry == 1/ {
   ls = ((latchstate *) copyin(laddr, sizeof(latchstate))); 
   printf("\n>>>State Return***(nproc_pid=>%d, flag=>%d, gets=>%d, latch=>%d, level=>%d, where=>%d)"
           ,ls->nproc_pid, ls->flag, ls->gets, ls->latch, ls->level, ls->where);
   /*stop(); exit(0);*/    /* simulate long latch holder, run by: dtrace -w -n */
   self->entry = 2;}   
pid$target::kslfre:entry /self->entry == 2 && arg0 == laddr && self->evt != probefunc/ {
   printf("\n---------Step %d at %Y (Elapsed-ns=%d)---------", step, walltimestamp, (timestamp-prev_ts));
   ls = ((latchstate *) copyin(arg0, sizeof(latchstate)));
   end_gets = ls->gets;
   printf("\n>>>Transition>>>%s:%s(Addr=>0x%-X)", probefunc, probename, arg0);
   self->evt = probefunc; self->loc = 0x0;
   /*stop(); exit(0);*/      /* simulate long latch holder, run by: dtrace -w -n */
   printf("\n===Latch freed===");
   ustack(10, 0); step = step + 1; prev_ts = timestamp;}      
END {
   printf("\n---------Step %d at %Y (Elapsed-ns=%d)---------", step, walltimestamp, (timestamp-prev_ts));
   printf("\nLatch Gets = %d (%d - %d)", (end_gets - begin_gets), (unsigned long)begin_gets, (unsigned long)end_gets); }
' -p 5222 0x18B8A5CE8

The output is some repeated occurrences of following pattern:


   kslgess:entry
---------Step 1825 at 2020 Feb 24 15:51:09 (Elapsed-ns=6088)---------
***State Entry>>>(nproc_pid=>47, flag=>536870912, gets=>461619058, latch=>228, level=>1, where=>2168)
>>>Transition>>>kslgess:entry
      (Addr=>0x18B8A5CE8, Where=>2218)
              oracle`kslgess
              a.out`kcbgtcr+0x1a323
              a.out`ktrgtc2+0x316
              a.out`qeilbk1+0x4a1
              a.out`qeilbk0+0xe
              a.out`qerixFetchUniqueIndex+0x17f
              a.out`qerstFetch+0x543
              a.out`qertbFetchByRowID+0x39e
              a.out`qerstFetch+0x4ca
              a.out`opifch2+0x188b

   kslgess:return
>>>State Return***(nproc_pid=>44, flag=>536870912, gets=>461619062, latch=>228, level=>1, where=>2218)

   kslgess:entry
---------Step 1826 at 2020 Feb 24 15:51:09 (Elapsed-ns=250117)---------
***State Entry>>>(nproc_pid=>43, flag=>536870912, gets=>461619113, latch=>228, level=>1, where=>2186)
>>>Transition>>>kslgess:entry
      (Addr=>0x18B8A5CE8, Where=>2186)
              oracle`kslgess
              oracle`kcbrls+0x170c
              a.out`ktrgtc2+0x554
              a.out`qeilbk1+0x4a1
              a.out`qeilbk0+0xe
              a.out`qerixFetchUniqueIndex+0x17f
              a.out`qerstFetch+0x543
              a.out`qertbFetchByRowID+0x39e
              a.out`qerstFetch+0x4ca
              a.out`opifch2+0x188b

It shows the CBC Latch State Transition. At first, C subroutine kslgess was called to get CBC latch in entry location (where=>2168) and next location (where=>2186). Then freed by kslfre. CallStack shows that each row is accessed by qertbFetchByRowID via qerixFetchUniqueIndex (TABLE ACCESS BY INDEX ROWID via INDEX UNIQUE SCAN).

For the where location, we can use the query below to find them.


CREATE OR REPLACE FORCE VIEW X_GV$LATCH_MISSES as
SELECT t1.indx              INDX,
       t1.inst_id           INST_ID,       
       t1.ksllasnam         PARENT_NAME,   
       t2.ksllwnam          "WHERE",       
       t1.kslnowtf          NWFAIL_COUNT,  
       t1.kslsleep          SLEEP_COUNT,   
       t1.kslwscwsl         WTR_SLP_COUNT, 
       t1.kslwsclthg        LONGHOLD_COUNT,
       t2.ksllwnam          LOCATION,      
       t1.con_id            CON_ID         
  FROM x$ksllw t2, x$kslwsc t1
 WHERE t2.indx = t1.indx;
 
 select * from sys.X_GV$LATCH_MISSES 
where parent_name in ('cache buffers chains') 
  and indx in (2168, 2186, 2218);
  
INDX  INST_ID  PARENT_NAME           WHERE                   NWFAIL_COUNT  SLEEP_COUNT  WTR_SLP_COUNT  LONGHOLD_COUNT  LOCATION
----  -------  --------------------  ----------------------  ------------  -----------  -------------  --------------  ---------------------
2168  1        cache buffers chains  kcbgtcr: fast path exam  0             106323       185902         15798           kcbgtcr: fast path exam
2186  1        cache buffers chains  kcbrls: pin release      0             14959        22770          3187            kcbrls: pin release
2218  1        cache buffers chains  kcbgtcr: slow path excl  0             189591       48044          3576            kcbgtcr: slow path excl

The above output showed that the most LONGHOLD_COUNT is in location "kcbgtcr: fast path exam" when using INDEX UNIQUE SCAN to make TABLE ACCESS. Whereas when using INDEX RANGE SCAN, the top location is "kcbgtcr: fast path pin". This tiny nuance of location change can play a critical role in CBC latch contention investigation. It also shows the different handling of reverse primary key index vs.normal reverse index, which is probably some diligent work of Oracle optimization.

In the above Dtrace, we trace kslgess function. Blog: Oracle 12 and latches, part 2 described it as:

  kslgess: kernel service latch get spin shared (latch)

Blog: Latch, mutex and beyond written:

  kslgess: shared latch wait get

About kcbrls, Blog: Buffer is pinned count said:

  The function kcbrls (Kernel Cache Buffer Release) is used to release a buffer, also it is possible from another places, 
  for example from kcb_post_apply can call kcbzfs (detach handle from x$kcbbf) directly.

5. Buffer Cache Activity Internals

If we make 10200 and 10046 trace for 100 loops for id from 6 to 10, we can see 500 "ktrgtc2 -> ktrexc" INDEX UNIQUE SCAN on the new block (3665609=0x0037eec9).
("ktrexc: for unique index and index root" and "ktrexf: for non-unique index".
see Blog: Oracle Unique Index Non-Consistency Read with RowCR)


alter session set events='10046 trace name context forever, level 12: 
                          10200 trace name context forever, level 10' 
      tracefile_identifier='10046_10200_index_rp_2';  
exec test_tab_select(6, 1000, 6, 0.00);
alter session set events='10046 trace name context off : 10200 trace name context off ';


ktrgtc2(): started for block <0x07cf : 0x0037eec9> objd: 0x00258aa6
ktrexc(): returning 9 
ktrgtc2(): completed for block <0x07cf : 0x0037eec9> objd: 0x00258aa6

6. X$TRACE Latch Operations

In this section, we try to use X$TRACE to trace latch operations (10005). Pick one CBC latch contention session (SID: 367, Oracle PID: 44 (0x2C), spid: 5222) and trace all levels (15).


$ > oerr ora 10005
10005, 00000, "trace latch operations for debugging"
// *Action: Enable tracing for various latch operations
//    level 1 - trace latch gets and frees
//    level 2 - protect/unprotect latch recovery areas
//    level 4 - trace multiple posts by processes when latch is freed
//    level 8 - signal error on bad latch waits


alter tracing enable "10005:15:44";

SQL> select * from x$trace_events where procs in (44);
ADDR              INDX  INST_ID  CON_ID  EVENT  TRCLEVEL  STATUS  PROCS  FLAGS
----------------  ----  -------  ------  -----  --------  ------  -----  -----
FFFF80FFBC1361C8  5     1        0       10005  15        1       44     1

Following query on x$trace shows the latch begin (KSL WAIT BEG) and end (KSL WAIT END) for each seq_num on contention child latch (6636068072/0x18b8a5ce8). One has a longer elapsed time of 169973 usecs, another is short as of 81 usecs.

169973 usecs is about 169 ms. For a CPU time slice of 10 ms, the process can be scheduled to run after 17 sleeps. That is also the case we have observed in some real applications.


select event, seq#, pid, file_loc,function, data 
  from x$trace 
 where pid in (44) 
 order by time;

EVENT SEQ# PID FILE_LOC    FUNCTION  DATA
----- ---- --- ----------- --------- ----------------------------------------------------------------------------------------------------------------------
10005 7186 44  ksl2.c@2966 kslwtbctx KSL WAIT BEG [latch: cache buffers chains] 6636068072/0x18b8a5ce8 228/0xe4 0/0x0 wait_id=11277 seq_num=11278 snap_id=1
10005 7200 44  ksl2.c@3109 kslwtectx KSL WAIT END [latch: cache buffers chains] 6636068072/0x18b8a5ce8 228/0xe4 0/0x0 wait_id=11277 seq_num=11278 snap_id=1
10005 7201 44  ksl2.c@3117 kslwtectx KSL WAIT END wait times (usecs) - snap=169973, exc=169973, tot=169973

10005 7280 44  ksl2.c@2966 kslwtbctx KSL WAIT BEG [latch: cache buffers chains] 6636068072/0x18b8a5ce8 228/0xe4 0/0x0 wait_id=11278 seq_num=11279 snap_id=1
10005 7301 44  ksl2.c@3109 kslwtectx KSL WAIT END [latch: cache buffers chains] 6636068072/0x18b8a5ce8 228/0xe4 0/0x0 wait_id=11278 seq_num=11279 snap_id=1
10005 7302 44  ksl2.c@3117 kslwtectx KSL WAIT END wait times (usecs) - snap=35031, exc=35031, tot=35031

10005 7312 44  ksl2.c@2966 kslwtbctx KSL WAIT BEG [latch: cache buffers chains] 6636068072/0x18b8a5ce8 228/0xe4 0/0x0 wait_id=11279 seq_num=11280 snap_id=1
10005 7317 44  ksl2.c@3109 kslwtectx KSL WAIT END [latch: cache buffers chains] 6636068072/0x18b8a5ce8 228/0xe4 0/0x0 wait_id=11279 seq_num=11280 snap_id=1
10005 7318 44  ksl2.c@3117 kslwtectx KSL WAIT END wait times (usecs) - snap=81, exc=81, tot=81

Thursday, February 20, 2020

Cache Buffer Chains Latch Contention Case Study-1: Reverse Key Index

In Oracle Database, if a reverse key index is heavily accessed by many concurrent sessions, a minor change on the index can perturb whole system with massive cache buffer chains (CBC) latch contention (maybe Butterfly Effect).

In this Blog, we will try to construct such a case of CBC latch contention that one index leaf block split occurs when inserting a single row.

Note: Tested in Oracle 12c, 18c, 19c.

1. Test Setup

First we create a table with one reverse primary key index, and get object_id of that index:


drop table test_tab purge;

create table test_tab (id number, val number);

create index test_tab#r on test_tab(id) reverse;

-- CREATE TABLE test_tab (id NUMBER PRIMARY KEY USING INDEX (CREATE INDEX test_tab#r ON test_tab(id) REVERSE));

select object_id, to_char(object_id, 'XXXXXXXX') id_hex, data_object_id, to_char(data_object_id, 'XXXXXXXX') did_hex 
from dba_objects where object_name = 'TEST_TAB#R';

  OBJECT_ID ID_HEX DATA_OBJECT_ID DID_HEX
  --------- ------ -------------- -------
    2459815 2588A7        2459815  2588A7

2. Leaf Block Split Point Finding

With appended Plsql procedure: leaf_split_finder, we can find one exact point of leaf block split.


SQL > exec leaf_split_finder(5000);

  --- Leaf Split at row count: 4708, (id between 1 and 11) in 2 Blocks
  block: 2661260, cnt: 5, min_id: 1, max_id: 5
  block: 3665613, cnt: 6, min_id: 6, max_id: 11

Above output shows that after inserting 4707 rows into table test_tab, all the index entries for row from 1 to 11 are in the same block (2661260). Now insert of one more row (4708) will cause a leaf block split. The index entries for row from 1 to 5 remain in block (2661260), but index entries for row from 6 to 11 are moved to a new block (3665613).

3. CBC Latch Contention Producing

Open a Plsql Window, truncate table, insert 4707 rows at first.


truncate table test_tab;

insert into test_tab select level, -level from dual connect by level <= 4707;

commit;

-- using above index object_id to substitute for 2458815
select blk, count(*), min(id) min_id, max(id) max_id from (
  select id 
        ,dbms_rowid.rowid_block_number(sys_op_lbid (2459815, 'L', rowid)) blk
    from test_tab)  
where id between 1 and 11
group by blk order by min_id, blk;

      BLK COUNT(*) MIN_ID MAX_ID
  ------- -------- ------ ------
  2661260       11      1     11

We can see all rows from 1 to 11 are in the same leaf block (2661260).

Following query lists all 14 index blocks and number of entries in each of them, something like index entry distribution. The first BLK (2661260) has 534 entries.


select blk, count(*), min(id) min_id, max(id) max_id from (
  select id 
        ,dbms_rowid.rowid_block_number(sys_op_lbid (2461548, 'L', rowid)) blk
    from test_tab)  
group by blk order by min_id, blk;

       BLK  COUNT(*)  MIN_ID  MAX_ID
  -------- --------- ------- -------
   2661260       534       1    4707
   3665593       527      12    4622
   2661262       274      22    4627
   3665612       269      28    4633
   3665595       284      34    4639
   3665611       279      40    4645
   3665614       314      46    4700
   3665598       311      52    4658
   3665599       327      59    4665
   3665596       319      66    4672
   2661261       312      73    4678
   3665594       311      79    4685
   3665592       327      86    4692
   3665597       319      93    4699
  
  14 rows selected.

Now we start two groups of jobs, each group having 6 jobs in a query loop (see appended test_tab_select_job). All 6 jobs in the first group access row 1 to 5; whereas 6 jobs in the second group access row 6 to 10;


exec test_tab_select_job(6, 1e9, 1, 0);

exec test_tab_select_job(6, 1e9, 6, 0);

-- stop all jobs by:
--   exec clean_jobs;

Each job in the first group executes the query with id from 1 to 5 in a loop, whereas job in the second group executes it with id from 6 to 10. Both are with the same Xplan.


select val from test_tab where id = :B1
 
--------------------------------------------------------------------------------------------------
| Id  | Operation                           | Name       | Rows  | Bytes | Cost (%CPU)| Time     |
--------------------------------------------------------------------------------------------------
|   0 | SELECT STATEMENT                    |            |       |       |     1 (100)|          |
|   1 |  TABLE ACCESS BY INDEX ROWID BATCHED| TEST_TAB   |     1 |    26 |     1   (0)| 00:00:01 |
|*  2 |   INDEX RANGE SCAN                  | TEST_TAB#R |     1 |       |     1   (0)| 00:00:01 |
--------------------------------------------------------------------------------------------------

A query on v$session shows that all 12 jobs are running without any waiting event. And repeated query on v$latch shows that 'cache buffers chains' SLEEPS is not increasing.


select s.client_identifier, s.sid, s.serial#, p.pid, p.spid, s.event, p1, p2, p3, latchwait, latchspin
  from v$session s, v$process p
 where s.paddr=p.addr and lower(s.program) like '%j0%' 
 order by s.client_identifier; 

  CLIENT_IDENTIFIER     SID  SERIAL#  PID  SPID  EVENT                    P1  P2     P3  LATCHWAIT  LATCHSPIN
  --------------------  ---  -------  ---  ----  -----------------------  --  -----  --  ---------  ---------
  test_tab_select_id_1  900  65141    23   4653  db file sequential read  1   85437  1
  test_tab_select_id_1  7    8997     24   4655  null event               0   0      0
  test_tab_select_id_1  186  50273    25   4657  null event               0   0      0
  test_tab_select_id_1  369  57072    26   4659  null event               0   0      0
  test_tab_select_id_1  547  53721    27   4661  null event               0   0      0
  test_tab_select_id_1  903  13140    35   4663  null event               0   0      0
  test_tab_select_id_6  366  17587    38   4665  null event               0   0      0
  test_tab_select_id_6  9    26667    42   4667  null event               0   0      0
  test_tab_select_id_6  189  13145    43   4669  null event               0   0      0
  test_tab_select_id_6  549  36479    45   4671  null event               0   0      0
  test_tab_select_id_6  726  40173    46   4673  null event               0   0      0
  test_tab_select_id_6  906  16667    47   4675  null event               0   0      0


select addr, gets, misses, sleeps, spin_gets, wait_time, round(wait_time/sleeps) us_per_sleep 
  from v$latch where name = 'cache buffers chains';
  
  ADDR              GETS       MISSES  SLEEPS  SPIN_GETS  WAIT_TIME  US_PER_SLEEP
  ----------------  ---------  ------  ------  ---------  ---------  ------------
  0000000060027E88  181375987  795727  1       795699     25         25

Now insert a single new row:


insert into test_tab values(4708, -4708);

select blk, count(*), min(id) min_id, max(id) max_id from (
  select id 
        ,dbms_rowid.rowid_block_number(sys_op_lbid (2459815, 'L', rowid)) blk
    from test_tab)  
where id between 1 and 11
group by blk order by min_id, blk;

      BLK COUNT(*) MIN_ID MAX_ID
  ------- -------- ------ ------
  2661260        5      1      5
  3665613        6      6     11

One leaf block split occurs. Row 6 to 11 are moved to new block (3665613).

Look again index entry distribution, now it has 15 index blocks. The previous 534 entries in BLK (2661260) are split into two parts, 272 in original block, 262 in new created BLK (3665613), which also contains the new inserted row (id: 4708). So 534 is probably the split point in such case. If we look the third BLK (3665593), which has already 527 index entries, only 7 rows away from next leaf block split. It means that in certain case, the distance of two split points can be as short of 7 rows for 4715 (4708+7) row inserts (average should be 4715/15 = 314), an uneven split point distribution in case of reverse index. In later section, we will give further look of index block split distributions.


select blk, count(*), min(id) min_id, max(id) max_id from (
  select id 
        ,dbms_rowid.rowid_block_number(sys_op_lbid (2461548, 'L', rowid)) blk
    from test_tab)  
group by blk order by min_id, blk;

       BLK  COUNT(*)  MIN_ID  MAX_ID
  -------- --------- ------- -------
   2661260       272       1    4705
   3665613       263       6    4708
   3665593       527      12    4622
   2661262       274      22    4627
   3665612       269      28    4633
   3665595       284      34    4639
   3665611       279      40    4645
   3665614       314      46    4700
   3665598       311      52    4658
   3665599       327      59    4665
   3665596       319      66    4672
   2661261       312      73    4678
   3665594       311      79    4685
   3665592       327      86    4692
   3665597       319      93    4699
  
  15 rows selected.

Now if we look again all 12 job sessions:


select s.client_identifier, s.sid, s.serial#, p.pid, p.spid, s.event, p1, p2, p3, latchwait, latchspin
  from v$session s, v$process p
 where s.paddr=p.addr and lower(s.program) like '%j0%' 
 order by s.client_identifier; 

  CLIENT_IDENTIFIER     SID  SERIAL#  PID  SPID  EVENT                        P1          P2       P3  LATCHWAIT         LATCHSPIN
  --------------------  ---  -------  ---  ----  ---------------------------  ----------  -------  --  ----------------  ---------
  test_tab_select_id_1  900  65141    23   4653  null event                   0           0        0
  test_tab_select_id_1  7    8997     24   4655  null event                   0           0        0
  test_tab_select_id_1  186  50273    25   4657  null event                   0           0        0
  test_tab_select_id_1  369  57072    26   4659  null event                   0           0        0
  test_tab_select_id_1  547  53721    27   4661  latch: cache buffers chains  6630918056  228      0
  test_tab_select_id_1  903  13140    35   4663  latch: cache buffers chains  6630918056  228      0
  test_tab_select_id_6  366  17587    38   4665  latch: cache buffers chains  6632747048  228      0
  test_tab_select_id_6  9    26667    42   4667  buffer busy waits            1548        3665613  1
  test_tab_select_id_6  189  13145    43   4669  latch: cache buffers chains  6632747048  228      0   000000018B57B028
  test_tab_select_id_6  549  36479    45   4671  latch: cache buffers chains  6632747048  228      0   000000018B57B028
  test_tab_select_id_6  726  40173    46   4673  buffer busy waits            1548        3665613  1
  test_tab_select_id_6  906  16667    47   4675  latch: cache buffers chains  6632747048  228      0   000000018B57B028

There are two wait events: "buffer busy waits" (p1text=file#, p2text=block#, p3text=class#) or "latch: cache buffers chains" (p1text=address, p2text=number, p3text=tries. From Oracle 18c, p3text renamed as "why").

For "buffer busy waits", they are waiting on new spli index block (file#=1548, block#=3665613).

For "latch: cache buffers chains", first group are waiting in address 6630918056 (0x18B3BC7A8), whereas second group in 6632747048 (0x18B57B028).

The second group also shows column v$process.LATCHWAIT (000000018B57B028), which is the child latch address the process is waiting for (probably only filled where there exists latch sleep).

Imagine that a production system was running smoothly all morning, Suddenly, a single new row insert slowed it down with overwhelming CBC latch concurrency.

Repeated query on v$latch shows the continously increasing of latch SLEEPS (1460647) and WAIT_TIME (905492637).


select addr, gets, misses, sleeps, spin_gets, wait_time, round(wait_time/sleeps) us_per_sleep 
  from v$latch where name = 'cache buffers chains';

  ADDR              GETS        MISSES    SLEEPS   SPIN_GETS  WAIT_TIME  US_PER_SLEEP
  ----------------  ----------  --------  -------  ---------  ---------  ------------
  0000000060027E88  1853839207  96374121  1460647  95066763   905492637  620

A further query on v$latch_children shows majority of CBC SLEEPS (1460317) is on child latch: 000000018B57B028 of second group. Even though the child latch: 000000018B3BC7A8 of first group has more GETS (428020322), it has much less MISSES, almost no SLEEPS and WAIT_TIME.


select latch#, child#, addr, gets, misses, sleeps, spin_gets, wait_time, round(wait_time/sleeps) us_per_sleep 
  from v$latch_children 
 where addr = hextoraw('000000018B3BC7A8') or addr = hextoraw('000000018B57B028'); 
 
  LATCH#  CHILD#  ADDR              GETS       MISSES    SLEEPS   SPIN_GETS  WAIT_TIME  US_PER_SLEEP
  ------  ------  ----------------  ---------  --------  -------  ---------  ---------  ------------
  228     3156    000000018B3BC7A8  428020322  1027200   4        1027196    108        27
  228     12682   000000018B57B028  380009579  90239103  1460317  88932209   905458824  620

With one more query on v$latch_misses, we can display CBC latch contention locations.


select * from v$latch_misses 
 where parent_name = 'cache buffers chains' and sleep_count > 0
 order by sleep_count desc;

  PARENT_NAME           WHERE                     NWFAIL_COUNT  SLEEP_COUNT  WTR_SLP_COUNT  LONGHOLD_COUNT  LOCATION
  --------------------  ------------------------  ------------  -----------  -------------  --------------  -----------------------
  cache buffers chains  kcbgtcr: fast path pin    0             1341199      2297098        197030          kcbgtcr: fast path pin
  cache buffers chains  kcbchg: cr pin cleanout   0             694146       34319          86549           kcbchg: cr pin cleanout
  cache buffers chains  kcbchg: cr pin change     0             182979       13069          23671           kcbchg: cr pin change
  cache buffers chains  kcbgtcr: slow path excl   0             124511       5269           5873            kcbgtcr: slow path excl
  cache buffers chains  kcbchg: change complete   0             30195        8539           2881            kcbchg: change complete
  cache buffers chains  kcbcge                    0             23214        30005          6855            kcbcge
  cache buffers chains  kcbzwb                    0             15599        9668           2013            kcbzwb
  cache buffers chains  kcbrls: pin release       0             10319        18689          2147            kcbrls: pin release
  cache buffers chains  kcb_trim_hash_chain       0             3527         109            712             kcb_trim_hash_chain
  cache buffers chains  kcbrls: fast pin release  0             1069         10173          503             kcbrls: fast pin release
  cache buffers chains  kcbbxsv                   0             2            0              2               kcbbxsv
  cache buffers chains  kcbgcur: slow path        0             1            0              0               kcbgcur: slow path

The top Location (SLEEP_COUNT=1341199, LONGHOLD_COUNT=197030) is "kcbgtcr: fast path pin".

With one further query on x$bh, we can link the CBC_LATCH to protected block.


select * from (
with sq as (select object_name, data_object_id 
              from dba_objects where object_name like 'TEST_TAB%')
    ,bh as (select hladdr, obj, file#, dbablk, sum(tch) tch 
              from x$bh group by hladdr, obj, file#, dbablk)
select hladdr cbc_latch_addr
      ,sum(tch) tch
      ,listagg(tch || '-' || obj || '(' || object_name || ')/' || file# || '/' ||dbablk, ';') 
         within group (order by tch desc) tch_list -- "tch-obj(name)/file/blk_list"
      ,count(*) blk_cnt
from  bh, sq
where bh.obj = sq.data_object_id
  and tch > 0
  --and (hladdr like '%18B3DB828%' or hladdr like '%18B3BACA8')   
group by hladdr
order by tch desc)
where 1=1
  and (tch_list like '%TEST_TAB%');
  
  CBC_LATCH_ADDR    TCH  TCH_LIST                             BLK_CNT
  ----------------  ---  -----------------------------------  -------
  000000018B851DA8   3  3-2459816(TEST_TAB#R)/1548/2661258    1       <== root block (kdxcolev 1)
  000000018B3BC7A8 213  213-2459816(TEST_TAB#R)/1548/2661260  1       <== leaf block for row 1 to 5  (kdxcolev 0)
  000000018B57B028   5  5-2459816(TEST_TAB#R)/1548/3665613    1       <== leaf block for row 6 to 11 (kdxcolev 0)

It shows that the contention latch (000000018B57B028) has much lower TCH than that of non contention latch (000000018B3BC7A8). This observation also raises an immediate question: can we use TCH to determine CBC latch contention or Hot Block ?

4. Latch State Transition Diagram By Dtrace

Pick one CBC latch contention session (SID: 549, Oracle PID: 45 (0x2D), spid: 4671) from above v$session and v$process query.

Run following Dtrace Script on contention latch address: 0x18B57B028 (For Latch State Transition Diagram, See Blog: PL/SQL Function Result Cache Invalidation (I) ).


sudo dtrace -n \
'
typedef unsigned int ub4;

typedef struct latchstate /* latch state */ {
    ub4 nproc_pid;  /* Nproc or pid   */
    ub4 flag;       /* blocking flag  */ 
    ub4 gets;       /* number of gets */
    ub4 latch;      /* latch number   */
    ub4 level;      /* latch level    */
    ub4 where; } latchstate;
    
BEGIN {laddr = $1; step = 1; begin_gets = 0, end_gets = 0; prev_ts = timestamp;
   self->evt = "begin"; self->loc = 0x0;}
pid$target::ksl_get_shared_latch:entry /arg0 == laddr && (self->evt != probefunc || self->loc != arg3)/ { 
   printf("\n---------Step %d at %Y (Elapsed-ns=%d)---------", step, walltimestamp, (timestamp-prev_ts));
   ls = ((latchstate *) copyin(arg0, sizeof(latchstate)));
   begin_gets = (begin_gets == 0 ? ls->gets: begin_gets); end_gets = ls->gets;
   printf("\n***State Entry>>>(nproc_pid=>%d, flag=>%d, gets=>%d, latch=>%d, level=>%d, where=>%d)"
           ,ls->nproc_pid, ls->flag, ls->gets, ls->latch, ls->level, ls->where);
   printf("\n>>>Transition>>>%s:%s\n      (Addr=>0x%X, Immed_Get=>%d, Why=>%d, Where=>%d(0x%X), Mode=>%d, PID=>0x%X)", 
          probefunc, probename, arg0, arg1, arg2, arg3, arg3, arg4, arg5);
   self->evt = probefunc; self->loc = arg3;    
   ustack(10, 0);  
   /* stop(); exit(0); */    /* no latch blocking, failed get, try again */ 
   step = step + 1; self->entry = 1; prev_ts = timestamp;} 
pid$target::ksl_get_shared_latch:return /self->entry == 1/ {
   ls = ((latchstate *) copyin(laddr, sizeof(latchstate))); 
   printf("\n>>>State Return***(nproc_pid=>%d, flag=>%d, gets=>%d, latch=>%d, level=>%d, where=>%d)"
           ,ls->nproc_pid, ls->flag, ls->gets, ls->latch, ls->level, ls->where);
   /* stop(); exit(0); */      /* simulate long latch holder, run by: dtrace -w -n */
   self->entry = 2;}   
pid$target::kslfre:entry /self->entry == 2 && arg0 == laddr && self->evt != probefunc/ {
   printf("\n---------Step %d at %Y (Elapsed-ns=%d)---------", step, walltimestamp, (timestamp-prev_ts));
   ls = ((latchstate *) copyin(arg0, sizeof(latchstate)));
   end_gets = ls->gets;
   printf("\n>>>Transition>>>%s:%s(Addr=>0x%-X)", probefunc, probename, arg0);
   self->evt = probefunc; self->loc = 0x0;
   /* stop(); exit(0); */      /* simulate long latch holder, run by: dtrace -w -n */
   printf("\n===Latch freed===");
   ustack(10, 0); step = step + 1; prev_ts = timestamp;}      
END {
   printf("\n---------Step %d at %Y (Elapsed-ns=%d)---------", step, walltimestamp, (timestamp-prev_ts));
   printf("\nLatch Gets = %d (%d - %d)", (end_gets - begin_gets), (unsigned long)begin_gets, (unsigned long)end_gets); }
' -p 4671 0x18B57B028

The output is some repeated occurrences of following pattern:


   ksl_get_shared_latch:entry
---------Step 599 at 2020 Feb 20 14:10:20 (Elapsed-ns=2112100)---------
***State Entry>>>(nproc_pid=>0, flag=>0, gets=>632751866, latch=>228, level=>1, where=>2195)
>>>Transition>>>ksl_get_shared_latch:entry
      (Addr=>0x18B57B028, Immed_Get=>1, Why=>0, Where=>2298(0x8FA), Mode=>16, PID=>0x200000000000002D)
              oracle`ksl_get_shared_latch
              a.out`kcbzwb+0xa33
              a.out`kcbgtcr+0x14a77
              a.out`ktrget2+0x292
              a.out`qeilbk1+0x3cf
              a.out`qeilsr+0xbd
              a.out`qerixFetch+0x1f9
              a.out`qerstFetch+0x543
              a.out`qertbFetchByRowID+0x39e
              a.out`qerstFetch+0x4ca

   ksl_get_shared_latch:return
>>>State Return***(nproc_pid=>45, flag=>536870912, gets=>632751872, latch=>228, level=>1, where=>2298)
   kslfre:entry
---------Step 600 at 2020 Feb 20 14:10:20 (Elapsed-ns=63625)---------
>>>Transition>>>kslfre:entry(Addr=>0x18B57B028)
===Latch freed===
              oracle`kslfre
              a.out`kcbgtcr+0x540f
              a.out`ktrget2+0x292
              a.out`qeilbk1+0x3cf
              a.out`qeilsr+0xbd
              a.out`qerixFetch+0x1f9
              a.out`qerstFetch+0x543
              a.out`qertbFetchByRowID+0x39e
              a.out`qerstFetch+0x4ca
              a.out`opifch2+0x188b

It shows the CBC Latch State Transition. At first, C subroutine ksl_get_shared_latch (shared Latch (in S Mode or X Mode)) was called to get exclusive latch (Mode=>16) in entry location (where=>2195) and next location (where=>2298). Then freed by kslfre. CallStack shows that each row is accessed by qertbFetchByRowID via qeilsr (TABLE ACCESS BY INDEX via INDEX RANGE SCAN).

For the where location, we can use the query below to find them.


CREATE OR REPLACE FORCE VIEW X_GV$LATCH_MISSES as
SELECT t1.indx              INDX,
       t1.inst_id           INST_ID,       
       t1.ksllasnam         PARENT_NAME,   
       t2.ksllwnam          "WHERE",       
       t1.kslnowtf          NWFAIL_COUNT,  
       t1.kslsleep          SLEEP_COUNT,   
       t1.kslwscwsl         WTR_SLP_COUNT, 
       t1.kslwsclthg        LONGHOLD_COUNT,
       t2.ksllwnam          LOCATION,      
       t1.con_id            CON_ID         
  FROM x$ksllw t2, x$kslwsc t1
 WHERE t2.indx = t1.indx;
 
select * from sys.X_GV$LATCH_MISSES 
where parent_name in ('cache buffers chains') 
  and indx in (2195, 2298, 2162); 
  
INDX  INST_ID  PARENT_NAME           WHERE                   NWFAIL_COUNT  SLEEP_COUNT  WTR_SLP_COUNT  LONGHOLD_COUNT  LOCATION
----  -------  --------------------  ----------------------  ------------  -----------  -------------  --------------  ---------------------
2118  1        cache buffers chains  kcbchg: cr pin cleanout 0             1946366      110291         243655          kcbchg: cr pin cleanout
2195  1        cache buffers chains  kcbgtcr: fast path pin  0             2594671      5314421        307276          kcbgtcr: fast path pin
2298  1        cache buffers chains  kcbzwb                  0             59480        48922          7353            kcbzwb

In the above Dtrace script, if we uncomment the line inside "kslfre:entry":


/* stop(); exit(0); */      /* simulate long latch holder, run by: dtrace -w -n */

the process was stopped (process STATE: stop) and latch will not be freed, hence to simulate long latch holder. (v$session.state: "WAITED SHORT TIME", v$active_session_history.session_state: "ON CPU").

Repeated query on v$latchholder will show GETS is no more changed.


select * from v$latchholder;

PID  SID  LADDR             NAME                  GETS
---  ---  ----------------  --------------------  ----------
45   549  000000018B57B028  cache buffers chains  761549981

Note that if we "stop()" inside "ksl_get_shared_latch:entry", there will be no latch blocking since latch child (000000018B57B028) Get failed and the latch was not acquired. (v$session.state: "WAITING", v$active_session_history.session_state: "WAITING").

However, if we "stop()" inside "ksl_get_shared_latch:return", there will be latch blocking since latch was already acquired, it should be released before any other sessions can get it.

By the way,


  v$active_session_history.session_state

is defined as


  DECODE (a.wait_time, 0, 'WAITING', 'ON CPU')

That means if "wait_time" is not 0, the session_state is denoted as 'ON CPU'. In this example, UNIX process STATE: "stop" is conveyed to v$active_session_history.session_state: 'ON CPU'. In this case, if v$active_session_history.session_state is aggregated (in an interval) to assess UNIX CPU utilization, the outcome from Oracle will differ from real UNIX CPU utilization.

As of 12c, v$session.WAIT_TIME has been deprecated in favor of the columns WAIT_TIME_MICRO and STATE (Strangely, derived v$active_session_history.WAIT_TIME and SESSION_STATE are not yet deprecated in Oracle 12c Documentation).

5. Buffer Cache Activity Internals

In Oracle, each CBC latch protects a set of hash buckets (32). Each hash bucket is a linked list of buffer headers. Each data (disk) block is cloned as a few buffer blocks with different status (CR, XCUR) into buffer cache (Maximum Allowed Number of CR buffers per dba is controlled by "_db_block_max_cr_dba"). All buffer blocks of same data (disk) block are hashed to the same hash bucket, hence protected by the same CBC latch. If we query the new block (3665613) protected by the CBC contention latch (000000018B57B028) twice, two example outputs are:


select hladdr, decode(state,0,'free',1,'xcur',2,'scur',3,'cr', state) status, 
       decode (bitand (flag, 1),0,'N','Y') dirty, ba, addr, nxt_hash, prv_hash, tch  --, v.* 
from  x$bh v 
where dbablk in (3665613) 
order by v.state, indx;

HLADDR            STATUS  DIRTY  BA                ADDR              NXT_HASH          PRV_HASH          TCH
----------------  ------  -----  ----------------  ----------------  ----------------  ----------------  ---
000000018B57B028  xcur    Y      00000000CA3BA000  FFFF80FFBC132E60  000000018AE0DAC8  00000000CAFD2450  0
000000018B57B028  cr      N      00000000CA8E2000  FFFF80FFBC1341E8  00000000CAFED610  000000018AE0DAC8  0
000000018B57B028  cr      N      00000000CAE24000  FFFF80FFBC134090  00000000CAF9EDD0  00000000CAFB8CD0  1
000000018B57B028  cr      N      00000000CA64A000  FFFF80FFBC133F38  00000000CAF6CA10  00000000CAFED610  1
000000018B57B028  cr      N      00000000CA144000  FFFF80FFBC133DE0  00000000CAF85290  00000000CAF9EDD0  1
000000018B57B028  cr      N      00000000CA3B8000  FFFF80FFBC133110  00000000CAFD2450  00000000CAF6CA10  1
000000018B57B028  cr      N      00000000CAB6E000  FFFF80FFBC132FB8  00000000CAF853D0  00000000CAF85290  1

HLADDR            STATUS  DIRTY  BA                ADDR              NXT_HASH          PRV_HASH          TCH
----------------  ------  -----  ----------------  ----------------  ----------------  ----------------  ---
000000018B57B028  xcur    Y      00000000BE3E6000  FFFF80FFBC18FE08  00000000BEFD76D0  00000000BEF70250  0
000000018B57B028  cr      N      00000000BEE80000  FFFF80FFBC115898  00000000BEFBC650  000000018AE0DAC8  0
000000018B57B028  cr      N      00000000BE93E000  FFFF80FFBC115740  00000000C2F68690  00000000BEFF0F90  1
000000018B57B028  cr      N      00000000C20D8000  FFFF80FFBC1155E8  00000000BEF70610  00000000BEFBC650  1
000000018B57B028  cr      N      00000000BE1A4000  FFFF80FFBC115490  00000000BEFA2D90  00000000C2F68690  1
000000018B57B028  cr      N      00000000BE6B0000  FFFF80FFBC1900B8  00000000BEF70250  00000000BEF70610  1
000000018B57B028  cr      N      00000000BE19E000  FFFF80FFBC18FF60  00000000BEF86F50  00000000BEFA2D90  1
000000018B57B028  cr      N      00000000BEBF2000  FFFF80FFBC18FCB0  00000000BEFBFE90  00000000BEF86F50  0
000000018B57B028  cr      N      00000000BE998000  FFFF80FFBC18FB58  000000018AE0DAC8  00000000BEFD76D0  0

We can see that there exists one single DIRTY buffer block ('Y') with STATUS ('xcur'). The number of rows returned is changing in each execution. All other memory address fields (BA, ADDR, NXT_HASH, PRV_HASH) are different in each execution of the query. That means this buffer chain is constantly changed. For any buffer chain modification, an exclusive latch get is requested. It results in CBC latch contention. However, if we query the original block (2661260) protected by the CBC contention latch (000000018B3BC7A8) a few times, the output are the same and the 'xcur' buffer block is not DIRTY ('N').:


select hladdr, decode(state,0,'free',1,'xcur',2,'scur',3,'cr', state) status, 
       decode (bitand (flag, 1),0,'N','Y') dirty, ba, addr, nxt_hash, prv_hash, tch  --, v.* 
from  x$bh v 
where dbablk in (2661260) 
order by v.state, indx;

HLADDR            STATUS  DIRTY  BA                ADDR              NXT_HASH          PRV_HASH          TCH
----------------  ------  -----  ----------------  ----------------  ----------------  ----------------  ---
000000018B3BC7A8  xcur    N      00000000CA14E000  FFFF80FFBC133110  00000000CAF85A10  00000000CAF9EF10  45
000000018B3BC7A8  cr      N      00000000CAB76000  FFFF80FFBC133F38  00000000CAF9EF10  000000018A7115B8  1
000000018B3BC7A8  cr      N      00000000CA64C000  FFFF80FFBC133DE0  00000000CAF6D050  00000000CAFD2950  1
000000018B3BC7A8  cr      N      00000000CA3C4000  FFFF80FFBC132FB8  00000000CAFB8E10  00000000CAF6D050  1
000000018B3BC7A8  cr      N      00000000CA8E4000  FFFF80FFBC132E60  000000018A7115B8  00000000CAF85A10  1

If we dump the contention buffer block (3665613), the buffer head section looks like:


-- alter system dump datafile 1548 block 3665613; 

BH (0xcafd2618) file#: 1548 rdba: 0x0037eecd (1024/3665613) class: 1 ba: 0xcab72000
  hash: [0xcaf9ec90,0x18ae0dac8] lru: [0xcafd2200,0x1897cc750]
  st: CR md: NULL tch: 1
  flags: only_sequential_access
BH (0xcaf9ebd8) file#: 1548 rdba: 0x0037eecd (1024/3665613) class: 1 ba: 0xca648000
  hash: [0xcaf6c8d0,0xcafd26d0] lru: [0xcaf9ea40,0x1897cb860]
  st: CR md: NULL tch: 1
  flags: only_sequential_access
BH (0xcaf6c818) file#: 1548 rdba: 0x0037eecd (1024/3665613) class: 1 ba: 0xca142000
  hash: [0xcafb8f50,0xcaf9ec90] lru: [0xcaf6ccc0,0x1897cd640]
  st: CR md: NULL tch: 1
  flags: only_sequential_access
BH (0xcafb8e98) file#: 1548 rdba: 0x0037eecd (1024/3665613) class: 1 ba: 0xca8e6000
  hash: [0xcaf84ed0,0xcaf6c8d0] lru: [0xcafb8800,0x1897cbfd8]
  st: CR md: NULL tch: 1
  flags: only_sequential_access
BH (0xcaf84e18) file#: 1548 rdba: 0x0037eecd (1024/3665613) class: 1 ba: 0xca3b2000
  hash: [0xcaf853d0,0xcafb8f50] lru: [0xcaf85400,0x1897cb0e8]
  st: CR md: NULL tch: 1
  flags: only_sequential_access
BH (0xcaf85318) file#: 1548 rdba: 0x0037eecd (1024/3665613) class: 1 ba: 0xca3ba000
  set: 13 pool: 3 bsz: 8192 bsi: 0 sflg: 2 pwc: 14,19
  dbwrid: 0 obj: 2459815 objn: 2459815 tsn: [0/1999] afn: 1548 hint: f
  hash: [0x18ae0dac8,0xcaf84ed0] lru: [0xcaf85180,0xcaf84f00]
  obj-flags: object_ckpt_list
  ckptq: [0x18be25400,0xfffab158] fileq: [0x18be31580,0x18be31580]
  objq: [0x1750dac40,0x1750dac40] objaq: [0xcaf8af78,0x1750dac20]
  use: [0x18bab8240,0x18bab8240] wait: [0x189465eb8,0x18bab9860]
  st: XCURRENT md: EXCL fpin: 'ktspbwh2: ktspfmdb' fscn: 0x896.f43f474c tch: 0
  flags: buffer_dirty mod_started block_written_once
  change state: NOT_ACTIVE_YET
  change count: 1
  LRBA: [0x1a9a.d1c5d.0] LSCN: [0x896.f4ef66c5] HSCN: [0x896.f50b9757] HSUB: [1]

There are 6 buffer blocks, one is with "st: XCURRENT md: EXCL", all other with "st: CR md: NULL". If we pick all the hash fields, we can build a linked list: (see Blog: [Oracle] Database cache internals and why one data block can use multiple cache buffers)


                          NXT_HASH   PRV_HASH         Link_List
                         ----------- -----------      ---------
1. BH (0xcafd2618) hash: [0xcaf9ec90,0x18ae0dac8]     1    -> 3
2. BH (0xcaf9ebd8) hash: [0xcaf6c8d0, 0xcafd26d0]     head -> 4  
3. BH (0xcaf6c818) hash: [0xcafb8f50, 0xcaf9ec90]     3    -> 5
4. BH (0xcafb8e98) hash: [0xcaf84ed0, 0xcaf6c8d0]     4    -> 6
5. BH (0xcaf84e18) hash: [0xcaf853d0, 0xcafb8f50]     5    -> tail
6. BH (0xcaf85318) hash: [0x18ae0dac8,0xcaf84ed0]     6    -> 1

If we look the index row section of above buffer dump, we can see the new inserted row (id: 4708) are in the same block as split id from 6 to 10 (all ID are reversed), and row (id: 4708) is locked ("lock: 2") by Itl (0x02).


 seg/obj: 0x2588a7  csc: 0x896.f4ef5c84  itc: 2  flg: E  typ: 2 - INDEX
 
 Itl           Xid                  Uba         Flag  Lck        Scn/Fsc
0x01   0x0002.004.00dbbdfb  0x00c022d5.b648.0a  C---    0  scn 0x0896.f43f4854
0x02   0x0004.004.00769215  0x00c021d1.39b9.27  ----    1  fsc 0x0000.00000000
Leaf block dump
===============
kdxcolev 0
kdxconro 263
...
row#20[4884] flag: -------, lock: 0, len=12
col 0; len 2; (2):  07 c1                    REVERSE=> c1 07    => 6
col 1; len 6; (6):  00 28 2b 85 00 05
row#69[5519] flag: -------, lock: 0, len=12
col 0; len 2; (2):  08 c1                    REVERSE=> c1 08    => 7
col 1; len 6; (6):  00 28 2b 85 00 06
row#117[4611] flag: -------, lock: 2, len=13
col 0; len 3; (3):  09 30 c2                 REVERSE=> c2 30 09 => 4708
col 1; len 6; (6):  00 37 ee c2 00 16
row#118[6154] flag: -------, lock: 0, len=12
col 0; len 2; (2):  09 c1                    REVERSE=> c1 09    => 8
col 1; len 6; (6):  00 28 2b 85 00 07
row#166[6776] flag: -------, lock: 0, len=12
col 0; len 2; (2):  0a c1                    REVERSE=> c1 0a    => 9
col 1; len 6; (6):  00 28 2b 85 00 08
row#214[7398] flag: -------, lock: 0, len=12
col 0; len 2; (2):  0b c1                    REVERSE=> c1 0b    => 10
col 1; len 6; (6):  00 28 2b 85 00 09

If we make 10200 and 10046 trace for 100 loops for id from 6 to 10, we can see 500 "ktrgcm()" calls, and a few long 'latch: cache buffers chains' WAIT (see Blog: Oracle Unique Index Non-Consistency Read with RowCR). 'xcur' (state 1) buffer block is finally returned after 5 ktrexf() showing buffer of 'cr' (state 3).


alter session set events='10046 trace name context forever, level 12: 
                          10200 trace name context forever, level 10' 
      tracefile_identifier='10046_10200_index_r_2';  
exec test_tab_select(6, 100, 6, 0.00);
alter session set events='10046 trace name context off : 10200 trace name context off ';

ktrget2(): started for block  <0x07cf : 0x0037eecd> objd: 0x002588a7     
showing buffer 0x9ffeda58 rdba 0x37eecd state 3
ktrexf(): returning 9 on:  9ffedad0  cr-scn: 0x0897.0ed85359   xid: 0x0000.000.00000000
showing buffer 0x9ffd3a18 rdba 0x37eecd state 3
ktrexf(): returning 9 on:  9ffd3a90  cr-scn: 0x0897.0ed85358   xid: 0x0004.004.00769215
showing buffer 0x9ffb5298 rdba 0x37eecd state 3
ktrexf(): returning 1 on:  9ffb5310  cr-scn: 0x0897.0ed85358   xid: 0x0004.004.00769215
showing buffer 0x9ffed558 rdba 0x37eecd state 3
ktrexf(): returning 9 on:  9ffed5d0  cr-scn: 0x0897.0ed85357   xid: 0x0004.004.00769215
showing buffer 0x9ff9bb18 rdba 0x37eecd state 3
ktrexf(): returning 1 on:  9ff9bb90  cr-scn: 0x0897.0ed85356   xid: 0x0004.004.00769215
showing buffer 0x9ff94f98 rdba 0x37eecd state 1
ktrexf(): returning 1 on:  ffff80ffbccf4e70  cr-scn: 0xffff.ffffffff   xid: 0x0000.000.00000000
ktrgcm(): completed for block  <0x07cf : 0x0037eecd> objd: 0x002588a7
ktrget2(): completed for  block <0x07cf : 0x0037eecd> objd: 0x002588a7
......

WAIT #18446604434568798512: nam='latch: cache buffers chains' ela= 151231 address=6635150120 number=228 tries=0

-- Legend
--   DATA_OBJECT_ID: 2459815 = 0x002588a7
--   new index block 3665613 = 0x0037eecd
--   tablespace ts#     1999 = 0x07cf
--   state 1 = 'xcur'
--   state 3 = 'cr'

Above 10200 trace shows that we have to traverse 5 'cr' (state 3) buffer blocks in the CBC chain before reaching 'xcur' (state 1) buffer block, and construct the required before-image buffer block. Then require CBC latch in exclusive mode to link the new buffer block into the CBC chain.

A query on v$sesstat reveals heavy CR buffer clones (by applying undo records) in the sessions of second group (for id from 6 to 10), but not the case of first group (for id from 1 to 5).


select s.client_identifier, st.sid, n.name, st.value, 
       value-lag(value) over(partition by s.sid order by n.name) delta 
from v$sesstat st, v$statname n, v$session s
where st.statistic# = n.statistic#
  and n.name in ('CR blocks created', 'data blocks consistent reads - undo records applied')
  and st.sid = s.sid
  and lower(s.program) like '%j0%'
order by s.client_identifier, s.sid, n.name;

CLIENT_IDENTIFIER     SID  NAME                                                 VALUE     DELTA
--------------------  ---  ---------------------------------------------------  --------  ------
test_tab_select_id_1  900  CR blocks created                                    1
test_tab_select_id_1  900  data blocks consistent reads - undo records applied  0         -1
test_tab_select_id_1  7    CR blocks created                                    4
test_tab_select_id_1  7    data blocks consistent reads - undo records applied  3         -1
test_tab_select_id_1  186  CR blocks created                                    1
test_tab_select_id_1  186  data blocks consistent reads - undo records applied  2         1
test_tab_select_id_1  369  CR blocks created                                    1
test_tab_select_id_1  369  data blocks consistent reads - undo records applied  1         0
test_tab_select_id_1  547  CR blocks created                                    0
test_tab_select_id_1  547  data blocks consistent reads - undo records applied  0         0
test_tab_select_id_1  903  CR blocks created                                    0
test_tab_select_id_1  903  data blocks consistent reads - undo records applied  0         0

CLIENT_IDENTIFIER     SID  NAME                                                 VALUE     DELTA
--------------------  ---  ---------------------------------------------------  --------  ------
test_tab_select_id_6  366  CR blocks created                                    62217832
test_tab_select_id_6  366  data blocks consistent reads - undo records applied  62134369  -83463
test_tab_select_id_6  9    CR blocks created                                    62673868
test_tab_select_id_6  9    data blocks consistent reads - undo records applied  62663191  -10677
test_tab_select_id_6  189  CR blocks created                                    62222626
test_tab_select_id_6  189  data blocks consistent reads - undo records applied  62255779  33153
test_tab_select_id_6  549  CR blocks created                                    62098359
test_tab_select_id_6  549  data blocks consistent reads - undo records applied  62139622  41263
test_tab_select_id_6  726  CR blocks created                                    62263906
test_tab_select_id_6  726  data blocks consistent reads - undo records applied  62293706  29800
test_tab_select_id_6  906  CR blocks created                                    62227364
test_tab_select_id_6  906  data blocks consistent reads - undo records applied  62217302  -10062

Finally, if we commit the single row insert, CBC latch contention disappears immediately.

For more discussion on reverse key index, see:
Blog: reverse key index: the remedy is worse than the symptoms
and right linked book: Oracle Database Performance Tuning (Studies . Practices . Researches) - Chapter 3: Locks, Latches and Mutexes

6. Data Access Path (Xplan) and Latch Locations

In the above test, we use INDEX RANGE SCAN to access table row:


select /*+ job 4, id 6 */ val from test_tab where id = 7
 
--------------------------------------------------------------------------------------------------
| Id  | Operation                           | Name       | Rows  | Bytes | Cost (%CPU)| Time     |
--------------------------------------------------------------------------------------------------
|   0 | SELECT STATEMENT                    |            |       |       |     1 (100)|          |
|   1 |  TABLE ACCESS BY INDEX ROWID BATCHED| TEST_TAB   |     1 |    26 |     1   (0)| 00:00:01 |
|*  2 |   INDEX RANGE SCAN                  | TEST_TAB#R |     1 |       |     1   (0)| 00:00:01 |
--------------------------------------------------------------------------------------------------

If we only use INDEX RANGE SCAN without accessing table row:
(In procedure test_tab_select, use second query statement)


select /*+ job 4, id 6 */ id from test_tab where id = 7
 
-------------------------------------------------------------------------------
| Id  | Operation        | Name       | Rows  | Bytes | Cost (%CPU)| Time     |
-------------------------------------------------------------------------------
|   0 | SELECT STATEMENT |            |       |       |     1 (100)|          |
|*  1 |  INDEX RANGE SCAN| TEST_TAB#R |     1 |    13 |     1   (0)| 00:00:01 |
-------------------------------------------------------------------------------

The top Location (most SLEEP_COUNT) is changed to "kcbchg: cr pin cleanout" (INDX=2118) instead of "kcbgtcr: fast path pin" (INDX=2195). Here the Dtrace State Transition Diagram (CallStack has no more "qertbFetchByRowID"):


   ksl_get_shared_latch:entry
---------Step 2621 at 2020 Feb 20 15:53:35 (Elapsed-ns=5633)---------
***State Entry>>>(nproc_pid=>0, flag=>0, gets=>1414234634, latch=>228, level=>1, where=>2118)
>>>Transition>>>ksl_get_shared_latch:entry
      (Addr=>0x18B57B028, Immed_Get=>1, Why=>0, Where=>2298(0x8FA), Mode=>16, PID=>0x2000000000000019)
              oracle`ksl_get_shared_latch
              a.out`kcbzwb+0xa33
              a.out`kcbgtcr+0x14a77
              a.out`ktrget2+0x292
              a.out`qeilbk1+0x3cf
              a.out`qeilsr+0xbd
              a.out`qerixFetch+0x1f9
              a.out`qerstFetch+0x4ca
              a.out`opifch2+0x188b
              a.out`opiefn0+0x1f7

   ksl_get_shared_latch:return
>>>State Return***(nproc_pid=>25, flag=>536870912, gets=>1414234638, latch=>228, level=>1, where=>2298)

7. Test Code

7.1. leaf_split_finder


create or replace procedure leaf_split_finder(p_find_cnt number) as
  type t_blk_tab is table of varchar2(100) index by pls_integer;
  l_blk_tab         t_blk_tab;
  l_object_id       number;
  l_stmt            varchar2(32767);
begin
 select object_id into l_object_id from dba_objects where object_name = 'TEST_TAB#R';
  execute immediate 'truncate table test_tab';
  
  l_stmt := q'[
      select 'block: '||blk||', cnt: '||cnt||', min_id: '||min_id||', max_id: '||max_id 
      from (
        select blk, count(*) cnt, min(id) min_id, max(id) max_id from (
         select id, dbms_rowid.rowid_block_number(sys_op_lbid (]'||l_object_id||q'[, 'L', rowid)) blk   
           from test_tab)  
       where id between 1 and 11  -- Note using "id in (1, 6)", Toad crashed due to "sys_op_lbid"
       group by blk order by min_id, blk)               
      ]';
  
  for i in 1..p_find_cnt loop
    insert into test_tab values(i, -i);
   execute immediate l_stmt bulk collect into l_blk_tab;
   if l_blk_tab.count >= 2 then
     dbms_output.put_line('--- Leaf Split at row count: '||i||
                          ', (id between 1 and 11) in '||l_blk_tab.count||' Blocks');
     for i in 1..l_blk_tab.count loop
      dbms_output.put_line(l_blk_tab(i));
    end loop;
    exit;
  end if; 
  end loop;
  rollback;  
end;
/

  -- exec leaf_split_finder(5000);
  -- 
  --   --- Leaf Spli at row count: 4707, (id between 1 and 11) in 2 Blocks
  --   block: 3665598, cnt: 5, min_id: 1, max_id: 5
  --   block: 3665619, cnt: 6, min_id: 6, max_id: 11

7.2. test_tab_select_job


create or replace procedure test_tab_select (p_job_id number, p_cnt number, p_id number, p_sleep number) as
  l_stmt varchar2(1000);
  l_id   number;
  l_ret  number;  
begin
 dbms_session.set_identifier('test_tab_select_id_'||p_id);
  for i in 1..p_cnt loop
    for r in 1..5 loop
      l_id := p_id + r - 1;
      l_stmt := 'select /*+ job '||p_job_id||', id '||p_id||' */ val from test_tab where id = '||l_id;
      -- select id from test_tab, only using reverse index test_tab#r, no table access 
      --l_stmt := 'select /*+ job '||p_job_id||', id '||p_id||' */ id from test_tab where id = '||l_id;
      -- dynamic statement makes each session has a different statement and avoid cusor pin S
      execute immediate l_stmt into l_ret;  
    end loop;
    if p_sleep > 0 then 
      dbms_lock.sleep(p_sleep);
    end if;
  end loop;
end;
/

  -- exec test_tab_select(1, 100, 1, 0.00);
  -- exec test_tab_select(6, 100, 6, 0.00);

create or replace procedure test_tab_select_job(p_job_cnt number, p_cnt number, p_id number, p_sleep number) as
  l_job_id     pls_integer;
begin
  for i in 1.. p_job_cnt loop
   dbms_job.submit(l_job_id, 
    'begin test_tab_select('||i||','||p_cnt||','||p_id||','||p_sleep||');  end;');
  end loop;
  commit;
end;    
/

  -- exec clean_jobs;
  -- exec test_tab_select_job(6, 1e9, 1, 0);
  -- exec test_tab_select_job(6, 1e9, 6, 0); 
  -- select * from dba_jobs;                  
  
create or replace procedure clean_jobs as
begin
  for c in (select job from dba_jobs) loop
    begin
       dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;

  for c in (select d.job, d.sid, (select serial# from v$session where sid = d.sid) ser 
              from dba_jobs_running d) loop
    begin
      execute immediate
             'alter system kill session '''|| c.sid|| ',' || c.ser|| ''' immediate';
      dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;
  
  -- select * from dba_jobs;
  -- select * from dba_jobs_running;
end;
/

  -- exec clean_jobs; -- stop all jobs