Wednesday, January 24, 2018

Oracle Logical Read: Current Gets Access Path and Cost

Oracle Logical Reads (Buffer Gets) fetch data from Buffer Cache (Memory) in two different Modes: Current Mode Get (DB Block Gets) and Consistent Mode Get (Consistent Gets). Blocks in Consistent Mode are the memory versions at the POINT IN TIME the query started, whereas Blocks in Current Mode are the versions at CURRENT TIME (right now). Each block can have multiversion Clones in Consistent Mode, but maximum one single version in Current Mode.

Following the discussions of Blog: Oracle Physical Read Access Path and Cost, and Oracle Logical Read: Consistent Gets Access Path and Cost, now is the time to look into second part of Logical Reads: Current Mode Gets.

We will use SQL Trace Event 10046 to display the statistics, and dtrace script: dtracelio.d published in Blog: Dynamic tracing of Oracle logical I/O to show block read details and sequence. Ideally Oracle could provide one Event to trace Current Mode Get, which is similar to Event 10200 ("consistent read buffer status") for Consistent Mode Get.

Blog: Dynamic tracing of Oracle logical I/O: part 2. Dtrace LIO v2 is released includes a detail description of Oracle internal functions:
Consistent gets:
    kcbgtcr   – Kernel Cache Buffer Get Consistent Read. This is general entry point for consistent read.
    kcbldrget – Kernel Cache Buffer Load Direct-Read Get. The function performing direct-path read. 
                Interesting detail: 10.2 the function kcbldrget is called just after kcbgtcr, 
                in 11.2 by (from) kcbgtcr.

Current gets (db block gets):
    kcbgcur – Kernel Cache Buffer Get Current Read
    kcbget  – Kernel Cache Buffer Get Buffer. This is analogue of kcbgcur function, 
              as I observed this function is called for index branch and leaf blocks
    kcbnew  – Kernel Cache Buffer New Buffer
    kcblnb (kcblnb_dscn in 11.2) – Kernel Cache Buffer Load New Buffer. 
              The function performing direct-path load. 
              Decoding of block coordinates is not supported for this function 
              in the current version of DTraceLIO. 
              The parameters such as object_id, data_object_id will be shown as 0 (zero)
Note: All tests are done in Oracle 12.1.0.2 on Solaris with 6 physical processors.


1. Test Setup



drop tablespace ksun_tts  including contents and datafiles;
drop tablespace ksun_its1 including contents and datafiles;
drop tablespace ksun_its2 including contents and datafiles;

create tablespace ksun_tts  datafile '/oratestdb/oradata/testdb/ksun_tts.dbf'  size 200m online;
create tablespace ksun_its1 datafile '/oratestdb/oradata/testdb/ksun_its1.dbf' size 200m online;
create tablespace ksun_its2 datafile '/oratestdb/oradata/testdb/ksun_its2.dbf' size 200m online;

drop table ksun_tab;

create table ksun_tab tablespace ksun_tts 
    as select trunc(level/1000) type_id, mod(level, 100) id1, trunc(mod(level, 1000)/100) id2, -1234 dummy 
  from dual connect by level <= 4099;

create unique index ksun_tab#u#1 on ksun_tab (type_id, id1, id2, dummy) tablespace ksun_its1;
create unique index ksun_tab#u#2 on ksun_tab (type_id, id2, id1)        tablespace ksun_its2;

exec dbms_stats.gather_table_stats(null, 'KSUN_TAB', cascade=>true);

select name, ts# from v$tablespace  where name like '%KSUN%' or ts# in (2, 1999);
  --UNDO         2
  --U1        1999
  --KSUN_TTS  2231
  --KSUN_ITS1 2232
  --KSUN_ITS2 2233

select rfile#, name, v.* from  v$datafile v where name like '%ksun%';
  --917 /oratestdb/oradata/testdb/ksun_tts.dbf
  --931 /oratestdb/oradata/testdb/ksun_its1.dbf
  --101 /oratestdb/oradata/testdb/ksun_its2.dbf

select object_name, object_id from dba_objects where object_name like 'KSUN_TAB%' or object_id in (1577349);
  --RMTAB$       1577349
  --KSUN_TAB     2286270
  --KSUN_TAB#U#1 2286271
  --KSUN_TAB#U#2 2286272
  
select segment_name, blocks, v.* from dba_segments v where segment_name like 'KSUN_TAB%';
  --KSUN_TAB     16
  --KSUN_TAB#U#1 24
  --KSUN_TAB#U#2 16

select 'KSUN_TAB', count(distinct dbms_rowid.rowid_relative_fno (rowid)||'-'||dbms_rowid.rowid_block_number (rowid)) used_blocks from KSUN_TAB;
  --KSUN_TAB     11

select 'KSUN_TAB#U#1', count(distinct dbms_rowid.rowid_block_number(sys_op_lbid (2286271, 'L', t.rowid))) used_blocks from KSUN_TAB t;
  --KSUN_TAB#U#1 14

select 'KSUN_TAB#U#2', count(distinct dbms_rowid.rowid_block_number(sys_op_lbid (2286272, 'L', t.rowid))) used_blocks from KSUN_TAB t;
  -- KSUN_TAB#U#2 11

-- list different functions used to perform different types of logical I/O
select indx, kcbwhdes from x$kcbwh where indx in (799, 1060, 1062, 1066, 1153, 1199);
  --799   ktewh26:  kteinpscan
  --1060  kdiwh15:  kdifxs
  --1062  kdiwh17:  kdifind
  --1066  kdiwh22:  kdifind
  --1153  kdiwh169: skipscan
  --1199  kddwh01:  kdddel


2. Tests


We will perform 3 Tests, the first one uses 2 different indexes, the second uses only one index, and the last one uses the same index twice.


2.1 Test_1



--******************************* SQL Trace *******************************--
delete /*+ test_1 use_concat index_ss(@DEL$1 T@DEL$1 ksun_tab#u#2) 
  index_ss(@DEL$1_2 T@DEL$1_2 ksun_tab#u#1) */ ksun_tab t where 99 in (id2, id1)

-- rollback; -- make test repeatable

call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute      1      0.00       0.00          0         12        211          41
Fetch        0      0.00       0.00          0          0          0           0
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total        2      0.00       0.00          0         12        211          41

Rows (avg) Row Source Operation
---------- ---------------------------------------------------
         0 DELETE  KSUN_TAB (cr=12 pr=0 pw=0 time=1569 us)
        41  CONCATENATION  (cr=12 pr=0 pw=0 time=356 us)
         0   INDEX SKIP SCAN KSUN_TAB#U#2 (cr=6 pr=0 pw=0 time=82 us cost=3 size=14 card=1)(object id 2286272)
        41   INDEX SKIP SCAN KSUN_TAB#U#1 (cr=6 pr=0 pw=0 time=228 us cost=8 size=574 card=41)(object id 2286271)


--******************************* dtrace *******************************--
SQL> delete /*+ test_1 use_concat index_ss(@DEL$1 T@DEL$1 ksun_tab#u#2) 
       index_ss(@DEL$1_2 T@DEL$1_2 ksun_tab#u#1) */ ksun_tab t where 99 in (id2, id1);
   41 rows deleted.

     -- rollback; -- make test repeatable


$ > dtracelio.d 23585

Dynamic tracing of Oracle logical I/O v2.1 by Alexander Anokhin ( http://alexanderanokhin.wordpress.com )

  108075: 108075 kcbgtcr(0xFFFFFD7FFAE17D58,0,1153,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1153 exam: 0
  124990:   4336 kcbgtcr(0xFFFFFD7FFAE17D58,0,1153,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1153 exam: 0
    --Repeated lines removed--
    
  424351:   4783 kcbgtcr(0xFFFFFD7FFA0EF830,0,1153,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1153 exam: 0
  428303:   2234 kcbgtcr(0xFFFFFD7FFA0EF830,0,1153,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1153 exam: 0
    --Repeated lines removed--
    
  511748:  21448 kcbgcur(0xFFFFFD7FFAED0EB0,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  524191:   3355 kcbgcur(0xFFFFFD7FFAED0EB0,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  658937:   5220 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  663567:   2134 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  695708:  18720 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  707668:   2415 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  745953:   5864 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  750657:   2105 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  759713:   4981 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  764240:   1989 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  
  784175:   3267 kcbgcur(0xFFFFFD7FFAED0EB0,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  789056:   2245 kcbgcur(0xFFFFFD7FFAED0EB0,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  806713:   2546 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  811388:   2215 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  818717:   3387 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  823620:   2283 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  833734:   2543 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  838017:   1949 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  843481:   2253 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  847875:   1988 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  --Repeated lines removed (40 Repeats)--    


Pick one line:  
  524191:   3355 kcbgcur(0xFFFFFD7FFAED0EB0,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
You can see here:
  timestamp  - 524191, 
  delta time - 3355 (from last timestamp to current timestamp),
  kcbgcur(0xFFFFFD7FFAED0EB0,2,1199,0) – it is function call with 3 arguments,
  tsn: 2231        – a tablespace number, ts# from v$tablespace for KSUN_TTS,
  rdba: 0xe5400083 – a relative dba (data block address)
  (917/131)        – file 917 (ksun_tts.dbf) block 131
  obj: 2286270     – dictionary object number, object_id from dba_objects for KSUN_TAB,
  dobj: 2286270    – data object number, data_object_id from dba_objects for KSUN_TAB,
  where: 1199      – location from function (kcbgcur in this case) was executed. This is INDX from x$kcbwf for "kddwh01: kdddel".

===================== Logical I/O Summary (grouped by object/function) ==============
 function    stat   object_id   data_object_id   mode_held   where     bufs     calls
--------- ------- ----------- ---------------- ----------- ------- -------- ---------
  kcbgcur      cu           0               -1           2      53        2         2
  kcbgcur      cu           0               -1           2      88        2         2
  kcbgcur      cu           0               -1           2      86        4         4
   kcbnew      cu           0               -1                  47        4         4
  kcbgtcr      cr     2286271          2286271                1153       12        12
  kcbgtcr      cr     2286272          2286272                1153       12        12
  kcbgcur      cu     2286270          2286270           2    1199       82        82
  kcbgcur      cu     2286271          2286271           1    1062       82        82
  kcbgcur      cu     2286272          2286272           1    1062       82        82
   kcbget      cu     2286271          2286271           2    1066       82        82
   kcbget      cu     2286272          2286272           2    1066       82        82
=====================================================================================

================================= Logical I/O Summary (grouped by object) ==========================================
 object_id  data_object_id       lio        cr    cr (e)    cr (d)        cu    cu (d) ispnd (Y) ispnd (N)   pin rls
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
         0              -1        12         0         0         0        12         0         0         0         4
   2286270         2286270        82         0         0         0        82         0         0         0        82
   2286271         2286271       176        12         0         0       164         0         0         0        12
   2286272         2286272       176        12         0         0       164         0         0         0        12
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
     total                       446        24         0         0       422         0         0         0       110
====================================================================================================================

Legend
  lio      : logical gets (cr + cu)
  cr       : consistent gets
  cr (e)   : consistent gets - examination
  cr (d)   : consistent gets direct
  cu       : db block gets
  cu (d)   : db block gets direct
  ispnd (Y): buffer is pinned count
  ispnd (N): buffer is not pinned count
  pin rls  : pin releases


2.2 Test_2



--******************************* SQL Trace *******************************--
delete /*+ test_2 index(t ksun_tab#U#2)  */ ksun_tab t where 99 in (id2, id1)

-- rollback; -- make test repeatable

call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute      1      0.00       0.00          0         12         75          41
Fetch        0      0.00       0.00          0          0          0           0
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total        2      0.00       0.00          0         12         75          41

Rows (avg) Row Source Operation
---------- ---------------------------------------------------
         0 DELETE  KSUN_TAB (cr=12 pr=0 pw=0 time=1025 us)
        41  INDEX FULL SCAN KSUN_TAB#U#2 (cr=12 pr=0 pw=0 time=405 us cost=6 size=574 card=41)(object id 2286272)


--******************************* dtrace *******************************--
SQL > delete /*+ test_2 index(t ksun_tab#U#2) */ ksun_tab t where 99 in (id2, id1);
   41 rows deleted.
   
      -- rollback; -- make test repeatable
   
$ > dtracelio.d 23585

Dynamic tracing of Oracle logical I/O v2.1 by Alexander Anokhin ( http://alexanderanokhin.wordpress.com )

  233462:  22641 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  245054:   2829 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  365187:   2913 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  369681:   1947 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  404103:   2826 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  408598:   2029 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  429972:   2233 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  434528:   2212 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  453201:   6922 kcbgtcr(0xFFFFFD7FFAED1730,0,1060,0) [tsn: 2233 rdba: 0x19400085 (101/133) obj: 2286272 dobj: 2286272] where: 1060 exam: 0
  458011:   2301 kcbgtcr(0xFFFFFD7FFAED1730,0,1060,0) [tsn: 2233 rdba: 0x19400085 (101/133) obj: 2286272 dobj: 2286272] where: 1060 exam: 0 
  479314:   2777 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  484221:   2266 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  506878:   2334 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  511463:   2202 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  532201:   2334 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  536500:   1965 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  556552:   2244 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  560690:   1834 kcbgcur(0xFFFFFD7FFAED9270,2,1199,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  577493:   3208 kcbgtcr(0xFFFFFD7FFAED1730,0,1060,0) [tsn: 2233 rdba: 0x19400086 (101/134) obj: 2286272 dobj: 2286272] where: 1060 exam: 0
  581798:   2276 kcbgtcr(0xFFFFFD7FFAED1730,0,1060,0) [tsn: 2233 rdba: 0x19400086 (101/134) obj: 2286272 dobj: 2286272] where: 1060 exam: 0
    --Repeated lines removed--
  
  1786824:   5448 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  1791226:   2020 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  1830860:  17280 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  1841407:   2737 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  1885701:   3609 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  1890635:   1952 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  1904651:   3127 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2233 rdba: 0x19400085 (101/133) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  1909925:   2340 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2233 rdba: 0x19400085 (101/133) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
   --Repeated lines removed--   
     
     
  2202041:   5066 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  2206643:   2018 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  2220344:   5196 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  2224948:   2136 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  2242122:   2441 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  2246527:   2011 kcbgcur(0xFFFFFD7FFFDF3200,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  2255942:   2484 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2232 rdba: 0xe8c0008a (931/138) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  2260392:   1987 kcbget(0xFFFFFD7FFFDF3200,2,1066,0) [tsn: 2232 rdba: 0xe8c0008a (931/138) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
   --Repeated lines removed--   


===================== Logical I/O Summary (grouped by object/function) ==============
 function    stat   object_id   data_object_id   mode_held   where     bufs     calls
--------- ------- ----------- ---------------- ----------- ------- -------- ---------
  kcbgcur      cu           0               -1           2      53        2         2
  kcbgcur      cu           0               -1           2      86        2         2
  kcbgcur      cu           0               -1           2      88        2         2
  kcbgtcr      cr     2286272          2286272                1298        2         2
  kcbgtcr      cr     2286272          2286272                1299        2         2
   kcbnew      cu           0               -1                  47        2         2
  kcbgcur      cu     2286271          2286271           1    1062        8         8
   kcbget      cu     2286271          2286271           2    1066        8         8
  kcbgtcr      cr     2286272          2286272                1060       20        20
  kcbgcur      cu     2286272          2286272           1    1062       22        22
   kcbget      cu     2286272          2286272           2    1066       22        22
  kcbgcur      cu     2286270          2286270           2    1199       82        82
=====================================================================================

================================= Logical I/O Summary (grouped by object) ==========================================
 object_id  data_object_id       lio        cr    cr (e)    cr (d)        cu    cu (d) ispnd (Y) ispnd (N)   pin rls
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
         0              -1         8         0         0         0         8         0         0         0         2
   2286270         2286270        82         0         0         0        82         0         0         0        82         
   2286271         2286271        16         0         0         0        16         0         0         0         0
   2286272         2286272        68        24         2         0        44         0         0         2        18
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
     total                       174        24         2         0       150         0         0         2       102
====================================================================================================================


2.3 Test_3



--******************************* SQL Trace *******************************--
delete /*+ test_3 use_concat index_ss(@DEL$1_1 T@DEL$1 ksun_tab#u#2) 
  index_ffs(@DEL$1_2 T@DEL$1_2 ksun_tab#u#2) */ ksun_tab t where 99 in (id2, id1)


call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute      1      0.00       0.00          0         23        211          41
Fetch        0      0.00       0.00          0          0          0           0
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total        2      0.00       0.00          0         23        211          41

ows (avg)  Row Source Operation
---------  ---------------------------------------------------
        0  DELETE  KSUN_TAB (cr=23 pr=0 pw=0 time=1180 us)
       41   CONCATENATION  (cr=23 pr=0 pw=0 time=398 us)
        0    INDEX SKIP SCAN KSUN_TAB#U#2 (cr=6 pr=0 pw=0 time=72 us cost=3 size=14 card=1)(object id 2286272)
       41    INDEX FAST FULL SCAN KSUN_TAB#U#2 (cr=17 pr=0 pw=0 time=242 us cost=5 size=574 card=41)(object id 2286272)
           

--******************************* dtrace *******************************--
  131208:   3788 kcbgtcr(0xFFFFFD7FFFDE5230,0,798,0) [tsn: 0 rdba: 0x5133b8 (1/1127352) obj: 1577349 dobj: 1577349] where: 798 exam: 0
  181969:   3256 kcbgtcr(0xFFFFFD7FFFDE4E40,0,798,0) [tsn: 0 rdba: 0x5133b8 (1/1127352) obj: 1577349 dobj: 1577349] where: 798 exam: 0
    --Repeated lines removed -- 
    
  2537995:   2950 kcbgcur(0xFFFFFD7FFA0FB028,2,1199,0) [tsn: 2231 rdba: 0xe5400085 (917/133) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  2543063:   2038 kcbgcur(0xFFFFFD7FFA0FB028,2,1199,0) [tsn: 2231 rdba: 0xe5400085 (917/133) obj: 2286270 dobj: 2286270] where: 1199 mode_held: 2
  2557738:   2324 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  2562180:   2074 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2232 rdba: 0xe8c00083 (931/131) obj: 2286271 dobj: 2286271] where: 1062 mode_held: 1
  2569550:   3515 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  2575512:   2290 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2232 rdba: 0xe8c00087 (931/135) obj: 2286271 dobj: 2286271] where: 1066 mode_held: 2
  2584580:   2492 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  2589073:   2002 kcbgcur(0xFFFFFD7FFFDF4660,1,1062,0) [tsn: 2233 rdba: 0x19400083 (101/131) obj: 2286272 dobj: 2286272] where: 1062 mode_held: 1
  2594662:   2312 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2233 rdba: 0x19400086 (101/134) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
  2599288:   1901 kcbget(0xFFFFFD7FFFDF4660,2,1066,0) [tsn: 2233 rdba: 0x19400086 (101/134) obj: 2286272 dobj: 2286272] where: 1066 mode_held: 2
    --Repeated lines removed (40 Repeats)--   

===================== Logical I/O Summary (grouped by object/function) ==============
 function    stat   object_id   data_object_id   mode_held   where     bufs     calls
--------- ------- ----------- ---------------- ----------- ------- -------- ---------
  kcbgcur      cu           0               -1           2      53        2         2
  kcbgcur      cu           0               -1           2      88        2         2
  kcbgtcr      cr     1577349          1577349                 799        2         2
  kcbgtcr      cr     2286272          2286272                 798        2         2
  kcbgcur      cu           0               -1           2      86        4         4
  kcbgtcr      cr     1577349          1577349                 798        4         4
  kcbgtcr      cr     2286272          2286272                 799        4         4
  kcbgtcr      cr     2286272          2286272                 800        4         4
   kcbnew      cu           0               -1                  47        4         4
  kcbgtcr      cr     2286272          2286272                1153       12        12
  kcbgtcr      cr     2286272          2286272                1118       24        24
  kcbgcur      cu     2286270          2286270           2    1199       82        82
  kcbgcur      cu     2286271          2286271           1    1062       82        82
  kcbgcur      cu     2286272          2286272           1    1062       82        82
   kcbget      cu     2286271          2286271           2    1066       82        82
   kcbget      cu     2286272          2286272           2    1066       82        82
=====================================================================================

================================= Logical I/O Summary (grouped by object) ==========================================
 object_id  data_object_id       lio        cr    cr (e)    cr (d)        cu    cu (d) ispnd (Y) ispnd (N)   pin rls
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
   1577349         1577349         6         6         0         0         0         0         0         0         6
         0              -1        12         0         0         0        12         0         0         0         4
   2286270         2286270        82         0         0         0        82         0         0         0        82
   2286271         2286271       164         0         0         0       164         0         0         0         0
   2286272         2286272       210        46         0         0       164         0         0         0        46
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
     total                       474        52         0         0       422         0         0         0       138
====================================================================================================================


3. Discussions


Note that Dtrace Script statistics are double counted, which will be discussed in next Section: Dtrace Script double counted Statistics.
(a). Looking execution statistics of all 3 Tests, which delete the same 41 rows:

         Test    Consistent Gets  Current Gets
         ------  ---------------  ------------
         Test_1               12           211
         Test_1               12            75
         Test_1               23           211
         
     the majority of Logical Read is generated from Current Gets.

(b). Test_1 makes 211 Current Mode Gets, whereas Test_2 does 75 Current Mode Gets.

     In Test_1, dtrace shows that each Current Get(kcbgcur) of KSUN_TAB (obj: 2286270) 
     follows by two Current Gets (kcbgcur and kcbgcur) of KSUN_TAB#U#1 (obj: 2286271) 
     and two Current Gets (kcbgcur and kcbgcur) of KSUN_TAB#U#2 (obj: 2286272). 
     41 deleted Rows results in:
     
           41 (KSUN_TAB)  + 2*41 (KSUN_TAB#U#1) + 2*41 (KSUN_TAB#U#2) = 205 Current Gets
           
     plus 4 Current Gets (kcbgcur) and 2 Current Gets (kcbnew) of UNDO Segments (dobj: -1). 
     So in total, 205 + 4 + 2 = 211.
     
     In Test_2, at first, 41 Current Gets (kcbgcur) of KSUN_TAB (obj: 2286270) for 41 deleted Rows, 
     then 8 Current Gets (kcbgcur and kcbgcur) of KSUN_TAB#U#2 (obj: 2286272), 
     and 22 Current Gets (kcbgcur and kcbgcur) of KSUN_TAB#U#1 (obj: 2286271):
     
           41 (KSUN_TAB)  + 8 (KSUN_TAB#U#2) + 22 (KSUN_TAB#U#1) = 71 Current Gets
           
     plus 3 Current Gets (kcbgcur) and 1 Current Gets (kcbnew) of UNDO Segments (dobj: -1). 
     So in total, 71 + 3 + 1 = 75.

(c). SQL Trace xplan contains only Consistent Mode Gets ("cr=xxx"), but not Current Mode Gets.
     However, Current Mode Gets is included in xplan statistics under Column current.

(d). Test_3 makes 211 Current Mode Gets, same as Test_1 although only one single Index: 
     KSUN_TAB#U#2 is involved in xplan.
     
     Dtrace shows that it follows the same access pattern as Test_1 to read both KSUN_TAB#U#1 
     and KSUN_TAB#U#2 in Current Mode, but SQL Trace is not able to point out KSUN_TAB#U#1 Gets.

(e). Test_3 SQL Trace published 23 Consistent Mode Gets (query=23), but Dtrace shows 26 (=52/2). 
     The 3 (=26-23) is to read RMTAB$ (object_id = 1577349), which are not disclosed in SQL Trace.
     (See MOS: Bug 20853821 - HIgh number of executions for queries on rmtab$ (Doc ID 20853821.8))
     
     Therefore, SQL Trace is not able to disclose all Logical Reads.
     
     Probably they stem from recursive calls, which can be visible in Sqlplus Autotrace.


4. Dtrace Script double counted Statistics


Comparing dtracelio.d output with SQL Trace:
    Test1: SQL Trace  query = 12 current = 211
           Dtrace     cr    = 24 cu      = 422    
           
    Test2: SQL Trace  query = 12 current =  75
           Dtrace     cr    = 24 cu      = 150             
it seems that Dtrace double counts cr and cu (also verified with Sqlplus Autotrace, v$sql.buffer_gets, and sys.dbms_sqltune.report_sql_monitor). In all tests, we purposely deleted and selected odd number of rows (41), but all number in dtrace Summary are even number.

We can observe this double counting when comparing 3 different Traces: one Sql Trace, one 10200 trace, and one dtrace for the same query.

At first, a small dtrace command shows us that each kcbgtcr is called by ktrget2, so in 10200 Trace, there is almost a 1-to-1 mapping between kcbgtcr and ktrget2.

SQL Trace shows that the query made cr=15 on KSUN_TAB#U#2 (object id 2286272), and 28-15=13 on KSUN_TAB (object id 2286270). dtracelio.d shows that cr=30 on object_id=2286272, cr=26 on object_id= 2286270.

10200 Trace contains two rows for each ktrget2 (which is immediate parent of kcbgtcr) call, first is "started", second is "completed", which probably caused this double counting.

--*************************** kcbgtcr is called by ktrget2 ***************************--
SQL > select/*+ test_2 index(t ksun_tab#U#2) */ * from ksun_tab t where 99 in (id2, id1);

$ dtrace -n 'pid$target::kcbgtcr:entry {ustack(2, 0); @[pid, ustack(2, 0)] = count();}' -p 23585
    oracle`kcbgtcr
    oracle`ktrgtc2+0x332
    
    oracle`kcbgtcr
    oracle`ktrgtc2+0x332
    
    oracle`kcbgtcr
    oracle`ktrget2+0x2b7    
  --53 Repeated lines removed-- 
    
    oracle`kcbgtcr
    oracle`ktrgtc2+0x332
      2
      
    oracle`kcbgtcr
    oracle`ktrget2+0x2b7
     54


--******************************* SQL Trace *******************************--
select /*+ test_2 index(t ksun_tab#U#2) */ * from ksun_tab t where 99 in (id2, id1)

call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute      1      0.00       0.00          0          0          0           0
Fetch        4      0.00       0.00          0         28          0          41
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total        6      0.00       0.00          0         28          0          41

Rows (avg)  Row Source Operation
----------  ---------------------------------------------------
        41  TABLE ACCESS BY INDEX ROWID BATCHED KSUN_TAB (cr=28 pr=0 pw=0 time=49 us cost=7 size=574 card=41)
        41   INDEX FULL SCAN KSUN_TAB#U#2 (cr=15 pr=0 pw=0 time=636 us cost=6 size=0 card=41)(object id 2286272)


--******************************* 10200 Trace *******************************--
alter session set tracefile_identifier = 'test_2_10200_trace';
alter session set events '10200 trace name context forever,level 10';
  select/*+ test_2 index(t ksun_tab#U#2) */ * from ksun_tab t where 99 in (id2, id1);
alter session set events '10200 trace name context off'; 
 
   ktrget2(): started   for block <0x08b7 : 0xe5400083> objd: 0x0022e2be (KSUN_TAB)
   ktrget2(): completed for block <0x08b7 : 0xe5400083> objd: 0x0022e2be (KSUN_TAB)
       
   ktrget2(): started   for block <0x08b9 : 0x1940008e> objd: 0x0022e2c0 (KSUN_TAB#U#2)
   ktrget2(): completed for block <0x08b9 : 0x1940008e> objd: 0x0022e2c0 (KSUN_TAB#U#2)
 --Repeated lines removed-- 
 
 
--******************************* dtrace *******************************--
SQL > select/*+ test_2 index(t ksun_tab#U#2) */ * from ksun_tab t where 99 in (id2, id1);
   
$ > dtracelio.d 23585

  1365716:  10444 kcbgtcr(0xFFFFFD7FFAEE4798,0,1061,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1061 exam: 0
  1372023:   2458 kcbgtcr(0xFFFFFD7FFAEE4798,0,1061,0) [tsn: 2233 rdba: 0x19400084 (101/132) obj: 2286272 dobj: 2286272] where: 1061 exam: 0
  1399863:   2673 kcbgtcr(0xFFFFFD7FFA0FA518,0,1007,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1007 exam: 0
  1403855:   2116 kcbgtcr(0xFFFFFD7FFA0FA518,0,1007,0) [tsn: 2231 rdba: 0xe5400083 (917/131) obj: 2286270 dobj: 2286270] where: 1007 exam: 0
  1457916:   5590 kcbgtcr(0xFFFFFD7FFAEE4798,0,1060,0) [tsn: 2233 rdba: 0x19400085 (101/133) obj: 2286272 dobj: 2286272] where: 1060 exam: 0
  1462033:   2347 kcbgtcr(0xFFFFFD7FFAEE4798,0,1060,0) [tsn: 2233 rdba: 0x19400085 (101/133) obj: 2286272 dobj: 2286272] where: 1060 exam: 0
  1493378:   2493 kcbgtcr(0xFFFFFD7FFA0FA518,0,1007,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1007 exam: 0
  1497338:   2096 kcbgtcr(0xFFFFFD7FFA0FA518,0,1007,0) [tsn: 2231 rdba: 0xe5400084 (917/132) obj: 2286270 dobj: 2286270] where: 1007 exam: 0    
   --Repeated lines removed-- 
     
===================== Logical I/O Summary (grouped by object/function) ==============
 function    stat   object_id   data_object_id   mode_held   where     bufs     calls
--------- ------- ----------- ---------------- ----------- ------- -------- ---------
  kcbgtcr      cr     2286272          2286272                1298        2         2
  kcbgtcr      cr     2286272          2286272                1299        2         2
  kcbgtcr      cr     2286272          2286272                1061        6         6
  kcbgtcr      cr     2286272          2286272                1060       20        20
  kcbgtcr      cr     2286270          2286270                1007       26        26
=====================================================================================

================================= Logical I/O Summary (grouped by object) ==========================================
 object_id  data_object_id       lio        cr    cr (e)    cr (d)        cu    cu (d) ispnd (Y) ispnd (N)   pin rls
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
         0               0         0         0         0         0         0         0         0         2         0
   2286270         2286270        26        26         0         0         0         0       130        32        26
   2286272         2286272        30        30         2         0         0         0         0         8        28
---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------
     total                        56        56         2         0         0         0       130        42        54
====================================================================================================================


5. Performance Test


Run above 3 Tests 10000 times and watch their statistics difference.

alter system flush shared_pool; 

declare
 p_cnt number := 10000;
begin
  for i in 1..p_cnt loop
    delete /*+ test_1 use_concat index_ss(@DEL$1 T@DEL$1 ksun_tab#u#2) index_ss(@DEL$1_2 T@DEL$1_2 ksun_tab#u#1) */ 
      ksun_tab t where 99 in (id2, id1);
    rollback;
  end loop;
  
  for i in 1..p_cnt loop
    delete /*+ test_2 index(t ksun_tab#U#2)  */ ksun_tab t where 99 in (id2, id1);
    rollback;
  end loop;  
  
  for i in 1..p_cnt loop
    delete /*+ test_3 use_concat index_ss(@DEL$1_1 T@DEL$1 ksun_tab#u#2) index_ffs(@DEL$1_2 T@DEL$1_2 ksun_tab#u#2) */ 
      ksun_tab t where 99 in (id2, id1);
    rollback;
  end loop;  
end;
/

select substr(sql_text, 1, 18) sql_text, buffer_gets, round(buffer_gets/10000) buffer_gets_per_exec
      ,rows_processed, physical_read_bytes, physical_write_bytes, elapsed_time, cpu_time, sql_id  --, v.*
from v$sql v where lower(sql_text) like 'delete%test%' order by v.sql_text;

SQL_TEXT          BUFFER_GETS BUFFER_GETS_PER_EXEC ROWS_PROCESSED PHYSICAL_READ_BYTES PHYSICAL_WRITE_BYTES ELAPSED_TIME CPU_TIME SQL_ID
----------------- ----------- -------------------- -------------- ------------------- -------------------- ------------ -------- -------------
DELETE /*+ test_1     2210823                  221         410000             1146880            210026496      7534494  7400730 g2267fmk9u39r
DELETE /*+ test_2      870403                   87         410000              786432            210026496      7802519  7755552 5xqavr92xbvw7
DELETE /*+ test_3     2320261                  232         410000              401408            105013248      9445604  9320234 66dw2ur1uakhn


6. dtracelio.d


Dtrace script is appended in the Blog for a quick reference. The script is extended with time stamp and time delta similar to truss (-d, -D options) prefixed in each output line.

#!/usr/sbin/dtrace -CZs

/* Add time stamp and time delta similar to truss (-d, -D options) */

/********************************************************************************************/
/*                                                                                          */
/*   File Name:    dtracelio.d                                                              */
/*   Version:      2.1                                                                     */
/*   Authors:      Alexander Anokhin ( http://alexanderanokhin.com )                        */
/*                 Andrey Nikolaev   ( http://andreynikolaev.wordpress.com )                */
/*   Dated:        Mar 2012                                                                 */
/*   Purpose:      The script shows calls of the functions                                  */
/*                 performing logical I/O in Oracle.                                        */
/*                                                                                          */
/*   Usage:        dtracelio.d  [show_each_call] [interval]                            */
/*                 PID: unix process ID                                                     */
/*                 show_each_call - Bitmask determining how functions calls will be shown   */
/*                                  1st bit - logical I/O functions                         */
/*                                  2nd bit - buffer pinning                                */
/*                                  Examples:                                               */
/*                                    0: output of each call will be disabled               */
/*                                    1: logical I/O functions will be shown                */
/*                                    3: logical I/O and buffer pinning functions           */
/*                                       will be shown                                      */
/*                                  Default value: 1                                        */
/*                 interval - Specifies the number of seconds over which                    */
/*                            Summary form with cumulative figures will be shown.           */
/*                            Works only when show_each_call is disabled.                   */
/*                                                                                          */
/*   Other:        Some bits of info in 8103.1                                              */
/*                 Note that data structures definitions are not full                       */
/*                 in current version of the script                                         */
/*                                                                                          */
/********************************************************************************************/

#pragma D option quiet
#pragma D option defaultargs
#pragma D option switchrate=10Hz

/*      0. Several standard oratype.h declarations */

typedef unsigned long long ub8; /* unsigned int of length 8 */
typedef unsigned int       ub4;
typedef unsigned short     ub2;
typedef unsigned char      ub1;
typedef int       sb4;

/*  */
typedef sb4 kobjd;
typedef sb4 kobjn;
typedef ub4 ktsn;
typedef ub4 krdba;

/* definition from MOS note 8103.1 */

typedef struct kdbafr /* full relative dba */ {
    ktsn tsn_kdbafr;  /* a tablespace number */
    krdba dba_kdbafr; /* a relative dba */ } kdbafr;

typedef struct ktid /* relative dba + objd */ {
    struct kdbafr dbr_ktid; /* a relative dba */
    kobjd objd_ktid; /* data object number */
    kobjn objn_ktid; /* dictionary object number */ } ktid;

typedef struct kcbds
{
    struct ktid kcbdstid; /* full relative DBA plus object number */
    /* Here unknown (yet ;-)) part of the structure */ } kcbds;

BEGIN
{
    show_each_call      = ($$2 == NULL) ? 1:$2;
    tick_time_interval  = ($$3 == NULL) ? 0:$3;
    trace_logical_io    = 1;
    trace_buf_pinning   = 1;
    show_lio_calls      = show_each_call & 1;
    show_buf_pins_calls = show_each_call & 2;
    printf("\nDynamic tracing of Oracle logical I/O v2.1 by Alexander Anokhin ( http://alexanderanokhin.wordpress.com )\n\n"); 
    t0 = 0;
}

pid$1::kcbgtcr:entry,
pid$1::kcbgcur:entry,
pid$1::kcbnew:entry,
pid$1::kcbrls:entry,
pid$1::kcbispnd:entry,
pid$1::kcbldrget:entry,
pid$1::kcbget:entry
{
    blk         = ((kcbds *) copyin(arg0, sizeof(kcbds)));
    tsn         = blk->kcbdstid.dbr_ktid.tsn_kdbafr;
    rdba        = blk->kcbdstid.dbr_ktid.dba_kdbafr;
    objd        = blk->kcbdstid.objd_ktid;
    objn        = blk->kcbdstid.objn_ktid;
    rdba_file   = rdba >> 22; /* for smallfile tablespaces */
    rdba_block  = rdba & 0x3FFFFF;
    t0 = (t0 == 0) ? timestamp:t0; t1 = timestamp;
}



pid$1::kcbgtcr:entry
/trace_logical_io/
{
    stat       = "cr";
    where      = arg2&0xFFFF;
    mode_held  = "";

    @func[probefunc,stat,objn,objd,mode_held,where] = count();
    @bufs[probefunc,stat,objn,objd,mode_held,where] = sum(1);

    @obj_cr[objn,objd] = sum(1);
    @total_cr = sum(1);

    @obj_cr_exam[objn,objd] = sum(arg1);
    @total_cr_exam = sum(arg1);

    @obj_lio[objn,objd] = sum(1);
    @total_lio = sum(1);
}

pid$1::kcbgtcr:entry
/trace_logical_io && show_lio_calls/
{   t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;    /* Elapsed nanosecond from beginning: Elapsed delta from last printf */
    printf("%s(0x%X,%d,%d,%d) [tsn: %d rdba: 0x%x (%d/%d) obj: %d dobj: %d] where: %d exam: %d\n",probefunc,arg0,arg1,arg2,arg3,tsn, rdba,rdba_file,rdba_block,objn, objd, where, arg1); }


pid$1::kcbldrget:entry
/trace_logical_io/
{
    stat       = "cr (d)";
    where      = 0;
    mode_held  = "";

    @func[probefunc,stat,objn,objd,mode_held,where] = count();
    @bufs[probefunc,stat,objn,objd,mode_held,where] = sum(1);

    @obj_cr_d[objn,objd] = sum(1);
    @total_cr_d = sum(1);

    @obj_cr[objn,objd] = sum(-1);
    @total_cr = sum(-1);

  /* commented because incremented in kcbgtcr yet
    @obj_lio[objn,objd] = sum(1);
    @total_lio = sum(1);
  */
}

pid$1::kcbldrget:entry
/trace_logical_io && show_lio_calls/
{   t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;
    printf("%s(0x%X,%d,%d,%d)\n",probefunc,arg0,arg1,arg2,arg3);
}


pid$1::kcbgcur:entry,
pid$1::kcbget:entry
/trace_logical_io/
{
    stat       = "cu";
    where      = arg2&0xFFFF;
    mode_held  = lltostr(arg1);

    @func[probefunc,stat,objn,objd,mode_held,where] = count();
    @bufs[probefunc,stat,objn,objd,mode_held,where] = sum(1);

    @obj_cu[objn,objd] = sum(1);
    @total_cu  = sum(1);

    @obj_lio[objn,objd] = sum(1);
    @total_lio = sum(1);
}

pid$1::kcbgcur:entry,
pid$1::kcbget:entry
/trace_logical_io && show_lio_calls/
{   t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;
    printf("%s(0x%X,%d,%d,%d) [tsn: %d rdba: 0x%x (%d/%d) obj: %d dobj: %d] where: %d mode_held: %s\n",probefunc,arg0,arg1,arg2,arg3,tsn, rdba,rdba_file,rdba_block,objn, objd, where, mode_held); }

pid$1::kcbnew:entry
/trace_logical_io/
{
    stat       = "cu";
    where      = arg2&0xFFFF;
    mode_held  = "";
    blocks     = arg1;

    @func[probefunc,stat,objn,objd,mode_held,where] = count();
    @bufs[probefunc,stat,objn,objd,mode_held,where] = sum(blocks);

    @obj_cu[objn,objd] = sum(blocks);
    @total_cu = sum(blocks);

    @obj_lio[objn,objd] = sum(blocks);
    @total_lio = sum(blocks);
}

pid$1::kcbnew:entry
/trace_logical_io && show_lio_calls/
{   t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;
    printf("%s(0x%X,%d,%d,%d) [tsn: %d rdba: 0x%x (%d/%d) obj: %d dobj: %d] where: %d mode_held: %s blocks: %d\n",probefunc,arg0,arg1,arg2,arg3,tsn, rdba,rdba_file,rdba_block,objn, objd, where, mode_held, blocks); }

pid$1::kcblnb:entry, pid$1::kcblnb_dscn:entry /trace_logical_io/ {
    stat = "cu (d)";
    @func[probefunc,stat,0,0,"",0] = count();
    @bufs[probefunc,stat,0,0,"",0] = sum(1);

    @obj_cu[objn,objd] = sum(1);
    @total_cu = sum(1);

    @obj_cu_d[objn,objd] = sum(1);
    @total_cu_d = sum(1);

    @obj_lio[objn,objd] = sum(1);
    @total_lio = sum(1);
}



pid$1::kcblnb:entry, pid$1::kcblnb_dscn:entry /trace_logical_io && show_lio_calls/ {
    t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;
    printf("%s(0x%X,%d,%d,%d)\n",probefunc,arg0,arg1,arg2,arg3);
}



/* buffer pinning */

pid$1::kcbispnd:entry
/trace_buf_pinning && show_buf_pins_calls/ {
    t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;
    printf("%s(0x%X,%d,%d,%d) [tsn: %d rdba: 0x%x (%d/%d) obj: %d dobj: %d]: ",probefunc,arg0,arg1,arg2,arg3,tsn, rdba,rdba_file,rdba_block,objn,objd);
}

pid$1::kcbispnd:return
/trace_buf_pinning && arg1 == 0/
{
    @obj_is_not_pnd[objn,objd] = sum(1);
    @total_is_not_pnd = sum(1);
}

pid$1::kcbispnd:return
/trace_buf_pinning && arg1 == 1/
{
    @obj_is_pnd[objn,objd] = sum(1);;
    @total_is_pnd = sum(1);
}

pid$1::kcbispnd:return
/trace_buf_pinning && show_buf_pins_calls/ {
    t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;
    printf("%d\n",arg1);
}

pid$1::kcbrls:entry
/trace_buf_pinning/
{
    @obj_pinrls[objn,objd] = sum(1);
    @total_pinrls = sum(1);
}

pid$1::kcbrls:entry
/trace_buf_pinning && show_buf_pins_calls/ {
    t2 = timestamp; printf("%10d: %6d ", t2-t0, t2-t1); t1 = t2;
    printf("%s(0x%X,%d,%d,%d) [tsn: %d rdba: 0x%x (%d/%d) obj: %d dobj: %d]\n",probefunc,arg0,arg1,arg2,arg3,tsn, rdba,rdba_file,rdba_block,objn,objd);
}


tick-1sec
/show_each_call == 0 && tick_time_interval > 0 && ++ticks >= tick_time_interval/ {
    printf("\n\n");
    printf("\n================================= Logical I/O Summary (grouped by object) ==========================================\n");
    printf(" object_id  data_object_id       lio        cr    cr (e)    cr (d)        cu    cu (d) ispnd (Y) ispnd (N)   pin rls\n");
    printf("---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------\n");
    printa("%10d %15d %9@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d\n", 
            @obj_lio, @obj_cr, @obj_cr_exam, @obj_cr_d, @obj_cu, @obj_cu_d, @obj_is_pnd, @obj_is_not_pnd, @obj_pinrls);
    printf("---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------\n");
    printa("     total %25@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d\n", @total_lio, @total_cr, @total_cr_exam, @total_cr_d, @total_cu, @total_cu_d, @total_is_pnd, @total_is_not_pnd, @total_pinrls);
    printf("====================================================================================================================\n");
    ticks = 0;
}



END
{
    printf("\n===================== Logical I/O Summary (grouped by object/function) ==============\n");
    printf(" function    stat   object_id   data_object_id   mode_held   where     bufs     calls\n");
    printf("--------- ------- ----------- ---------------- ----------- ------- -------- ---------\n");
    printa("%9s %7s %11d %16d %11s %7d %@8d %@9d\n", @bufs, @func);
    printf("=====================================================================================\n");
}

END
{
    printf("\n================================= Logical I/O Summary (grouped by object) ==========================================\n");
    printf(" object_id  data_object_id       lio        cr    cr (e)    cr (d)        cu    cu (d) ispnd (Y) ispnd (N)   pin rls\n");
    printf("---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------\n");
    printa("%10d %15d %9@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d\n", 
            @obj_lio, @obj_cr, @obj_cr_exam, @obj_cr_d, @obj_cu, @obj_cu_d, @obj_is_pnd, @obj_is_not_pnd, @obj_pinrls);
    printf("---------- --------------- --------- --------- --------- --------- --------- --------- --------- --------- ---------\n");
    printa("     total %25@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d %9@d\n", @total_lio, @total_cr, @total_cr_exam, @total_cr_d, @total_cu, @total_cu_d, @total_is_pnd, @total_is_not_pnd, @total_pinrls);
    printf("====================================================================================================================\n");
    printf("\n");
    printf("Legend\n");
    printf("lio      : logical gets (cr + cu)\n");
    printf("cr       : consistent gets\n");
    printf("cr (e)   : consistent gets - examination\n");
    printf("cr (d)   : consistent gets direct\n");
    printf("cu       : db block gets\n");
    printf("cu (d)   : db block gets direct\n");
    printf("ispnd (Y): buffer is pinned count\n");
    printf("ispnd (N): buffer is not pinned count\n");
    printf("pin rls  : pin releases\n");
}

Oracle Logical Read: Consistent Gets Access Path and Cost

Oracle Logical Reads (Buffer Gets) fetch data from Buffer Cache (Memory) in two different Modes: Consistent Mode Get (Consistent Gets) and Current Mode Get (DB Block Gets). Blocks in Consistent Mode are the memory versions at the POINT IN TIME the query started, whereas Blocks in Current Mode are the versions at CURRENT TIME (right now). Each block can have multiversion Clones in Consistent Mode, but maximum one single version in Current Mode.

This Blog will discuss Consistent Gets, and next Blog will talk about DB Block Gets.

At first, we test Consistent Gets in 4 different Access Paths and measure the Cost in terms of Event 10200 ("consistent read buffer status"), then demonstrate 'latch: cache buffers chains' in row-by-row slow processing.

Note: All tests are done in Oracle 12.1.0.2 on AIX, Solaris, Linux with 6 physical processors.


1. Test Setup


Pick the same Test Code from Blog: Hot Block Identification and latch: cache buffers chains.

drop table test_tab;
              
create table test_tab 
  INITRANS   26           -- prevent Segments ITL Waits and Deadlocks
as select level id, rpad('ABC', 10, 'X') val from dual connect by level <= 5;

alter table test_tab minimize records_per_block;

truncate table test_tab;

insert into test_tab select level id, rpad('ABC', 10, 'X') val from dual connect by level <= 100;

commit;

drop index test_tab#u1;

create unique index test_tab#u1 on k.test_tab (id) 
  pctfree    90
  initrans   26        -- prevent Segments ITL Waits and Deadlocks
;

exec dbms_stats.gather_table_stats(null, 'TEST_TAB', cascade => TRUE);

select block, to_char(block, 'xxxxxxxx') block_hex, count (*) cnt 
  from (select rowid rid, 
               dbms_rowid.rowid_object (rowid) object, 
               dbms_rowid.rowid_relative_fno (rowid) fil, 
               dbms_rowid.rowid_block_number (rowid) block, 
               dbms_rowid.rowid_row_number (rowid) ro, 
               t.* 
          from test_tab t 
         order by fil, block, ro) 
 group by block 
 order by block, cnt desc;

     BLOCK BLOCK_HEX        CNT
  -------- --------- ----------
   3070331    2ed97b          5
   3070332    2ed97c          5
   3070333    2ed97d          5
   3070334    2ed97e          5
   3070335    2ed97f          5
   3072640    2ee280          5
   3072641    2ee281          5
   3072642    2ee282          5
   3072643    2ee283          5
   3072644    2ee284          5
   3072645    2ee285          5
   3072646    2ee286          5
   3072647    2ee287          5
   3072649    2ee289          5
   3072650    2ee28a          5
   3072651    2ee28b          5
   3072652    2ee28c          5
   3072653    2ee28d          5
   3072654    2ee28e          5
   3072655    2ee28f          5
 20 rows selected. 
  
select object_name, object_id from dba_objects where object_name = 'TEST_TAB#U1';  

  OBJECT_NAME  OBJECT_ID
  ----------- ----------
  TEST_TAB#U1    2260907

select block#, to_char(block#, 'xxxxxxxx') block#_hex
      ,count(*) rows_per_block
from (
  select dbms_rowid.rowid_block_number(sys_op_lbid (2260907, 'L', t.rowid))  block#
        ,t.*
    from test_tab t
   where id is not null
)
group by block#
order by block#; 

    BLOCK# BLOCK#_HE ROWS_PER_BLOCK
  -------- --------- --------------
   3072660    2ee294              5
   3072661    2ee295              5
   3072662    2ee296              5
   3072663    2ee297              5
   3072664    2ee298              5
   3072665    2ee299              5
   3072666    2ee29a              5
   3072667    2ee29b              5
   3072668    2ee29c              5
   3072669    2ee29d              5
   3072670    2ee29e              5
   3072671    2ee29f              5
   3072673    2ee2a1              5
   3072674    2ee2a2              5
   3072675    2ee2a3              5
   3072676    2ee2a4              5
   3072677    2ee2a5              5
   3072678    2ee2a6              5
   3072679    2ee2a7              5
   3072680    2ee2a8              5
 20 rows selected 


2. Buffer Read Access Path Tests


Trace Event 10200 shows all consistent read Blocks and their sequence. We will demonstrate the number of consistent read Blocks with different access path.

At first, display Table and Index information:

column object_name format a12
column object_id_hex format a20
column data_object_id_hex format a20

select object_name, object_id, data_object_id, 
       to_char(object_id, 'xxxxxxxx') object_id_hex, to_char(data_object_id, 'xxxxxxxx') data_object_id_hex
 from dba_objects where object_name in ('TEST_TAB', 'TEST_TAB#U1');


 OBJECT_NAME  OBJECT_ID  DATA_OBJECT_ID  OBJECT_ID_HEX  DATA_OBJECT_ID_HEX
 ----------- ---------- --------------- -------------- -------------------
 TEST_TAB       2260905         2260906         227fa9              227faa
 TEST_TAB#U1    2260907         2260907         227fab              227fab


2.1 USER ROWID without using Index


First collect ROWID for test, then make the test:

select /*+ user_rowid_no_index */ t.*, rowid rid from test_tab t 
where id in (1, 2, 3, 4, 5, 6, 7) order by t.id;

  ID VAL        RID
 --- ---------- ------------------
   1 ABCXXXXXXX AAIn+qAAAAALtl7AAA
   2 ABCXXXXXXX AAIn+qAAAAALtl7AAB
   3 ABCXXXXXXX AAIn+qAAAAALtl7AAC
   4 ABCXXXXXXX AAIn+qAAAAALtl7AAD
   5 ABCXXXXXXX AAIn+qAAAAALtl7AAE
   6 ABCXXXXXXX AAIn+qAAAAALtl8AAA
   7 ABCXXXXXXX AAIn+qAAAAALtl8AAB

alter session set events '10200 trace name context forever,level 10';

select /*+ user_rowid_no_index */ * from test_tab t 
where id in (1, 2, 3, 4, 5, 6, 7)
  and rowid in 
      ('AAIn+qAAAAALtl7AAA'
      ,'AAIn+qAAAAALtl7AAB'
      ,'AAIn+qAAAAALtl7AAC'
      ,'AAIn+qAAAAALtl7AAD'
      ,'AAIn+qAAAAALtl7AAE'
      ,'AAIn+qAAAAALtl8AAA'
      ,'AAIn+qAAAAALtl8AAB');

alter session set events '10200 trace name context off';

Rows (1st) Rows (avg) Rows (max)  Row Source Operation (0 Branch Block, 0 Leaf Blocks, 3 Table Blocks)
---------- ---------- ----------  ---------------------------------------------------
         7          7          7  INLIST ITERATOR  (cr=3 pr=0 pw=0 time=130 us)
         7          7          7   TABLE ACCESS BY USER ROWID TEST_TAB (cr=3 pr=0 pw=0 time=261 us cost=1 size=14 card=1)

ktrget2(): started for block  
 <0x07cf : 0x002ed97b> objd: 0x00227faa       <-- Table  Block 1
 <0x07cf : 0x002ed97b> objd: 0x00227faa       <-- Table  Block 1
 <0x07cf : 0x002ed97c> objd: 0x00227faa       <-- Table  Block 2


2.2 INDEX RANGE SCAN with INDEX ROWID BATCHED



alter session set events '10200 trace name context forever,level 10';

select /*+ index_range_scan */ * from test_tab t where id between 1 and 7;

alter session set events '10200 trace name context off';

Row Source Operation (1 Branch Block, 3 Leaf Blocks, 3 Table Blocks)
---------------------------------------------------
TABLE ACCESS BY INDEX ROWID BATCHED TEST_TAB (cr=7 pr=0 pw=0 time=210 us cost=3 size=98 card=7)
 INDEX RANGE SCAN TEST_TAB#U1 (cr=4 pr=0 pw=0 time=552 us cost=2 size=0 card=7)(object id 2260907)

ktrgtc2(): started for block
  <0x07cf : 0x002ee293> objd: 0x00227fab   <-- Branch Block 
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ed97b> objd: 0x00227faa       <-- Table  Block 1
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ed97b> objd: 0x00227faa       <-- Table  Block 1
  <0x07cf : 0x002ee295> objd: 0x00227fab     <-- Leaf  Block 2
  <0x07cf : 0x002ed97c> objd: 0x00227faa       <-- Table  Block 2


2.3 INDEX UNIQUE SCAN



select /*+ index_uniqe_scan */ * from test_tab t where id in (1, 2, 3, 4, 5, 6, 7);

Row Source Operation (2 Branch Block, 8 Leaf Blocks, 3 Table Blocks)
---------------------------------------------------
INLIST ITERATOR  (cr=13 pr=0 pw=0 time=218 us)
 TABLE ACCESS BY INDEX ROWID TEST_TAB (cr=13 pr=0 pw=0 time=812 us cost=3 size=98 card=7)
  INDEX UNIQUE SCAN TEST_TAB#U1 (cr=10 pr=0 pw=0 time=585 us cost=2 size=0 card=7)(object id 2260907)
  
ktrgtc2(): started for block  
  <0x07cf : 0x002ee293> objd: 0x00227fab   <-- Branch Block
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ed97b> objd: 0x00227faa       <-- Table  Block 1
  <0x07cf : 0x002ee293> objd: 0x00227fab   <-- Branch Block
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ed97b> objd: 0x00227faa       <-- Table  Block 1
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ee294> objd: 0x00227fab     <-- Leaf  Block 1
  <0x07cf : 0x002ee295> objd: 0x00227fab     <-- Leaf  Block 2
  <0x07cf : 0x002ed97c> objd: 0x00227faa       <-- Table  Block 2
  <0x07cf : 0x002ee295> objd: 0x00227fab     <-- Leaf  Block 2


2.4 INDEX ROWID GET


We can construct a pipelined functions to force INDEX ROWID GETs in a given frequency.

create type number_tab as table of number;
/

create or replace function id_pipelined(p_cnt number, p_sleep_seconds number := 0) 
return number_tab pipelined as
begin
  for i in 1..p_cnt loop
    if p_sleep_seconds > 0 then
      dbms_lock.sleep(p_sleep_seconds);  -- purposed delay, specially before producing first row
    end if;
    pipe row(i);
  end loop;
  return;
  exception
    when no_data_needed then
      raise;
    when others then
      dbms_output.put_line('others Handler');
      raise;
end;
/

select /*+ leading(c) cardinality(c 10) indx(t test_tab#u1) index_rowid_pipelined */ * 
  from table(cast (id_pipelined(7, 1.3) as number_tab)) c, test_tab t
 where c.column_value = t.id;
  
Row Source Operation (2 Branch Block, 8 Leaf Blocks, 7 Table Blocks)
---------------------------------------------------
NESTED LOOPS  (cr=17 pr=0 pw=0 time=253 us cost=35 size=160 card=10)
 NESTED LOOPS  (cr=10 pr=0 pw=0 time=7802413 us cost=35 size=160 card=10)
  COLLECTION ITERATOR PICKLER FETCH ID_PIPELINED (cr=0 pr=0 pw=0 time=7801325 us cost=30 size=20 card=10)
  INDEX UNIQUE SCAN TEST_TAB#U1 (cr=10 pr=0 pw=0 time=823 us cost=1 size=0 card=1)(object id 2260907)
 TABLE ACCESS BY INDEX ROWID TEST_TAB (cr=7 pr=0 pw=0 time=395 us cost=1 size=14 card=1)
  
ktrgtc2(): started for block 
 <0x07cf : 0x002ee293> objd: 0x00227fab  <-- Branch Block 
 <0x07cf : 0x002ee294> objd: 0x00227fab    <-- Leaf  Block 1
 <0x07cf : 0x002ed97b> objd: 0x00227faa      <-- Table  Block 1
 <0x07cf : 0x002ee293> objd: 0x00227fab  <-- Branch Block 
 <0x07cf : 0x002ee294> objd: 0x00227fab    <-- Leaf  Block 1
 <0x07cf : 0x002ed97b> objd: 0x00227faa      <-- Table  Block 1
 <0x07cf : 0x002ee294> objd: 0x00227fab    <-- Leaf  Block 1
 <0x07cf : 0x002ed97b> objd: 0x00227faa      <-- Table  Block 1
 <0x07cf : 0x002ee294> objd: 0x00227fab    <-- Leaf  Block 1
 <0x07cf : 0x002ed97b> objd: 0x00227faa      <-- Table  Block 1
 <0x07cf : 0x002ee294> objd: 0x00227fab    <-- Leaf  Block 1
 <0x07cf : 0x002ed97b> objd: 0x00227faa      <-- Table  Block 1
 <0x07cf : 0x002ee294> objd: 0x00227fab    <-- Leaf  Block 1
 <0x07cf : 0x002ee295> objd: 0x00227fab    <-- Leaf  Block 2
 <0x07cf : 0x002ed97c> objd: 0x00227faa      <-- Table  Block 2
 <0x07cf : 0x002ee295> objd: 0x00227fab    <-- Leaf  Block 2
 <0x07cf : 0x002ed97c> objd: 0x00227faa      <-- Table  Block 2


3. Discussions


3.1 Access Path


Above 4 tests showed:
  USER ROWID:         3 Blocks (0 Branch Block, 0 Leaf Blocks, 3 Table Blocks)
  INDEX RANGE SCAN:   7 Blocks (1 Branch Block, 3 Leaf Blocks, 3 Table Blocks)
  INDEX UNIQUE SCAN: 13 Blocks (2 Branch Block, 8 Leaf Blocks, 3 Table Blocks)
  INDEX ROWID GET:   17 Blocks (2 Branch Block, 8 Leaf Blocks, 7 Table Blocks)
The fourth test is the most expensive one because pipelined function is a row-by-row processing. The consumer starts processing only after producer produces it (the principle of "pipelined"). In the above example, consumer waits for "p_sleep_seconds" when receiving first row from producer. In the real applications, during "p_sleep_seconds", any modifications can occur, and in order to maintain Read Consistency from query start SCN, Oracle should use UNDO to clone Buffer, hence, 'latch: cache buffers chains' (see Demo in next Section).

In the aspect of Disk Read, row-by-row access is fulfilled by "db file sequential read" instead of more efficient multiblock (batch) "db file scattered read" or "db file parallel read" (to be discussed in next Blog: Oracle DB File Read Access Path and Cost).

Oracle 12c Database Data Cartridge Developer's Guide Section: 13 Using Pipelined and Parallel Table Functions wrote:

Rows from a collection returned by a table function can also be pipelined; this means that they are iteratively returned as they are produced, instead of being returned in a single batch after all processing of the table function's input is completed.

Pipelining enables a table function to return rows faster and can reduce the memory required to cache a table function's results.


It looks like that Pipelined function is a reflection of consuming row-by-row produced data as a memory (PGA) optimization of collection processing if intermediate result can be aggregated and consumed rows can be forgotten.

By the way, Pipelined table functions can be implemented in two ways: native PL/SQL and interface. In the above example, we use the native PL/SQL approach.

Oracle is a RDBMS based on relational model, where "relation" means "tabular", in mathematics, a set of objects (rows).

Blog Ask TOM: Slow processing on database using merge declared:
    SQL is about sets, the best way to make SQL perform is to think SETS, not procedurally

The first test with "USER ROWID" is the cheapest fetching approach. In real application, it can be implemented by first caching ROWID of previously fetched ROWs, and subsequently, make the direct table fetch, and therefore bypass INDEX access. (See: Blog: Hot Block Identification and latch: cache buffers chains Section: "5. Index Hot Block Application Fix")


3.2 'latch: cache buffers chains' on Index Blocks


Index is made of a part of table columns. Usually it is more condensed than table, i.e, each Branch/Leaf Block can contain more rows than table Block. In other words, the same index Block is more frequently touched than table Block, hence more prone to 'latch: cache buffers chains' when multi sessions concurrently access it.


4. 'latch: cache buffers chains' Demo



create or replace procedure index_rowid_pipelined_get(p_cnt number, p_sleep_seconds number := 0) as
begin
  for c in (
    select /*+ leading(c) cardinality(c 10) indx(t test_tab#u1) index_rowid_pipelined */ * 
    from table(cast (id_pipelined(p_cnt, p_sleep_seconds) as number_tab)) c, test_tab t
    where c.column_value = t.id)
  loop
    null;
  end loop;
end;
/

create or replace procedure index_rowid_pipelined_get_jobs(p_job_cnt number, p_cnt number, p_sleep_seconds number := 0) as
  l_job_id pls_integer;
begin
  for i in 1.. p_job_cnt loop
    dbms_job.submit(l_job_id, 'begin while true loop index_rowid_pipelined_get('||p_cnt||', '||p_sleep_seconds||'); end loop; end;');
  end loop;
  commit;
end;    
/
Launch 6 Jobs on a UNIX with 6 physical Processors:

exec index_rowid_pipelined_get_jobs(6, 90, 0);
All Blocks in V$BH have status 'xcur'.

When modifying Table and no commit/rollback:

update test_tab set id = -id where abs(id) between 3 and 12;   -- Without commit/rollback
We can immediately observe 'latch: cache buffers chains' or 'buffer busy waits':

select * from v$session_wait where event in ('latch: cache buffers chains') order by event, sid;

select * from  v$session_wait_history where event in ('latch: cache buffers chains') order by event, sid;
In case of 'latch: cache buffers chains', one guess is that when
  V$SESSION_WAIT.state = 'WAITED SHORT TIME' (less than one centisecond), V$LATCH.Spin_Gets increased
  V$SESSION_WAIT.state = 'WAITED KNOWN TIME' (more than one centisecond), V$LATCH.Sleeps increased
  no entries in V$SESSION_WAIT, but V$BH.status='cr' Block Read,          V$LATCH.immediate_gets increased
but it is not clear how to confirm it.

By querying v$bh, we can see Consistent Read Copy of Blocks (cloned) marked as 'cr':

select d.object_name, data_object_id, b.status, b.* 
  from v$bh b, dba_objects d
 where b.objd = d.data_object_id
   and object_name in ('TEST_TAB', 'TEST_TAB#U1')
   --and b.status not in ('free')
   and b.status in ('xcur', 'cr')
order by b.status, object_name, block#;
In this case, Event 10201 ("consistent read undo application") can be used to trace CR Reads constructed by applying UNDO Blocks.

To stop all Jobs, execute following "clean_jobs".


-------------- clean_jobs --------------

create or replace procedure clean_jobs as
begin
  for c in (select job from dba_jobs) loop
    begin
       dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;

  for c in (select d.job, d.sid, (select serial# from v$session where sid = d.sid) ser 
              from dba_jobs_running d) loop
    begin
      execute immediate
             'alter system kill session '''|| c.sid|| ',' || c.ser|| ''' immediate';
      dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;
  
  -- select * from dba_jobs;
  -- select * from dba_jobs_running;
end;
/

-- exec clean_jobs; -- stop all jobs

Oracle Physical Read Access Path and Cost

In this Blog, we will look into Physical Read (Disk Read, DB File Read) in different Access Paths and their Cost.

We will first test 3 varieties of db file read:
  db file sequential read
  db file scattered read 
  db file parallel read  
and then compare the cost (statistics) reported in 3 places for different access path:
  SQL Trace
  Dtrace
  Oracle View: V$FILESTAT and V$IOSTAT_FILE
Note: All tests are done in Oracle 12.1.0.2 on Solaris.


1. DB File Read Access Path


Run 4 variants of Access Path tests with SQl Trace (Event 10046), and at the same time dtace the process (see appended Dtrace Script).


1.1. single Read



SQL > exec db_file_read_test('single', 1, 333);

-- adjacent rowid, single block read, 'db file sequential read'
SELECT /*+ single_read */ Y FROM KSUN_TAB T WHERE ROWID = :B1 

call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute    333      0.01       0.01          0          0          0           0
Fetch      333      0.01       0.01        641        333          0         333
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total      667      0.02       0.02        641        333          0         333

Row Source Operation
---------------------------------------------------
TABLE ACCESS BY USER ROWID KSUN_TAB (cr=1 pr=8 pw=0 time=160 us cost=1 size=3513 card=1)

  Event waited on                             Times   Max. Wait  Total Waited
  ----------------------------------------   Waited  ----------  ------------
  db file scattered read                         44        0.00          0.00
  db file sequential read                       289        0.00          0.00
 
------------------------------ dtrace ------------------------------
  PROBEFUNC      FD  RETURN_SIZE     COUNT
  lseek         260            0        44
  readv         260        65536        44
  pread         260         8192       289

  PROBEFUNC      FD  MAX_READ_Blocks
  pread         260                1
  readv         260                8

TOTAL_SIZE = 5251072   , TOTAL_READ_Blocks = 641   , TOTAL_READ_CNT = 333

readv         260
           value  ------------- Distribution ------------- count
            8192 |                                         0
           16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  43
           32768 |@                                        1
           65536 |                                         0


pread         260
           value  ------------- Distribution ------------- count
            2048 |                                         0
            4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@          226
            8192 |@@@@@@@@                                 60
           16384 |                                         2
           32768 |                                         0
           65536 |                                         1
          131072 |                                         0


1.2. scattered Read



SQL > exec db_file_read_test('scattered', 1, 333);

-- jumped rowid, scattered read, 'db file scattered read'
SELECT /*+ scattered_read */ Y FROM KSUN_TAB T WHERE ROWID = :B1 

call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute    333      0.00       0.00          0          0          0           0
Fetch      333      0.02       0.02       2664        333          0         333
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total      667      0.02       0.02       2664        333          0         333


Row Source Operation
---------------------------------------------------
TABLE ACCESS BY USER ROWID KSUN_TAB (cr=1 pr=8 pw=0 time=156 us cost=1 size=3513 card=1)

Elapsed times include waiting on following events:
  Event waited on                             Times   Max. Wait  Total Waited
  ----------------------------------------   Waited  ----------  ------------
  db file scattered read                        333        0.00          0.00
  
------------------------------ dtrace ------------------------------
  PROBEFUNC      FD  RETURN_SIZE     COUNT
  lseek         260            0        91
  readv         260        65536       333

  PROBEFUNC      FD  MAX_READ_Blocks
  readv         260                8

TOTAL_SIZE = 21823488  , TOTAL_READ_Blocks = 2664  , TOTAL_READ_CNT = 333

readv         260
           value  ------------- Distribution ------------- count
            8192 |                                         0
           16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@      290
           32768 |@@@@@                                    43
           65536 |                                         0


1.3. parallel Read



SQL > exec db_file_read_test('parallel', 1, 333);

SELECT /*+ index(t ksun_tab#i1) parallel_read */ MAX(Y) FROM KSUN_TAB T WHERE X BETWEEN 1 AND :B1 

call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute      1      0.00       0.00          0          0          0           0
Fetch        1      0.00       0.00        344        335          0           1
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total        3      0.00       0.00        344        335          0           1

Row Source Operation
---------------------------------------------------
SORT AGGREGATE (cr=335 pr=344 pw=0 time=3760 us)
 FILTER  (cr=335 pr=344 pw=0 time=1698 us)
  TABLE ACCESS BY INDEX ROWID BATCHED KSUN_TAB (cr=335 pr=344 pw=0 time=1361 us cost=168 size=1167165 card=333)
   INDEX RANGE SCAN KSUN_TAB#I1 (cr=2 pr=8 pw=0 time=279 us cost=1 size=0 card=333)(object id 2260477)

  Event waited on                             Times   Max. Wait  Total Waited
  ----------------------------------------   Waited  ----------  ------------
  db file scattered read                          4        0.00          0.00
  db file parallel read                           2        0.00          0.00
  
 -- Raw Trace File --
 'db file scattered read' ela= 49  file#=917 block#=10368 blocks=8     obj#=2260477 (Index KSUN_TAB#I1)
 'db file scattered read' ela= 27  file#=917 block#=128   blocks=8     obj#=2260476 (Table KSUN_TAB) 
 'db file scattered read' ela= 21  file#=917 block#=136   blocks=8     obj#=2260476
 'db file parallel read'  ela= 422 files=1   blocks=127   requests=127 obj#=2260476
 'db file parallel read'  ela= 334 files=1   blocks=127   requests=127 obj#=2260476
 'db file scattered read' ela= 264 file#=917 block#=409   blocks=66    obj#=2260476
  
------------------------------ dtrace ------------------------------
   PROBEFUNC      FD  RETURN_SIZE     COUNT
   pread         260       540672         1
   lseek         260            0         2
   readv         260        65536         3
   pread         260         8192       254
 
   PROBEFUNC      FD  MAX_READ_Blocks
   readv         260                8
   pread         260               66
 
 TOTAL_SIZE = 2818048   , TOTAL_READ_Blocks = 344   , TOTAL_READ_CNT = 258
 
   readv         260
            value  ------------- Distribution ------------- count
             8192 |                                         0
            16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@              2
            32768 |@@@@@@@@@@@@@                            1
            65536 |                                         0
 
   pread         260
            value  ------------- Distribution ------------- count
             2048 |                                         0
             4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@          196
             8192 |@@@@@@@@                                 52
            16384 |                                         2
            32768 |                                         0
            65536 |@                                        4
           131072 |                                         1
           262144 |                                         0


1.4. full Read



SQL > exec db_file_read_test('full', 1, 333);

SELECT /*+ full_read */ MAX(Y) FROM KSUN_TAB T WHERE ROWNUM <= :B1 

call     count       cpu    elapsed       disk      query    current        rows
------- ------  -------- ---------- ---------- ---------- ----------  ----------
Parse        1      0.00       0.00          0          0          0           0
Execute      1      0.00       0.00          0          0          0           0
Fetch        1      0.00       0.00        342        342          3           1
------- ------  -------- ---------- ---------- ---------- ----------  ----------
total        3      0.00       0.00        342        342          3           1

Row Source Operation
---------------------------------------------------
SORT AGGREGATE (cr=342 pr=342 pw=0 time=3788 us)
 COUNT STOPKEY (cr=342 pr=342 pw=0 time=1371 us)
  TABLE ACCESS FULL KSUN_TAB (cr=342 pr=342 pw=0 time=925 us cost=99 size=1169334 card=334)

  Event waited on                             Times   Max. Wait  Total Waited
  ----------------------------------------   Waited  ----------  ------------
  db file sequential read                         2        0.00          0.00
  db file scattered read                         23        0.00          0.00

-- Raw Trace File --
'db file sequential read' ela= 19 file#=917 block#=130 blocks=1 obj#=2260476 tim=647299975335
'db file sequential read' ela= 14 file#=3 block#=768 blocks=1 obj#=0 tim=647299975399
     -- UNDO file#=3   /oratestdb/oradata/testdb/undo01.dbf
'db file scattered read' ela= 22 file#=917 block#=131 blocks=5 obj#=2260476 tim=647299975501
'db file scattered read' ela= 25 file#=917 block#=136 blocks=8 obj#=2260476 tim=647299975609
'db file scattered read' ela= 25 file#=917 block#=145 blocks=7 obj#=2260476 tim=647299975713
'db file scattered read' ela= 23 file#=917 block#=152 blocks=8 obj#=2260476 tim=647299975806
'db file scattered read' ela= 25 file#=917 block#=161 blocks=7 obj#=2260476 tim=647299975901
'db file scattered read' ela= 24 file#=917 block#=168 blocks=8 obj#=2260476 tim=647299975994
'db file scattered read' ela= 23 file#=917 block#=177 blocks=7 obj#=2260476 tim=647299976088
'db file scattered read' ela= 23 file#=917 block#=184 blocks=8 obj#=2260476 tim=647299976178
'db file scattered read' ela= 23 file#=917 block#=193 blocks=7 obj#=2260476 tim=647299976270
'db file scattered read' ela= 22 file#=917 block#=200 blocks=8 obj#=2260476 tim=647299976364
'db file scattered read' ela= 23 file#=917 block#=209 blocks=7 obj#=2260476 tim=647299976465
'db file scattered read' ela= 22 file#=917 block#=216 blocks=8 obj#=2260476 tim=647299976554
'db file scattered read' ela= 22 file#=917 block#=225 blocks=7 obj#=2260476 tim=647299976646
'db file scattered read' ela= 29 file#=917 block#=232 blocks=8 obj#=2260476 tim=647299976759
'db file scattered read' ela= 24 file#=917 block#=241 blocks=7 obj#=2260476 tim=647299976866
'db file scattered read' ela= 23 file#=917 block#=248 blocks=8 obj#=2260476 tim=647299976956
'db file scattered read' ela= 128 file#=917 block#=258 blocks=32 obj#=2260476 tim=647299977200
'db file scattered read' ela= 95 file#=917 block#=290 blocks=32 obj#=2260476 tim=647299977511
'db file scattered read' ela= 97 file#=917 block#=322 blocks=32 obj#=2260476 tim=647299977822
'db file scattered read' ela= 87 file#=917 block#=354 blocks=30 obj#=2260476 tim=647299978113
'db file scattered read' ela= 96 file#=917 block#=386 blocks=32 obj#=2260476 tim=647299978407
'db file scattered read' ela= 108 file#=917 block#=418 blocks=32 obj#=2260476 tim=647299978719
'db file scattered read' ela= 94 file#=917 block#=450 blocks=32 obj#=2260476 tim=647299979021

------------------------------ dtrace ------------------------------
   PROBEFUNC      FD  RETURN_SIZE     COUNT
   pread         260         8192         1
   pread         260       245760         1
   readv         260        40960         1
   pread         260       262144         6
   readv         260        57344         7
   lseek         260            0         8
   readv         260        65536         8
 
   PROBEFUNC      FD  MAX_READ_Blocks
   readv         260                8
   pread         260               32
 
 TOTAL_SIZE = 2793472   , TOTAL_READ_Blocks = 341   , TOTAL_READ_CNT = 24
 
   readv         260
            value  ------------- Distribution ------------- count
             8192 |                                         0
            16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 16
            32768 |                                         0
 
   pread         260
            value  ------------- Distribution ------------- count
            16384 |                                         0
            32768 |@@@@@@@@@@@@@@@@@@@@                     4
            65536 |@@@@@@@@@@@@@@@                          3
           131072 |@@@@@                                    1
           262144 |                                         0

*Note: One extra Read in SQL Trace: 25 (=23+2) Reads with 342 Blocks in SQL Trace vs. 
       dtrace 24 Reads with 341 Blocks is due to one UNDO Read (file#=3).


2. UNIX References


At first some documentations from UNIX man pages and internet.
  ssize_t pread(int fildes, void * buf, size_t nbyte, off_t offset);
  ssize_t readv(int fildes, struct iovec * iov, int iovcnt);
  off_t lseek(int fildes, off_t offset, int whence)
pread(): performs the same action as read(), except that it reads from a given position in the file 
         without changing the file pointer. The first three arguments to pread() are the same as read() with the addition of 
         a fourth argument offset for the desired position inside the file. pread() will read up to the maximum offset value 
         that can be represented in an off_t for regular files.

readv(): is equivalent to read(), but places the input data into the iovcnt buffers 
         specified by the members of the iov array: iov 0 , iov 1 , ..., iov [ iovcnt -1]. 
         The iovcnt argument is valid if greater than 0 and less than or equal to IOV_MAX .
         
         The iovec structure contains the following members:
             caddr_t   iov_base;
             int       iov_len;  
         Each iovec entry specifies the base address and length of an area in memory where data should be placed. 
         The readv() function always fills an area completely before proceeding to the next.
         
         Upon successful completion, readv() marks for update the st_atime field of the file.
         
         IOV_MAX: 1024 On Linux, 16 On Solaris, 16 On AIX and HP-UX
            
readv (Linux): Read data from file descriptor FD, and put the result in the buffers described by VECTOR, 
         which is a vector of COUNT 'struct iovec's. The buffers are filled in the order specified. 
         Operates just like 'read' (see <unistd.h>) except that data are put in VECTOR instead of a contiguous buffer.    

lseek(): sets the file pointer associated with the open file descriptor specified by fildes.


3. Oracle References


db file sequential read: P1 = file#, P2 = block#, P3 = blocks
 file#:  This is the file# of the file that Oracle is trying to read from. From Oracle8 onwards it is the ABSOLUTE file number (AFN).
 block#: This is the starting block number in the file from where Oracle starts reading the blocks. Typically only one block is being read.
 blocks: This parameter specifies the number of blocks that Oracle is trying to read from the file# starting at block#. This is
   usually "1" but if P3 > 1 then this is a multiblock read. Multiblock "db file sequential read"s may be seen in 
   earlier Oracle versions when reading from a SORT (TEMPORARY) segments. 
   
db file scattered read:  P1 = file#, P2 = block#, P3 = blocks
 file#:  This is the file# of the file that Oracle is trying to read from. In Oracle8 onwards it is the ABSOLUTE file number (AFN).
 block#: This is the starting block number in the file from where Oracle starts reading the blocks. 
 blocks: This parameter specifies the number of blocks that Oracle is trying to read from the file# starting at block#. 
   The upper limit is DB_FILE_MULTIBLOCK_READ_COUNT, which is self tuned from Oracle 10.2 onwards.
                
db file parallel read:   P1 = files, P2 = blocks, P3 = requests
 files:    This indicates the number of files to which the session is reading
 blocks:   This indicates the total number of blocks to be read
 requests: This indicates the total number of I/O requests, which will be the same as blocks
 
WAITEVENT: "db file sequential read" Reference Note (Doc ID 34559.1)
 This signifies a wait for an I/O read request to complete. This call differs from "db file scattered read" 
 in that a sequential read reads data into contiguous memory (whilst a scattered read reads multiple blocks 
 and scatters them into different buffers in the SGA). 

WAITEVENT: "db file scattered read" Reference Note (Doc ID 34558.1) 
 This wait happens when a session is waiting for a multiblock IO to complete. This typically occurs during 
 FULL TABLE SCANs or INDEX FAST FULL SCANs. Oracle reads up to DB_FILE_MULTIBLOCK_READ_COUNT consecutive blocks 
 at a time and scatters them into buffers in the buffer cache.
  
C.3.34 db file parallel read
 This happens during recovery. It can also happen during buffer prefetching, 
 as an optimization (rather than performing multiple single-block reads). 


4. Discussions


(a). pread and readv: both read contiguous file space. pread places the input data into one single contiguous buffer (memory space);
     whereas readv distributes them into multi buffers.
     
     It looks like that the difference between pread and readv is the difference of memory allocation. Their disk operations are the same.
     
     pread specifies file reading position by a third parameter: offset;
     whereas readv requires a precedent lseak (with parameter: offset) to fix the file reading position.

     Contiguous file space means logically contiguous in a file, but does not mean physically contiguous in a disk.
     Oracle 12 Logical Storage Structures Section: Overview of Extents wrote:
          An extent is a unit of database storage made up of logically contiguous data blocks. Data blocks can be physically spread out 
             on disk because of RAID striping and file system implementations.

(b). pread can fulfill all 3 kinds of db file read:
         db file sequential read (Test 1.1. single Read)
         db file scattered read  (Test 1.3. parallel Read, see next discussion)
         db file parallel read   (Test 1.3. parallel Read, see next discussion)
     whereas readv is for
         db file scattered read  (Test 1.2. scattered Read)  

(c). In all three tests of "single Read", "scattered Read" and "parallel Read", 
     SQL Trace shows "disk" is bigger than "query" (641 > 333,  2664 > 333, 344 > 335), 
     there seems some wastage since more disk Blocks are read than consumed.
     
     But the number of "disk" read Requests, which is gathered by dtrace TOTAL_READ_CNT, is no more than 
     SQL Trace "disk" and "query". From performance point of view, number of "disk" read Requests is one more cost factor
     than number of read Blocks. This is visible in dtrace quantize (frequency distribution diagram) output, 
     where the value field denotes elapsed nanoseconds. For example, in Test "single Read", elapsed time per Block read 
     for readv (8 Blocks per read Request), and pread (1 Block per read Request) can be calculated as:     
     
       readv: (16384*43 + 32768*1)/8/44 = 2094 
       pread: (4096*226 + 8192*60 + 16384*2 + 65536*1)/289 = 5244 

(d). 'db file parallel read' specifies the number of files (first parameter), and number of blocks to read (second and third parameters are equal).
     
     We only observe 'db file parallel read' in Test:
        exec db_file_read_test('parallel', 1, 333);   

Event waited on                             Times   Max. Wait  Total Waited
----------------------------------------   Waited  ----------  ------------
db file scattered read                          4        0.00          0.00
db file parallel read                           2        0.00          0.00

-- Syscalls and Raw Trace File --
readv: 'db file scattered read' ela= 49  file#=917 block#=10368 blocks=8     obj#=2260477 (Index KSUN_TAB#I1)
readv: 'db file scattered read' ela= 27  file#=917 block#=128   blocks=8     obj#=2260476 (Table KSUN_TAB) 
readv: 'db file scattered read' ela= 21  file#=917 block#=136   blocks=8     obj#=2260476
pread: 'db file parallel read'  ela= 422 files=1   blocks=127   requests=127 obj#=2260476
pread: 'db file parallel read'  ela= 334 files=1   blocks=127   requests=127 obj#=2260476
pread: 'db file scattered read' ela= 264 file#=917 block#=409   blocks=66    obj#=2260476

------------------------------ dtrace ------------------------------
  PROBEFUNC      FD  RETURN_SIZE     COUNT
  pread         260       540672         1
  lseek         260            0         2
  readv         260        65536         3
  pread         260         8192       254
  
  PROBEFUNC      FD  MAX_READ_Blocks
  readv         260                8
  pread         260               66

TOTAL_SIZE = 2818048   , TOTAL_READ_Blocks = 344   , TOTAL_READ_CNT = 258
From above SQL Raw Trace File and Dtrace, we can see: first 3 'db file scattered read' with "blocks=8", corresponding to 3 readv, each reads 65536 Bytes (8 Blocks). next 2 'db file parallel read' with "blocks=127", corresponding to 254 pread, each reads 8192 Bytes (1 Block). last 'db file scattered read' with "blocks=66", corresponding to 1 pread, which reads 540672 Bytes (66 Blocks). In other words, there exist only 3 readv, which read 3*65536 Bytes=24 Blocks. These 3 readv can neither match to 254 (=127 + 127) Blocks 'db file parallel read', nor 66 Blocks 'db file scattered read'. Therefore both 'db file parallel read' and 'db file scattered read' in this test are accomplished by pread. 'db file parallel read' is similar to 'db file sequential read', the difference is that former reads multi Blocks (probably asynchronously), but later reads one single Block. In fact, we can observe the 254 aio requests by:

SQL > exec db_file_read_test('parallel', 1, 333);

sudo dtrace -n '
syscall::pread:entry / pid == $1 && arg0 == $2 / {self->pread_fd = arg0;} 
syscall::pread:return/ pid == $1 && self->pread_fd == $2 / 
 {@STACK_CNT[probefunc, self->pread_fd, arg1, ustack(5, 0)]=count(); self->pread_fd = 0;}
' 11352 260 

  pread   260   540672
    libc.so.1`_pread+0xa
    oracle`skgfqio+0x284
    oracle`ksfd_skgfqio+0x195
    oracle`ksfd_skgfrvio+0xcb4
    oracle`ksfd_vio+0x9a3
        1
        
  pread   260   8192
    libc.so.1`_pread+0xa
    libaio.so.1`_aio_do_request+0x18e
    libc.so.1`_thr_setup+0x5b
    libc.so.1`_lwp_start
      254
'db file parallel read' has 2 "plural" parameters, P1 (files), P3 (requests) (P2=P3), each of which can denote one dimension of parallel operations. P1 (files) signifies multi files reading in parallel. P3 (requests) stands for multi parallel Disk Reading requests delegating to multi-threading LWP processes. Similar to 'log file parallel write' and 'control file parallel write', in which first parameter "files" represents number of log files (in one redo group) and number of control files. In the above example, since files=1, therefore 'db file parallel read' implies multi requests. Because multi requests are performed asynchronously, probably better named as 'db file async read'. If we set DISK_ASYNCH_IO=false (and restart DB), there is no more aio observed with dtrace as shown in following output, but SQL Trace is not able to reveal this setting change, and it still shows the same output as above.

  pread   268   540672
    libc.so.1`_pread+0xa
    oracle`skgfqio+0x284
    oracle`ksfd_skgfqio+0x195
    oracle`ksfd_skgfrvio+0xcb4
    oracle`ksfd_vio+0x9a3
        1
  pread    268   8192
    libc.so.1`_pread+0xa
    oracle`skgfqio+0x284
    oracle`ksfd_skgfqio+0x203
    oracle`ksfdgo+0x188
    oracle`ksfd_sbio+0xdd1
      254   
The last 'db file scattered read' with "blocks=66" also shows one pread can read 66 Blocks, much higher than db_file_multiblock_read_count=32 in this database. By the way, we have 3 readv, but only 2 lseek, so there are probably 2 readv share one lseek.


5. V$FILESTAT vs. V$IOSTAT_FILE Views


There exist two Oracle views recording file disk I/O statistics:
  1. V$FILESTAT and its Cumulative DBA_HIST_FILESTATXS in centisecond since Oracle 8 (or 7).
  2. V$IOSTAT_FILE and DBA_HIST_IOSTAT_* in milliseconds since Oracle 11.
For temp files, the counterpart of V$FILESTAT is V$TEMPSTAT; and in V$IOSTAT_FILE, filetype_name is marked as 'Temp File'.

According to MOS (Doc ID 2222370.1): What Does "1-bk Rds/s" and "Av 1-bk Rd(ms)" in "Tablespace IO Stats" and "File IO Stats" in AWR Report Mean? (Doc ID 2222370.1)
"Tablespace IO Stats" and "File IO Stats" in AWR report come from the columns of DBA_HIST_FILESTATXS.

But AWR basis table SYS.WRH$_IOSTAT_FILETYPE is filled by V$IOSTAT_FILE.

INSERT INTO WRH$_IOSTAT_FILETYPE (dbid,
                                  con_dbid,
                                  snap_id,
                                  instance_number,
                                  filetype_id,
                                  small_read_megabytes,
                                  small_write_megabytes,
                                  large_read_megabytes,
                                  large_write_megabytes,
                                  small_read_reqs,
                                  small_write_reqs,
                                  small_sync_read_reqs,
                                  large_read_reqs,
                                  large_write_reqs,
                                  small_read_servicetime,
                                  small_write_servicetime,
                                  small_sync_read_latency,
                                  large_read_servicetime,
                                  large_write_servicetime,
                                  retries_on_error)
    SELECT :dbid,
           :srcdbid                    con_dbid,
           :snap_id,
           :instance_number,
           filetype_id,
           SUM (small_read_megabytes)  small_read_megabytes,
           SUM (small_write_megabytes) small_write_megabytes,
           SUM (large_read_megabytes)  large_read_megabytes,
           SUM (large_write_megabytes) large_write_megabytes,
           SUM (small_read_reqs)       small_read_reqs,
           SUM (small_write_reqs)      small_write_reqs,
           SUM (small_sync_read_reqs)  small_sync_read_reqs,
           SUM (large_read_reqs)       large_read_reqs,
           SUM (large_write_reqs)      large_write_reqs,
           SUM (small_read_servicetime) small_read_servicetime,
           SUM (small_write_servicetime) small_write_servicetime,
           SUM (small_sync_read_latency) small_sync_read_latency,
           SUM (large_read_servicetime) large_read_servicetime,
           SUM (large_write_servicetime) large_write_servicetime,
           SUM (retries_on_error)      retries_on_error
      FROM v$iostat_file
  GROUP BY filetype_id;  
In AWR Report, both V$IOSTAT_FILE and V$FILESTAT data are visible in different places. For example, in Section: "IOStat by Filetype summary", the last 2 Columns are named as "Small Read" and "Large Read", which are probably from V$IOSTAT_FILE. But in Section "Tablespace IO Stats" and "File IO Stats", the Column names are prefixed by "1-bk Rd", which are from from V$FILESTAT. Therefore, potential stats inconsistence can appear even in the same AWR report.


6. DB File Read Stats Tests


Run test (see appended Test Code):

alter session set timed_statistics = true; 
alter session set statistics_level=all; 

truncate table read_stats;
exec db_file_read_test('single',    1, 333);
exec db_file_read_test('scattered', 1, 333);
exec db_file_read_test('parallel',  1, 333);
exec db_file_read_test('full',      1, 333);
and than watch statistics in both views (AIX and Linux are added for comparison. Copy HTML Table to Excel for easy reading):

select test_name,  
       phyrds, phyblkrd, singleblkrds, singleblkrdtim, 
       ceil(small_read_megabytes*1024*1024/8192) small_read_blks,       -- converted to Block for comparison 
       small_read_reqs, small_read_servicetime, small_sync_read_reqs, small_sync_read_latency,  
       ceil(large_read_megabytes*1024*1024/8192) large_read_blks,       -- converted to Block for comparison 
       large_read_reqs, large_read_servicetime 
from read_stats_delta_v where phyrds > 0 order by test_name desc, test_timestamp;

OS test_name phyrds phyblkrd singleblkrds singleblkrdtim small_read_blks small_read_reqs small_read_servicetime small_sync_read_reqs small_sync_read_latency large_read_blks large_read_reqs large_read_servicetime
Solaris single
333
641
289
10
640
333
0
289
0
0
0
0
Solaris scattered
333
2664
0
0
2560
333
0
0
0
0
0
0
Solaris parallel
258
344
254
0
384
257
0
508
0
0
1
0
Solaris full
24
341
1
0
0
17
0
1
0
256
7
0
AIX single
333
333
333
150
384
333
37
333
37
0
0
0
AIX scattered
333
333
333
1460
384
333
1310
333
1310
0
0
0
AIX parallel
260
335
259
110
256
259
17
259
17
0
1
1
AIX full
24
341
1
20
128
17
41
1
12
256
7
26
Linux single
333
641
289
0
640
333
0
289
0
0
0
0
Linux scattered
333
2664
0
0
2688
333
0
0
0
0
0
0
Linux parallel
258
344
254
0
256
257
0
508
0
0
1
0
Linux full
24
341
1
0
128
17
0
1
0
256
7
0


7. DB File Read Stats Test Discussion


The first 4 stats Columns (green) are from V$FILESTAT, the rest from V$IOSTAT_FILE. Here some observations:
(a). V$FILESTAT.phyrds matches TOTAL_READ_CNT, equal to V$IOSTAT_FILE.(small_read_reqs + large_read_reqs).
      V$FILESTAT.phyblkrd matches TOTAL_READ_Blocks, approximate to V$IOSTAT_FILE.(small_read_blks + large_read_blks).
      V$FILESTAT.singleblkrds matches V$IOSTAT_FILE.small_sync_read_reqs except "parallel".
         V$FILESTAT.singleblkrds are 1 DB Block(8192 Bytes) Read (' db file sequential read' or 'db file parallel read').
         It is not clear why small_sync_read_reqs (508) in "parallel" is much higher than dtrace TOTAL_READ_Blocks (344).
     V$IOSTAT_FILE.small_read_reqs includes small_sync_read_reqs execept "parallel".
     V$IOSTAT_FILE.small_sync_read_reqs matches 'db file sequential read' except "parallel".
     
     All time metrics are (almost) zero: singleblkrdtim, small_read_servicetime, small_sync_read_latency, large_read_servicetime.
     Time in V$FILESTAT is in centiseconds, V$IOSTAT_FILE in milliseconds. Both could have rounding errors.
     
     AIX seems able to record the time metrics, probably because they are bigger than the minimum time granule.

(b). Only 'scattered' Test:
          exec db_file_read_test('scattered', 1, 333);
     shows much higher small_read_blks: 2560 Blocks by small_read_reqs: 333 Requests (2560/333 approximately 8 Blocks per Request), 
     marked as "db file scattered read" and accomplished by "readv".

     However, in V$IOSTAT_FILE, they are entirely classified as SMALL_READ_MEGABYTES (small_read_blks) and SMALL_READ_REQS.
     But Oracle 12c V$IOSTAT_FILE Docu said that SMALL_READ_* are all about "single block read" even though dtrace evidenced 
     them as multi block read (8 Blocks per readv). 
     
     Logically it is controversial if small_read_blks is bigger than small_read_reqs because each single block read request
     can read only one single block.

(c). 'db file parallel read' in "parallel" test seems running asynchronously, and hard to collect precise statistics.

(d). V$IOSTAT_FILE has one pair of stats about single block read
        -. small_read_reqs / small_read_servicetime
        -. small_sync_read_reqs / small_sync_read_latency

(e). Oracle Time Accounting also evolves with time. MOS:
       I/O Wait Event Accounting Change in the RDBMS 12.1.0.2.0 Patch Set Release (Doc ID 1967410.1)
     said:
       Starting with the RDBMS 12.1.0.2.0 Patch Set Release, the accounting for certain I/O-related wait events has been updated 
       to be more accurate than it was before. As a result, these wait events may show an increase of wait times of around 10%.
         
       The wait events that are affected include, but are not limited to, the following:
         db file sequential read
         db file scattered read
         db file parallel read
         log file parallel write


8. Test Code



drop tablespace ksun_ts including contents and datafiles;
create tablespace ksun_ts datafile '/oratestdb/oradata/testdb/ksun_ts.dbf' size 200m online;

drop table ksun_tab;

-- DB_BLOCK_SIZE = 8192, each row occupies one BLOCK
create table ksun_tab tablespace ksun_ts as 
select level x, rpad('ABC', 3500, 'X') y, rpad('ABC', 3500, 'X') z from dual connect by level <= 1e4; 

select round(bytes/1024/1024) mb, blocks from dba_segments where segment_name = 'KSUN_TAB';       
--80 10240

create index ksun_tab#i1 on ksun_tab(x) tablespace ksun_ts;

exec dbms_stats.gather_table_stats(null, 'KSUN_TAB', cascade=>true);

drop tablespace ksun_ts_aux including contents and datafiles;
create tablespace ksun_ts_aux datafile '/oratestdb/oradata/testdb/ksun_ts_aux.dbf' size 200m online;

drop table ksun_rid;
create table ksun_rid tablespace ksun_ts_aux as select x, rowid rid from ksun_tab;
create index ksun_rid#i1 on ksun_rid(x) tablespace ksun_ts_aux;

exec dbms_stats.gather_table_stats(null, 'KSUN_RID', cascade=>true);

create or replace view read_stats_v as
select to_char(localtimestamp, 'yyyy-mm-dd hh24:mi:ss') test_timestamp
      ,phyrds, phyblkrd, singleblkrds, 10*singleblkrdtim singleblkrdtim
      ,small_read_megabytes, small_read_reqs, small_read_servicetime, small_sync_read_reqs, small_sync_read_latency
      ,large_read_megabytes, large_read_reqs, large_read_servicetime
      ,f.name
from   v$filestat v8,  v$iostat_file v11, v$datafile f
where  v8.file#    = f.file# 
  and  v11.file_no = f.file#
  and  f.name  like '%ksun_ts.dbf';
  
drop table read_stats;

create table read_stats as select 'setall_seq_readxx' test_name, v.* from read_stats_v v where 1=2;

create or replace view read_stats_delta_v as
select test_name, test_timestamp
      ,phyrds         - lag(phyrds)         over(partition by test_name order by test_timestamp)                   phyrds                 
      ,phyblkrd       - lag(phyblkrd)       over(partition by test_name order by test_timestamp)                   phyblkrd              
      ,singleblkrds   - lag(singleblkrds)   over(partition by test_name order by test_timestamp)                   singleblkrds 
      ,singleblkrdtim - lag(singleblkrdtim) over(partition by test_name order by test_timestamp)                   singleblkrdtim         
      ,small_read_megabytes    - lag(small_read_megabytes)  over(partition by test_name order by test_timestamp)   small_read_megabytes  
      ,small_read_reqs         - lag(small_read_reqs)         over(partition by test_name order by test_timestamp) small_read_reqs 
      ,small_read_servicetime  - lag(small_read_servicetime)  over(partition by test_name order by test_timestamp) small_read_servicetime 
      ,small_sync_read_reqs    - lag(small_sync_read_reqs)    over(partition by test_name order by test_timestamp) small_sync_read_reqs 
      ,small_sync_read_latency - lag(small_sync_read_latency) over(partition by test_name order by test_timestamp) small_sync_read_latency
      ,large_read_megabytes    - lag(large_read_megabytes)    over(partition by test_name order by test_timestamp) large_read_megabytes
      ,large_read_reqs         - lag(large_read_reqs)         over(partition by test_name order by test_timestamp) large_read_reqs  
      ,large_read_servicetime  - lag(large_read_servicetime)  over(partition by test_name order by test_timestamp) large_read_servicetime 
from read_stats s;

create or replace procedure db_file_read_test (p_test_name varchar2, p_loops number, p_rows number) as
 l_max_y           varchar2(3500);
 type tab_rowid is table of rowid index by pls_integer;
 l_rowid_cache     tab_rowid;
begin
  case 
   when p_test_name = 'single' then
     select rid bulk collect into l_rowid_cache from ksun_rid where x between 1 and p_rows;                
   when p_test_name = 'scattered' then
     select rid bulk collect into l_rowid_cache from ksun_rid where mod(x, 10) = 0 and rownum <= p_rows;   
   else null;
 end case;
 dbms_output.put_line('Number of Rows to read = '|| l_rowid_cache.count);
 
 insert into read_stats select p_test_name test_name, v.* from read_stats_v v;
 for i in 1..p_loops loop
  execute immediate 'alter system flush buffer_cache';
  dbms_lock.sleep(1);
  case 
    when p_test_name = 'single' then
     for r in 1..l_rowid_cache.count loop
       -- adjacent rowid, single block read, 'db file sequential read'
       select /*+ single_read */ y into l_max_y from ksun_tab t where rowid = l_rowid_cache(r);
     end loop;
    when p_test_name = 'scattered' then
     for r in 1..l_rowid_cache.count loop
       -- jumped rowid, scattered read, 'db file scattered read'
       select /*+ scattered_read */ y into l_max_y from ksun_tab t where rowid = l_rowid_cache(r);
     end loop;   
   when p_test_name = 'parallel' then
     -- table access by index rowid batched   'db file parallel read'
     select /*+ index(t ksun_tab#i1) parallel_read */ max(y) into l_max_y from ksun_tab t where x between 1 and p_rows;
   when p_test_name = 'full' then
     -- table access by FULL
     select /*+ full_read */ max(y) into l_max_y from ksun_tab t where rownum <= p_rows;
   end case;
  dbms_lock.sleep(1);
  insert into read_stats select p_test_name test_name, v.* from read_stats_v v;
  end loop;
  commit;
end;
/


9. Dtrace Script



pfiles 11352
  260: /oratestdb/oradata/testdb/ksun_ts.dbf
  
--Dtrace Script

#!/usr/sbin/dtrace -Zs

/* 
 * read_dtrace.d   pid   fd
 * chmod u+x read_dtrace.d
 * sudo ./read_dtrace.d 11352 260 
*/

BEGIN / $1 > 0 && $2 > 0 / 
 {TOTAL_SIZE = 0; TOTAL_READ_CNT = 0; } 
syscall::pread:entry / pid == $1 && arg0 == $2 / 
  {self->pread_fd = arg0; self->pread_t = timestamp;} 
syscall::pread:return/ pid == $1 && self->pread_fd == $2 / 
 {@CNT[probefunc, self->pread_fd, arg1] = count(); 
  @MAXB[probefunc, self->pread_fd] = max(arg1/8192);
  @ETIME[probefunc, self->pread_fd] = quantize(timestamp- self->pread_t); 
   TOTAL_SIZE = TOTAL_SIZE + arg1; TOTAL_READ_CNT = TOTAL_READ_CNT + 1;
   self->pread_fd = 0;} 
syscall::readv:entry / pid == $1 && arg0 == $2 / 
 {self->readv_fd = arg0; self->readv_t = timestamp; } 
syscall::readv:return/ pid == $1 && self->readv_fd == $2 / 
 {@CNT[probefunc, self->readv_fd, arg1] = count(); 
  @MAXB[probefunc, self->readv_fd] = max(arg1/8192);
  @ETIME[probefunc, self->readv_fd] = quantize(timestamp- self->readv_t); 
   TOTAL_SIZE = TOTAL_SIZE + arg1; TOTAL_READ_CNT = TOTAL_READ_CNT + 1;
   self->readv_fd = 0;} 
syscall::kaio:entry  / pid == $1 && arg1 == $2 / 
 {self->kaio = arg1;} 
syscall::kaio:return / pid == $1 && self->kaio == $2 /  
 {@CNT[probefunc,  self->kaio, arg1] = count(); self->kaio  = 0;} 
syscall::lseek:entry / pid == $1 && arg0 == $2/ 
 {@CNT[probefunc, arg0, 0] = count(); } 
END / $1 > 0 && $2 > 0 / 
    {printf("\n%11s  %6s %12s %9s \n", "PROBEFUNC", "FD", "RETURN_SIZE", "COUNT");
   printa("  %-10s %6d %12d %9@d\n", @CNT);
   printf("\n%11s  %6s %16s \n", "PROBEFUNC", "FD", "MAX_READ_Blocks");
   printa("  %-10s %6d %16@d\n", @MAXB);
   printf("\nTOTAL_SIZE = %-10d, TOTAL_READ_Blocks = %-6d, TOTAL_READ_CNT = %-6d\n", 
             TOTAL_SIZE, TOTAL_SIZE/8192, TOTAL_READ_CNT);
   printa("\n%-10s %6d %16@d\n", @ETIME);}