We will first test 3 varieties of db file read:
db file sequential read db file scattered read db file parallel readand then compare the cost (statistics) reported in 3 places for different access path:
SQL Trace Dtrace Oracle View: V$FILESTAT and V$IOSTAT_FILENote: All tests are done in Oracle 12.1.0.2 on Solaris.
1. DB File Read Access Path
Run 4 variants of Access Path tests with SQl Trace (Event 10046), and at the same time dtace the process (see appended Dtrace Script).
1.1. single Read
SQL > exec db_file_read_test('single', 1, 333);
-- adjacent rowid, single block read, 'db file sequential read'
SELECT /*+ single_read */ Y FROM KSUN_TAB T WHERE ROWID = :B1
call count cpu elapsed disk query current rows
------- ------ -------- ---------- ---------- ---------- ---------- ----------
Parse 1 0.00 0.00 0 0 0 0
Execute 333 0.01 0.01 0 0 0 0
Fetch 333 0.01 0.01 641 333 0 333
------- ------ -------- ---------- ---------- ---------- ---------- ----------
total 667 0.02 0.02 641 333 0 333
Row Source Operation
---------------------------------------------------
TABLE ACCESS BY USER ROWID KSUN_TAB (cr=1 pr=8 pw=0 time=160 us cost=1 size=3513 card=1)
Event waited on Times Max. Wait Total Waited
---------------------------------------- Waited ---------- ------------
db file scattered read 44 0.00 0.00
db file sequential read 289 0.00 0.00
------------------------------ dtrace ------------------------------
PROBEFUNC FD RETURN_SIZE COUNT
lseek 260 0 44
readv 260 65536 44
pread 260 8192 289
PROBEFUNC FD MAX_READ_Blocks
pread 260 1
readv 260 8
TOTAL_SIZE = 5251072 , TOTAL_READ_Blocks = 641 , TOTAL_READ_CNT = 333
readv 260
value ------------- Distribution ------------- count
8192 | 0
16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 43
32768 |@ 1
65536 | 0
pread 260
value ------------- Distribution ------------- count
2048 | 0
4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 226
8192 |@@@@@@@@ 60
16384 | 2
32768 | 0
65536 | 1
131072 | 0
1.2. scattered Read
SQL > exec db_file_read_test('scattered', 1, 333);
-- jumped rowid, scattered read, 'db file scattered read'
SELECT /*+ scattered_read */ Y FROM KSUN_TAB T WHERE ROWID = :B1
call count cpu elapsed disk query current rows
------- ------ -------- ---------- ---------- ---------- ---------- ----------
Parse 1 0.00 0.00 0 0 0 0
Execute 333 0.00 0.00 0 0 0 0
Fetch 333 0.02 0.02 2664 333 0 333
------- ------ -------- ---------- ---------- ---------- ---------- ----------
total 667 0.02 0.02 2664 333 0 333
Row Source Operation
---------------------------------------------------
TABLE ACCESS BY USER ROWID KSUN_TAB (cr=1 pr=8 pw=0 time=156 us cost=1 size=3513 card=1)
Elapsed times include waiting on following events:
Event waited on Times Max. Wait Total Waited
---------------------------------------- Waited ---------- ------------
db file scattered read 333 0.00 0.00
------------------------------ dtrace ------------------------------
PROBEFUNC FD RETURN_SIZE COUNT
lseek 260 0 91
readv 260 65536 333
PROBEFUNC FD MAX_READ_Blocks
readv 260 8
TOTAL_SIZE = 21823488 , TOTAL_READ_Blocks = 2664 , TOTAL_READ_CNT = 333
readv 260
value ------------- Distribution ------------- count
8192 | 0
16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 290
32768 |@@@@@ 43
65536 | 0
1.3. parallel Read
SQL > exec db_file_read_test('parallel', 1, 333);
SELECT /*+ index(t ksun_tab#i1) parallel_read */ MAX(Y) FROM KSUN_TAB T WHERE X BETWEEN 1 AND :B1
call count cpu elapsed disk query current rows
------- ------ -------- ---------- ---------- ---------- ---------- ----------
Parse 1 0.00 0.00 0 0 0 0
Execute 1 0.00 0.00 0 0 0 0
Fetch 1 0.00 0.00 344 335 0 1
------- ------ -------- ---------- ---------- ---------- ---------- ----------
total 3 0.00 0.00 344 335 0 1
Row Source Operation
---------------------------------------------------
SORT AGGREGATE (cr=335 pr=344 pw=0 time=3760 us)
FILTER (cr=335 pr=344 pw=0 time=1698 us)
TABLE ACCESS BY INDEX ROWID BATCHED KSUN_TAB (cr=335 pr=344 pw=0 time=1361 us cost=168 size=1167165 card=333)
INDEX RANGE SCAN KSUN_TAB#I1 (cr=2 pr=8 pw=0 time=279 us cost=1 size=0 card=333)(object id 2260477)
Event waited on Times Max. Wait Total Waited
---------------------------------------- Waited ---------- ------------
db file scattered read 4 0.00 0.00
db file parallel read 2 0.00 0.00
-- Raw Trace File --
'db file scattered read' ela= 49 file#=917 block#=10368 blocks=8 obj#=2260477 (Index KSUN_TAB#I1)
'db file scattered read' ela= 27 file#=917 block#=128 blocks=8 obj#=2260476 (Table KSUN_TAB)
'db file scattered read' ela= 21 file#=917 block#=136 blocks=8 obj#=2260476
'db file parallel read' ela= 422 files=1 blocks=127 requests=127 obj#=2260476
'db file parallel read' ela= 334 files=1 blocks=127 requests=127 obj#=2260476
'db file scattered read' ela= 264 file#=917 block#=409 blocks=66 obj#=2260476
------------------------------ dtrace ------------------------------
PROBEFUNC FD RETURN_SIZE COUNT
pread 260 540672 1
lseek 260 0 2
readv 260 65536 3
pread 260 8192 254
PROBEFUNC FD MAX_READ_Blocks
readv 260 8
pread 260 66
TOTAL_SIZE = 2818048 , TOTAL_READ_Blocks = 344 , TOTAL_READ_CNT = 258
readv 260
value ------------- Distribution ------------- count
8192 | 0
16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@ 2
32768 |@@@@@@@@@@@@@ 1
65536 | 0
pread 260
value ------------- Distribution ------------- count
2048 | 0
4096 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 196
8192 |@@@@@@@@ 52
16384 | 2
32768 | 0
65536 |@ 4
131072 | 1
262144 | 0
1.4. full Read
SQL > exec db_file_read_test('full', 1, 333);
SELECT /*+ full_read */ MAX(Y) FROM KSUN_TAB T WHERE ROWNUM <= :B1
call count cpu elapsed disk query current rows
------- ------ -------- ---------- ---------- ---------- ---------- ----------
Parse 1 0.00 0.00 0 0 0 0
Execute 1 0.00 0.00 0 0 0 0
Fetch 1 0.00 0.00 342 342 3 1
------- ------ -------- ---------- ---------- ---------- ---------- ----------
total 3 0.00 0.00 342 342 3 1
Row Source Operation
---------------------------------------------------
SORT AGGREGATE (cr=342 pr=342 pw=0 time=3788 us)
COUNT STOPKEY (cr=342 pr=342 pw=0 time=1371 us)
TABLE ACCESS FULL KSUN_TAB (cr=342 pr=342 pw=0 time=925 us cost=99 size=1169334 card=334)
Event waited on Times Max. Wait Total Waited
---------------------------------------- Waited ---------- ------------
db file sequential read 2 0.00 0.00
db file scattered read 23 0.00 0.00
-- Raw Trace File --
'db file sequential read' ela= 19 file#=917 block#=130 blocks=1 obj#=2260476 tim=647299975335
'db file sequential read' ela= 14 file#=3 block#=768 blocks=1 obj#=0 tim=647299975399
-- UNDO file#=3 /oratestdb/oradata/testdb/undo01.dbf
'db file scattered read' ela= 22 file#=917 block#=131 blocks=5 obj#=2260476 tim=647299975501
'db file scattered read' ela= 25 file#=917 block#=136 blocks=8 obj#=2260476 tim=647299975609
'db file scattered read' ela= 25 file#=917 block#=145 blocks=7 obj#=2260476 tim=647299975713
'db file scattered read' ela= 23 file#=917 block#=152 blocks=8 obj#=2260476 tim=647299975806
'db file scattered read' ela= 25 file#=917 block#=161 blocks=7 obj#=2260476 tim=647299975901
'db file scattered read' ela= 24 file#=917 block#=168 blocks=8 obj#=2260476 tim=647299975994
'db file scattered read' ela= 23 file#=917 block#=177 blocks=7 obj#=2260476 tim=647299976088
'db file scattered read' ela= 23 file#=917 block#=184 blocks=8 obj#=2260476 tim=647299976178
'db file scattered read' ela= 23 file#=917 block#=193 blocks=7 obj#=2260476 tim=647299976270
'db file scattered read' ela= 22 file#=917 block#=200 blocks=8 obj#=2260476 tim=647299976364
'db file scattered read' ela= 23 file#=917 block#=209 blocks=7 obj#=2260476 tim=647299976465
'db file scattered read' ela= 22 file#=917 block#=216 blocks=8 obj#=2260476 tim=647299976554
'db file scattered read' ela= 22 file#=917 block#=225 blocks=7 obj#=2260476 tim=647299976646
'db file scattered read' ela= 29 file#=917 block#=232 blocks=8 obj#=2260476 tim=647299976759
'db file scattered read' ela= 24 file#=917 block#=241 blocks=7 obj#=2260476 tim=647299976866
'db file scattered read' ela= 23 file#=917 block#=248 blocks=8 obj#=2260476 tim=647299976956
'db file scattered read' ela= 128 file#=917 block#=258 blocks=32 obj#=2260476 tim=647299977200
'db file scattered read' ela= 95 file#=917 block#=290 blocks=32 obj#=2260476 tim=647299977511
'db file scattered read' ela= 97 file#=917 block#=322 blocks=32 obj#=2260476 tim=647299977822
'db file scattered read' ela= 87 file#=917 block#=354 blocks=30 obj#=2260476 tim=647299978113
'db file scattered read' ela= 96 file#=917 block#=386 blocks=32 obj#=2260476 tim=647299978407
'db file scattered read' ela= 108 file#=917 block#=418 blocks=32 obj#=2260476 tim=647299978719
'db file scattered read' ela= 94 file#=917 block#=450 blocks=32 obj#=2260476 tim=647299979021
------------------------------ dtrace ------------------------------
PROBEFUNC FD RETURN_SIZE COUNT
pread 260 8192 1
pread 260 245760 1
readv 260 40960 1
pread 260 262144 6
readv 260 57344 7
lseek 260 0 8
readv 260 65536 8
PROBEFUNC FD MAX_READ_Blocks
readv 260 8
pread 260 32
TOTAL_SIZE = 2793472 , TOTAL_READ_Blocks = 341 , TOTAL_READ_CNT = 24
readv 260
value ------------- Distribution ------------- count
8192 | 0
16384 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 16
32768 | 0
pread 260
value ------------- Distribution ------------- count
16384 | 0
32768 |@@@@@@@@@@@@@@@@@@@@ 4
65536 |@@@@@@@@@@@@@@@ 3
131072 |@@@@@ 1
262144 | 0
*Note: One extra Read in SQL Trace: 25 (=23+2) Reads with 342 Blocks in SQL Trace vs.
dtrace 24 Reads with 341 Blocks is due to one UNDO Read (file#=3).
2. UNIX References
At first some documentations from UNIX man pages and internet.
ssize_t pread(int fildes, void * buf, size_t nbyte, off_t offset); ssize_t readv(int fildes, struct iovec * iov, int iovcnt); off_t lseek(int fildes, off_t offset, int whence)
pread(): performs the same action as read(), except that it reads from a given position in the file without changing the file pointer. The first three arguments to pread() are the same as read() with the addition of a fourth argument offset for the desired position inside the file. pread() will read up to the maximum offset value that can be represented in an off_t for regular files. readv(): is equivalent to read(), but places the input data into the iovcnt buffers specified by the members of the iov array: iov 0 , iov 1 , ..., iov [ iovcnt -1]. The iovcnt argument is valid if greater than 0 and less than or equal to IOV_MAX . The iovec structure contains the following members: caddr_t iov_base; int iov_len; Each iovec entry specifies the base address and length of an area in memory where data should be placed. The readv() function always fills an area completely before proceeding to the next. Upon successful completion, readv() marks for update the st_atime field of the file. IOV_MAX: 1024 On Linux, 16 On Solaris, 16 On AIX and HP-UX readv (Linux): Read data from file descriptor FD, and put the result in the buffers described by VECTOR, which is a vector of COUNT 'struct iovec's. The buffers are filled in the order specified. Operates just like 'read' (see <unistd.h>) except that data are put in VECTOR instead of a contiguous buffer. lseek(): sets the file pointer associated with the open file descriptor specified by fildes.
3. Oracle References
db file sequential read: P1 = file#, P2 = block#, P3 = blocks file#: This is the file# of the file that Oracle is trying to read from. From Oracle8 onwards it is the ABSOLUTE file number (AFN). block#: This is the starting block number in the file from where Oracle starts reading the blocks. Typically only one block is being read. blocks: This parameter specifies the number of blocks that Oracle is trying to read from the file# starting at block#. This is usually "1" but if P3 > 1 then this is a multiblock read. Multiblock "db file sequential read"s may be seen in earlier Oracle versions when reading from a SORT (TEMPORARY) segments. db file scattered read: P1 = file#, P2 = block#, P3 = blocks file#: This is the file# of the file that Oracle is trying to read from. In Oracle8 onwards it is the ABSOLUTE file number (AFN). block#: This is the starting block number in the file from where Oracle starts reading the blocks. blocks: This parameter specifies the number of blocks that Oracle is trying to read from the file# starting at block#. The upper limit is DB_FILE_MULTIBLOCK_READ_COUNT, which is self tuned from Oracle 10.2 onwards. db file parallel read: P1 = files, P2 = blocks, P3 = requests files: This indicates the number of files to which the session is reading blocks: This indicates the total number of blocks to be read requests: This indicates the total number of I/O requests, which will be the same as blocks WAITEVENT: "db file sequential read" Reference Note (Doc ID 34559.1) This signifies a wait for an I/O read request to complete. This call differs from "db file scattered read" in that a sequential read reads data into contiguous memory (whilst a scattered read reads multiple blocks and scatters them into different buffers in the SGA). WAITEVENT: "db file scattered read" Reference Note (Doc ID 34558.1) This wait happens when a session is waiting for a multiblock IO to complete. This typically occurs during FULL TABLE SCANs or INDEX FAST FULL SCANs. Oracle reads up to DB_FILE_MULTIBLOCK_READ_COUNT consecutive blocks at a time and scatters them into buffers in the buffer cache. C.3.34 db file parallel read This happens during recovery. It can also happen during buffer prefetching, as an optimization (rather than performing multiple single-block reads).
4. Discussions
(a). pread and readv: both read contiguous file space. pread places the input data into one single contiguous buffer (memory space); whereas readv distributes them into multi buffers. It looks like that the difference between pread and readv is the difference of memory allocation. Their disk operations are the same. pread specifies file reading position by a third parameter: offset; whereas readv requires a precedent lseak (with parameter: offset) to fix the file reading position. Contiguous file space means logically contiguous in a file, but does not mean physically contiguous in a disk. Oracle 12 Logical Storage Structures Section: Overview of Extents wrote: An extent is a unit of database storage made up of logically contiguous data blocks. Data blocks can be physically spread out on disk because of RAID striping and file system implementations. (b). pread can fulfill all 3 kinds of db file read: db file sequential read (Test 1.1. single Read) db file scattered read (Test 1.3. parallel Read, see next discussion) db file parallel read (Test 1.3. parallel Read, see next discussion) whereas readv is for db file scattered read (Test 1.2. scattered Read) (c). In all three tests of "single Read", "scattered Read" and "parallel Read", SQL Trace shows "disk" is bigger than "query" (641 > 333, 2664 > 333, 344 > 335), there seems some wastage since more disk Blocks are read than consumed. But the number of "disk" read Requests, which is gathered by dtrace TOTAL_READ_CNT, is no more than SQL Trace "disk" and "query". From performance point of view, number of "disk" read Requests is one more cost factor than number of read Blocks. This is visible in dtrace quantize (frequency distribution diagram) output, where the value field denotes elapsed nanoseconds. For example, in Test "single Read", elapsed time per Block read for readv (8 Blocks per read Request), and pread (1 Block per read Request) can be calculated as: readv: (16384*43 + 32768*1)/8/44 = 2094 pread: (4096*226 + 8192*60 + 16384*2 + 65536*1)/289 = 5244 (d). 'db file parallel read' specifies the number of files (first parameter), and number of blocks to read (second and third parameters are equal). We only observe 'db file parallel read' in Test: exec db_file_read_test('parallel', 1, 333);From above SQL Raw Trace File and Dtrace, we can see: first 3 'db file scattered read' with "blocks=8", corresponding to 3 readv, each reads 65536 Bytes (8 Blocks). next 2 'db file parallel read' with "blocks=127", corresponding to 254 pread, each reads 8192 Bytes (1 Block). last 'db file scattered read' with "blocks=66", corresponding to 1 pread, which reads 540672 Bytes (66 Blocks). In other words, there exist only 3 readv, which read 3*65536 Bytes=24 Blocks. These 3 readv can neither match to 254 (=127 + 127) Blocks 'db file parallel read', nor 66 Blocks 'db file scattered read'. Therefore both 'db file parallel read' and 'db file scattered read' in this test are accomplished by pread. 'db file parallel read' is similar to 'db file sequential read', the difference is that former reads multi Blocks (probably asynchronously), but later reads one single Block. In fact, we can observe the 254 aio requests by:Event waited on Times Max. Wait Total Waited ---------------------------------------- Waited ---------- ------------ db file scattered read 4 0.00 0.00 db file parallel read 2 0.00 0.00 -- Syscalls and Raw Trace File -- readv: 'db file scattered read' ela= 49 file#=917 block#=10368 blocks=8 obj#=2260477 (Index KSUN_TAB#I1) readv: 'db file scattered read' ela= 27 file#=917 block#=128 blocks=8 obj#=2260476 (Table KSUN_TAB) readv: 'db file scattered read' ela= 21 file#=917 block#=136 blocks=8 obj#=2260476 pread: 'db file parallel read' ela= 422 files=1 blocks=127 requests=127 obj#=2260476 pread: 'db file parallel read' ela= 334 files=1 blocks=127 requests=127 obj#=2260476 pread: 'db file scattered read' ela= 264 file#=917 block#=409 blocks=66 obj#=2260476 ------------------------------ dtrace ------------------------------ PROBEFUNC FD RETURN_SIZE COUNT pread 260 540672 1 lseek 260 0 2 readv 260 65536 3 pread 260 8192 254 PROBEFUNC FD MAX_READ_Blocks readv 260 8 pread 260 66 TOTAL_SIZE = 2818048 , TOTAL_READ_Blocks = 344 , TOTAL_READ_CNT = 258
'db file parallel read' has 2 "plural" parameters, P1 (files), P3 (requests) (P2=P3), each of which can denote one dimension of parallel operations. P1 (files) signifies multi files reading in parallel. P3 (requests) stands for multi parallel Disk Reading requests delegating to multi-threading LWP processes. Similar to 'log file parallel write' and 'control file parallel write', in which first parameter "files" represents number of log files (in one redo group) and number of control files. In the above example, since files=1, therefore 'db file parallel read' implies multi requests. Because multi requests are performed asynchronously, probably better named as 'db file async read'. If we set DISK_ASYNCH_IO=false (and restart DB), there is no more aio observed with dtrace as shown in following output, but SQL Trace is not able to reveal this setting change, and it still shows the same output as above.SQL > exec db_file_read_test('parallel', 1, 333); sudo dtrace -n ' syscall::pread:entry / pid == $1 && arg0 == $2 / {self->pread_fd = arg0;} syscall::pread:return/ pid == $1 && self->pread_fd == $2 / {@STACK_CNT[probefunc, self->pread_fd, arg1, ustack(5, 0)]=count(); self->pread_fd = 0;} ' 11352 260 pread 260 540672 libc.so.1`_pread+0xa oracle`skgfqio+0x284 oracle`ksfd_skgfqio+0x195 oracle`ksfd_skgfrvio+0xcb4 oracle`ksfd_vio+0x9a3 1 pread 260 8192 libc.so.1`_pread+0xa libaio.so.1`_aio_do_request+0x18e libc.so.1`_thr_setup+0x5b libc.so.1`_lwp_start 254
The last 'db file scattered read' with "blocks=66" also shows one pread can read 66 Blocks, much higher than db_file_multiblock_read_count=32 in this database. By the way, we have 3 readv, but only 2 lseek, so there are probably 2 readv share one lseek.pread 268 540672 libc.so.1`_pread+0xa oracle`skgfqio+0x284 oracle`ksfd_skgfqio+0x195 oracle`ksfd_skgfrvio+0xcb4 oracle`ksfd_vio+0x9a3 1 pread 268 8192 libc.so.1`_pread+0xa oracle`skgfqio+0x284 oracle`ksfd_skgfqio+0x203 oracle`ksfdgo+0x188 oracle`ksfd_sbio+0xdd1 254
5. V$FILESTAT vs. V$IOSTAT_FILE Views
There exist two Oracle views recording file disk I/O statistics:
1. V$FILESTAT and its Cumulative DBA_HIST_FILESTATXS in centisecond since Oracle 8 (or 7). 2. V$IOSTAT_FILE and DBA_HIST_IOSTAT_* in milliseconds since Oracle 11.For temp files, the counterpart of V$FILESTAT is V$TEMPSTAT; and in V$IOSTAT_FILE, filetype_name is marked as 'Temp File'.
According to MOS (Doc ID 2222370.1): What Does "1-bk Rds/s" and "Av 1-bk Rd(ms)" in "Tablespace IO Stats" and "File IO Stats" in AWR Report Mean? (Doc ID 2222370.1)
"Tablespace IO Stats" and "File IO Stats" in AWR report come from the columns of DBA_HIST_FILESTATXS.
But AWR basis table SYS.WRH$_IOSTAT_FILETYPE is filled by V$IOSTAT_FILE.
INSERT INTO WRH$_IOSTAT_FILETYPE (dbid,
con_dbid,
snap_id,
instance_number,
filetype_id,
small_read_megabytes,
small_write_megabytes,
large_read_megabytes,
large_write_megabytes,
small_read_reqs,
small_write_reqs,
small_sync_read_reqs,
large_read_reqs,
large_write_reqs,
small_read_servicetime,
small_write_servicetime,
small_sync_read_latency,
large_read_servicetime,
large_write_servicetime,
retries_on_error)
SELECT :dbid,
:srcdbid con_dbid,
:snap_id,
:instance_number,
filetype_id,
SUM (small_read_megabytes) small_read_megabytes,
SUM (small_write_megabytes) small_write_megabytes,
SUM (large_read_megabytes) large_read_megabytes,
SUM (large_write_megabytes) large_write_megabytes,
SUM (small_read_reqs) small_read_reqs,
SUM (small_write_reqs) small_write_reqs,
SUM (small_sync_read_reqs) small_sync_read_reqs,
SUM (large_read_reqs) large_read_reqs,
SUM (large_write_reqs) large_write_reqs,
SUM (small_read_servicetime) small_read_servicetime,
SUM (small_write_servicetime) small_write_servicetime,
SUM (small_sync_read_latency) small_sync_read_latency,
SUM (large_read_servicetime) large_read_servicetime,
SUM (large_write_servicetime) large_write_servicetime,
SUM (retries_on_error) retries_on_error
FROM v$iostat_file
GROUP BY filetype_id;
In AWR Report, both V$IOSTAT_FILE and V$FILESTAT data are visible in different places.
For example, in Section: "IOStat by Filetype summary", the last 2 Columns are named as "Small Read" and "Large Read",
which are probably from V$IOSTAT_FILE.
But in Section "Tablespace IO Stats" and "File IO Stats", the Column names are prefixed by "1-bk Rd", which are from from V$FILESTAT. Therefore, potential stats inconsistence can appear even in the same AWR report.
6. DB File Read Stats Tests
Run test (see appended Test Code):
alter session set timed_statistics = true;
alter session set statistics_level=all;
truncate table read_stats;
exec db_file_read_test('single', 1, 333);
exec db_file_read_test('scattered', 1, 333);
exec db_file_read_test('parallel', 1, 333);
exec db_file_read_test('full', 1, 333);
and than watch statistics in both views (AIX and Linux are added for comparison. Copy HTML Table to Excel for easy reading):
select test_name,
phyrds, phyblkrd, singleblkrds, singleblkrdtim,
ceil(small_read_megabytes*1024*1024/8192) small_read_blks, -- converted to Block for comparison
small_read_reqs, small_read_servicetime, small_sync_read_reqs, small_sync_read_latency,
ceil(large_read_megabytes*1024*1024/8192) large_read_blks, -- converted to Block for comparison
large_read_reqs, large_read_servicetime
from read_stats_delta_v where phyrds > 0 order by test_name desc, test_timestamp;
OS
test_name
phyrds
phyblkrd
singleblkrds
singleblkrdtim
small_read_blks
small_read_reqs
small_read_servicetime
small_sync_read_reqs
small_sync_read_latency
large_read_blks
large_read_reqs
large_read_servicetime
Solaris
single
333
641
289
10
640
333
0
289
0
0
0
0
Solaris
scattered
333
2664
0
0
2560
333
0
0
0
0
0
0
Solaris
parallel
258
344
254
0
384
257
0
508
0
0
1
0
Solaris
full
24
341
1
0
0
17
0
1
0
256
7
0
AIX
single
333
333
333
150
384
333
37
333
37
0
0
0
AIX
scattered
333
333
333
1460
384
333
1310
333
1310
0
0
0
AIX
parallel
260
335
259
110
256
259
17
259
17
0
1
1
AIX
full
24
341
1
20
128
17
41
1
12
256
7
26
Linux
single
333
641
289
0
640
333
0
289
0
0
0
0
Linux
scattered
333
2664
0
0
2688
333
0
0
0
0
0
0
Linux
parallel
258
344
254
0
256
257
0
508
0
0
1
0
Linux
full
24
341
1
0
128
17
0
1
0
256
7
0
7. DB File Read Stats Test Discussion
The first 4 stats Columns (green) are from V$FILESTAT, the rest from V$IOSTAT_FILE. Here some observations:
(a). V$FILESTAT.phyrds matches TOTAL_READ_CNT, equal to V$IOSTAT_FILE.(small_read_reqs + large_read_reqs). V$FILESTAT.phyblkrd matches TOTAL_READ_Blocks, approximate to V$IOSTAT_FILE.(small_read_blks + large_read_blks). V$FILESTAT.singleblkrds matches V$IOSTAT_FILE.small_sync_read_reqs except "parallel". V$FILESTAT.singleblkrds are 1 DB Block(8192 Bytes) Read (' db file sequential read' or 'db file parallel read'). It is not clear why small_sync_read_reqs (508) in "parallel" is much higher than dtrace TOTAL_READ_Blocks (344). V$IOSTAT_FILE.small_read_reqs includes small_sync_read_reqs execept "parallel". V$IOSTAT_FILE.small_sync_read_reqs matches 'db file sequential read' except "parallel". All time metrics are (almost) zero: singleblkrdtim, small_read_servicetime, small_sync_read_latency, large_read_servicetime. Time in V$FILESTAT is in centiseconds, V$IOSTAT_FILE in milliseconds. Both could have rounding errors. AIX seems able to record the time metrics, probably because they are bigger than the minimum time granule. (b). Only 'scattered' Test: exec db_file_read_test('scattered', 1, 333); shows much higher small_read_blks: 2560 Blocks by small_read_reqs: 333 Requests (2560/333 approximately 8 Blocks per Request), marked as "db file scattered read" and accomplished by "readv". However, in V$IOSTAT_FILE, they are entirely classified as SMALL_READ_MEGABYTES (small_read_blks) and SMALL_READ_REQS. But Oracle 12c V$IOSTAT_FILE Docu said that SMALL_READ_* are all about "single block read" even though dtrace evidenced them as multi block read (8 Blocks per readv). Logically it is controversial if small_read_blks is bigger than small_read_reqs because each single block read request can read only one single block. (c). 'db file parallel read' in "parallel" test seems running asynchronously, and hard to collect precise statistics. (d). V$IOSTAT_FILE has one pair of stats about single block read -. small_read_reqs / small_read_servicetime -. small_sync_read_reqs / small_sync_read_latency (e). Oracle Time Accounting also evolves with time. MOS: I/O Wait Event Accounting Change in the RDBMS 12.1.0.2.0 Patch Set Release (Doc ID 1967410.1) said: Starting with the RDBMS 12.1.0.2.0 Patch Set Release, the accounting for certain I/O-related wait events has been updated to be more accurate than it was before. As a result, these wait events may show an increase of wait times of around 10%. The wait events that are affected include, but are not limited to, the following: db file sequential read db file scattered read db file parallel read log file parallel write
8. Test Code
drop tablespace ksun_ts including contents and datafiles;
create tablespace ksun_ts datafile '/oratestdb/oradata/testdb/ksun_ts.dbf' size 200m online;
drop table ksun_tab;
-- DB_BLOCK_SIZE = 8192, each row occupies one BLOCK
create table ksun_tab tablespace ksun_ts as
select level x, rpad('ABC', 3500, 'X') y, rpad('ABC', 3500, 'X') z from dual connect by level <= 1e4;
select round(bytes/1024/1024) mb, blocks from dba_segments where segment_name = 'KSUN_TAB';
--80 10240
create index ksun_tab#i1 on ksun_tab(x) tablespace ksun_ts;
exec dbms_stats.gather_table_stats(null, 'KSUN_TAB', cascade=>true);
drop tablespace ksun_ts_aux including contents and datafiles;
create tablespace ksun_ts_aux datafile '/oratestdb/oradata/testdb/ksun_ts_aux.dbf' size 200m online;
drop table ksun_rid;
create table ksun_rid tablespace ksun_ts_aux as select x, rowid rid from ksun_tab;
create index ksun_rid#i1 on ksun_rid(x) tablespace ksun_ts_aux;
exec dbms_stats.gather_table_stats(null, 'KSUN_RID', cascade=>true);
create or replace view read_stats_v as
select to_char(localtimestamp, 'yyyy-mm-dd hh24:mi:ss') test_timestamp
,phyrds, phyblkrd, singleblkrds, 10*singleblkrdtim singleblkrdtim
,small_read_megabytes, small_read_reqs, small_read_servicetime, small_sync_read_reqs, small_sync_read_latency
,large_read_megabytes, large_read_reqs, large_read_servicetime
,f.name
from v$filestat v8, v$iostat_file v11, v$datafile f
where v8.file# = f.file#
and v11.file_no = f.file#
and f.name like '%ksun_ts.dbf';
drop table read_stats;
create table read_stats as select 'setall_seq_readxx' test_name, v.* from read_stats_v v where 1=2;
create or replace view read_stats_delta_v as
select test_name, test_timestamp
,phyrds - lag(phyrds) over(partition by test_name order by test_timestamp) phyrds
,phyblkrd - lag(phyblkrd) over(partition by test_name order by test_timestamp) phyblkrd
,singleblkrds - lag(singleblkrds) over(partition by test_name order by test_timestamp) singleblkrds
,singleblkrdtim - lag(singleblkrdtim) over(partition by test_name order by test_timestamp) singleblkrdtim
,small_read_megabytes - lag(small_read_megabytes) over(partition by test_name order by test_timestamp) small_read_megabytes
,small_read_reqs - lag(small_read_reqs) over(partition by test_name order by test_timestamp) small_read_reqs
,small_read_servicetime - lag(small_read_servicetime) over(partition by test_name order by test_timestamp) small_read_servicetime
,small_sync_read_reqs - lag(small_sync_read_reqs) over(partition by test_name order by test_timestamp) small_sync_read_reqs
,small_sync_read_latency - lag(small_sync_read_latency) over(partition by test_name order by test_timestamp) small_sync_read_latency
,large_read_megabytes - lag(large_read_megabytes) over(partition by test_name order by test_timestamp) large_read_megabytes
,large_read_reqs - lag(large_read_reqs) over(partition by test_name order by test_timestamp) large_read_reqs
,large_read_servicetime - lag(large_read_servicetime) over(partition by test_name order by test_timestamp) large_read_servicetime
from read_stats s;
create or replace procedure db_file_read_test (p_test_name varchar2, p_loops number, p_rows number) as
l_max_y varchar2(3500);
type tab_rowid is table of rowid index by pls_integer;
l_rowid_cache tab_rowid;
begin
case
when p_test_name = 'single' then
select rid bulk collect into l_rowid_cache from ksun_rid where x between 1 and p_rows;
when p_test_name = 'scattered' then
select rid bulk collect into l_rowid_cache from ksun_rid where mod(x, 10) = 0 and rownum <= p_rows;
else null;
end case;
dbms_output.put_line('Number of Rows to read = '|| l_rowid_cache.count);
insert into read_stats select p_test_name test_name, v.* from read_stats_v v;
for i in 1..p_loops loop
execute immediate 'alter system flush buffer_cache';
dbms_lock.sleep(1);
case
when p_test_name = 'single' then
for r in 1..l_rowid_cache.count loop
-- adjacent rowid, single block read, 'db file sequential read'
select /*+ single_read */ y into l_max_y from ksun_tab t where rowid = l_rowid_cache(r);
end loop;
when p_test_name = 'scattered' then
for r in 1..l_rowid_cache.count loop
-- jumped rowid, scattered read, 'db file scattered read'
select /*+ scattered_read */ y into l_max_y from ksun_tab t where rowid = l_rowid_cache(r);
end loop;
when p_test_name = 'parallel' then
-- table access by index rowid batched 'db file parallel read'
select /*+ index(t ksun_tab#i1) parallel_read */ max(y) into l_max_y from ksun_tab t where x between 1 and p_rows;
when p_test_name = 'full' then
-- table access by FULL
select /*+ full_read */ max(y) into l_max_y from ksun_tab t where rownum <= p_rows;
end case;
dbms_lock.sleep(1);
insert into read_stats select p_test_name test_name, v.* from read_stats_v v;
end loop;
commit;
end;
/
9. Dtrace Script
pfiles 11352
260: /oratestdb/oradata/testdb/ksun_ts.dbf
--Dtrace Script
#!/usr/sbin/dtrace -Zs
/*
* read_dtrace.d pid fd
* chmod u+x read_dtrace.d
* sudo ./read_dtrace.d 11352 260
*/
BEGIN / $1 > 0 && $2 > 0 /
{TOTAL_SIZE = 0; TOTAL_READ_CNT = 0; }
syscall::pread:entry / pid == $1 && arg0 == $2 /
{self->pread_fd = arg0; self->pread_t = timestamp;}
syscall::pread:return/ pid == $1 && self->pread_fd == $2 /
{@CNT[probefunc, self->pread_fd, arg1] = count();
@MAXB[probefunc, self->pread_fd] = max(arg1/8192);
@ETIME[probefunc, self->pread_fd] = quantize(timestamp- self->pread_t);
TOTAL_SIZE = TOTAL_SIZE + arg1; TOTAL_READ_CNT = TOTAL_READ_CNT + 1;
self->pread_fd = 0;}
syscall::readv:entry / pid == $1 && arg0 == $2 /
{self->readv_fd = arg0; self->readv_t = timestamp; }
syscall::readv:return/ pid == $1 && self->readv_fd == $2 /
{@CNT[probefunc, self->readv_fd, arg1] = count();
@MAXB[probefunc, self->readv_fd] = max(arg1/8192);
@ETIME[probefunc, self->readv_fd] = quantize(timestamp- self->readv_t);
TOTAL_SIZE = TOTAL_SIZE + arg1; TOTAL_READ_CNT = TOTAL_READ_CNT + 1;
self->readv_fd = 0;}
syscall::kaio:entry / pid == $1 && arg1 == $2 /
{self->kaio = arg1;}
syscall::kaio:return / pid == $1 && self->kaio == $2 /
{@CNT[probefunc, self->kaio, arg1] = count(); self->kaio = 0;}
syscall::lseek:entry / pid == $1 && arg0 == $2/
{@CNT[probefunc, arg0, 0] = count(); }
END / $1 > 0 && $2 > 0 /
{printf("\n%11s %6s %12s %9s \n", "PROBEFUNC", "FD", "RETURN_SIZE", "COUNT");
printa(" %-10s %6d %12d %9@d\n", @CNT);
printf("\n%11s %6s %16s \n", "PROBEFUNC", "FD", "MAX_READ_Blocks");
printa(" %-10s %6d %16@d\n", @MAXB);
printf("\nTOTAL_SIZE = %-10d, TOTAL_READ_Blocks = %-6d, TOTAL_READ_CNT = %-6d\n",
TOTAL_SIZE, TOTAL_SIZE/8192, TOTAL_READ_CNT);
printa("\n%-10s %6d %16@d\n", @ETIME);}