Monday, September 18, 2017

Result Cache: RC Latch Contention and PL/SQL Exception Handler (III)

If we have fixed all the causes of Result Cache: RC Latch Contention discussed in previous two Blogs, the system still suffers from such Contention. There could exist the cases, which we have not yet covered.

In this Blog, we will discuss the third case: "Result Cache: RC Latch Contention and PL/SQL Exception Handler". So in total we have found 3 Cases:
1. PL/SQL Function Result Cache Invalidation (I) (DML and Select for Update)
2. Result Cache: RC Latch Contention and RESULT_CACHE_MAX_SIZE (II))
3. Result Cache: RC Latch Contention and PL/SQL Exception Handler (III)
Note: All tests are done in Oracle 11.2.0.4, 12.1.0.2, 12.2.0.1 on AIX, Solaris, Linux with 6 physical processors.

Update (2017-Sep-19): One day after this Blog appeared, it turns out that there exists an on-going bug fix, which probably addresses Case (II) and (III).
    Bug 23738186 : HIGH RESULT CACHE LATCHES AFTER MIGRATION TO 12.1.0.2
This fix addresses the Result Cache (RC) latch contention when the RC is in the LRU regime.
The following changes are part of this fix:

1. Bypass Object (BO)
    Currently, when a RC function throws an exception or a RC SQL exceeds memory
    limit, the under-construction Result Object (RO) is marked invalid and released.
    The customer's application has some RC functions that have a high rate of
    exceptions, which leads to high rate of invalid RO creation, which in turn leads
    to noticeable x-latch contention. To reduce these wasteful RO creations, we will
    instead store a BO when the function+parameters throws an exception. A BO is a
    regular RO that is created, on client directive, by the RC, but unlike regular
    RO's a BO can never be read. A BO uses exactly one block that contains the RO
    header but no data. A client trying to read a BO always gets a (non-storable)
    miss which forces the client to handle the miss by executing the function.
    
    Note, unlike regular misses, a non-storable miss does not allow the client to
      create a new RO. We have introduced two new statistics in v$result_cache_statistics
      to track the creation and use of BO's (for an example see tkqgcbug.sql/log).
 
2. Group release of invalid objects
    Release all invalid objects together (instead of one at a time).
 
3. Improved LRU list processing
    Instead of scanning all object on the list looking for the min lru#, stop
    the scan when the first un-scanned object (i.e. scan_count is 0) is reached.


1. Test Code


Out test table (TEST_TAB) contains 4 rows (1-4). Any select not in (1, 2, 3, 4) will get NO_DATA_FOUND exception.

We create 4 result_cache functions for 4 different scenarios:
  1. rc_c1: exception handler and return 
  2. rc_c2: exception handler and no return
  3. rc_c3: exception handler, raise again
  4. rc_c4: no exception handler

--------------------------------- Test Setup ---------------------------------
drop table test_tab;
drop table rc_latch_hist;
drop table rc_stats_hist;

create table test_tab as select level x from dual connect by level <= 4;

create table rc_latch_hist as 
select 'aaaaaaaaaaaaaaaaaaaa' snap_name, systimestamp ts, v.* from v$latch v where name = 'Result Cache: RC Latch'; 

create table rc_stats_hist as 
select 'aaaaaaaaaaaaaaaaaaaa' snap_name, systimestamp ts, v.* from v$result_cache_statistics v;

-- exception handler: return
create or replace function rc_c1(p_x number) return number result_cache as
  l_ret number;
begin
  select x into l_ret from test_tab where x = p_x;
  return l_ret;
  exception when others then return null;   --NO_DATA_FOUND
end;
/

-- exception handler: no return
create or replace function rc_c2(p_x number) return number result_cache as
  l_ret number;
begin
  select x into l_ret from test_tab where x = p_x;
  return l_ret;
  exception when others then null;
end;
/

-- exception handler: raise again
create or replace function rc_c3(p_x number) return number result_cache as
  l_ret number;
  my_excep exception;
  pragma exception_init(my_excep, -20001);
begin
  select x into l_ret from test_tab where x = p_x;
  return l_ret;
  exception when others then raise_application_error(-20001, 'My Private Exception'); 
end;
/

-- no exception handler
create or replace function rc_c4(p_x number) return number result_cache as
  l_ret number;
begin
  select x into l_ret from test_tab where x = p_x;
  return l_ret;
end;
/

create or replace procedure rc_proc_case(p_case number, p_rc_limit number) as
  l_sql_bool_val boolean := true;
  l_ret varchar2(10);
begin
 dbms_application_info.set_module('RC Test', 'RC_CASE_'||p_case);
  for x in 1..p_rc_limit loop
   begin
    case p_case 
      when 1 then l_ret := rc_c1(x);
      when 2 then l_ret := rc_c2(x);
      when 3 then l_ret := rc_c3(x);
      when 4 then l_ret := rc_c4(x);
    end case;
    exception when others then null;
    end;
  end loop;
end;
/

create or replace procedure rc_case_job(p_case number, p_job_cnt number, p_rc_limit number) as
  l_job_id pls_integer;
begin
  for i in 1.. p_job_cnt loop
    dbms_job.submit(l_job_id, 'begin while true loop rc_proc_case('||p_case||', '||p_rc_limit||'); end loop; end;');
  end loop;
  commit;
end;    
/

create or replace procedure clean_jobs as
begin
  for c in (select job from dba_jobs) loop
    begin
       dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;

  for c in (select d.job, d.sid, (select serial# from v$session where sid = d.sid) ser 
              from dba_jobs_running d) loop
    begin
      execute immediate
             'alter system kill session '''|| c.sid|| ',' || c.ser|| ''' immediate';
      dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;
  
  -- select * from dba_jobs;
  -- select * from dba_jobs_running;
end;
/

create or replace procedure rc_record_rc_latch(p_snap_name varchar2) as
begin
  insert into rc_latch_hist
  select p_snap_name snap_name, systimestamp ts, v.* from v$latch v where name = 'Result Cache: RC Latch';
  insert into rc_stats_hist
  select p_snap_name snap_name, systimestamp ts, v.* from v$result_cache_statistics v;
  commit; 
end;    
/

create or replace procedure rc_case_test(p_sleep number) as
begin
  clean_jobs;
  rc_record_rc_latch('Start');             
  rc_case_job(p_case=> 1, p_job_cnt=> 2, p_rc_limit=> 4); dbms_lock.sleep(p_sleep);
  rc_record_rc_latch('No_Error');          
  rc_case_job(p_case=> 1, p_job_cnt=> 1, p_rc_limit=> 5); dbms_lock.sleep(p_sleep);
  rc_record_rc_latch('Except_Return') ;     
  rc_case_job(p_case=> 2, p_job_cnt=> 1, p_rc_limit=> 5); dbms_lock.sleep(p_sleep);
  rc_record_rc_latch('Except_No_Return');  
  rc_case_job(p_case=> 3, p_job_cnt=> 1, p_rc_limit=> 5); dbms_lock.sleep(p_sleep);
  rc_record_rc_latch('Except_Raise');      
  rc_case_job(p_case=> 4, p_job_cnt=> 1, p_rc_limit=> 5); dbms_lock.sleep(p_sleep);
  rc_record_rc_latch('No_Hanler');
  clean_jobs;
end;    
/


2. Single Session Observation Test


Run following small test code:

column name format    a20
column rc_name format a48
column value format   a10
set numformat         999,999,999,999,999
column id format      999,999
alter session set nls_date_format ='dd-mon-yyyy hh24:mi:ss';
exec dbms_result_cache.flush;

select id, type, status, name rc_name, creation_timestamp, scn, lru_number from v$result_cache_objects v order by 3 desc, name, status;

var l_x number;
exec :l_x := rc_c4(1);
exec dbms_lock.sleep(1);
select id, type, status, name rc_name, creation_timestamp, scn, lru_number from v$result_cache_objects v order by 3 desc, name, status;

exec :l_x := rc_c4(5);
exec dbms_lock.sleep(1);
select id, type, status, name rc_name, creation_timestamp, scn, lru_number from v$result_cache_objects v order by 3 desc, name, status;
Output shows that there is "Invalid" Result when hitting NO_DATA_FOUND.

SQL > select id, type, status, name rc_name, creation_timestamp, scn, lru_number from v$result_cache_objects v order by 3 desc, name, status;
  no rows selected

SQL > exec :l_x := rc_c4(1);
SQL > select id, type, status, name rc_name, creation_timestamp, scn, lru_number from v$result_cache_objects v order by 3 desc, name, status;
   ID TYPE       STATUS    RC_NAME                                     CREATION_TIMESTAMP            SCN           LRU_NUMBER
  --- ---------- --------- ------------------------------------------- -------------------- -------------------- ---------------
    1 Result     Published "K"."RC_C4"::8."RC_C4"#3048d2af80817a01 #1  2017*sep*18 06:58:33    9,237,974,126,701   1,985,444,743
    0 Dependency Published K.RC_C4                                     2017*sep*18 06:58:33    9,237,974,126,701               0
    2 Dependency Published K.TEST_TAB                                  2017*sep*18 06:58:33    9,237,974,126,701               0

SQL > exec :l_x := rc_c4(5);
  ORA-01403: no data found
SQL > select id, type, status, name rc_name, creation_timestamp, scn, lru_number from v$result_cache_objects v order by 3 desc, name, status;
   ID TYPE       STATUS    RC_NAME                                     CREATION_TIMESTAMP            SCN           LRU_NUMBER
  --- ---------- --------- ------------------------------------------- -------------------- -------------------- ---------------
    1 Result     Published "K"."RC_C4"::8."RC_C4"#3048d2af80817a01 #1  2017*sep*18 06:58:33    9,237,974,126,701   1,985,444,743
    0 Dependency Published K.RC_C4                                     2017*sep*18 06:58:33    9,237,974,126,701               0
    2 Dependency Published K.TEST_TAB                                  2017*sep*18 06:58:33    9,237,974,126,701               0
    3 Result     Invalid   "K"."RC_C4"::8."RC_C4"#3048d2af80817a01 #1  2017*sep*18 06:58:34    9,237,974,126,701               0


3. Multiple Session Concurrency Test


On a UNIX machine with 6 physical processors, we start 6 Oracle Jobs one after another with an interval of about 60 seconds before each launch. At same time, we collect RC Latch and RC Stats between each launch.
1. First 2 Jobs (base load) select row 1-4, and there is no exception. 
   All other Jobs select row 1-5, and hence get exception when select row 5.
2. Job 3 has exception handler and returns null. 
3. Job 4 has exception handler, but does not have return. 
4. Job 5 has exception handler, but raises again.
5. Job 6 does not have exception handler.

truncate table rc_latch_hist;
truncate table rc_stats_hist;
exec dbms_result_cache.flush;
exec rc_case_test(60);
During the test, if we monitor RC Objects, we can see the status: "New" and "Invalid".

select type, status, name, count(*) from v$result_cache_objects 
group by type, status, name order by type desc, status, name;

TYPE       STATUS    NAME                                         COUNT(*)
---------- --------- ------------------------------------------ ----------
Result     Invalid   "K"."RC_C2"::8."RC_C2"#3048d2af80817a01 #1          7
Result     Invalid   "K"."RC_C3"::8."RC_C3"#3048d2af80817a01 #1          1
Result     New       "K"."RC_C3"::8."RC_C3"#3048d2af80817a01 #1          1
Result     New       "K"."RC_C4"::8."RC_C4"#3048d2af80817a01 #1          1
Result     Published "K"."RC_C1"::8."RC_C1"#3048d2af80817a01 #1          5
Result     Published "K"."RC_C2"::8."RC_C2"#3048d2af80817a01 #1          4
Result     Published "K"."RC_C3"::8."RC_C3"#3048d2af80817a01 #1          4
Result     Published "K"."RC_C4"::8."RC_C4"#3048d2af80817a01 #1          4
Dependency Published K.RC_C1                                             1
Dependency Published K.RC_C2                                             1
Dependency Published K.RC_C3                                             1
Dependency Published K.RC_C4                                             1
Dependency Published K.TEST_TAB                                          1
After the test, we look the collected stats, which shows the difference of RC Latch SLEEPS in 4 different scenarios.

The test result demonstrates that except the first scenario: "rc_c1: exception handler and return", all other 3 PL/SQL Result Cache implantations can cause RC Latch Contentions.

select v.*, round(wait_time/(nullif (sleeps, 0)), 2) wait_time_us_per_sleep
from (
    select snap_name, ts
          ,gets-lag(gets) over(order by ts) gets
          ,misses-lag(misses) over(order by ts) misses
          ,sleeps-lag(sleeps) over(order by ts) sleeps
          ,spin_gets-lag(spin_gets) over(order by ts) spin_gets
          ,wait_time-lag(wait_time) over(order by ts) wait_time
    from rc_latch_hist
) v order by ts;

SNAP_NAME         TS                                  GETS      MISSES     SLEEPS    SPIN_GETS    WAIT_TIME  WAIT_TIME_US_PER_SLEEP
----------------- --------------------------- ------------ ----------- ---------- ------------ ------------ -----------------------
Start             18-SEP-2017 07:59:55 +02:00                                                                
No_Error          18-SEP-2017 08:00:55 +02:00   61,623,726     790,876     0           790,876            0  
Except_Return     18-SEP-2017 08:01:55 +02:00  127,973,369   6,743,323     1         6,743,469            3                       3
Except_No_Return  18-SEP-2017 08:02:55 +02:00  135,558,711  14,221,514  1,277,086   12,948,140    7,762,707                       6
Except_Raise      18-SEP-2017 08:03:56 +02:00  123,932,503  16,996,212  2,304,628   14,752,639   14,765,338                       6
No_Hanler         18-SEP-2017 08:04:56 +02:00  112,230,114  18,895,634  3,322,022   15,724,684   22,069,024                       7

select name, snap_name, ts
      ,value-lag(value) over(partition by name order by ts) value
from rc_stats_hist
where name in ('Create Count Success', 'Delete Count Invalid')
order by name, ts;

NAME                    SNAP_NAME            TS                             VALUE
----------------------- -------------------- ------------------------------ ----------
Create Count Success    Start                18-SEP-2017 07:59:55 +02:00
Create Count Success    No_Error             18-SEP-2017 08:00:55 +02:00        4
Create Count Success    Except_Return        18-SEP-2017 08:01:55 +02:00        1
Create Count Success    Except_No_Return     18-SEP-2017 08:02:55 +02:00       747,926
Create Count Success    Except_Raise         18-SEP-2017 08:03:56 +02:00     1,410,675
Create Count Success    No_Hanler            18-SEP-2017 08:04:56 +02:00     2,025,890
Delete Count Invalid    Start                18-SEP-2017 07:59:55 +02:00
Delete Count Invalid    No_Error             18-SEP-2017 08:00:55 +02:00        0
Delete Count Invalid    Except_Return        18-SEP-2017 08:01:55 +02:00        0
Delete Count Invalid    Except_No_Return     18-SEP-2017 08:02:55 +02:00       747,902
Delete Count Invalid    Except_Raise         18-SEP-2017 08:03:56 +02:00     1,410,676
Delete Count Invalid    No_Hanler            18-SEP-2017 08:04:56 +02:00     2,025,891


4. SQL Result Cache Difference


In the above test, we use PL/SQL Result Cache. if we replace all of them by SQL Result Cache, for example:

create or replace function rc_c4(p_x number) return number as
  l_ret number;
begin
  select /*+ result_cache */ x into l_ret from test_tab where x = p_x;   
  return l_ret;
end;
/
make a similar test:

exec clean_jobs;
exec dbms_result_cache.flush;
exec rc_case_job(p_case=> 4, p_job_cnt=> 4, p_rc_limit=> 5);
and monitor RC Objects by:

select type, status, name, count(*) from v$result_cache_objects 
group by type, status, name order by type desc, status, name;

TYPE       STATUS    NAME                                                        COUNT(*)
---------- --------- ---------------------------------------------------------- ---------
Result     Published SELECT /*+ result_cache */ X FROM TEST_TAB WHERE X = :B1           5
Dependency Published K.TEST_TAB                                                         1
There does exist any RC status "New" or "Invalid". We can hardly observe RC Latch Contentions because there are no RC Latch SLEEPS. Therefore SQL Result Cache and PL/SQL Result Cache are probably NOT equivalent.

It is not clear how Oracle internally handles differently SQL Result Cache and PL/SQL Result Cache. But it could affect real applications when using them differently.

Monday, September 11, 2017

Result Cache: RC Latch Contention and RESULT_CACHE_MAX_SIZE (II)

Result Cache memory is limited by RESULT_CACHE_MAX_SIZE. Its default value is 1% of SHARED_POOL_SIZE. When allocated space reaches RESULT_CACHE_MAX_SIZE, any new request can cause contention of "Result Cache: RC Latch". RC Latch "ksl_get_shared_latch" Gets jumps from an order of magnitude Microseconds (us) to Millisecond (ms). The single RC Latch could eventually serialize the system ("Result Cache: Serialization12").

Each RC row takes at least one RC BLOCK (1K). In case of caching SQL Collection, it can occupy more than 1 BLOCK.

In the previous Blog, we discussed the RC Latch Contention caused by RC invalidation triggered by DML statements (Blog: PL/SQL Function Result Cache Invalidation)

Note: all tests are done in Oracle 11.2.0.4, 12.1.0.2, 12.2.0.1 on AIX, Solaris, Linux.

Update (2017-Sep-18): Oracle MOS: SQL Query Result Cache. Includes: [Video] (Doc ID 1108133.1)
RESULT_CACHE_MAX_SIZE defines the memory allocated to the result cache. When set to 0, the Result cache is disabled. The Default value is either 0.25% of MEMORY_TARGET or 0.5% of SGA_TARGET or 1% of SHARED_POOL_SIZE and is chosen by the database based on total memory available to the SGA and on the memory management method currently in use.


1. Test_1: Under Memory Limit


Test DB runs on a UNIX machine with 6 physical processors, and configured with:
 
  shared_pool_size         1408M
  
  result_cache_max_size    14432K
  result_cache_max_result  5
  result_cache_mode        MANUAL
When not specially set, result_cache_max_size is derived from shared_pool_size, which is about 1% of shared_pool_size in default.

Each RC row requires at least one RC BLOCK (1K), and with result_cache_max_size = 14432K, we can have maximum about 14432 RC Rows (ignoring some overhead).

Run test with 4 parallel sessions to create 13000 RC Rows (see appended Test Code):
 
exec clean_jobs;
exec dbms_result_cache.flush;
exec dbms_result_cache.memory_report(TRUE);
exec rc_fun_jobs(4, 13000);
Run query below, we can hardly observe RC Latch Sleeps increasing:
 
select latch#, name, gets, misses, spin_gets, sleeps, wait_time 
from v$latch where name = 'Result Cache: RC Latch'; 
-- latch# in (436, 559, 665) for Oracle 11.2.0.4/12.1.0.2, 12.2.0.1
Latch Misses is comprised of Spin_gets and Sleeps (see Blog: Is latch misses statistic gathered or deduced ? ), and the long wait is exposed by Sleeps.

In AWR report - "Top 10 Foreground Events by Total Wait Time" Section, it is presented as column "Wait Avg(ms)". In case of RC Latch, we can see it in the line "latch free". AWR "Latch Sleep Breakdown" Section contains more details.

Crosscheck RC memory usage by:
 
SQL > exec dbms_result_cache.memory_report(TRUE);
R e s u l t   C a c h e   M e m o r y   R e p o r t
[Parameters]
Block Size          = 1K bytes
Maximum Cache Size  = 14432K bytes (14432 blocks)
Maximum Result Size = 721K bytes (721 blocks)
[Memory]
Total Memory = 13540016 bytes [0.917% of the Shared Pool]

... Fixed Memory = 46272 bytes [0.003% of the Shared Pool]
....... Memory Mgr = 208 bytes
....... Cache Mgr  = 256 bytes
....... Bloom Fltr = 2K bytes

....... State Objs = 43760 bytes
... Dynamic Memory = 13493744 bytes [0.914% of the Shared Pool]
....... Overhead = 157168 bytes
........... Hash Table    = 64K bytes (4K buckets)
........... Chunk Ptrs    = 24K bytes (3K slots)
........... Chunk Maps    = 12K bytes
........... Miscellaneous = 157168 bytes
....... Cache Memory = 13024K bytes (13024 blocks)
........... Unused Memory = 23 blocks
........... Used Memory = 13001 blocks
............... Dependencies = 1 blocks (1 count)
............... Results = 13000 blocks
................... PLSQL   = 13000 blocks (13000 count)


SQL > select id, name, value from v$result_cache_statistics order by name;

   ID NAME                           VALUE
----- ------------------------------ ----------
    3 Block Count Current            13024
    2 Block Count Maximum            14432
    1 Block Size (Bytes)             1024
    6 Create Count Failure           0
    5 Create Count Success           13321
    9 Delete Count Invalid           321
   10 Delete Count Valid             0
   12 Find Copy Count                34959999
    7 Find Count                     34370958
   11 Hash Chain Length              1-5
    8 Invalidation Count             0
   13 Latch (Share)                  0
    4 Result Size Maximum (Blocks)   721


SQL > select name, bytes from v$sgastat where name in ('Result Cache');

NAME               BYTES
------------- ----------
Result Cache    13493744


SQL > column name     format a50
SQL > column cache_id format a30
SQL > select name, cache_id, type, status, namespace, sum(block_count) blocks, sum(ceil(row_count*row_size_avg/1024)) est_blocks
      from v$result_cache_objects
      group by name, cache_id, type, status, namespace;

NAME                                               CACHE_ID                       TYPE       STATUS    NAMESPACE      BLOCKS EST_BLOCKS
-------------------------------------------------- ------------------------------ ---------- --------- ---------- ---------- ----------
K.RC_FUN                                           K.RC_FUN                       Dependency Published                     1          0
"K"."RC_FUN"::8."RC_FUN"#8cd0e193d8974f19 #1       cyma7sjgpkr5zbtbp804tt27r7     Result     Published PLSQL           13000      13000
RC memory is under its maximum Limit.

"Block Count Current: 13024" < "Block Count Maximum: 14432", and "Delete Count Invalid: 0".

Measuring latch Get time by dtrace:
 
sudo dtrace -n \
'BEGIN {self->start_wts = walltimestamp; self->start_ts = timestamp;}
pid$target::qesrcCM_Acquire:entry /execname == "oracle"/ { self->rc = 1; }
pid$target::ksl_get_shared_latch:entry /execname == "oracle" && self->rc == 1/ { self->ts = timestamp; }
pid$target::ksl_get_shared_latch:return /self->ts > 0/ {
        @lquant["ns"] = lquantize(timestamp - self->ts, 0, 10000, 1000);
        @avgs["AVG_ns"] = avg(timestamp - self->ts);
        @mins["MIN_ns"] = min(timestamp - self->ts);
        @maxs["MAX_ns"] = max(timestamp - self->ts);
        @sums["SUM_ms"] = sum((timestamp - self->ts)/1000000);
        @counts[ustack(10, 0)] = count();
        self->rc = 0; self->ts = 0;}
END { printf("Start: %Y, End: %Y, Elapsed_ms: %d\n", self->start_wts, walltimestamp, (timestamp - self->start_ts)/1000000);}
' -p 13945


:BEGIN dtrace started: 2017 Sep 11 07:11:57
:END dtrace ended: 2017 Sep 11 07:11:59 ,Elapsed_ms: 1978
  ns
           value  ------------- Distribution ------------- count
             < 0 |                                         0
               0 |@@@@@@@@@@@@@@                           102706
            1000 |@@@@@@@@@@@@@@@@@@@@@@@@@@               186749
            2000 |                                         2193
            3000 |                                         54
            4000 |                                         49
            5000 |                                         42
            6000 |                                         28
            7000 |                                         16
            8000 |                                         9
            9000 |                                         10
        >= 10000 |                                         107

  AVG_ns     1167
  MIN_ns      827
  MAX_ns   169083
  SUM_ms        0

           oracle`ksl_get_shared_latch+0x1b8
           oracle`qesrcCM_Acquire+0x354
           oracle`psdfrcLookup+0x12d
           oracle`pevm_ENTERX+0x143
           oracle`pfrinstr_ENTERX+0x47
           oracle`pfrrun_no_tool+0x12a
           oracle`pfrrun+0x4c0
           oracle`plsql_run+0x288
           oracle`peicnt+0x946
           oracle`kkxexe+0x2f3
  291963
In average, each Get takes 1,167 Nanoseconds:


2. Test_2: Over Memory Limit


Stop above tests, re-run the test to create 15000 RC Rows, make the same checks:
 
exec clean_jobs;
exec dbms_result_cache.flush;
exec dbms_result_cache.memory_report(TRUE);
exec rc_fun_jobs(4, 15000);

SQL > exec dbms_result_cache.memory_report(TRUE);
R e s u l t   C a c h e   M e m o r y   R e p o r t
[Parameters]
Block Size          = 1K bytes
Maximum Cache Size  = 14432K bytes (14432 blocks)
Maximum Result Size = 721K bytes (721 blocks)
[Memory]
Total Memory = 14984624 bytes [1.015% of the Shared Pool]

... Fixed Memory = 46272 bytes [0.003% of the Shared Pool]
....... Memory Mgr = 208 bytes
....... Cache Mgr  = 256 bytes
....... Bloom Fltr = 2K bytes

....... State Objs = 43760 bytes
... Dynamic Memory = 14938352 bytes [1.012% of the Shared Pool]
....... Overhead = 159984 bytes
........... Hash Table    = 64K bytes (4K buckets)
........... Chunk Ptrs    = 24K bytes (3K slots)
........... Chunk Maps    = 12K bytes
........... Miscellaneous = 159984 bytes
....... Cache Memory = 14432K bytes (14432 blocks)
........... Unused Memory = 0 blocks
........... Used Memory = 14432 blocks
............... Dependencies = 1 blocks (1 count)
............... Results = 14431 blocks
................... PLSQL   = 14431 blocks (14431 count)


SQL > select id, name, value from v$result_cache_statistics order by name;

   ID NAME                           VALUE
----- ------------------------------ ----------
    3 Block Count Current            14432
    2 Block Count Maximum            14432
    1 Block Size (Bytes)             1024
    6 Create Count Failure           0
    5 Create Count Success           143128
    9 Delete Count Invalid           169
   10 Delete Count Valid             128528
   12 Find Copy Count                80928
    7 Find Count                     86758
   11 Hash Chain Length              1-5
    8 Invalidation Count             0
   13 Latch (Share)                  0
    4 Result Size Maximum (Blocks)   721


SQL > select name, bytes from v$sgastat where name in ('Result Cache');

NAME                   BYTES
----------------- ----------
Result Cache        14938352


SQL > select name, cache_id, type, status, namespace, sum(block_count) blocks, sum(ceil(row_count*row_size_avg/1024)) est_blocks
      from v$result_cache_objects
      group by name, cache_id, type, status, namespace;

NAME                                               CACHE_ID                       TYPE       STATUS    NAMESPACE      BLOCKS EST_BLOCKS
-------------------------------------------------- ------------------------------ ---------- --------- ---------- ---------- ----------
"K"."RC_FUN"::8."RC_FUN"#8cd0e193d8974f19 #1       cyma7sjgpkr5zbtbp804tt27r7     Result     New       PLSQL               1          0
K.RC_FUN                                           K.RC_FUN                       Dependency Published                     1          0
"K"."RC_FUN"::8."RC_FUN"#8cd0e193d8974f19 #1       cyma7sjgpkr5zbtbp804tt27r7     Result     Published PLSQL           14430      14430
dbms_result_cache.memory_report shows RC Memory reaches result_cache_max_size:
 
 Total Memory = 14984624 bytes [1.015% of the Shared Pool]
"Block Count Current: 14432" = "Block Count Maximum: 14432", and "Delete Count Invalid: 128528".

Output of query on v$result_cache_objects occasionally contains one line with Status "NEW" when repeatedly running.

Perform the latch Sleeps check:
 
select latch#, name, gets, misses, spin_gets, sleeps, wait_time 
from v$latch where name = 'Result Cache: RC Latch';

 LATCH# NAME                         GETS     MISSES  SPIN_GETS     SLEEPS  WAIT_TIME
------- ---------------------- ---------- ---------- ---------- ---------- ----------
    559 Result Cache: RC Latch  696395773 3217974991 2909396116 1023307692 7.9453E+11
It shows that Sleeps goes up steadily when each time run the query.

Again measuring latch Get time by dtrace:
 
Start: 2017 Sep 11 07:40:44, End: 2017 Sep 11 07:40:46, Elapsed_ms: 2333
  ns
           value  ------------- Distribution ------------- count
               0 |                                         0
            1000 |@                                        15
            2000 |                                         1
            3000 |                                         0
            4000 |                                         0
            5000 |                                         0
            6000 |                                         0
            7000 |                                         0
            8000 |                                         0
            9000 |                                         0
        >= 10000 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  747

  AVG_ns     3029147
  MIN_ns        1031
  MAX_ns    87959260
  SUM_ms        1951

            oracle`ksl_get_shared_latch+0x1b8
            oracle`qesrcCM_Acquire+0x354
            oracle`psdfrcLookup+0x12d
            oracle`pevm_ENTERX+0x143
            oracle`pfrinstr_ENTERX+0x47
            oracle`pfrrun_no_tool+0x12a
            oracle`pfrrun+0x4c0
            oracle`plsql_run+0x288
            oracle`peicnt+0x946
            oracle`kkxexe+0x2f3
  763
In average, each get takes 3,029,147 Nanoseconds (3 Microseconds).

RC Latch "ksl_get_shared_latch" Gets jumps from an order of magnitude Microseconds (us) to Millisecond (ms).

In real applications, we can also build a histogram on RC creation_timestamp to track RC usages. (see Blog: Latch free waits, Dynamic Statistics, and the Result Cache and MOS Doc ID 2002089.1: High "Latch Free" Waits with Contention on 'Result Cache: RC Latch' when RESULT_CACHE_MODE = MANUAL on Oracle 12c)
 
select trunc(creation_timestamp, 'MI') rc_create_ts
      ,name, cache_id, type, status, namespace
      ,count(*) rc_count, round(avg(scan_count)) avg_scan_cnt, sum(block_count) blocks, sum(ceil(row_count*row_size_avg/1024)) est_blocks 
from   v$result_cache_objects t
where  lru_number > 0
group by name, cache_id, type, status, namespace, trunc(creation_timestamp, 'MI')
order by 1, rc_count desc, avg_scan_cnt desc; 
Blog: PL/SQL Function Result Cache Invalidation contains 5 views in Section: Causal Analysis of Result Cache, and provides some code for Result Cache History Recording.


3. RC Latch Contention and CPU Usage


There exists one common understanding that Latch Contention is manifested in UNIX as high CPU Usage. In both above tests, we have a machine with 6 physical processors, and started 4 parallel sessions, so there is no CPU saturation.

When creating 13000 RC Rows, we observe our 4 processes at the TOP of CPU usage:
 
load averages:  3.98,  2.85,  2.09;                    up 148+17:47:10 07:55:59
73 processes: 68 sleeping, 5 on cpu
CPU states: 32.9% idle, 66.8% user,  0.3% kernel,  0.0% iowait,  0.0% swap
Memory: 24G phys mem, 9991M free mem, 30G total swap, 30G free swap

   PID USERNAME LWP PRI NICE  SIZE   RES STATE    TIME    CPU COMMAND
 16030 oracle     1   3    0 4712M 4632M cpu/5    4:50 16.75% oracle
 16032 oracle     1   1    0 4712M 4632M cpu/4    4:50 16.69% oracle
 16026 oracle     1   1    0 4712M 4632M cpu/2    4:50 16.45% oracle
 16028 oracle     1   1    0 4712M 4632M cpu/1    4:50 16.39% oracle
When creating 15000 RC Rows:
 
load averages:  1.16,  1.69,  1.84;                    up 148+17:54:36 08:03:25
73 processes: 71 sleeping, 2 on cpu
CPU states: 81.7% idle, 17.5% user,  0.8% kernel,  0.0% iowait,  0.0% swap
Memory: 24G phys mem, 9991M free mem, 30G total swap, 30G free swap

   PID USERNAME LWP PRI NICE  SIZE   RES STATE    TIME    CPU COMMAND
 16134 oracle     1   1    0 4712M 4632M cpu/2    1:35  6.52% oracle
 16132 oracle     1   1    0 4712M 4637M sleep    1:31  4.76% oracle
 16138 oracle     1   1    0 4712M 4632M sleep    1:39  3.42% oracle
 16136 oracle     1   1    0 4712M 4632M sleep    1:36  3.23% oracle
The differences are:
 
Creating 13000 RC Rows has:
  load averages:  3.98
  5 on cpu
  66.8% user
  
Creating 15000 RC Rows has:
  load averages:  1.16
  2 on cpu
  17.5% user
Probably Latch Spins is burning CPU; whereas Latch Sleeps yields CPU. Therefore when talking about Latch contentions, it would be necessary to distinguish between Spin_gets and Sleeps. As tested, usually Spin_gets of Latch Misses are caused by frequently concurrent access; whereas Sleeps of Latch Misses are triggered by Invalidations or DML Modifications.

On an AIX machine with 6 cores and SMT=4 (see Blog: IBM AIX POWER7 CPU Usage and Throughput), run test:
 
  exec rc_fun_jobs(4, 15000);
and list call stacks of all 4 processes for 4 running Oracle sessions (common code removed):
 
$ > procstack 52166700 57278636 58982430 62521360

52166700: ora_j004_aix3
  thread_wait(0xbb800000bb8) + 0x22c
  sskgpwwait(??, ??, ??, ??, ??) + 0x54
  skgpwwait(??, ??, ??, ??, ??) + 0x12c
  ksliwat(??, ??, ??) + 0x1f08
  kslwaitctx(??, ??) + 0xe0
  ksqcmi(??, ??, ??, ??, ??, ??, ??, ??) + 0x6b4c
  ksqgtlctx(??, ??, ??, ??, ??, ??, ??, ??) + 0x10bc
  ksqgelctx(??, ??, ??, ??, ??, ??, ??, ??) + 0x21c
  qesrcCM_Wait(??, ??, ??) + 0x3c4
  qesrcCM_Acquire(??, ??, ??, ??, ??, ??) + 0x474
  psdfrcLookup(??, ??, ??, ??, ??, ??) + 0x124
  pevm_ENTERX(??, ??, ??, ??) + 0x150
  pfrinstr_ENTERX(0xfffffffffff44a0, 0x7000100e5e4e618, 0x122053f58) + 0x60
  pfrrun_no_tool(??, ??, ??) + 0x7c

57278636: ora_j003_aix3
  sskgpwpost(??, ??, ??, ??) + 0x44
  sskgpwwait(??, ??, ??, ??, ??) + 0x54
  skgpwwait(??, ??, ??, ??, ??) + 0x12c
  kslgess(0x0, 0x1600000010, 0x200000002, 0x1, 0x14fe000014fe) + 0x9c8
  ksl_get_shared_latch(??, ??, ??, ??, ??, ??) + 0x460
  qesrcCM_Release(??, ??, ??) + 0x18c
  psdfrcWriteResult(??, ??, ??, ??) + 0x360
  pevm_RET(??, ??) + 0x380
  pfrinstr_RET(0xfffffffffff4490, 0x7000100e5e4e658, 0x122053f58) + 0x4c
  pfrrun_no_tool(??, ??, ??) + 0x7c

58982430: ora_j001_aix3
  thread_wait(0x10e0000012c) + 0x22c
  sskgpwwait(??, ??, ??, ??, ??) + 0x54
  skgpwwait(??, ??, ??, ??, ??) + 0x12c
  kslgess(0x8, 0x8e00000008, 0x200000002, 0x1, 0x14ee000014ee) + 0x9c8
  ksl_get_shared_latch(??, ??, ??, ??, ??, ??) + 0x460
  qesrcPin_Get(??, ??, ??, ??, ??, ??, ??, ??) + 0x218
  qesrcCM_Acquire(??, ??, ??, ??, ??, ??) + 0x368
  psdfrcLookup(??, ??, ??, ??, ??, ??) + 0x124
  pevm_ENTERX(??, ??, ??, ??) + 0x150
  pfrinstr_ENTERX(0xfffffffffff4490, 0x7000100e5e4e618, 0x122053f58) + 0x60

62521360: ora_j000_aix3
  qesrcCM_MakeRoom(??, ??, ??) + 0xec
  qesrcMM_Alloc(??, ??, ??) + 0x16c
  qesrcRO_New(??, ??, ??, ??) + 0xa9c
  qesrcPin_Get(??, ??, ??, ??, ??, ??, ??, ??) + 0x6f4
  qesrcCM_Acquire(??, ??, ??, ??, ??, ??) + 0x368
  psdfrcLookup(??, ??, ??, ??, ??, ??) + 0x124
  pevm_ENTERX(??, ??, ??, ??) + 0x150
  pfrinstr_ENTERX(0xfffffffffff4590, 0x7000100e5e4e618, 0x122053f58) + 0x60
  pfrrun_no_tool(??, ??, ??) + 0x7c
  pfrrun(??) + 0x136c
  plsql_run(??, ??, ??) + 0x310
  peicnt(??, ??) + 0x284
  kkxexe(??) + 0x2b4
  opiexe(??, ??, ??) + 0x4594
  opiodr(??, ??, ??, ??) + 0x3d4
  rpidrus(??) + 0x21c
  skgmstack(??, ??, ??, ??, ??) + 0xc0
  rpidru(??) + 0xa8
  rpiswu2(??, ??, ??, ??, ??, ??, ??, ??) + 0x5ac
  rpidrv(??, ??, ??, ??) + 0xe14
  rpiexe(??) + 0x80
  kkjex1e(??) + 0x1824
  kkjex1e_cdb(??, ??, ??, ??, ??) + 0x1d8
  kkjsexe() + 0x4f4
  kkjrdp() + 0x384
  opirip(??, ??, ??) + 0x408
  opidrv(??, ??, ??) + 0x484
  sou2o(??, ??, ??, ??) + 0x110
  opimai_real(??, ??) + 0x144
  ssthrdmain(0x0, 0x300000003, 0xfffffffffffebf8) + 0x1e4
  main(??, ??) + 0xf0
  __start() + 0x70
We can see that 4 processes are executing respectively:
 
  thread_wait(0xbb800000bb8)
  sskgpwpost(??, ??, ??, ??)
  thread_wait(0x10e0000012c)
  qesrcCM_MakeRoom(??, ??, ??) 
Two processes are waiting with thread_wait. AIX Document said:
  int thread_wait(timeout):  Specifies the maximum length of time, in milliseconds, to wait for a posting.
and two timeout values are:
 
  0x12c =  300
  0xbb8 = 3000
One speculation is that the first timeout is 300, that is, thread_wait(300). In case of later Sleep (not got RC Latch after first 300ms sleeping), timeout is multiplied by 10 (3000).

The thread_post subroutine posts the thread the occurrence of an event, hence waking up the sleeping threads.

The last process (Oracle session) is creating a new RC Row after holding Latch (qesrcPin_Get):
 
  qesrcCM_MakeRoom(??, ??, ??) 
  qesrcMM_Alloc(??, ??, ??)
  qesrcRO_New(??, ??, ??, ??) 
  qesrcPin_Get(??, ??, ??, ??, ??, ??, ??, ??)
  qesrcCM_Acquire(??, ??, ??, ??, ??, ??)


4. Big Rows


In above tests, each RC row has a size less than 1024 Bytes. When caching SQL Collections, we can make the row size more than 1K. Here two similar tests (see appended Collection Test Code) with each row taking about 2K (ROW_SIZE_AVG = 1099 Bytes, i.e 2 RC Blocks). one is under result_cache_max_size, another is beyond that.
 
exec clean_jobs;
exec dbms_result_cache.flush;
exec dbms_result_cache.memory_report(TRUE);
exec rc_tab_jobs(4, 6500, 100);

exec clean_jobs;
exec dbms_result_cache.flush;
exec dbms_result_cache.memory_report(TRUE);
exec rc_tab_jobs(4, 7500, 100);


5. Possible Solutions


Adapt (increase) result_cache_max_size to satisfy cache requests or reduce cache usage. It can be dynamically increased by:
 
alter system set result_cache_max_size = 30M;
We can also disable Result Cache by either dynamically setting result_cache_max_size = 0, or bypass RC:
 
alter system set result_cache_max_size = 0;

exec dbms_result_cache.bypass(true);
exec dbms_result_cache.bypass(false);
The difference is that "result_cache_max_size = 0" releases RC memory, but bypass instructs code bypassing RC, but still keeps the RC memory.

Oracle 12.2 introduced RC Black List, and dbms_result_cache is enhanced with 4 APIs:
 
  exec dbms_result_cache.Black_List_Add('cyma7sjgpkr5zbtbp804tt27r7', TRUE);     -- Add cache_id to black-list
  exec dbms_result_cache.Black_List_Remove('cyma7sjgpkr5zbtbp804tt27r7', TRUE);  -- Remove cache_id from black-list
  exec dbms_result_cache.Black_List_clear(TRUE);                                 -- Remove all cache_id's from black-list
  select * from table(dbms_result_cache.Black_List);                             -- List all contents of the black-list
They allow us to regulate RC usage in granularity of each single cache_id, instead of toggling whole RC Cache.


6. Test Code


 
create or replace function rc_fun(p_var_1 varchar2, p_var_2 varchar2) return pls_integer result_cache as
  l_ret pls_integer;
begin
  l_ret := length(p_var_1) + 1e3*length(p_var_2);
  return l_ret;
end;
/

create or replace procedure rc_proc(p_rc_limit number) as
  l_ret pls_integer;
begin
  for i in 1..p_rc_limit loop
    l_ret := rc_fun('p1_'||i, 'p2_'||(i + p_rc_limit));
  end loop;
end;
/

create or replace procedure rc_fun_jobs(p_job_cnt number, p_rc_limit number) as
  l_job_id pls_integer;
begin
  for i in 1.. p_job_cnt loop
    dbms_job.submit(l_job_id, 'begin while true loop rc_proc('||p_rc_limit||'); end loop; end;');
  end loop;
  commit;
end;    
/

create or replace procedure clean_jobs as
begin
  for c in (select job from dba_jobs) loop
    begin
       dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;

  for c in (select d.job, d.sid, (select serial# from v$session where sid = d.sid) ser 
              from dba_jobs_running d) loop
    begin
      execute immediate
             'alter system kill session '''|| c.sid|| ',' || c.ser|| ''' immediate';
      dbms_job.remove (c.job);
    exception when others then null;
    end;
    commit;
  end loop;
  
  -- select * from dba_jobs;
  -- select * from dba_jobs_running;
end;
/


7. Collection Test Code


 
create or replace type num_tab force is table of number(10)
/

create or replace function rc_fun_tab(p_id number, p_tab_cnt number) return num_tab result_cache as
  l_ret  num_tab := new num_tab();
begin
  for i in 1..p_tab_cnt loop
    l_ret.extend;
    l_ret(i) := p_id + i;
  end loop;
  return l_ret;
end;
/

create or replace procedure rc_proc_tab(p_rc_limit number, p_tab_cnt number) as
  l_ret num_tab := new num_tab();
begin
  for i in 1..p_rc_limit loop
    l_ret := rc_fun_tab(i, p_tab_cnt);
  end loop;
end;
/

create or replace procedure rc_tab_jobs(p_job_cnt number, p_rc_limit number, p_tab_cnt number) as
  l_job_id pls_integer;
begin
  for i in 1.. p_job_cnt loop
    dbms_job.submit(l_job_id, 'begin while true loop rc_proc_tab('||p_rc_limit||', '||p_tab_cnt||'); end loop; end;');
  end loop;
  commit;
end;    
/