On Oracle: DBMS_SCHEDULER Job Not Running and Used Slaves

Oracle parameter JOB_QUEUE_PROCESSES specifies the maximum number of job slaves per instance that can be created for the execution of DBMS_JOB jobs and Oracle Scheduler (DBMS_SCHEDULER) jobs. The number of free available slaves (#free_slaves) is JOB_QUEUE_PROCESSES minus the number of used slaves (#used_slaves). When #used_slaves >= JOB_QUEUE_PROCESSES, no more job slaves are able to run.

In this Blog, we will show that #used_slaves not only counts the currently running jobs, but also abnormal terminated jobs, which include incident jobs and killed jobs.

Note: 
        Tested and reproducible in Oracle 18.10, 19.6 and 19.7. 
        Partially reproducible in Oracle 12.1.
        Not reproducible in Oracle 19.8.
        
        *  Oracle 19.8 changed this Blog tested behaviour.

1. Test Setup

We will create two types of repeated DBMS_SCHEDULER jobs. The first one is the abnormal terminated jobs which crashes with ORA-600 [4156] (see Blog: ORA-600 [4156] SAVEPOINT and PL/SQL Exception Handling); The second is normal jobs with each running duration of 60 seconds.


drop table t purge;
create table t(id number, label varchar2(20));
insert into t(id, label) select level, 'label_'||level from dual connect by level <= 100;
commit;

create or replace procedure test_proc_crashed(p_i number) as
begin 
  savepoint sp;
  update t set label = label where id = p_i;
  execute immediate '
    begin
      raise_application_error(-20000, ''error-sp'');
    exception
      when others then
        rollback to savepoint sp;
        update t set label = label where id = '||p_i||';
        raise;
    end;';        
end;
/

create or replace procedure start_job_crashed(p_count number) as
begin
  for i in 1..p_count loop
    dbms_scheduler.create_job (
      job_name        => 'TEST_JOB_CRASHED_'||i,
      job_type        => 'PLSQL_BLOCK',
      job_action      => 'begin test_proc_crashed('||i||'); end;',    
      start_date      => systimestamp,
      repeat_interval => 'systimestamp',
      auto_drop       => true,
      enabled         => true);
  end loop;
end;
/

create or replace procedure start_job_normal(p_count number) as
  l_job_name varchar2(50);
begin
  for i in 1..p_count loop
    l_job_name := 'TEST_JOB_NORMAL_'||i;
    dbms_scheduler.create_job (
      job_name        => l_job_name,
      job_type        => 'PLSQL_BLOCK',
      job_action      => 
        'begin 
           dbms_lock.sleep(13); 
           dbms_scheduler.set_attribute(
              name      => '''||l_job_name||'''
             ,attribute => ''start_date''
             ,value     => systimestamp);
           dbms_lock.sleep(47);
        end;',    
      start_date      => systimestamp,
      repeat_interval => 'systimestamp',
      auto_drop       => true,
      enabled         => true);
  end loop;
end;
/

2. Test Start

At first, we set job_queue_processes to 13, and at the same time trace DBMS_SCHEDULER coordinator CJQ0 with event 27402 level 65535.


alter system set job_queue_processes=13 scope=memory;
alter system set max_dump_file_size = unlimited scope=memory;

-- DBMS_SCHEDULER coordinator CJQ0 (sid, s.serial#) = (7, 15601)
exec sys.dbms_system.set_ev(7, 15601, 27402, 65535, '');

then start two test_proc_crashed jobs and one normal job.


exec start_job_crashed(2);
exec start_job_normal(1);

During the test, we continously monitor running jobs in dba_scheduler_running_jobs, and job state ("RUNNING") in dba_scheduler_jobs:


select job_name,session_id, slave_process_id, slave_os_process_id, elapsed_time, log_id 
  from dba_scheduler_running_jobs;

  JOB_NAME            SESSION_ID SLAVE_PROCESS_ID SLAVE_OS_PRO ELAPSED_TIME      LOG_ID
  ------------------- ---------- ---------------- ------------ ------------------------
  TEST_JOB_CRASHED_2         375               62 25302        +000 00:00:06.93 8587896
  TEST_JOB_NORMAL_1          731               64 25130        +000 00:00:06.95 8587892
  TEST_JOB_CRASHED_1         910               65 25270        +000 00:00:06.95 8587894

select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';

  JOB_NAME            ENABL STATE   RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE   LAST_START_DATE  NEXT_RUN_DATE
  ------------------- ----- ----------------- ------------- ----------- ------------ ---------------- -------------
  TEST_JOB_CRASHED_1  TRUE  RUNNING         2             0           0 10:37:05.497 10:38:12.072     10:37:45.054
  TEST_JOB_CRASHED_2  TRUE  RUNNING         2             0           0 10:37:06.370 10:38:12.090     10:37:45.071
  TEST_JOB_NORMAL_1   TRUE  RUNNING         1             0           0 10:37:24.732 10:38:12.071     10:37:24.769

After a few minutes, we can see that there are no more running jobs in dba_scheduler_running_jobs, and job state in dba_scheduler_jobs remains as "SCHEDULED":


select job_name,session_id, slave_process_id, slave_os_process_id, elapsed_time, log_id 
  from dba_scheduler_running_jobs;

  no rows selected

select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';

  JOB_NAME            ENABL STATE     RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE    LAST_START_DATE  NEXT_RUN_DATE
  ------------------- ----- ------------------- ------------- ----------- ------------- ---------------- -------------
  TEST_JOB_CRASHED_1  TRUE  SCHEDULED         7             0           0 10:37:05.497  10:41:12.340     10:41:12.341
  TEST_JOB_CRASHED_2  TRUE  SCHEDULED         6             0           0 10:37:06.370  10:39:44.948     10:39:44.950
  TEST_JOB_NORMAL_1   TRUE  SCHEDULED         4             0           0 10:40:25.148  10:40:12.145     10:40:25.152

Now we can turn off CJQ0 tracing (otherwise CJQ0 trace file will get many MB):


exec sys.dbms_system.set_ev(7, 15601, 27402, 0, '');

Note that we use CJQ0 trace event 27402 level 65535 (0xffff) instead of level 65355 (0xff4b) as documented in MOS: Scheduler Stuck Executing Jobs And CJQ0 Locking SYS.LAST_OBSERVED_EVENT (Doc ID 2341386.1).

One can also start a DB wide 27402 trace by:


  alter system set events '27402 trace name context forever, level 65535';
  alter system set events '27402 trace name context off';

As a test, we can run the "SCHEDULED" job immediately in the current session (foreground session) in lieu of job slave session (background session) as follows:


exec dbms_scheduler.run_job(job_name  => 'TEST_JOB_NORMAL_1', use_current_session => true);
exec dbms_scheduler.run_job(job_name  => 'TEST_JOB_CRASHED_1', use_current_session => true);

3. Used Slaves by Job Incident

From above dba_scheduler_jobs query, we can see that TEST_JOB_CRASHED_1 run_count=7, TEST_JOB_CRASHED_2 run_count=6, the total run_count of two test_proc_crashed jobs is 13:


select sum(run_count) "#used_slaves" 
  from dba_scheduler_jobs 
 where job_name like 'TEST_JOB_CRASHED%';

  #used_slaves
  ------------
            13

The number of incident ORA-00600:[4156] in v$diag_alert_ext during the test inteval is 13:

    
select count(distinct process_id) "#used_slaves", min(originating_timestamp), max(originating_timestamp)
  from v$diag_alert_ext
 where (message_text like '%incident%ORA-00600%4156%' or problem_key like '%ORA 600 [4156]%')
   and originating_timestamp > timestamp'2021-01-18 10:30:00';
     
  #used_slaves MIN(ORIGINATING_TIMESTAMP)  MAX(ORIGINATING_TIMESTAMP)
  ------------ --------------------------- --------------------------
            13 10:37:06.572                10:41:21.195

Open DB alert.log, count the number of different OS process id in all ORA-00600:[4156] incident files (around text "ORA-00600: internal error code, arguments: [4156]"), it is exactly 13.

Go to diag incident directory, count the number of incident files with different OS process id, it is also 13.

Now open CJQ0 trace file, it shows that when condition "CDB slave limit" <= "CDB Used/Reserved slaves" is not true, subroutine jscr_can_pdb_run_job returns 1, more slaves can become "RUNNING". The text in the trace file looks like:


  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscrsq_select_queue):Considering job 3715269
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Enter
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):CDB slave limit=13, CDB Used/Reserved slaves=5, MSL=2
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):PDB 0 slaves: used=4, assigned=1, reserved=5, max=13
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Return 1
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_increase_pdb_assigned_slaves):Enter
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_increase_pdb_assigned_slaves):Return
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscrsq_select_queue):JOB FLOW TRACE: SELECT Q: ADD JOB TO TJL WORK Q: 3715269
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscrsq_select_queue):Considering job 3715270
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Enter
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):CDB slave limit=13, CDB Used/Reserved slaves=6, MSL=2
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):PDB 0 slaves: used=4, assigned=2, reserved=6, max=13
  SCHED 01-18 10:38:12.061 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Return 1
  ...
  SCHED 01-18 10:39:12.095 1 00 23038 CJQ0 0(jscrsq_select_queue):Considering job 3715271
  SCHED 01-18 10:39:12.095 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Enter
  SCHED 01-18 10:39:12.095 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):CDB slave limit=13, CDB Used/Reserved slaves=8, MSL=2
  SCHED 01-18 10:39:12.095 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):PDB 0 slaves: used=8, assigned=0, reserved=8, max=13
  SCHED 01-18 10:39:12.095 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Return 1

When "CDB slave limit" <= "CDB Used/Reserved slaves" is true, jscr_can_pdb_run_job returns 0, slaves remain in state "SCHEDULED", not able to move to state "RUNNING". The trace file is repeatedly filled with following text.


  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscrsq_select_queue):Considering job 3715270
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Enter
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):CDB slave limit=13, CDB Used/Reserved slaves=13, MSL=2
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):PDB 0 slaves: used=13, assigned=0, reserved=13, max=13
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Return 0
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscrsq_select_queue):Considering job 3715271
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Enter
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):CDB slave limit=13, CDB Used/Reserved slaves=13, MSL=2
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):PDB 0 slaves: used=13, assigned=0, reserved=13, max=13
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Return 0
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscrsq_select_queue):Considering job 3715269
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Enter
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):CDB slave limit=13, CDB Used/Reserved slaves=13, MSL=2
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):PDB 0 slaves: used=13, assigned=0, reserved=13, max=13
  SCHED 01-18 10:43:10.442 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Return 0

From above test and CJQ0 trace file, we can see that #used_slaves is the number of incident, "CDB slave limit" is specified by JOB_QUEUE_PROCESSES. Once #used_slaves reached JOB_QUEUE_PROCESSES, no more JOB can be in state "RUNNING". dba_scheduler_jobs shows that job is enabled ("TRUE"), but state remains in "SCHEDULED". Restarting database will reset #used_slaves.

In the above trace, slave limit is prefixed with CDB as "CDB slave limit=13". Job slaves are spawned in the root container because they are a shared resource among PDBs as documented in MOS: Alter System Kill Session Failed With ORA-00026 When Kill a Job Session From PDB (Doc ID 2491701.1)). (This MOS Note seems not consistent with Oracle docu about JOB_QUEUE_PROCESSES, which said: it is maximum number of job slaves per instance, not root container).

"MSL=2" seems the number of maximum assigned jobs in each scheduling scan. jscr_can_pdb_run_job Call Stack looks like:


  #0  0x00000000041bfcbb in jscr_can_pdb_run_job ()
  #1  0x00000000041be8a8 in jscrsq_select_queue ()       
  #2  0x00000000041b3224 in jscrs_select0 ()
  #3  0x00000000041b00f2 in jscrs_select ()
  #4  0x00000000122ac797 in rpiswu2 ()
  #5  0x0000000003726293 in kkjcjexe ()
  #6  0x0000000003725d65 in kkjssrh ()
  #7  0x0000000012360495 in ksb_act_run_int ()
  #8  0x000000001235f162 in ksb_act_run ()
  #9  0x000000001235dec5 in ksbcti ()
  #10 0x0000000003d2a550 in ksbabs ()
  #11 0x0000000003d48611 in ksbrdp ()
  #12 0x0000000004168bf7 in opirip ()
  #13 0x00000000027b8138 in opidrv ()
  #14 0x00000000033be90f in sou2o ()
  #15 0x0000000000d81f9a in opimai_real ()
  #16 0x00000000033cb767 in ssthrdmain ()
  #17 0x0000000000d81ea3 in main ()

JOB_QUEUE_PROCESSES max value is fixed as:

      up to Oracle 12cR1: 1000
      from  Oracle 12cR2: 4000

It seems that CJQ0 is periodically waking up each 200 ms to check enabled jobs. The related hidden parameters are:


  Name                                     Description                                               Default
  ---------------------------------------- --------------------------------------------------------  -------
  _sched_delay_sample_interval_ms          scheduling delay sampling interval in ms                  1000 
  _sched_delay_max_samples                 scheduling delay maximum number of samples                4    
  _sched_delay_sample_collection_thresh_ms scheduling delay sample collection duration threshold ms  200  
  _sched_delay_measurement_sleep_us        scheduling delay mesurement sleep us                      1000 
  _sched_delay_os_tick_granularity_us      os tick granularity used by scheduling delay calculations 16000

In the above trace file, job 3715269, 3715270 and 3715271 can be found by query:


select o.object_name, start_date, last_enabled_time, last_start_date, next_run_date, last_end_date
  from dba_objects o, sys.scheduler$_job j 
 where o.object_id = j.obj# and j.obj# in (3715269, 3715270, 3715271);
 
OBJECT_NAME         START_DATE    LAST_ENABLED_TIME  LAST_START_DATE  NEXT_RUN_DATE  LAST_END_DATE
------------------  ------------  -----------------  ---------------  -------------  -------------
TEST_JOB_CRASHED_1  10:37:05.497  10:37:06.351       10:41:12.340     10:41:12.341   10:41:24.322
TEST_JOB_CRASHED_2  10:37:06.370  10:37:06.373       10:39:44.948     10:39:44.950   10:40:12.115
TEST_JOB_NORMAL_1   10:40:25.148  10:40:25.152       10:40:12.145     10:40:25.152   10:41:12.153

class object 12166 is DEFAULT_JOB_CLASS:


select owner, object_name, object_type from dba_objects where object_id = 12166;

  OWNER  OBJECT_NAME        OBJECT_TYPE
  ------ ------------------ -----------
  SYS    DEFAULT_JOB_CLASS  JOB CLASS

In Oracle 12.1, the above behaviour can only be partially reproduced. Once #used_slaves reached JOB_QUEUE_PROCESSES, they can still periodically start and stop (job state changed from "RUNNING" to "SCHEDULED", then back to "RUNNING" after a while). Following CJQ0 trace shows that jleft (job left) is started with JOB_QUEUE_PROCESSES (13), once reached 0, it is reset back 0 after certain interval.


  SCHED 11:58:35.575 1 00 19988592 CJQ0 0(jscrs_select0):position 0, job 1287236, considered_count 0, prio 6 
  SCHED 11:58:35.575 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):Entering jscrmrr_mark_run_resource_mgr
  SCHED 11:58:35.575 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):jleft=13, sleft=1032, pleft=638
  SCHED 11:58:35.575 1 00 19988592 CJQ0 0(jscrrsa_rm_slave_allocation):all cg granted total of 1
  ...
  SCHED 11:59:36.145 1 00 19988592 CJQ0 0(jscrs_select0):position 0, job 1287236, considered_count 0, prio 6 
  SCHED 11:59:36.145 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):Entering jscrmrr_mark_run_resource_mgr
  SCHED 11:59:36.145 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):jleft=1, sleft=1029, pleft=636
  SCHED 11:59:36.145 1 00 19988592 CJQ0 0(jscrrsa_rm_slave_allocation):all cg granted total of 1
  ...
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrs_select0):position 0, job 1287237, considered_count 2, prio 6 
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrs_select0):position 1, job 1287236, considered_count 0, prio 6 
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):Entering jscrmrr_mark_run_resource_mgr
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):jleft=0, sleft=1032, pleft=637
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):Max number of jobs we can run is 0
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrs_select0):No jobs to run
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrcw_compute_wait):Entering jscrcw_compute_wait
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrcw_compute_wait):0  jscrcw_compute_wait returns value 400
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrs_select0):Running 0 jobs:
  SCHED 11:59:44.941 1 00 19988592 CJQ0 0(jscrs_select0):Waiting 4000 milliseconds:
  ...
  SCHED 12:01:19.509 1 00 19988592 CJQ0 0(jscrs_select0):position 0, job 1287236, considered_count 27, prio 5 
  SCHED 12:01:19.509 1 00 19988592 CJQ0 0(jscrs_select0):position 1, job 17408, considered_count 13, prio 6 
  SCHED 12:01:19.509 1 00 19988592 CJQ0 0(jscrs_select0):position 2, job 1287238, considered_count 12, prio 6 
  SCHED 12:01:19.509 1 00 19988592 CJQ0 0(jscrs_select0):position 3, job 1287237, considered_count 9, prio 6 
  SCHED 12:01:19.509 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):Entering jscrmrr_mark_run_resource_mgr
  SCHED 12:01:19.509 1 00 19988592 CJQ0 0(jscrmrr_mark_run_resource_mgr):jleft=13, sleft=1033, pleft=661
  SCHED 12:01:19.509 1 00 19988592 CJQ0 0(jscrrsa_rm_slave_allocation):all cg granted total of 4

Probably this different behaviour shows that DBMS_SCHEDULER it still evolving after the releases.

4. Used Slaves by Job Kill

At first, we disable both incident jobs. Then look different job kill commands and its impact on #used_slaves.


  exec dbms_scheduler.disable ('TEST_JOB_CRASHED_1', force => true, commit_semantics =>'ABSORB_ERRORS');
  exec dbms_scheduler.disable ('TEST_JOB_CRASHED_2', force => true, commit_semantics =>'ABSORB_ERRORS');

4.1. OS Command Kill

Increase job_queue_processes from 13 to 14 to allow one job slave running:


  alter system set job_queue_processes=14 scope=memory;

Now we can see that TEST_JOB_NORMAL_1 is changed from "SCHEDULED" to "RUNNING" (incident jobs are all in state="DISABLED"):


select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';
  
  JOB_NAME           ENABL STATE    RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE   LAST_START_DATE NEXT_RUN_DATE
  ------------------ ----- -------- --------- ------------- ----------- ------------ --------------- -------------
  TEST_JOB_CRASHED_1 FALSE DISABLED         7             0           0 10:37:05.497 10:41:12.340    10:41:12.341
  TEST_JOB_CRASHED_2 FALSE DISABLED         6             0           0 10:37:06.370 10:39:44.948    10:39:44.950
  TEST_JOB_NORMAL_1  TRUE  RUNNING          4             0           0 11:36:31.298 11:36:18.276    11:36:31.305

Find OS process id of TEST_JOB_NORMAL_1:


select s.program, s.module, s.action, s.sid, s.serial#, p.pid, p.spid, s.event
  from v$session s, v$process p 
 where s.paddr=p.addr and s.program like '%J0%';  
 
  PROGRAM              MODULE         ACTION            SID SERIAL# PID SPID  EVENT
  -------------------- -------------- ----------------- --- ------- --- ----- -----------------
  oracle@testdb (J000) DBMS_SCHEDULER TEST_JOB_NORMAL_1 373   27399  50 23709 PL/SQL lock timer

Then kill it by OS command:


  kill -9 23709

Now we can see that TEST_JOB_NORMAL_1 is in state "SCHEDULED", same as above incident case. So the job killed by OS command is also counted into #used_slaves (TEST_JOB_CRASHED_1 and TEST_JOB_CRASHED_2 rows are same, removed in the output).


select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';
  
  JOB_NAME          ENABL STATE     RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE   LAST_START_DATE NEXT_RUN_DATE
  ----------------- ----- --------- --------- ------------- ----------- ------------ --------------- -------------
  TEST_JOB_NORMAL_1 TRUE  SCHEDULED        13             0           0 11:44:31.619 11:44:18.617    11:44:18.617

CJQ0 trace shows "CDB slave limit=14, CDB Used/Reserved slaves=14", no more job can be in state "RUNNING".


  SCHED 01-18 11:49:48.662 1 00 23038 CJQ0 0(jscrsq_select_queue):Considering job 3715271
  SCHED 01-18 11:49:48.662 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Enter
  SCHED 01-18 11:49:48.662 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):CDB slave limit=14, CDB Used/Reserved slaves=14, MSL=2
  SCHED 01-18 11:49:48.662 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):PDB 0 slaves: used=14, assigned=0, reserved=14, max=14
  SCHED 01-18 11:49:48.662 1 00 23038 CJQ0 0(jscr_can_pdb_run_job):Return 0

IN DB alert log, there is no info about this OS killed job, but pmon trace contains text below:


  *** 2021-01-18T11:45:36.206630+01:00
  Marked process 0xb75d64a0 pid=50 serial=18 ospid=23709 newly dead
  User session information :
    sid: 373 ser: 945
    client details:
      O/S info: user: oracle, term: UNKNOWN, ospid: 23709
      machine: testdb program: oracle@testdb (J000)
      application name: DBMS_SCHEDULER, hash value=2478762354
      action name: TEST_JOB_NORMAL_1, hash value=355935408

4.2. Oracle Kill Statement With "immediate" Option (KILL HARD)

We increase job_queue_processes from 14 to 15:


  alter system set job_queue_processes=15 scope=memory;

TEST_JOB_NORMAL_1 back to state "RUNNING" again:


select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';
  
  JOB_NAME          ENABL STATE   RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE   LAST_START_DATE NEXT_RUN_DATE
  ----------------- ----- ------- --------- ------------- ----------- ------------ --------------- -------------
  TEST_JOB_NORMAL_1 TRUE  RUNNING        13             0           0 12:17:43.620 12:17:30.610    12:17:43.627 
    
select s.program, s.module, s.action, s.sid, s.serial#, p.pid, p.spid, s.event
  from v$session s, v$process p 
 where s.paddr=p.addr and s.program like '%J0%'; 
  
  PROGRAM              MODULE         ACTION            SID SERIAL# PID SPID EVENT
  -------------------- -------------- ----------------- --- ------- --- ---- -----------------
  oracle@testdb (J000) DBMS_SCHEDULER TEST_JOB_NORMAL_1 722   45490  58 5571 PL/SQL lock timer

Pick sid and serial#, kill TEST_JOB_NORMAL_1 by Oracle statement with "immediate" option:


  alter system kill session '722,45490,@1' immediate;

Now we can see that TEST_JOB_NORMAL_1 is in state "SCHEDULED", same as above incident case. So the job killed by Oracle statement with "immediate" option is counted into #used_slaves.


select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';
  
  JOB_NAME          ENABL STATE     RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE   LAST_START_DATE NEXT_RUN_DATE
  ----------------- ----- --------- --------- ------------- ----------- ------------ --------------- -------------
  TEST_JOB_NORMAL_1 TRUE  SCHEDULED        20             0           0 12:23:43.714 12:23:30.712    12:23:30.712

DB alert log shows Process termination by "KILL HARD SAFE":


  2021-01-18T12:23:54.011134+01:00
  Process termination requested for pid 5571 [source = rdbms], [info = 2] [request issued by pid: 5315, uid: 100]
  2021-01-18T12:23:54.060574+01:00
  KILL SESSION for sid=(722, 45490):
    Reason = alter system kill session
    Mode = KILL HARD SAFE -/-/-
    Requestor = USER (orapid = 50, ospid = 5315, inst = 1)
    Owner = Process: J000 (orapid = 58, ospid = 5571)
    Result = ORA-0

pmon trace text:


  *** 2021-01-18T12:23:54.061510+01:00
  Marked process 0xb75df060 pid=58 serial=17 ospid=5571 newly dead
  User session information :
    sid: 722 ser: 45490
    client details:
      O/S info: user: oracle, term: UNKNOWN, ospid: 5571
      machine: testdb program: oracle@testdb (J000)
      application name: DBMS_SCHEDULER, hash value=2478762354
      action name: TEST_JOB_NORMAL_1, hash value=355935408

The Kill Statement seems implemented by subroutine call: "kpoal8 () -> opiexe () -> kksExecuteCommand ()".

4.3. Oracle Kill Statement Without "immediate" Option (KILL SOFT)

We increase job_queue_processes from 15 to 16:


  alter system set job_queue_processes=16 scope=memory;

TEST_JOB_NORMAL_1 back to state "RUNNING" again:


select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';
  
  JOB_NAME           ENABL STATE    RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE   LAST_START_DATE NEXT_RUN_DATE
  ------------------ ----- -------- --------- ------------- ----------- ------------ --------------- -------------
  TEST_JOB_NORMAL_1  TRUE  RUNNING         20             0           0 12:23:43.714 12:26:16.292    12:23:30.712
 
select s.program, s.module, s.action, s.sid, s.serial#, p.pid, p.spid, s.event
  from v$session s, v$process p 
 where s.paddr=p.addr and s.program like '%J0%'; 
 
  PROGRAM              MODULE         ACTION            SID SERIAL# PID SPID EVENT
  -------------------- -------------- ----------------- --- ------- --- ---- -----------------
  oracle@testdb (J000) DBMS_SCHEDULER TEST_JOB_NORMAL_1 722   58202  58 6652 PL/SQL lock timer

Pick sid and serial#, kill TEST_JOB_NORMAL_1 by Oracle statement without "immediate" option, and check its state:


alter system kill session '722,58202,@1';

select job_name, enabled, state, run_count, failure_count, retry_count, start_date, last_start_date, next_run_date
  from dba_scheduler_jobs v where job_name like '%TEST_JOB%';
  
  JOB_NAME          ENABL STATE    RUN_COUNT FAILURE_COUNT RETRY_COUNT START_DATE   LAST_START_DATE NEXT_RUN_DATE
  ----------------- ----- -------- --------- ------------- ----------- ------------ --------------- -------------
  TEST_JOB_NORMAL_1 TRUE  RUNNING         24             0           0 12:28:58.543 12:29:45.639    12:28:58.549

Now we can see that TEST_JOB_NORMAL_1 is able to be started in state "RUNNING". So the job killed by Oracle statement without "immediate" option is not counted into #used_slaves.

DB alert log shows aborting process by "KILL SOFT":

                                       
  2021-01-18T12:28:30.705849+01:00
  opidrv aborting process J000 ospid (6652) as a result of ORA-28
  2021-01-18T12:28:30.754338+01:00
  KILL SESSION for sid=(722, 58202):
    Reason = alter system kill session
    Mode = KILL SOFT -/-/-
    Requestor = USER (orapid = 50, ospid = 5315, inst = 1)
    Owner = Process: J000 (orapid = 58, ospid = 6652)
    Result = ORA-0

pmon trace:


  *** 2021-01-18T12:28:33.714660+01:00
  Marked process 0xb75df060 pid=58 serial=18 ospid=6652 newly dead

In Oracle 12.1, all above Job Kill behaviours are not observed.

4.4. Alter Session Kill Statement UNIX Signals, v$session and v$process Changes

Here some observations of Alter Session Kill Statement With vs. WithOut "immediate" in Oracle 18c and 19c.

(1). Signals Sending (tracing with strace/truss)
  -. alter system kill session 'sid,serial#' immediate;
     sends SIGTERM to the killed session (process). The killed process received:
       SIGTERM {si_signo=SIGTERM, si_code=SI_QUEUE, si_pid=24984, si_uid=100, si_value={int=2, ptr=0x100000002}}
       
  -. alter system kill session 'sid,serial#';
     sends SIGTSTP to the killed session (process). The killed process received:
       SIGTSTP {si_signo=SIGTSTP, si_code=SI_TKILL, si_pid=24984, si_uid=100}

  Both SIGTERM and SIGTSTP can be ignored or handled (caught) differently, depending on OS (platform) and Oracle versions.
  Whereas their sibling SIGKILL, SIGSTOP are unblockable, and cannot be caught or ignored.

(2). After Sending Kill Statement
  -. When the session killed With "immediate", the UNIX process is terminated.
     The session disappeared from v$session because flag KSSPAFLG in the underlined x$ksuse is changed from 1 to 6
     (filtered out by BITAND("S"."KSSPAFLG",1)<>0). Its info is still visible in x$ksuse (v$session .SID=x$ksuse).
     The process disappeared from v$process/x$ksupr.

  -. The session killed WithOut "immediate", the UNIX process is not terminated.
     It is still visible in v$session/x$ksuse and v$process/x$ksupr.

(3). After Kill Statement, using it again to execute a statement on the killed session:   
  -. The session killed With "immediate" throws ORA-03113.
     
     The UNIX process is already terminated. It is still kept in x$ksuse and x$ksupr.
     The flag KSSPAFLG in the underlined x$ksuse is further changed from 6 to 2 (filtered out by BITAND("S"."KSSPAFLG",1)<>0).
     
  -. The session killed WithOut "immediate" throws ORA-00028.
     
     The UNIX process is still alive (not terminated). It is no more visible in v$session 
     because the flag KSSPAFLG in the underlined x$ksuse is changed from 1 to 2 (filtered out by BITAND("S"."KSSPAFLG",1)<>0).
     It is still kept in v$process/x$ksupr, and x$ksuse with KSSPAFLG=2. 
     We can see that UNIX process actively reacts to any actions from the session.
     Each use of that killed session again, its process received:
        SIGSEGV {si_signo=SIGSEGV, si_code=SEGV_MAPERR, si_addr=0}

There are also session cancel and disconnect statements. Here the observed behaviour:

alter system cancel sql 'sid,serial#';      
  sends SIGTSTP to the cancelled session (process). The cancelled process received:
    SIGTSTP {si_signo=SIGTSTP, si_code=SI_TKILL, si_pid=24984, si_uid=100} 
  The cancelled session is alive in v$session and v$process.
  It only cancels the current execution (even in the middle of processing or in a loop)
  If using that session again, it behaves as normal.

alter system disconnect session 'sid,serial#' immediate;   
  sends SIGTERM to the disconnected session (process). The disconnected process received:
    SIGTERM {si_signo=SIGTERM, si_code=SI_QUEUE, si_pid=24984, si_uid=100, si_value={int=2, ptr=0x100000002}} 
  The disconnected session in x$ksuse is with KSSPAFLG changed from 1 to 6.
  If using that session again, it throws ORA-03113: end-of-file on communication channel

alter system disconnect session 'sid,serial#';   
  sends SIGTSTP to the disconnected session (process). The disconnected process received:
    SIGTSTP {si_signo=SIGTSTP, si_code=SI_TKILL, si_pid=24984, si_uid=100} ---
  The disconnected session is kept in v$process/x$ksupr, and x$ksuse with KSSPAFLG=2.
  If using that session again, it throws ORA-00028: your session has been killed

For cancel statement, it is similar to kill WithOut "immediate". For disconnect statement, the two variants (With/WithOut) are similar to kill (With/WithOut).

We can see that the same signal SIGTSTP is handled differently. in case of:


  alter system kill session 'sid,serial#';

It makes the session unusable. If using that session again, it throws ORA-00028.

Whereas in case of:


  alter system cancel sql 'sid,serial#';

It only cancels the current execution. If using that session again, it behaves normal.

By the way, we can also toggle SIGSTOP and SIGCONT to suspend and resume session's current execution:


  kill -s SIGSTOP pid
  kill -s SIGCONT pid

Anyway, SIGKILL is used as a last resort to terminate processes immediately. It cannot be caught or ignored, and the receiving process cannot perform any clean-up upon receiving this signal.

4.5 v$resource_limit and x$ksupr/v$process, x$ksuse/v$session

When instance startup, Oracle allocates number of processes and sessions by their initial_allocation values specified in v$resource_limit.


  select * from v$resource_limit where resource_name in ('processes', 'sessions');

You can see the number of rows in x$ksupr, x$ksuse are the same as their initial_allocation values. But the visible processes and sessions in v$process and v$session are less since recursive sessions and un-used sessions are filtered out. (Note: both x$ksupr and x$ksuse contains one identical name column: "KSSPAFLG")

v$process rows are selected as:


  select ksspaflg, bitand (ksspaflg, 1), p.* from x$ksupr p 
   where BITAND("KSSPAFLG",1)<>0;

v$session rows are selected as:


  select ksuseflg, bitand(ksuseflg,1), ksspaflg, bitand(ksspaflg,1), s.* from x$ksuse s 
   where BITAND("S"."KSSPAFLG",1)<>0 AND BITAND("S"."KSUSEFLG",1)<>0;

current_utilization of sessions in v$resource_limit are those selected without BITAND("S"."KSUSEFLG",1)<>0 predicate.


  select ksuseflg, bitand(ksuseflg,1), ksspaflg, bitand(ksspaflg,1), s.* from x$ksuse s 
   where BITAND("S"."KSSPAFLG",1)<>0;

We can see that session type in v$session is mapped from x$ksuse.ksuseflg as:


	DECODE (BITAND (S.KSSPAFLG, 19),
	17, 'BACKGROUND',
	1, 'USER',
	2, 'RECURSIVE',
	'?'),

and 'RECURSIVE' sessions has BITAND (S.KSSPAFLG, 19)=2. All even number is filtered out by predicate BITAND("S"."KSUSEFLG",1)<>0.

When initialization values exceeded, Oracle throws the corresponding error:

  ORA-00018 maximum number of sessions exceeded 
  ORA-00020: maximum number of processes (1200) exceeded

The default values of parameter SESSIONS is computed by (1.5 * PROCESSES) + 24.

Following MOS Notes have certain related descriptions:

  -.KILLING INACTIVE SESSIONS DOES NOT REMOVE SESSION ROW FROM V$SESSION (Doc ID 1041427.6)
  -.Troubleshooting Guide - ORA-18: Maximum Number Of Sessions (%S) Exceeded (Doc ID 1307128.1)
  -.Troubleshooting Guide - ORA-18: Maximum Number Of Sessions (%S) Exceeded (Doc ID 1307128.1)

5. Job Datetime and Start Delay

For job TEST_JOB_NORMAL_1, its job_action contains a set_attribute of 'start_date':


  -- TEST_JOB_NORMAL_1 job_action
  begin
    dbms_lock.sleep(13);
    dbms_scheduler.set_attribute(
       name      => 'TEST_JOB_NORMAL_1'
      ,attribute => 'start_date'
      ,value     => systimestamp);
    dbms_lock.sleep(47);
  end;

Remove 'start_date' setting, create a similar job TEST_JOB_NORMAL_B:


  begin 
    dbms_scheduler.create_job (
      job_name        => 'TEST_JOB_NORMAL_B',
      job_type        => 'PLSQL_BLOCK',
      job_action      => 
        'begin 
           dbms_lock.sleep(13); 
           dbms_lock.sleep(47);
        end;',    
      start_date      => systimestamp,
      repeat_interval => 'systimestamp',
      auto_drop       => true,
      enabled         => true);
  end;
  /

We increase job_queue_processes from 16 to 17 to run TEST_JOB_NORMAL_B:


  alter system set job_queue_processes=17 scope=memory;

Both jobs are running, and now we look their datatime (we query sys.scheduler$_job instead of dba_scheduler_jobs for two addtional fileds: last_end_date, last_enabled_time):


select o.object_name, o.created, o.last_ddl_time, start_date, last_start_date, next_run_date, last_end_date, last_enabled_time
  from dba_objects o, sys.scheduler$_job j 
 where o.object_id = j.obj# and o.object_name like 'TEST_JOB_NORMAL_%';
 
  OBJECT_NAME       CREATED  LAST_DDL START_DATE   LAST_START_DATE NEXT_RUN_DATE LAST_END_DATE LAST_ENABLED_TIME
  ----------------- -------- -------- ------------ --------------- ------------- ------------- -----------------
  TEST_JOB_NORMAL_1 10:37:11 15:49:50 15:49:50.791 15:49:37.786    15:49:50.810  15:49:37.758  15:49:50.810
  TEST_JOB_NORMAL_B 14:14:55 15:49:34 14:14:55.515 15:49:34.731    15:48:34.726  15:49:34.728  15:43:34.625

In the simple case (TEST_JOB_NORMAL_B), job start_date is (almost) same as DBA_OBJECTS.created. both start_date and last_enabled_time are not changed once started. last_start_date and last_end_date are the timestamp of job action start and end (in fact, last_start_date is current start timestamp, last_end_date is previous end timestamp). next_run_date is an expected time for next run, calculated when current run starts (only a scheduled time, not a real performed time). DBA_OBJECTS.last_ddl is the last modification time of job object.

In the case of TEST_JOB_NORMAL_1, job_action modifies 'start_date' to 'systimestamp' after 13 seconds of job start, then continue another 47 seconds. Therefore start_date is 13 seconds after last_start_date. next_run_date and last_enabled_time are also modified close to start_date. DBA_OBJECTS.last_ddl records the timestamp of this modifcation.

Now we can also query dba_scheduler_job_run_details for historical job run timestamp:


select job_name, log_date, status, req_start_date, actual_start_date, session_id, slave_pid
      ,(actual_start_date - req_start_date) delay
  from dba_scheduler_job_run_details v 
 where job_name = 'TEST_JOB_NORMAL_1' 
 order by v.log_date

  JOB_NAME           LOG_DATE     STATUS    REQ_START_DATE ACTUAL_START_DA SESSION_ID SLAVE_ DELAY
  -----------------  ------------ --------- -------------- --------------- ---------- ------ -------------
      -- TEST_JOB_NORMAL_1 RUN_COUNT=4 till state=SCHEDULED
  TEST_JOB_NORMAL_1  10:38:11.853 SUCCEEDED 10:37:11.681   10:37:11.703    731,19298  25130  +00:00:00.022
  TEST_JOB_NORMAL_1  10:39:12.080 SUCCEEDED 10:37:24.769   10:38:12.071    731,52059  25130  +00:00:47.302
  TEST_JOB_NORMAL_1  10:40:12.112 SUCCEEDED 10:38:25.077   10:39:12.105    731,411    25130  +00:00:47.027
  TEST_JOB_NORMAL_1  10:41:12.153 SUCCEEDED 10:39:25.111   10:40:12.145    731,11120  25130  +00:00:47.034
  
      -- increase job_queue_processes to make state=RUNNING
  TEST_JOB_NORMAL_1  11:37:18.437 SUCCEEDED 10:40:25.152   11:36:18.276    373,30624  23709  +00:55:53.124
  TEST_JOB_NORMAL_1  11:38:18.554 SUCCEEDED 11:36:31.305   11:37:18.546    373,36987  23709  +00:00:47.240
  TEST_JOB_NORMAL_1  11:39:18.564 SUCCEEDED 11:37:31.552   11:38:18.556    373,44734  23709  +00:00:47.003
  TEST_JOB_NORMAL_1  11:40:18.574 SUCCEEDED 11:38:31.562   11:39:18.566    373,54579  23709  +00:00:47.004
  TEST_JOB_NORMAL_1  11:41:18.584 SUCCEEDED 11:39:31.573   11:40:18.576    373,4241   23709  +00:00:47.003
  TEST_JOB_NORMAL_1  11:42:18.594 SUCCEEDED 11:40:31.583   11:41:18.587    373,27399  23709  +00:00:47.003
  TEST_JOB_NORMAL_1  11:43:18.604 SUCCEEDED 11:41:31.593   11:42:18.597    373,41459  23709  +00:00:47.004
  TEST_JOB_NORMAL_1  11:44:18.614 SUCCEEDED 11:42:31.603   11:43:18.607    373,23208  23709  +00:00:47.003

      -- kill by OS command to stop it (state=SCHEDULED)
  TEST_JOB_NORMAL_1  11:45:45.239 STOPPED   11:44:31.623   11:44:18.617    373,945           -00:00:13.006
  
      -- increase job_queue_processes to make state=RUNNING
  TEST_JOB_NORMAL_1  12:18:30.629 SUCCEEDED 11:44:18.617   12:17:30.611    722,16157  5571   +00:33:11.993
  TEST_JOB_NORMAL_1  12:19:30.669 SUCCEEDED 12:17:43.627   12:18:30.660    722,53334  5571   +00:00:47.033

In the normal case of job running, from actual_start_date to log_date is from start to end of each run. req_start_date is 13 seconds after previous actual_start_date.

If we use (actual_start_date - req_start_date) to determine job start delay, we should take into account 'start_date' setting, state 'SCHEDULED', job killed. Even though there is no real delay, (actual_start_date - req_start_date) can show certain values, even some big or negative ones.

Run the same query for incident job TEST_JOB_CRASHED_1. The output lists all 7 runs (RUN_COUNT=7) with 4 computed fields (DELAY, DURATION, START_DIFF, LOG_DIFF). The DELAY and START_DIFF are (almost) same in the same row, but are varied in different rows. During incident, job slave process is still alive and the dump is performed by the job slave process itself, but SLAVE_PID is not filled.


select job_name, log_date, status, req_start_date, actual_start_date, session_id, slave_pid
      ,(actual_start_date - req_start_date)                                  delay
      ,(log_date - actual_start_date)                                        duration
      ,(actual_start_date - lag(actual_start_date) over(order by log_date))  start_diff
      ,(log_date - lag(log_date) over(order by log_date))                    log_diff
  from dba_scheduler_job_run_details v 
 where job_name = 'TEST_JOB_CRASHED_1'
 order by v.log_date;
 
  JOB_NAME           LOG_DATE     STATUS  REQ_START_DATE ACTUAL_START_DA SESSION_ID SLAVE_ DELAY         DURATION      START_DIFF    LOG_DIFF     
  ------------------ ------------ ------- -------------- --------------- ---------- ------ ------------- ------------- ------------- -------------
  TEST_JOB_CRASHED_1 10:37:44.948 STOPPED 10:37:06.351   10:37:06.416    375,57672         +00:00:00.064 +00:00:38.532                            
  TEST_JOB_CRASHED_1 10:38:12.048 STOPPED 10:37:06.439   10:37:45.052    375,51826         +00:00:38.613 +00:00:26.995 +00:00:38.636 +00:00:27.099
  TEST_JOB_CRASHED_1 10:38:44.843 STOPPED 10:37:45.054   10:38:12.072    910,52595         +00:00:27.017 +00:00:32.770 +00:00:27.020 +00:00:32.795
  TEST_JOB_CRASHED_1 10:39:12.082 STOPPED 10:38:12.075   10:38:44.883    375,55842         +00:00:32.808 +00:00:27.199 +00:00:32.810 +00:00:27.238
  TEST_JOB_CRASHED_1 10:39:44.892 STOPPED 10:38:44.886   10:39:12.106    910,46748         +00:00:27.220 +00:00:32.785 +00:00:27.223 +00:00:32.810
  TEST_JOB_CRASHED_1 10:40:12.114 STOPPED 10:39:12.109   10:39:44.932    375,49907         +00:00:32.822 +00:00:27.182 +00:00:32.825 +00:00:27.221
  TEST_JOB_CRASHED_1 10:41:24.323 STOPPED 10:39:44.934   10:41:12.340    731,55755         +00:01:27.405 +00:00:11.983 +00:01:27.408 +00:01:12.209

select job_name, log_date, operation, status,dbms_lob.substr(additional_info, 120,1) add_info
  from dba_scheduler_job_log  v 
 where job_name like 'TEST_JOB_CRASHED_1'
 order by v.log_date;
 
  JOB_NAME           LOG_DATE     OPERATION STATUS  ADD_INFO
  ------------------ ------------ --------- ------- -----------------------------------------
  TEST_JOB_CRASHED_1 10:37:44.790 RUN       STOPPED REASON="Job slave process was terminated"
  TEST_JOB_CRASHED_1 10:38:12.047 RUN       STOPPED REASON="Job slave process was terminated"
  TEST_JOB_CRASHED_1 10:38:44.843 RUN       STOPPED REASON="Job slave process was terminated"
  TEST_JOB_CRASHED_1 10:39:12.082 RUN       STOPPED REASON="Job slave process was terminated"
  TEST_JOB_CRASHED_1 10:39:44.892 RUN       STOPPED REASON="Job slave process was terminated"
  TEST_JOB_CRASHED_1 10:40:12.114 RUN       STOPPED REASON="Job slave process was terminated"
  TEST_JOB_CRASHED_1 10:41:24.323 RUN       STOPPED REASON="Job slave process was terminated"

Actually, as we observed, sys.scheduler$_job_run_details is filled by job session calling: "jslgLogJobRunInt() -> OCIStmtExecute()" with LOG_DATE = SYSTIMESTAMP, whereas sys.scheduler$_event_log is inserted by both job session and control session(enable/disable/stop/drop).

One similar query from OEM Grid Control looks like:


select j.job_name,
       j.owner,
       case
           when j.state = 'SCHEDULED'
           then to_char (j.next_run_date, 'DD-MON-YYYY HH24:MI:SS TZH:TZM')
           else null
       end    scheduled_date,
       case
           when j.state = 'RUNNING'
           then (  (sys_extract_utc (systimestamp) + 0) - (sys_extract_utc (j.last_start_date) + 0))* 24*60
           else null
       end    duration_mins,
       j.state
  from dba_scheduler_jobs j, dba_scheduler_running_jobs r
 where     (   (j.state = 'RUNNING' or j.state = 'CHAIN_STALLED')
            or (j.state = 'SCHEDULED'))
       and j.job_subname is null
       and r.job_subname is null
       and j.job_name = r.job_name(+)
       and j.owner = r.owner(+)
       and (   (    j.state = 'SCHEDULED'
                and   (  (sys_extract_utc (j.next_run_date) + 0) - (sys_extract_utc (systimestamp) + 0)) * 24*60 < 24*60)
            or (j.state = 'RUNNING' or j.state = 'CHAIN_STALLED'))
union
select sj.job_name,
       sj.owner,
       null                                             scheduled_date,
       extract (day from 24 * 60 * sj.run_duration)     duration_mins,
       sj.status
  from dba_scheduler_job_run_details sj
 where (   (    sj.status = 'STOPPED'
            and   (  (sys_extract_utc (systimestamp) + 0)
                   - (  (sys_extract_utc (sj.actual_start_date) + 0) + extract (day from sj.run_duration))) * 24*60 < 24*60)
        or (    sj.status = 'FAILED'
            and   (  (sys_extract_utc (systimestamp) + 0)
                   - (  (sys_extract_utc (sj.actual_start_date) + 0) + extract (day from sj.run_duration))) * 24*60 < 24*60)
);

6. Test Cleanup

After test, we can cleanup the test by following procedure:


create or replace procedure clearup_test as
begin
  for c in (select * from dba_scheduler_jobs where job_name like '%TEST_JOB%') loop
    begin
      --set DBA_SCHEDULER_JOBS.enabled=FALSE
	    dbms_scheduler.disable (c.job_name, force => true, commit_semantics =>'ABSORB_ERRORS');
	    --set DBA_SCHEDULER_JOBS.enabled=TRUE, so that it can be scheduled to run (state='RUNNING')
	    --  dbms_scheduler.enable (c.job_name, commit_semantics =>'ABSORB_ERRORS');
	  exception when others then null;
	  end;
	end loop;
	
  for c in (select * from dba_scheduler_running_jobs where job_name like '%TEST_JOB%') loop
    begin
      --If force=FALSE, gracefully stop the job, slave process can update the status of the job in the job queue.
      --If force= TRUE, the Scheduler immediately terminates the job slave.
      --For repeating job with attribute "start_date => systimestamp" and enabled=TRUE, 
      --re-start immediate (state changed from 'SCHEDULED to 'RUNNING'), DBA_SCHEDULER_JOBS.run_count increases 1.
	    dbms_scheduler.stop_job (c.job_name, force => true, commit_semantics =>'ABSORB_ERRORS');
	  exception when others then null;
	  end;
	end loop;
	
  for c in (select * from dba_scheduler_jobs where job_name like '%TEST_JOB%') loop
    begin
      --If force=TRUE, the Scheduler first attempts to stop the running job instances 
      --(by issuing the STOP_JOB call with the force flag set to false), and then drops the jobs.
	    dbms_scheduler.drop_job (c.job_name, force => true, commit_semantics =>'ABSORB_ERRORS');
	  exception when others then null;
	  end;
	end loop;
end;
/

exec clearup_test;

Sunday, January 24, 2021

DBMS_SCHEDULER Job Not Running and Used Slaves