diff --git a/init/Kconfig b/init/Kconfig index 719c94612548..7f8309213c68 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -843,6 +843,40 @@ config CACULE_SCHED If unsure, say Y here. +config CACULE_RDB + bool "RDB (Response Driven Balancer)" + default y + depends on CACULE_SCHED + help + This is an experimental load balancer for CacULE. It is a lightweight + load balancer which is a replacement of CFS load balancer. It migrates + tasks based on their interactivity scores. + + If unsure, say Y here. + +config RDB_INTERVAL + int "RDB load balancer interval" + default 19 + depends on CACULE_RDB + help + This is an interval to control load balance time period. + The trigger_load_balance runs in every tick. For High HZ values, the + load balance could be overwhelming. RDB load balance includes rq locking + which can reduce the performance. The balance interval can help to avoid + running load balance on every tick. For example, RDB_INTERVAL=3 will + only run load balance every 3ms. Setting RDB_INTERVAL depends on HZ. + If you want load balancer run every 2ms while HZ=500 then it is not + needed and better to set RDB_INTERVAL=0 since 500HZ already (1000ms + / 500HZ = 2ms). However, if you have 1000HZ and want to avoid load + balancer from running every 1ms, you could set RDB_INTERVAL=4ms for + example to make load balancer run every 4ms. Less RDB_INTERVAL values + (or 0 to disable) could make sure tasks are balanced ASAP, but with + the cost of locking/blocking time. High RDB_INTERVAL values can relax + balancing locking but with the cost of imbalanced workload for that + period of time (i.e. if RDB_INTERVAL=100ms) there will be no balancing + for 100ms (except for newidle_balance which is not effected by RDB_INTERVAL). + + If in doubt, use the default value. # # For architectures that want to enable the support for NUMA-affine scheduler @@ -961,6 +995,7 @@ config CGROUP_WRITEBACK menuconfig CGROUP_SCHED bool "CPU controller" + depends on !CACULE_RDB default n help This feature lets CPU scheduler recognize task groups and control CPU @@ -1238,10 +1273,10 @@ config CHECKPOINT_RESTORE config SCHED_AUTOGROUP bool "Automatic process group scheduling" + depends on !CACULE_RDB select CGROUPS select CGROUP_SCHED select FAIR_GROUP_SCHED - default y help This option optimizes the scheduler for common desktop workloads by automatically creating and populating task groups. This separation diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8505beec4e99..1a09237a3dfe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8112,8 +8112,11 @@ void __init sched_init(void) BUG_ON(&dl_sched_class + 1 != &stop_sched_class); #endif -#ifdef CONFIG_CACULE_SCHED - printk(KERN_INFO "CacULE CPU scheduler v5.13 by Hamad Al Marri."); +#ifdef CONFIG_CACULE_RDB + /* + * This patch is on top cacule-5.13.patch + */ + printk(KERN_INFO "CacULE CPU scheduler (RDB) v5.13 by Hamad Al Marri."); #endif wait_bit_init(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b7649507f511..0b663012a644 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -742,6 +742,10 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *_se) cfs_rq->head = NULL; cfs_rq->tail = NULL; +#ifdef CONFIG_CACULE_RDB + WRITE_ONCE(cfs_rq->IS_head, ~0); +#endif + } else if (se == cfs_rq->head) { // if it is the head cfs_rq->head = cfs_rq->head->next; @@ -1114,6 +1118,7 @@ static void update_curr_fair(struct rq *rq) static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) u64 wait_start, prev_wait_start; if (!schedstat_enabled()) @@ -1127,11 +1132,13 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) wait_start -= prev_wait_start; __schedstat_set(se->statistics.wait_start, wait_start); +#endif } static inline void update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) struct task_struct *p; u64 delta; @@ -1168,11 +1175,13 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) __schedstat_inc(se->statistics.wait_count); __schedstat_add(se->statistics.wait_sum, delta); __schedstat_set(se->statistics.wait_start, 0); +#endif } static inline void update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) struct task_struct *tsk = NULL; u64 sleep_start, block_start; @@ -1236,6 +1245,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) account_scheduler_latency(tsk, delta >> 10, 0); } } +#endif } /* @@ -1244,6 +1254,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { +#if !defined(CONFIG_CACULE_RDB) if (!schedstat_enabled()) return; @@ -1256,11 +1267,13 @@ update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) update_stats_enqueue_sleeper(cfs_rq, se); +#endif } static inline void update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { +#if !defined(CONFIG_CACULE_RDB) if (!schedstat_enabled()) return; @@ -1281,6 +1294,7 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } +#endif } /* @@ -3311,15 +3325,19 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) cfs_rq->avg.load_avg += se->avg.load_avg; cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +#endif } static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); +#endif } #else static inline void @@ -3574,6 +3592,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { +#if !defined(CONFIG_CACULE_RDB) u64 p_last_update_time; u64 n_last_update_time; @@ -3613,6 +3632,7 @@ void set_task_rq_fair(struct sched_entity *se, #endif __update_load_avg_blocked_se(p_last_update_time, se); se->avg.last_update_time = n_last_update_time; +#endif } @@ -3892,6 +3912,9 @@ static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) { +#ifdef CONFIG_CACULE_RDB + return 0; +#else unsigned long removed_load = 0, removed_util = 0, removed_runnable = 0; struct sched_avg *sa = &cfs_rq->avg; int decayed = 0; @@ -3937,8 +3960,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) #endif return decayed; +#endif } +#if !defined(CONFIG_CACULE_RDB) /** * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to @@ -4016,6 +4041,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s trace_pelt_cfs_tp(cfs_rq); } +#endif /* * Optional action to be done while updating the load average @@ -4027,6 +4053,7 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* Update task and its cfs_rq load average */ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { +#if !defined(CONFIG_CACULE_RDB) u64 now = cfs_rq_clock_pelt(cfs_rq); int decayed; @@ -4058,8 +4085,10 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s if (flags & UPDATE_TG) update_tg_load_avg(cfs_rq); } +#endif } +#if !defined(CONFIG_CACULE_RDB) #ifndef CONFIG_64BIT static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { @@ -4080,6 +4109,7 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) return cfs_rq->avg.last_update_time; } #endif +#endif /* * Synchronize entity load avg of dequeued entity without locking @@ -4087,11 +4117,13 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) */ static void sync_entity_load_avg(struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); __update_load_avg_blocked_se(last_update_time, se); +#endif } /* @@ -4100,6 +4132,7 @@ static void sync_entity_load_avg(struct sched_entity *se) */ static void remove_entity_load_avg(struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) struct cfs_rq *cfs_rq = cfs_rq_of(se); unsigned long flags; @@ -4117,6 +4150,7 @@ static void remove_entity_load_avg(struct sched_entity *se) cfs_rq->removed.load_avg += se->avg.load_avg; cfs_rq->removed.runnable_avg += se->avg.runnable_avg; raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags); +#endif } static inline unsigned long cfs_rq_runnable_avg(struct cfs_rq *cfs_rq) @@ -4404,10 +4438,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) } #endif /* CONFIG_CACULE_SCHED */ +#if !defined(CONFIG_CACULE_RDB) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +#endif static inline void check_schedstat_required(void) { +#if !defined(CONFIG_CACULE_RDB) #ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) return; @@ -4424,6 +4461,7 @@ static inline void check_schedstat_required(void) "kernel.sched_schedstats=1\n"); } #endif +#endif } static inline bool cfs_bandwidth_used(void); @@ -4513,6 +4551,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; +#if !defined(CONFIG_CACULE_RDB) /* * When bandwidth control is enabled, cfs might have been removed * because of a parent been throttled but cfs->nr_running > 1. Try to @@ -4523,6 +4562,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (cfs_rq->nr_running == 1) check_enqueue_throttle(cfs_rq); +#endif } #if !defined(CONFIG_CACULE_SCHED) @@ -5053,6 +5093,9 @@ static int tg_throttle_down(struct task_group *tg, void *data) static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) { +#ifdef CONFIG_CACULE_RDB + return false; +#else struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; @@ -5130,10 +5173,12 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); return true; +#endif } void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { +#if !defined(CONFIG_CACULE_RDB) struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; @@ -5215,6 +5260,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) resched_curr(rq); +#endif } static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) @@ -5667,7 +5713,11 @@ static inline bool cfs_bandwidth_used(void) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } + +#if !defined(CONFIG_CACULE_RDB) static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +#endif + static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} @@ -5798,7 +5848,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; +#if !defined(CONFIG_CACULE_RDB) int idle_h_nr_running = task_has_idle_policy(p); +#endif int task_new = !(flags & ENQUEUE_WAKEUP); /* @@ -5817,6 +5869,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (p->in_iowait) cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); +#ifdef CONFIG_CACULE_RDB + if (!se->on_rq) { + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, flags); + cfs_rq->h_nr_running++; + } +#else for_each_sched_entity(se) { if (se->on_rq) break; @@ -5854,6 +5913,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (throttled_hierarchy(cfs_rq)) list_add_leaf_cfs_rq(cfs_rq); } +#endif /* At this point se is NULL and we are at root level*/ add_nr_running(rq, 1); @@ -5875,6 +5935,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!task_new) update_overutilized_status(rq); +#if !defined(CONFIG_CACULE_RDB) enqueue_throttle: if (cfs_bandwidth_used()) { /* @@ -5890,6 +5951,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; } } +#endif assert_list_leaf_cfs_rq(rq); @@ -5910,6 +5972,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int task_sleep = flags & DEQUEUE_SLEEP; + +#ifdef CONFIG_CACULE_RDB + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, flags); + cfs_rq->h_nr_running--; +#else int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); @@ -5958,15 +6026,18 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) goto dequeue_throttle; } +#endif /* At this point se is NULL and we are at root level*/ sub_nr_running(rq, 1); +#if !defined(CONFIG_CACULE_RDB) /* balance early to pull high priority tasks */ if (unlikely(!was_sched_idle && sched_idle_rq(rq))) rq->next_balance = jiffies; dequeue_throttle: +#endif util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -6102,6 +6173,7 @@ static int wake_wide(struct task_struct *p) } #endif /* CONFIG_CACULE_SCHED */ +#if !defined(CONFIG_CACULE_RDB) /* * The purpose of wake_affine() is to quickly determine on which CPU we can run * soonest. For the purpose of speed we only consider the waking and previous @@ -6203,6 +6275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, schedstat_inc(p->se.statistics.nr_wakeups_affine); return target; } +#endif static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu); @@ -7120,13 +7193,15 @@ find_least_IS_cpu(struct task_struct *p) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { +#if !defined(CONFIG_CACULE_RDB) int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); - int new_cpu = prev_cpu; int want_affine = 0; /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; +#endif + int new_cpu = prev_cpu; #ifdef CONFIG_CACULE_SCHED struct sched_entity *se = &p->se; @@ -7147,7 +7222,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) new_cpu = prev_cpu; cfs_way: -#else +#else /* CONFIG_CACULE_SCHED */ if (wake_flags & WF_TTWU) { record_wakee(p); @@ -7162,6 +7237,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) } #endif /* CONFIG_CACULE_SCHED */ +#ifdef CONFIG_CACULE_RDB + return select_idle_sibling(p, prev_cpu, prev_cpu); +#else rcu_read_lock(); for_each_domain(cpu, tmp) { /* @@ -7196,9 +7274,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) rcu_read_unlock(); return new_cpu; +#endif /* CONFIG_CACULE_RDB */ } +#if !defined(CONFIG_CACULE_RDB) static void detach_entity_cfs_rq(struct sched_entity *se); +#endif /* * Called immediately before a task is migrated to a new CPU; task_cpu(p) and @@ -7235,6 +7316,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) } #endif /* CONFIG_CACULE_SCHED */ +#if !defined(CONFIG_CACULE_RDB) if (p->on_rq == TASK_ON_RQ_MIGRATING) { /* * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old' @@ -7254,6 +7336,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) */ remove_entity_load_avg(&p->se); } +#endif /* Tell new CPU we are migrated */ p->se.avg.last_update_time = 0; @@ -7553,11 +7636,23 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf if (prev) put_prev_task(rq, prev); +#ifdef CONFIG_CACULE_RDB + se = pick_next_entity(cfs_rq, NULL); + set_next_entity(cfs_rq, se); + + if (cfs_rq->head) { + unsigned int IS_head = calc_interactivity(sched_clock(), cfs_rq->head); + WRITE_ONCE(cfs_rq->IS_head, IS_head); + } else { + WRITE_ONCE(cfs_rq->IS_head, ~0); + } +#else do { se = pick_next_entity(cfs_rq, NULL); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); +#endif p = task_of(se); @@ -7579,6 +7674,10 @@ done: __maybe_unused; return p; idle: +#ifdef CONFIG_CACULE_RDB + WRITE_ONCE(cfs_rq->IS_head, ~0); +#endif + if (!rf) return NULL; @@ -7885,6 +7984,7 @@ struct lb_env { struct list_head tasks; }; +#if !defined(CONFIG_CACULE_RDB) /* * Is this task likely cache-hot: */ @@ -8306,6 +8406,7 @@ static void attach_tasks(struct lb_env *env) rq_unlock(env->dst_rq, &rf); } +#endif #ifdef CONFIG_NO_HZ_COMMON static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq) @@ -8355,6 +8456,7 @@ static inline void update_blocked_load_tick(struct rq *rq) {} static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) {} #endif +#if !defined(CONFIG_CACULE_RDB) static bool __update_blocked_others(struct rq *rq, bool *done) { const struct sched_class *curr_class; @@ -8380,6 +8482,7 @@ static bool __update_blocked_others(struct rq *rq, bool *done) return decayed; } +#endif #ifdef CONFIG_FAIR_GROUP_SCHED @@ -8487,6 +8590,7 @@ static unsigned long task_h_load(struct task_struct *p) cfs_rq_load_avg(cfs_rq) + 1); } #else +#if !defined(CONFIG_CACULE_RDB) static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -8498,6 +8602,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) return decayed; } +#endif static unsigned long task_h_load(struct task_struct *p) { @@ -8505,6 +8610,7 @@ static unsigned long task_h_load(struct task_struct *p) } #endif +#if !defined(CONFIG_CACULE_RDB) static void update_blocked_averages(int cpu) { bool decayed = false, done = true; @@ -8523,6 +8629,7 @@ static void update_blocked_averages(int cpu) cpufreq_update_util(rq, 0); rq_unlock_irqrestore(rq, &rf); } +#endif /********** Helpers for find_busiest_group ************************/ @@ -9626,6 +9733,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * different in groups. */ +#if !defined(CONFIG_CACULE_RDB) /** * find_busiest_group - Returns the busiest group within the sched_domain * if there is an imbalance. @@ -9894,6 +10002,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, return busiest; } +#endif /* * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but @@ -9930,6 +10039,7 @@ imbalanced_active_balance(struct lb_env *env) return 0; } +#if !defined(CONFIG_CACULE_RDB) static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; @@ -10262,6 +10372,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, out: return ld_moved; } +#endif static inline unsigned long get_sd_balance_interval(struct sched_domain *sd, int cpu_busy) @@ -10300,6 +10411,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) *next_balance = next; } +#if !defined(CONFIG_CACULE_RDB) /* * active_load_balance_cpu_stop is run by the CPU stopper. It pushes * running tasks off the busiest CPU onto idle CPUs. It requires at @@ -10385,6 +10497,7 @@ static int active_load_balance_cpu_stop(void *data) } static DEFINE_SPINLOCK(balancing); +#endif /* * Scale the max load_balance interval with the number of CPUs in the system. @@ -10395,6 +10508,7 @@ void update_max_interval(void) max_load_balance_interval = HZ*num_online_cpus()/10; } +#if !defined(CONFIG_CACULE_RDB) /* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. @@ -10487,6 +10601,7 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle) rq->next_balance = next_balance; } +#endif static inline int on_null_domain(struct rq *rq) { @@ -10520,6 +10635,7 @@ static inline int find_new_ilb(void) return nr_cpu_ids; } +#if !defined(CONFIG_CACULE_RDB) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any * idle CPU in the HK_FLAG_MISC housekeeping set (if there is one). @@ -10670,6 +10786,7 @@ static void nohz_balancer_kick(struct rq *rq) if (flags) kick_ilb(flags); } +#endif /* CONFIG_CACULE_RDB */ static void set_cpu_sd_state_busy(int cpu) { @@ -10777,6 +10894,7 @@ void nohz_balance_enter_idle(int cpu) WRITE_ONCE(nohz.has_blocked, 1); } +#if !defined(CONFIG_CACULE_RDB) static bool update_nohz_stats(struct rq *rq) { unsigned int cpu = rq->cpu; @@ -10955,8 +11073,10 @@ static void nohz_newidle_balance(struct rq *this_rq) */ atomic_or(NOHZ_NEWILB_KICK, nohz_flags(this_cpu)); } +#endif #else /* !CONFIG_NO_HZ_COMMON */ +#if !defined(CONFIG_CACULE_RDB) static inline void nohz_balancer_kick(struct rq *rq) { } static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) @@ -10965,8 +11085,130 @@ static inline bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle } static inline void nohz_newidle_balance(struct rq *this_rq) { } +#endif + #endif /* CONFIG_NO_HZ_COMMON */ +#ifdef CONFIG_CACULE_RDB +static int +can_migrate_task(struct task_struct *p, int dst_cpu, struct rq *src_rq) +{ + if (task_running(src_rq, p)) + return 0; + + if (!cpumask_test_cpu(dst_cpu, p->cpus_ptr)) + return 0; + + if (p->se.exec_start == 0) + return 0; + + return 1; +} + +static void push_to_unlock(struct rq *this_rq, + struct rq *dst_rq, + struct task_struct *p, + int dst_cpu) +{ + struct rq_flags rf; + + // detach task + deactivate_task(this_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); + + // unlock this rq + raw_spin_unlock(&this_rq->lock); + + /* push to */ + rq_lock_irqsave(dst_rq, &rf); + update_rq_clock(dst_rq); + + activate_task(dst_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(dst_rq, p, 0); + + // unlock src rq + rq_unlock(dst_rq, &rf); + local_irq_restore(rf.flags); +} + +static void pull_from_unlock(struct rq *this_rq, + struct rq *src_rq, + struct rq_flags *rf, + struct task_struct *p, + int dst_cpu) +{ + // detach task + deactivate_task(src_rq, p, DEQUEUE_NOCLOCK); + set_task_cpu(p, dst_cpu); + + // unlock src rq + rq_unlock(src_rq, rf); + local_irq_restore(rf->flags); + + // lock this rq + raw_spin_lock(&this_rq->lock); + update_rq_clock(this_rq); + + activate_task(this_rq, p, ENQUEUE_NOCLOCK); + check_preempt_curr(this_rq, p, 0); + + // unlock this rq + raw_spin_unlock(&this_rq->lock); +} + +static inline struct rq * +find_max_IS_rq(struct cfs_rq *cfs_rq, int dst_cpu) +{ + struct rq *tmp_rq, *max_rq = NULL; + int cpu; + u32 max_IS = cfs_rq->IS_head; + u32 local_IS; + + // find max hrrn + for_each_online_cpu(cpu) { + if (cpu == dst_cpu) + continue; + + tmp_rq = cpu_rq(cpu); + + if (tmp_rq->cfs.nr_running < 2 || !tmp_rq->cfs.head) + continue; + + local_IS = READ_ONCE(tmp_rq->cfs.IS_head); + + if (local_IS < max_IS) { + max_IS = local_IS; + max_rq = tmp_rq; + } + } + + return max_rq; +} + +static int try_pull_from(struct rq *src_rq, struct rq *this_rq) +{ + struct rq_flags rf; + int dst_cpu = cpu_of(this_rq); + struct task_struct *p; + + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.head && src_rq->cfs.nr_running > 1) { + p = task_of(se_of(src_rq->cfs.head)); + + if (can_migrate_task(p, dst_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &rf, p, dst_cpu); + return 1; + } + } + + rq_unlock(src_rq, &rf); + local_irq_restore(rf.flags); + + return 0; +} + /* * newidle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. @@ -10977,6 +11219,105 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * > 0 - success, new (fair) tasks present */ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) +{ + int this_cpu = this_rq->cpu; + struct task_struct *p = NULL; + struct rq *src_rq; + int src_cpu; + struct rq_flags src_rf; + int pulled_task = 0; + int cores_round = 1; + + update_misfit_status(NULL, this_rq); + /* + * We must set idle_stamp _before_ calling idle_balance(), such that we + * measure the duration of idle_balance() as idle time. + */ + this_rq->idle_stamp = rq_clock(this_rq); + + /* + * Do not pull tasks towards !active CPUs... + */ + if (!cpu_active(this_cpu)) + return 0; + + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + raw_spin_unlock(&this_rq->lock); + +again: + for_each_online_cpu(src_cpu) { + + if (src_cpu == this_cpu) + continue; + + if (cores_round && !cpus_share_cache(src_cpu, this_cpu)) + continue; + + src_rq = cpu_rq(src_cpu); + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.nr_running < 2 || !(src_rq->cfs.head)) + goto next; + + p = task_of(se_of(src_rq->cfs.head)); + + if (can_migrate_task(p, this_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &src_rf, p, this_cpu); + + pulled_task = 1; + goto out; + } + +next: + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); + + /* + * Stop searching for tasks to pull if there are + * now runnable tasks on this rq. + */ + if (pulled_task || this_rq->nr_running > 0) + goto out; + } + + if (cores_round) { + // now search for all cpus + cores_round = 0; + goto again; + } + +out: + raw_spin_lock(&this_rq->lock); + + /* + * While browsing the domains, we released the rq lock, a task could + * have been enqueued in the meantime. Since we're not going idle, + * pretend we pulled a task. + */ + if (this_rq->cfs.h_nr_running && !pulled_task) + pulled_task = 1; + + /* Is there a task of a high priority class? */ + if (this_rq->nr_running != this_rq->cfs.h_nr_running) + pulled_task = -1; + + if (pulled_task) + this_rq->idle_stamp = 0; + + rq_repin_lock(this_rq, rf); + + return pulled_task; +} +#else +static int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -11130,6 +11471,214 @@ void trigger_load_balance(struct rq *rq) nohz_balancer_kick(rq); } +#endif + +#ifdef CONFIG_CACULE_RDB +static int +idle_try_pull_any(struct cfs_rq *cfs_rq) +{ + struct task_struct *p = NULL; + struct rq *this_rq = rq_of(cfs_rq), *src_rq; + int dst_cpu = cpu_of(this_rq); + int src_cpu; + struct rq_flags rf; + int pulled = 0; + int cores_round = 1; + +again: + for_each_online_cpu(src_cpu) { + + if (src_cpu == dst_cpu) + continue; + + if (cores_round && !cpus_share_cache(src_cpu, dst_cpu)) + continue; + + src_rq = cpu_rq(src_cpu); + + rq_lock_irqsave(src_rq, &rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.nr_running < 2 || !(src_rq->cfs.head)) + goto next; + + p = task_of(se_of(src_rq->cfs.head)); + + if (can_migrate_task(p, dst_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &rf, p, dst_cpu); + pulled = 1; + goto out; + } + +next: + rq_unlock(src_rq, &rf); + local_irq_restore(rf.flags); + } + + if (cores_round) { + // now search for all cpus + cores_round = 0; + goto again; + } + +out: + return pulled; +} + + +static int +try_pull_higher_IS(struct cfs_rq *cfs_rq) +{ + struct rq *this_rq = rq_of(cfs_rq), *max_rq; + int dst_cpu = cpu_of(this_rq); + + max_rq = find_max_IS_rq(cfs_rq, dst_cpu); + + if (!max_rq) + return 0; + + if (try_pull_from(max_rq, this_rq)) + return 1; + + return 0; +} + +static void try_push_any(struct rq *this_rq) +{ + struct task_struct *p = NULL; + struct rq *dst_rq; + int dst_cpu; + int src_cpu = cpu_of(this_rq); + int cores_round = 1; + +again: + for_each_online_cpu(dst_cpu) { + + if (dst_cpu == src_cpu) + continue; + + if (cores_round && !cpus_share_cache(src_cpu, dst_cpu)) + continue; + + dst_rq = cpu_rq(dst_cpu); + + if (dst_rq->cfs.nr_running >= this_rq->cfs.nr_running - 1) + continue; + + // lock this rq + raw_spin_lock(&this_rq->lock); + update_rq_clock(this_rq); + + if (!this_rq->cfs.head) { + // unlock this rq + raw_spin_unlock(&this_rq->lock); + return; + } + + p = task_of(se_of(this_rq->cfs.head)); + + if (can_migrate_task(p, dst_cpu, this_rq)) { + push_to_unlock(this_rq, dst_rq, p, dst_cpu); + return; + } + + // unlock this rq + raw_spin_unlock(&this_rq->lock); + } + + if (cores_round) { + // now search for all cpus + cores_round = 0; + goto again; + } +} + +static void try_pull_any(struct rq *this_rq) +{ + struct task_struct *p = NULL; + struct rq *src_rq; + int dst_cpu = cpu_of(this_rq); + int src_cpu; + struct rq_flags src_rf; + int cores_round = 1; + unsigned int this_head = this_rq->cfs.IS_head; + +again: + for_each_online_cpu(src_cpu) { + + if (src_cpu == dst_cpu) + continue; + + if (cores_round && !cpus_share_cache(src_cpu, dst_cpu)) + continue; + + src_rq = cpu_rq(src_cpu); + + if (src_rq->cfs.nr_running < 2 || !(src_rq->cfs.head) + || READ_ONCE(src_rq->cfs.IS_head) >= this_head) + continue; + + rq_lock_irqsave(src_rq, &src_rf); + update_rq_clock(src_rq); + + if (src_rq->cfs.nr_running < 2 || !(src_rq->cfs.head) + || src_rq->cfs.IS_head >= this_head) + goto next; + + p = task_of(se_of(src_rq->cfs.head)); + + if (can_migrate_task(p, dst_cpu, src_rq)) { + pull_from_unlock(this_rq, src_rq, &src_rf, p, dst_cpu); + return; + } + +next: + rq_unlock(src_rq, &src_rf); + local_irq_restore(src_rf.flags); + } + + if (cores_round) { + // now search for all cpus + cores_round = 0; + goto again; + } +} + +static inline void +active_balance(struct rq *rq) +{ + struct cfs_rq *cfs_rq = &rq->cfs; + + if (!cfs_rq->head || cfs_rq->nr_running < 2) + try_pull_higher_IS(&rq->cfs); + else { + try_push_any(rq); + try_pull_any(rq); + } +} + +void trigger_load_balance(struct rq *rq) +{ + unsigned long interval; + +#ifdef CONFIG_RDB_INTERVAL + if (time_before(jiffies, rq->next_balance)) + return; +#endif + + if (rq->idle_balance) + idle_try_pull_any(&rq->cfs); + else { + active_balance(rq); + +#ifdef CONFIG_RDB_INTERVAL + /* scale ms to jiffies */ + interval = msecs_to_jiffies(CONFIG_RDB_INTERVAL); + rq->next_balance = jiffies + interval; +#endif + } +} +#endif static void rq_online_fair(struct rq *rq) { @@ -11308,9 +11857,12 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) } } #else +#if !defined(CONFIG_CACULE_RDB) static void propagate_entity_cfs_rq(struct sched_entity *se) { } #endif +#endif +#if !defined(CONFIG_CACULE_RDB) static void detach_entity_cfs_rq(struct sched_entity *se) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -11321,9 +11873,11 @@ static void detach_entity_cfs_rq(struct sched_entity *se) update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); } +#endif static void attach_entity_cfs_rq(struct sched_entity *se) { +#if !defined(CONFIG_CACULE_RDB) struct cfs_rq *cfs_rq = cfs_rq_of(se); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -11339,10 +11893,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se) attach_entity_load_avg(cfs_rq, se); update_tg_load_avg(cfs_rq); propagate_entity_cfs_rq(se); +#endif } static void detach_task_cfs_rq(struct task_struct *p) { +#if !defined(CONFIG_CACULE_RDB) struct sched_entity *se = &p->se; #if !defined(CONFIG_CACULE_SCHED) @@ -11359,10 +11915,12 @@ static void detach_task_cfs_rq(struct task_struct *p) #endif detach_entity_cfs_rq(se); +#endif } static void attach_task_cfs_rq(struct task_struct *p) { +#if !defined(CONFIG_CACULE_RDB) struct sched_entity *se = &p->se; #if !defined(CONFIG_CACULE_SCHED) @@ -11375,6 +11933,7 @@ static void attach_task_cfs_rq(struct task_struct *p) if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; #endif +#endif } static void switched_from_fair(struct rq *rq, struct task_struct *p) @@ -11770,7 +12329,9 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m) __init void init_sched_fair_class(void) { #ifdef CONFIG_SMP +#if !defined(CONFIG_CACULE_RDB) open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); +#endif #ifdef CONFIG_NO_HZ_COMMON nohz.next_balance = jiffies; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7ca3d3d86c2a..a7422dea8a9f 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -262,10 +262,12 @@ static void do_idle(void) { int cpu = smp_processor_id(); +#if !defined(CONFIG_CACULE_RDB) /* * Check if we need to update blocked load */ nohz_run_idle_balance(cpu); +#endif /* * If the arch has a polling bit, we maintain an invariant: diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0affe3be7c21..576909c92fb5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -544,6 +544,10 @@ struct cfs_rq { struct cacule_node *head; struct cacule_node *tail; +#ifdef CONFIG_CACULE_RDB + unsigned int IS_head; +#endif + #else struct sched_entity *next; struct sched_entity *last; @@ -951,7 +955,6 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; struct dl_rq dl; - #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this CPU: */ struct list_head leaf_cfs_rq_list;