[RFD,10/10] sched: io_latency: Tracking via buckets

Message ID	1413986273-28522-11-git-send-email-daniel.lezcano@linaro.org
State	New
Headers	show Return-Path: <patchwork-forward+bncBDIYTLXUW4BRB67PT2RAKGQEQVFZGUY@linaro.org> MIME-Version: 1.0 Received-SPF: pass (google.com: domain of patch+caf_=patchwork-forward=linaro.org@linaro.org designates 209.85.215.48 as permitted sender) client-ip=209.85.215.48; Received-SPF: pass (google.com: domain of daniel.lezcano@linaro.org designates 74.125.82.41 as permitted sender) client-ip=74.125.82.41; From: Daniel Lezcano <daniel.lezcano@linaro.org> To: linux-pm@vger.kernel.org, linux-kernel@vger.kernel.org Cc: axboe@kernel.dk, rafael.j.wysocki@intel.com, mingo@kernel.org, peterz@infradead.org, preeti@linux.vnet.ibm.com, Morten.Rasmussen@arm.com, mturquette@linaro.org, tuukka.tikkanen@linaro.org, nicolas.pitre@linaro.org, patches@linaro.org Subject: [RFD PATCH 10/10] sched: io_latency: Tracking via buckets Date: Wed, 22 Oct 2014 15:57:53 +0200 Message-Id: <1413986273-28522-11-git-send-email-daniel.lezcano@linaro.org> In-Reply-To: <1413986273-28522-1-git-send-email-daniel.lezcano@linaro.org> References: <1413986273-28522-1-git-send-email-daniel.lezcano@linaro.org> Precedence: list Mailing-list: list patchwork-forward@linaro.org; contact patchwork-forward+owners@linaro.org

diff --git a/include/linux/sched.h b/include/linux/sched.h index 6af032b..9652ad6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1228,7 +1228,15 @@ struct io_latency_node { unsigned int avg_latency; ktime_t start_time; ktime_t end_time; + struct list_head bucket_list; }; + +void exit_io_latency(struct task_struct *tsk); +#else +static inline void exit_io_latency(struct task_struct *tsk) +{ + ; +} #endif struct task_struct { diff --git a/kernel/exit.c b/kernel/exit.c index 32c58f7..3413fbe 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -757,6 +757,7 @@ void do_exit(long code) exit_task_namespaces(tsk); exit_task_work(tsk); exit_thread(); + exit_io_latency(tsk); /* * Flush inherited counters to the parent - before the parent diff --git a/kernel/fork.c b/kernel/fork.c index 7201bc4..d4e7ecc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -347,6 +347,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->task_frag.page = NULL; #ifdef CONFIG_SCHED_IO_LATENCY tsk->io_latency.avg_latency = 0; + INIT_LIST_HEAD(&tsk->io_latency.bucket_list); #endif account_kernel_stack(ti, 1); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 64181f6..96403f2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6961,6 +6961,8 @@ void __init sched_init(void) autogroup_init(&init_task); #endif /* CONFIG_CGROUP_SCHED */ + + io_latency_init(); for_each_possible_cpu(i) { struct rq *rq; @@ -7035,7 +7037,6 @@ void __init sched_init(void) #endif init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - io_latency_init(rq); } set_load_weight(&init_task); diff --git a/kernel/sched/io_latency.c b/kernel/sched/io_latency.c index 2d56a38..5f6bd50 100644 --- a/kernel/sched/io_latency.c +++ b/kernel/sched/io_latency.c @@ -23,23 +23,280 @@ struct io_latency_tree { struct io_latency_node *left_most; }; +/* + * That represents the resolution of the statistics in usec, the latency + * for a bucket is BUCKET_INTERVAL * index. + * The higher the resolution is the lesser good prediction you will have. + * Some measurements: + * + * For 1ms: + * SSD 6Gb/s : 99.7% + * SD card class 10: 97.7% + * SD card class 4 : 54.3% + * HDD on USB : 93.6% + * + * For 500us: + * SSD 6Gb/s : 99.9% + * SD card class 10 : 96.8% + * SD card class 4 : 55.8% + * HDD on USB : 86.3% + * + * For 200us: + * SSD 6Gb/s : 99.7% + * SD card class 10 : 95.5% + * SD card class 4 : 29.5% + * HDD on USB : 66.3% + * + * For 100us: + * SSD 6Gb/s : 85.7% + * SD card class 10 : 67.63% + * SD card class 4 : 31.4% + * HDD on USB : 44.97% + * + * Aiming a 100% is not necessary good because we want to hit the correct + * idle state. Setting a low resolution will group the different latencies + * into a big interval which may overlap with the cpuidle state target + * residency. + * + */ +#define BUCKET_INTERVAL 200 + +/* + * Number of successive hits for the same bucket. That is the thresold + * triggering the move of the element at the beginning of the list, so + * becoming more weighted for the statistics when guessing for the next + * latency. + */ +#define BUCKET_SUCCESSIVE 5 + +/* + * What is a bucket ? + * + * A bucket is an interval of latency. This interval is defined with the + * BUCKET_INTERVAL. The bucket index gives what latency interval we have. + * For example, if you have an index 2 and a bucket interval of 1000usec, + * then the bucket contains the latencies 2000 and 2999 usec. + * + */ +struct bucket { + int hits; + int successive_hits; + int index; + int average; + struct list_head list; +}; + +static struct kmem_cache *bucket_cachep; + static DEFINE_PER_CPU(struct io_latency_tree, latency_trees); /** - * io_latency_init : initialization routine to be called for each possible cpu. + * io_latency_bucket_find - Find a bucket associated with the specified index * - * @rq: the runqueue associated with the cpu + * @index: the index of the bucket to find + * @tsk: the task to retrieve the task list * + * Returns the bucket associated with the index, NULL if no bucket is found */ -void io_latency_init(struct rq *rq) +static struct bucket *io_latency_bucket_find(struct task_struct *tsk, int index) { - int cpu = rq->cpu; - struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu); - struct rb_root *root = &latency_tree->tree; + struct list_head *list; + struct bucket *bucket = NULL; + struct list_head *bucket_list = &tsk->io_latency.bucket_list; - spin_lock_init(&latency_tree->lock); - latency_tree->left_most = NULL; - root->rb_node = NULL; + list_for_each(list, bucket_list) { + + bucket = list_entry(list, struct bucket, list); + + if (bucket->index == index) + return bucket; + } + + return NULL; +} + +/** + * io_latency_bucket_alloc - Allocate a bucket + * + * @index: index of the bucket to allow + * + * Allocate and initialize a bucket structure + * + * Returns a pointer to a bucket or NULL is the allocation failed + */ +static struct bucket *io_latency_bucket_alloc(int index) +{ + struct bucket *bucket; + + bucket = kmem_cache_alloc(bucket_cachep, GFP_KERNEL); + if (bucket) { + bucket->hits = 0; + bucket->successive_hits = 0; + bucket->index = index; + bucket->average = 0; + INIT_LIST_HEAD(&bucket->list); + } + + return bucket; +} + +/** + * io_latency_guessed_bucket - try to predict the next bucket + * + * @tsk: the task to get the bucket list + * + * The list is ordered by history. The first element is the one with + * the more *successive* hits. This function is called each time a new + * latency is inserted. The algorithm is pretty simple here: As the + * first element is the one which more chance to occur next, its + * weight is the bigger, the second one has less weight, etc ... + * + * The bucket which has the maximum score (number of hits weighted by + * its position in the list) is the next bucket which has more chances + * to occur. + * + * Returns a pointer to the bucket structure, NULL if there are no + * buckets in the list + */ +static struct bucket *io_latency_guessed_bucket(struct task_struct *tsk) +{ + int weight = 0; + int score, score_max = 0; + struct bucket *bucket, *winner = NULL; + struct list_head *list = NULL; + struct list_head *bucket_list = &tsk->io_latency.bucket_list; + + if (list_empty(bucket_list)) + return NULL; + + list_for_each(list, bucket_list) { + + bucket = list_entry(list, struct bucket, list); + + /* + * The list is ordered by history, the first element has + * more weight the next one + */ + score = bucket->hits / ((2 * weight) + 1); + + weight++; + + if (score < score_max) + continue; + + score_max = score; + winner = bucket; + } + + return winner; +} + +/* + * io_latency_bucket_index - Returns the bucket index for the specified latency + * + * @latency: the latency fitting a bucket with the specified index + * + * Returns an integer for the bucket's index + */ +static int io_latency_bucket_index(int latency) +{ + return latency / BUCKET_INTERVAL; +} + +/* + * io_latency_bucket_fill - Compute and fill the bucket list + * + * @tsk: the task completing an IO + * @latency: the latency of the IO + * + * The dynamic of the list is the following. + * - Each new element is inserted at the end of the list + * - Each element passing <BUCKET_SUCCESSIVE> times in this function + * is elected to be moved at the beginning at the list + * + * Returns 0 on success, -1 if a bucket allocation failed + */ +static int io_latency_bucket_fill(struct task_struct *tsk, int latency) +{ + int diff, index = io_latency_bucket_index(latency); + struct bucket *bucket; + + /* + * Find the bucket associated with the index + */ + bucket = io_latency_bucket_find(tsk, index); + if (!bucket) { + bucket = io_latency_bucket_alloc(index); + if (!bucket) + return -1; + + list_add_tail(&bucket->list, &tsk->io_latency.bucket_list); + } + + /* + * Increase the number of times this bucket has been hit + */ + bucket->hits++; + bucket->successive_hits++; + + /* + * Compute a sliding average for latency in this bucket + */ + diff = latency - bucket->average; + bucket->average += (diff >> 6); + + /* + * We hit a successive number of times the same bucket, move + * it at the beginning of the list + */ + if (bucket->successive_hits == BUCKET_SUCCESSIVE) { + list_move(&bucket->list, &tsk->io_latency.bucket_list); + bucket->successive_hits = 1; + } + + return 0; +} + +/* + * exit_io_latency - free ressources when the task exits + * + * @tsk : the exiting task + * + */ +void exit_io_latency(struct task_struct *tsk) +{ + struct list_head *bucket_list = &tsk->io_latency.bucket_list; + struct list_head *tmp, *list; + struct bucket *bucket; + + list_for_each_safe(list, tmp, bucket_list) { + + list_del(list); + bucket = list_entry(list, struct bucket, list); + kmem_cache_free(bucket_cachep, bucket); + } +} + +/** + * io_latency_init : initialization routine + * + * Initializes the cache pool and the io latency rb trees. + */ +void io_latency_init(void) +{ + int cpu; + struct io_latency_tree *latency_tree; + struct rb_root *root; + + bucket_cachep = KMEM_CACHE(bucket, SLAB_PANIC); + + for_each_possible_cpu(cpu) { + latency_tree = &per_cpu(latency_trees, cpu); + latency_tree->left_most = NULL; + spin_lock_init(&latency_tree->lock); + root = &latency_tree->tree; + root->rb_node = NULL; + } } /** @@ -54,18 +311,20 @@ s64 io_latency_get_sleep_length(struct rq *rq) int cpu = rq->cpu; struct io_latency_tree *latency_tree = &per_cpu(latency_trees, cpu); struct io_latency_node *node; - ktime_t now = ktime_get(); - s64 diff; + s64 diff, next_event, now; node = latency_tree->left_most; - if (!node) return 0; - diff = ktime_to_us(ktime_sub(now, node->start_time)); - diff = node->avg_latency - diff; + next_event = ktime_to_us(node->start_time) + node->avg_latency; + now = ktime_to_us(ktime_get()); + diff = next_event - now; - /* Estimation was wrong, return 0 */ + /* Estimation was wrong, so the next io event should have + * already occured but it actually didn't, so we have a + * negative value, return 0 in this case as it is considered + * by the caller as an invalid value */ if (diff < 0) return 0; @@ -78,13 +337,17 @@ s64 io_latency_get_sleep_length(struct rq *rq) * @node: a rb tree node belonging to a task * */ -static void io_latency_avg(struct io_latency_node *node) +static void io_latency_avg(struct task_struct *tsk) { - /* MA*[i]= MA*[i-1] + X[i] - MA*[i-1]/N */ + struct io_latency_node *node = &tsk->io_latency; s64 latency = ktime_to_us(ktime_sub(node->end_time, node->start_time)); - s64 diff = latency - node->avg_latency; + struct bucket *bucket; + + io_latency_bucket_fill(tsk, latency); - node->avg_latency = node->avg_latency + (diff >> 6); + bucket = io_latency_guessed_bucket(tsk); + if (bucket) + node->avg_latency = bucket->average; } /** @@ -118,7 +381,11 @@ int io_latency_begin(struct rq *rq, struct task_struct *tsk) parent = *new; - if (lat->avg_latency > node->avg_latency) + /* + * Check *when* will occur the next event + */ + if (ktime_to_us(lat->start_time) + lat->avg_latency > + ktime_to_us(node->start_time) + node->avg_latency) new = &parent->rb_left; else { new = &parent->rb_right; @@ -170,5 +437,5 @@ void io_latency_end(struct rq *rq, struct task_struct *tsk) spin_unlock(&latency_tree->lock); - io_latency_avg(old); + io_latency_avg(tsk); } diff --git a/kernel/sched/io_latency.h b/kernel/sched/io_latency.h index 62ece7c..c54de4d 100644 --- a/kernel/sched/io_latency.h +++ b/kernel/sched/io_latency.h @@ -11,12 +11,12 @@ */ #ifdef CONFIG_SCHED_IO_LATENCY -extern void io_latency_init(struct rq *rq); +extern void io_latency_init(void); extern int io_latency_begin(struct rq *rq, struct task_struct *tsk); extern void io_latency_end(struct rq *rq, struct task_struct *tsk); -extern int io_latency_get_sleep_length(struct rq *rq); +extern s64 io_latency_get_sleep_length(struct rq *rq); #else -static inline void io_latency_init(struct rq *rq) +static inline void io_latency_init(void) { ; } @@ -31,7 +31,7 @@ static inline void io_latency_end(struct rq *rq, struct task_struct *tsk) ; } -static inline int io_latency_get_sleep_length(struct rq *rq) +static inline s64 io_latency_get_sleep_length(struct rq *rq) { return 0; }

[RFD,10/10] sched: io_latency: Tracking via buckets

Commit Message

Comments

Patch