measure cpu loading by /proc/stat

常见的测量CPU loading 的工具有:

sar
top
iostat
mpstat
cat /proc/stat

我们参看busybox 中src code，可以发现top，iostat, mpstat 都是使用到/proc/stat, 或者/proc/stat 文件并进行解析呈现。

iostat
iostat

mpstat
mpstat

cat /proc/1/stat
proc_pid_stat

1. /proc/stat

CPU time = user + nice + system + idle + iowait + irq + softirq + Steal

item	remarks
user time	普通用户进程占用时间
nice time	高优先级用户进程占用时间
system time	OS 中运行时间
idle time	CPU 空闲时间
iowait time	I/O 等待时间
irq time	硬中断处理时间
softirq time	软中断处理时间
steal time	类似于guest os 切换等未统计到的时间

proc_stat

1.1. 原理

/* kernel/fs/proc/stat.c, kernel-4.9.198 */
static int stat_open(struct inode *inode, struct file *file)
{
	size_t size = 1024 + 128 * num_online_cpus();

	/* minimum size to display an interrupt count : 2 bytes */
	size += 2 * nr_irqs;
	return single_open_size(file, show_stat, NULL, size);
}

static int show_stat(struct seq_file *p, void *v)
{
	int i, j;
	u64 user, nice, system, idle, iowait, irq, softirq, steal;
	u64 guest, guest_nice;

	user = nice = system = idle = iowait =
		irq = softirq = steal = 0;
	guest = guest_nice = 0;

	getboottime64(&boottime);

	for_each_possible_cpu(i) {
		user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
		nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
		system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
		idle += get_idle_time(i);
		iowait += get_iowait_time(i);
		irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
		softirq += kcpustat_cpu(i).cpustat[CPUTIME_SOFTIRQ];
		steal += kcpustat_cpu(i).cpustat[CPUTIME_STEAL];
		guest += kcpustat_cpu(i).cpustat[CPUTIME_GUEST];
		guest_nice += kcpustat_cpu(i).cpustat[CPUTIME_GUEST_NICE];
		sum += kstat_cpu_irqs_sum(i);
		sum += arch_irq_stat_cpu(i);
	}
}

在show_stat() 中kcpustat_cpu(i).cpustat[CPUTIME_USER] 这个变量时一个关键全局变量。per_cpu 的用法大致是在kernel init 时拷贝CPU NUM 份变量到不同的内存空间，访问时加上CPU NUM(i) 的偏移量。

struct kernel_cpustat {
	u64 cpustat[NR_STATS];
};

#define kstat_cpu(cpu) per_cpu(kstat, cpu)
#define kcpustat_cpu(cpu) per_cpu(kernel_cpustat, cpu)

1.2. 何时更新

那kernel_cpustat 是在什么时候更新的呢？答案是在Timer 的中断函数中进行更新。

我们可以使用dump_stack()函数打印调用栈。在clockevents_config_and_register() 进行clock event 注册时有如下关系：

clockevents_config_and_register() ->
    tick_check_new_device() -> 
        tick_setup_device() -> 
            tick_setup_periodic() -> 
                tick_set_periodic_handle()

那之后timer 将会在1/HZ 时raise 中断，则有如下调用关系起来：

tick_handle_periodic()->
    update_process_times() ->
        account_process_tick() ->
            account_system_time()

在account_system_time() 函数中会进行分类统计CPU 占用时间。

/* linux/kernel/sched/cputime.c, kernel-4.9.18 */
void account_process_tick(struct task_struct *p, int user_tick)
{
	cputime_t cputime, scaled, steal;
	struct rq *rq = this_rq();

	if (vtime_accounting_cpu_enabled())
		return;

	if (sched_clock_irqtime) {
		irqtime_account_process_tick(p, user_tick, rq, 1);
		return;
	}

	cputime = cputime_one_jiffy;
	steal = steal_account_process_time(ULONG_MAX);

	if (steal >= cputime)
		return;

	cputime -= steal;
	scaled = cputime_to_scaled(cputime);

	if (user_tick)
		account_user_time(p, cputime, scaled);
	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
		account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
	else
		account_idle_time(cputime);
}

1.3. 怎么分类

接下来的问题是我们怎么知道何时是user, system, idle 等呢？

1.3.1 user or system?

ARM CPU 可以从CPSR reg 得到当前的运行态，下面函数大概也是基于此思想：

/*
tick_handle_periodic() ->
    tick_periodic()
*/
static void tick_periodic(int cpu)
{
	if (tick_do_timer_cpu == cpu) {
		write_seqlock(&jiffies_lock);

		/* Keep track of the next tick event */
		tick_next_period = ktime_add(tick_next_period, tick_period);

		do_timer(1);
		write_sequnlock(&jiffies_lock);
		update_wall_time();
	}

	update_process_times(user_mode(get_irq_regs()));
	profile_tick(CPU_PROFILING);
}

#define user_mode(regs)	\
	(((regs)->ARM_cpsr & 0xf) == 0)

static inline struct pt_regs *get_irq_regs(void)
{
	return __this_cpu_read(__irq_regs);
}

1.3.2. system, irq, softirq ?

判断idle or system
主要通过struct rq -> runqueue 运行队列上状态判断是否是IDLE。

void account_process_tick(struct task_struct *p, int user_tick)
{
	struct rq *rq = this_rq();
    ...
	if (user_tick)
		account_user_time(p, cputime, scaled);
	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
		account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
	else
		account_idle_time(cputime);
}

判断system, irq, softirq

kernel 判断通过thread_info 中preempt_count 进行判断。
在进入中断时，preempt_count 会进行设定。

#define __irq_enter()					\
	do {						\
		account_irq_enter_time(current);	\
		preempt_count_add(HARDIRQ_OFFSET);	\
		trace_hardirq_enter();			\
	} while (0)

#define in_irq()		        (hardirq_count())
#define in_softirq()		    (softirq_count())
#define in_interrupt()		    (irq_count())
#define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)

#define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
#define softirq_count()	(preempt_count() & SOFTIRQ_MASK)

static __always_inline int preempt_count(void)
{
	return READ_ONCE(current_thread_info()->preempt_count);
}


void account_system_time(struct task_struct *p, int hardirq_offset,
			 cputime_t cputime, cputime_t cputime_scaled)
{
	int index;

	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
		account_guest_time(p, cputime, cputime_scaled);
		return;
	}

	if (hardirq_count() - hardirq_offset)
		index = CPUTIME_IRQ;
	else if (in_serving_softirq())
		index = CPUTIME_SOFTIRQ;
	else
		index = CPUTIME_SYSTEM;

	__account_system_time(p, cputime, cputime_scaled, index);
}