msm8916平台Linux watchdog详解(2)
2017-02-05 15:02
1071 查看
介绍高通msm8916中CONFIG_MSM_WATCHDOG_V2宏打开之后,watchdog是怎么工作的。
这个watchdog跟硬件相关,主要检查当前是否有频繁的中断发生导致整个内核调度都出现问题。[LOCKUP_DETECTOR与MSM_WATCHDOG_V2的区别?]
这个宏很简单,所涉及的文件就只有一个drivers/soc/qcom/watchdog_v2.c文件。
相关的device tree定义如下
这个device tree内容在下面函数里边解析,下面内容可以看一下驱动里边是怎么操作resource地址空间的!!!
下面是probe函数
打印了pet_watchdog_work的调用时间。queue_delayed_work延迟调用的时间没有那么精确,但每次调用时间大概都是在10秒左右。
上面的log里边可以看到发生的wathcodg bark,然后打印了发生watchdog bark的时间(单位是秒)。然后也打印了上次pet的时间(Watchdog last pet at 1970.041145),已经超过了20秒。
解决这种问题,需要打印每个cpu的sched和irq的时间。而且其实只需要看1970秒之后到底发生了什么。
怀疑是卡在.app.soundalive pid = 6435这个进程,但看了task状态是disksleep状态
irq 30的handle irq是handle_fasteoi_irq()函数
这两个irq频繁触发,是否是导致pet_watchdog_work无法调度的原因呢?
这个watchdog跟硬件相关,主要检查当前是否有频繁的中断发生导致整个内核调度都出现问题。[LOCKUP_DETECTOR与MSM_WATCHDOG_V2的区别?]
这个宏很简单,所涉及的文件就只有一个drivers/soc/qcom/watchdog_v2.c文件。
相关的device tree定义如下
qcom,wdt@b017000 { compatible = "qcom,msm-watchdog"; reg = <0xb017000 0x1000>; reg-names = "wdt-base"; interrupts = <0 3 0>, <0 4 0>; qcom,bark-time = <20000>; //bark时间20秒,20秒之内没有喂狗,会触发中断 qcom,pet-time = <10000>;//pet时间10秒,表示每10秒喂一次狗 qcom,ipi-ping; };
这个device tree内容在下面函数里边解析,下面内容可以看一下驱动里边是怎么操作resource地址空间的!!!
(1) devm_request_mem_region()[iomem_resource] (2) pdata->base = devm_ioremap(&pdev->dev, pdata->phys_base, pdata->size);
static int msm_wdog_dt_to_pdata(struct platform_device *pdev, struct msm_watchdog_data *pdata) { struct device_node *node = pdev->dev.of_node; struct resource *res; int ret; res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "wdt-base"); if (!res) return -ENODEV; pdata->size = resource_size(res); pdata->phys_base = res->start; if (unlikely(!(devm_request_mem_region(&pdev->dev, pdata->phys_base, pdata->size, "msm-watchdog")))) { dev_err(&pdev->dev, "%s cannot reserve watchdog region\n", __func__); return -ENXIO; } pdata->base = devm_ioremap(&pdev->dev, pdata->phys_base, pdata->size); if (!pdata->base) { dev_err(&pdev->dev, "%s cannot map wdog register space\n", __func__); return -ENXIO; } #ifdef CONFIG_SEC_DEBUG wdog_base_addr = pdata->base; #endif res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "wdt-absent-base"); if (res) { pdata->wdog_absent_base = devm_ioremap(&pdev->dev, res->start, resource_size(res)); if (!pdata->wdog_absent_base) { dev_err(&pdev->dev, "cannot map wdog absent register space\n"); return -ENXIO; } } else { dev_info(&pdev->dev, "wdog absent resource not present\n"); } //分别取device tree里边的interrupts = <0 3 0>, <0 4 0>;的第一个第二个 pdata->bark_irq = platform_get_irq(pdev, 0); pdata->bite_irq = platform_get_irq(pdev, 1); ret = of_property_read_u32(node, "qcom,bark-time", &pdata->bark_time); if (ret) { dev_err(&pdev->dev, "reading bark time failed\n"); return -ENXIO; } ret = of_property_read_u32(node, "qcom,pet-time", &pdata->pet_time); if (ret) { dev_err(&pdev->dev, "reading pet time failed\n"); return -ENXIO; } pdata->do_ipi_ping = of_property_read_bool(node, "qcom,ipi-ping"); if (!pdata->bark_time) { dev_err(&pdev->dev, "%s watchdog bark time not setup\n", __func__); return -ENXIO; } if (!pdata->pet_time) { dev_err(&pdev->dev, "%s watchdog pet time not setup\n", __func__); return -ENXIO; } pdata->irq_ppi = irq_is_percpu(pdata->bark_irq);//irq是否是 //percpu的这个在哪里指定?? dump_pdata(pdata); return 0; }
下面是probe函数
static int msm_watchdog_probe(struct platform_device *pdev) { int ret; struct msm_watchdog_data *wdog_dd; wdog_wq = alloc_workqueue("wdog", WQ_HIGHPRI, 0);//初始化一个workqueue if (!wdog_wq) { pr_err("Failed to allocate watchdog workqueue\n"); return -EIO; } if (!pdev->dev.of_node || !enable) return -ENODEV; wdog_dd = kzalloc(sizeof(struct msm_watchdog_data), GFP_KERNEL); if (!wdog_dd) return -EIO; ret = msm_wdog_dt_to_pdata(pdev, wdog_dd); if (ret) goto err; wdog_data = wdog_dd; wdog_dd->dev = &pdev->dev; platform_set_drvdata(pdev, wdog_dd); cpumask_clear(&wdog_dd->alive_mask); INIT_WORK(&wdog_dd->init_dogwork_struct, init_watchdog_work); INIT_DELAYED_WORK(&wdog_dd->dogwork_struct, pet_watchdog_work); queue_work(wdog_wq, &wdog_dd->init_dogwork_struct); //上面初始化了两个work,然后queue进去init_dogwork_struct这个work //这里queue进入是否会马上调度执行??需要仔细看一下workqueue相关内容 return 0; err: destroy_workqueue(wdog_wq); kzfree(wdog_dd); return ret; }
初始化watchdog
static void init_watchdog_work(struct work_struct *work) { struct msm_watchdog_data *wdog_dd = container_of(work, struct msm_watchdog_data, init_dogwork_struct); unsigned long delay_time; int error; u64 timeout; int ret; /* * Disable the watchdog for cluster 1 so that cluster 0 watchdog will * be mapped to the entire sub-system. */ if (wdog_dd->wdog_absent_base) __raw_writel(2, wdog_dd->wdog_absent_base + WDOG_ABSENT); if (wdog_dd->irq_ppi) { wdog_dd->wdog_cpu_dd = alloc_percpu(struct msm_watchdog_data *); if (!wdog_dd->wdog_cpu_dd) { dev_err(wdog_dd->dev, "fail to allocate cpu data\n"); return; } *__this_cpu_ptr(wdog_dd->wdog_cpu_dd) = wdog_dd; ret = request_percpu_irq(wdog_dd->bark_irq, wdog_ppi_bark, "apps_wdog_bark", wdog_dd->wdog_cpu_dd); if (ret) { dev_err(wdog_dd->dev, "failed to request bark irq\n"); free_percpu(wdog_dd->wdog_cpu_dd); return; } } else { //给bark_irq设置回调函数。如果设定的20秒的bark时间之内没有喂狗,就会触发中断 ret = devm_request_irq(wdog_dd->dev, wdog_dd->bark_irq, wdog_bark_handler, IRQF_TRIGGER_RISING, "apps_wdog_bark", wdog_dd); if (ret) { dev_err(wdog_dd->dev, "failed to request bark irq\n"); return; } } delay_time = msecs_to_jiffies(wdog_dd->pet_time); //delay_time等于10秒对应的jiffies //用这个delay_time来设置下一次pet_watchdog_work函数调用时间 wdog_dd->min_slack_ticks = UINT_MAX; wdog_dd->min_slack_ns = ULLONG_MAX; configure_bark_dump(wdog_dd); timeout = (wdog_dd->bark_time * WDT_HZ)/1000; //timeou为20秒对应的jiffies个数! __raw_writel(timeout, wdog_dd->base + WDT0_BARK_TIME); //根据timeout时间,设置bark_time寄存器 __raw_writel(timeout + 3*WDT_HZ, wdog_dd->base + WDT0_BITE_TIME); //根据timeout+3秒时间,设置bite_time,但这个没有设置系统中断函数,好像是没有处理 wdog_dd->panic_blk.notifier_call = panic_wdog_handler; atomic_notifier_chain_register(&panic_notifier_list, &wdog_dd->panic_blk); mutex_init(&wdog_dd->disable_lock); //10秒后调用pet_watchdog_work函数 queue_delayed_work(wdog_wq, &wdog_dd->dogwork_struct, delay_time); __raw_writel(1, wdog_dd->base + WDT0_EN);//使能 __raw_writel(1, wdog_dd->base + WDT0_RST);//reset wdog_dd->last_pet = sched_clock(); wdog_dd->enabled = true; #ifdef CONFIG_SEC_DEBUG last_pet = wdog_dd->last_pet; sec_debug_save_last_pet(wdog_dd->last_pet); #endif error = device_create_file(wdog_dd->dev, &dev_attr_disable); if (error) dev_err(wdog_dd->dev, "cannot create sysfs attribute\n"); if (wdog_dd->irq_ppi) enable_percpu_irq(wdog_dd->bark_irq, 0); if (ipi_opt_en) cpu_pm_register_notifier(&wdog_cpu_pm_nb); dev_info(wdog_dd->dev, "MSM Watchdog Initialized\n"); return; }
喂狗的函数
static void pet_watchdog_work(struct work_struct *work) { unsigned long delay_time; struct delayed_work *delayed_work = to_delayed_work(work); struct msm_watchdog_data *wdog_dd = container_of(delayed_work, struct msm_watchdog_data, dogwork_struct); delay_time = msecs_to_jiffies(wdog_dd->pet_time); pr_info("pet_watchdog_work delay_time = %d enable = %d \n ",(unsigned int)delay_time,enable); if (enable) { if (wdog_dd->do_ipi_ping) ping_other_cpus(wdog_dd); pet_watchdog(wdog_dd);//喂狗 } /* Check again before scheduling * * Could have been changed on other cpu */ if (enable) //再设置10秒后喂狗 queue_delayed_work(wdog_wq, &wdog_dd->dogwork_struct, delay_time); }
打印了pet_watchdog_work的调用时间。queue_delayed_work延迟调用的时间没有那么精确,但每次调用时间大概都是在10秒左右。
出现watchdog bark的时候怎么去debug
watchdog bark的出现,说明有比较高优先级的进程一直在运行,或者有频繁的中断发生,导致喂狗的work queue在10秒之内一直没有办法运行去喂狗(bark_time是20秒减去pet_time10秒)。<4>[ 1990.647173] [1: krtccd: 143] reclaimed 1026 (swapped 1026) pages. <6>[ 1993.082536] I[0: mmcqd/0: 145] Watchdog bark! Now = 1993.082530 <0>[ 1993.082594] I[0: mmcqd/0: 145] (sec_debug_set_upload_magic) 776655ee <6>[ 1993.082649] I[0: mmcqd/0: 145] sec_debug_set_qc_dload_magic: on=1 <3>[ 1993.082712] I[0: mmcqd/0: 145] set_dload_mode <1> ( c01191fc ) <0>[ 1993.082882] I[0: mmcqd/0: 145] (sec_debug_set_upload_cause) dba <6>[ 1993.082939] I[0: mmcqd/0: 145] Watchdog last pet at 1970.041145 <6>[ 1993.082990] I[0: mmcqd/0: 145] cpu alive mask from last pet 0
上面的log里边可以看到发生的wathcodg bark,然后打印了发生watchdog bark的时间(单位是秒)。然后也打印了上次pet的时间(Watchdog last pet at 1970.041145),已经超过了20秒。
解决这种问题,需要打印每个cpu的sched和irq的时间。而且其实只需要看1970秒之后到底发生了什么。
cpu0的sched和irq的log
cpu0的sched和irq的log只到1974秒左右。而且看cpu状态,cpu0一直也是online状态。怀疑是卡在.app.soundalive pid = 6435这个进程,但看了task状态是disksleep状态
[ 1970.839923][ SECURE] SVC_ID = 12 CMD_ID = 2 [ 1970.839938][ SECURE] SVC_ID = 12 CMD_ID = 2 [ 1971.572058][ SECURE] SVC_ID = 12 CMD_ID = 2 ... [ 1974.161089][SCHEDULE] AudioOut_DD pid = 9483 [ 1974.161099][SCHEDULE] kworker/u8:34 pid = 6390 [ 1974.161104][SCHEDULE] @async_run_entr pid = -1 [ 1974.161112][SCHEDULE] AudioOut_DD pid = 9483 ... [ 1974.226214][SCHEDULE] .app.soundalive pid = 6435//1974秒之后cpu0也没有输出什么log,然后直接到1993秒时间点 [ 1993.073554][IRQ EXIT] irq = 240 start_time = 1993073539371 end_time = 1993073554527 elapsed_time = 15156 [ 1993.073570][IRQ EXIT] irq = 240 start_time = 1993073555985 en[ 1974.226784][SCHEDULE] mmcqd/0 pid = 145 d_time = 1993073570985 elapsed_time = 15000 ... //后面一堆全是irq 240和507 //对应的handler函数是msm_tlmm_v4_handle_irq和sec_jack_detect_irq函数 //而且这些中断都是watchdog bark之前的,应该就是这些中断的问题 [ 1993.082526][IRQ EXIT] irq = 240 start_time = 1993082510985 end_time = 1993082526037 elapsed_time = 15052 [ 1993.082528][ IRQ] irq = 35 fn = 0xc03fd72c en=0x80 preempt_count=0x10101 context=0xed395c8c//一直到喂狗时间的前一刻还在发生中断
cpu1的sched和irq的log
irq 20的handle irq是handle_percpu_devid_irq函数irq 30的handle irq是handle_fasteoi_irq()函数
这两个irq频繁触发,是否是导致pet_watchdog_work无法调度的原因呢?
[ 1975.086905][ SECURE] SVC_ID = 12 CMD_ID = 2 [ 1975.086922][ SECURE] SVC_ID = 12 CMD_ID = 2 [ 1986.520440][ IRQ] irq = 20 fn = 0xc06b3108 en=0x80 preempt_count=0x10000 context=0xdb52dcc4 [ 1986.520473][IRQ EXIT] irq = 20 start_time = 1986520439403 end_time = 1986520473153 elapsed_time = 33750 ... //中间一直反复出现irq 20和irq 39,可以根据fn的地址通过system.map去找到对应函数 [ 1986.590443][ IRQ] irq = 20 fn = 0xc06b3108 en=0x80 preempt_count=0x10001 context=0xee609e7c [ 1986.590456][ IRQ] irq = 39 fn = 0xc06b3238 en=0x80 preempt_count=0x10101 context=0xee609dac [ 1986.590465][IRQ EXIT] irq = 39 start_time = 1986590455393 end_time = 1986590465445 elapsed_time = 10052 [ 1986.590502][IRQ EXIT] irq = 20 start_time = 1986590440861 end_time = 1986590502007 elapsed_time = 61146 ....//一直到1993秒左右一直在反复出现irq20和irq39 [ 1993.010442][ IRQ] irq = 20 fn = 0xc06b3108 en=0x80 preempt_count=0x10001 context=0xee609e7c [ 1993.010452][ IRQ] irq = 39 fn = 0xc06b3238 en=0x80 preempt_count=0x10101 context=0xee609dac [ 1993.010460][IRQ EXIT] irq = 39 start_time = 1993010451506 end_time = 1993010460360 elapsed_time = 8854 [ 1993.010486][IRQ EXIT] irq = 20 start_time = 1993010440464 end_time = 1993010486089 elapsed_time = 45625 //在pet_watchdog_work之前irq已经结束,所以这个不是原因 [ 1993.012658][SCHEDULE] qrngd pid = 240 [ 1993.012665][SCHEDULE] swapper/1 pid = 0 ...
cpu2的sched和irq的log
[ 1966.060443][ IRQ] irq = 20 fn = 0xc06b3108 en=0x80 preempt_count=0x10001 context=0xed349c7c [ 1966.060488][IRQ EXIT] irq = 20 start_time = 1966060442205 end_time = 1966060488715 elapsed_time = 46510 [ 1966.060505][SCHEDULE] ksoftirqd/2 pid = 16 [ 1966.060526][SCHEDULE] krtccd pid = 143 [ 1966.060723][SCHEDULE] migration/2 pid = 15 [ 1966.061856][SCHEDULE] swapper/2 pid = 0 [ 1966.061879][SCHEDULE] +pc(I:0,R:1) pid = -1 //后面就没有 log出来
cpu3的sched和irq的log
[ 1966.003495][SCHEDULE] -pc(1) pid = -1 [ 1966.003533][SCHEDULE] migration/3 pid = 19 [ 1966.004594][SCHEDULE] swapper/3 pid = 0 [ 1966.004623][SCHEDULE] +pc(I:0,R:1) pid = -1 ...
相关文章推荐
- 详解Visual C++ 2005中的突破性变化
- Linux日志管理详解
- .Net开发必备十大工具详解之.NET程序集反编译器Reflector
- jar包详解和META-INF作用
- C/C++中volatile关键字详解
- android ListView详解
- jquery validate 详解一
- Spring配置文件详解
- 计算机操作系统进程切换详解
- 详解MYSQL数据库密码的加密方式及破解方法(1)
- sap算法详解与模板 [转]
- (转)【配置关系】—Entity Framework实例详解
- 使用jadclipse+jad 在Eclipse中反编译Class文件详解
- hadoop (1.0.4) IntWritable 详解
- 读书札记:Webconfig元素配置详解(ASP.NET 设置架构)
- php.ini中文版详解
- java中的io系统详解
- UML概述与详解
- LVS原理详解及部署之五:LVS+keepalived实现负载均衡&高可用
- pjsip协议栈中定时器的实现详解