Skip to main content
 首页 » 编程设计

linux-kernel之内核哎呀哎呀 : 80000005 on arm embedded system

2024年12月31日2452php

请帮我解决这个问题。我使用一个 1 毫秒的高分辨率计时器并将其安装为带有“insmod”的单独模块。这每 1 ms 触发一次,我必须用这个定时器中断做一些任务。还有其他进行图像传输的进程,我看到以太网驱动程序中断出现在发送图像。这个enet中断有一些高优先级,看起来它正在延迟上面的1 ms定时器中断,但我不确定。

在运行测试 3 到 3 小时后,我看到了下面的 Oops。如何从根本上解决这个问题?
请帮忙。
系统为ARM omap,运行Linux 2.6.33 交叉编译。

[root@user:/]#  
Unable to handle kernel paging request at virtual address 7eb52754 
pgd = 80004000 
[7eb52754] *pgd=00000000 
Internal error: Oops: 80000005 [#1] PREEMPT 
last sysfs file: /sys/devices/virtual/spi/spi/dev 
Modules linked in: mod timermod mod2(P) mod3(P) mod4 
CPU: 0    Tainted: P            (2.6.33_appl #1) 
PC is at 0x7eb52754 
LR is at walk_stackframe+0x24/0x40 
pc : [<7eb52754>]    lr : [<8002d4dc>]    psr: a0000013 
sp : 80395f10  ip : 80395f30  fp : 80395f2c 
r10: 0000001f  r9 : 00000000  r8 : 87a25200 
r7 : 878b0380  r6 : 80395f40  r5 : 80028374  r4 : 80395f30 
r3 : 80000100  r2 : 80395f40  r1 : 80395f40  r0 : 80395f30 
Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment kernel 
Control: 10c5387d  Table: 86fb0019  DAC: 00000017 
Process swapper (pid: 0, stack limit = 0x803942e8) 
Stack: (0x80395f10 to 0x80396000) 
5f00:                                     8002bfa4 00000001 802c678c 87a25380 
5f20: 80395f54 80395f30 8002bfe0 8002d4c4 80395f54 80395f30 8004998c 8002bfa4 
5f40: 00000002 00000002 80395f6c 80395f58 8004998c 8002bfb0 80396ea8 80394000 
5f60: 80395fa4 80395f70 802c678c 800498d0 8002b320 80023218 80398408 80021e10 
5f80: 80394000 8002321c 80023218 80398408 80021e10 413fc082 80395fbc 80395fa8 
5fa0: 8002b324 802c62fc 803f4cc8 803f5190 80395fcc 80395fc0 802c3ee4 8002b28c 
5fc0: 80395ff4 80395fd0 8000897c 802c3e6c 800084fc 00000000 00000000 8002321c 
5fe0: 10c53c7d 803c7630 00000000 80395ff8 80008034 80008754 00000000 00000000 
Backtrace: 
[<8002d4b8>] (walk_stackframe+0x0/0x40) from [<8002bfe0>] (return_address+0x3c/0x5c) 
 r6:87a25380 r5:802c678c r4:00000001 r3:8002bfa4 
[<8002bfa4>] (return_address+0x0/0x5c) from [<8004998c>] (sub_preempt_count+0xc8/0xfc) 
[<800498c4>] (sub_preempt_count+0x0/0xfc) from [<802c678c>] (schedule+0x49c/0x4d8) 
 r5:80394000 r4:80396ea8 
[<802c62f0>] (schedule+0x0/0x4d8) from [<8002b324>] (cpu_idle+0xa4/0xbc) 
 r9:413fc082 r8:80021e10 r7:80398408 r6:80023218 r5:8002321c 
r4:80394000 
[<8002b280>] (cpu_idle+0x0/0xbc) from [<802c3ee4>] (rest_init+0x84/0xa0) 
 r4:803f5190 r3:803f4cc8 
[<802c3e60>] (rest_init+0x0/0xa0) from [<8000897c>] (start_kernel+0x234/0x284) 
[<80008748>] (start_kernel+0x0/0x284) from [<80008034>] (__enable_mmu+0x0/0x2c) 
Code: bad PC value 
---[ end trace 7e26218fd59f68a5 ]--- 
Kernel panic - not syncing: Attempted to kill the idle task! 
Backtrace: 
[<8002db2c>] (dump_backtrace+0x0/0x114) from [<802c610c>] (dump_stack+0x20/0x24) 
 r6:fffffffc r5:0000000b r4:803c8518 r3:00000002 
[<802c60ec>] (dump_stack+0x0/0x24) from [<802c6168>] (panic+0x58/0x130) 
[<802c6110>] (panic+0x0/0x130) from [<80057330>] (do_exit+0x7c/0x6e0) 
 r3:80394000 r2:00000000 r1:80395d28 r0:80348e90 
[<800572b4>] (do_exit+0x0/0x6e0) from [<8002dfc0>] (die+0x290/0x2c4) 
 r7:7eb52744 
[<8002dd30>] (die+0x0/0x2c4) from [<8002f4d4>] (__do_kernel_fault+0x74/0x84) 
 r7:80395ec8 
[<8002f460>] (__do_kernel_fault+0x0/0x84) from [<8002f6bc>] (do_page_fault+0x1d8/0x1f0) 
 r7:00000000 r6:80395ec8 r5:7eb52754 r4:80396ea8 
[<8002f4e4>] (do_page_fault+0x0/0x1f0) from [<8002f794>] (do_translation_fault+0x20/0x80) 
[<8002f774>] (do_translation_fault+0x0/0x80) from [<80029250>] (do_PrefetchAbort+0x44/0xa8) 
 r6:7eb52754 r5:80398820 r4:00000005 r3:8002f774 
[<8002920c>] (do_PrefetchAbort+0x0/0xa8) from [<80029d1c>] (__pabt_svc+0x5c/0xa0) 
Exception stack(0x80395ec8 to 0x80395f10) 
5ec0:                   80395f30 80395f40 80395f40 80000100 80395f30 80028374 
5ee0: 80395f40 878b0380 87a25200 00000000 0000001f 80395f2c 80395f30 80395f10 
5f00: 8002d4dc 7eb52754 a0000013 ffffffff 
 r7:878b0380 r6:80395f40 r5:80395efc r4:ffffffff 
[<8002d4b8>] (walk_stackframe+0x0/0x40) from [<8002bfe0>] (return_address+0x3c/0x5c) 
 r6:87a25380 r5:802c678c r4:00000001 r3:8002bfa4 
[<8002bfa4>] (return_address+0x0/0x5c) from [<8004998c>] (sub_preempt_count+0xc8/0xfc) 
[<800498c4>] (sub_preempt_count+0x0/0xfc) from [<802c678c>] (schedule+0x49c/0x4d8) 
 r5:80394000 r4:80396ea8 
[<802c62f0>] (schedule+0x0/0x4d8) from [<8002b324>] (cpu_idle+0xa4/0xbc) 
 r9:413fc082 r8:80021e10 r7:80398408 r6:80023218 r5:8002321c 
r4:80394000 
[<8002b280>] (cpu_idle+0x0/0xbc) from [<802c3ee4>] (rest_init+0x84/0xa0) 
 r4:803f5190 r3:803f4cc8 
[<802c3e60>] (rest_init+0x0/0xa0) from [<8000897c>] (start_kernel+0x234/0x284) 
[<80008748>] (start_kernel+0x0/0x284) from [<80008034>] (__enable_mmu+0x0/0x2c) 

==========================================
#include <linux/hrtimer.h> 
#include <linux/module.h> 
#include <linux/ktime.h> 
#include <linux/kdev_t.h> 
#include <linux/cdev.h> 
#include <linux/device.h> 
#include <linux/fs.h> 
#include <linux/wait.h> 
#include <linux/sched.h> 
 
#define FIRST_MINOR 0 
#define MINOR_CNT   1 
 
static struct class *cl; 
static struct cdev cdev; 
static dev_t dev; 
static u8 timer_expired = 0; 
static wait_queue_head_t wq_head; 
 
static struct hrtimer timer; 
 
static ssize_t hr_read(struct file *f, char * __user buff, size_t cnt, loff_t *off) 
{ 
    wait_event_interruptible(wq_head, timer_expired); 
    timer_expired = 0; 
    return 0; 
} 
 
static int hr_open(struct inode *i, struct file *f) 
{ 
    ktime_t ktime; 
        ktime.tv64 = 1E6L; 
        hrtimer_start(&timer, ktime, HRTIMER_MODE_REL); 
    return 0; 
}    
 
 
static int hr_close(struct inode *i, struct file *f) 
{ 
    if (hrtimer_cancel(&timer)) 
       printk(KERN_INFO "timercancelled\n"); 
 
    return 0; 
}    
 
static struct file_operations hr_fops = { 
    .read = hr_read, 
    .open = hr_open, 
    .release = hr_close 
}; 
 
static enum hrtimer_restart timer_callback(struct hrtimer *timer) 
{ 
    ktime_t ktime; 
    u64 overrun; 
    ktime.tv64 = 1E6L; 
    //printk("KERN_INFO""Timer Expired"); 
 
    overrun = hrtimer_forward_now(timer, ktime); 
    timer_expired = 1; 
    wake_up_interruptible(&wq_head); 
    return HRTIMER_RESTART; 
} 
#if 1 
 
static int init_hrtimer(void) 
{    
    ktime_t ktime; 
    unsigned long delay_in_ms = 500L; 
    printk(KERN_ERR "Timer being set up\n"); 
 
    ktime = ktime_set(0,delay_in_ms*1E6L); 
    hrtimer_init(&timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 
 
    timer.function = &timer_callback; 
    printk(KERN_ERR "Timer starting to fire\n"); 
    printk(KERN_ERR "in %ldms %ld\n", delay_in_ms, jiffies); 
 
    if (alloc_chrdev_region(&dev, FIRST_MINOR, MINOR_CNT, "Hr Timer") < 0) 
    { 
        return -1; 
    } 
    printk("Major Nr: %d\n", MAJOR(dev)); 
 
    cdev_init(&cdev, &hr_fops); 
 
    if (cdev_add(&cdev, dev, MINOR_CNT) == -1) 
    { 
        unregister_chrdev_region(dev, MINOR_CNT); 
        return -1; 
    } 
 
    if ((cl = class_create(THIS_MODULE, "hrtimer")) == NULL) 
    { 
        cdev_del(&cdev); 
        unregister_chrdev_region(dev, MINOR_CNT); 
        return -1; 
    } 
    if (IS_ERR(device_create(cl, NULL, dev, NULL, "hrt%d", 0))) 
    { 
        class_destroy(cl); 
        cdev_del(&cdev); 
        unregister_chrdev_region(dev, MINOR_CNT); 
        return -1; 
    } 
 
    init_waitqueue_head(&wq_head); 
 
    return 0; 
} 
#endif 
 
 
static void clean_hrtimer(void) 
{ 
    int cancelled = hrtimer_cancel(&timer); 
 
    if (cancelled) 
        printk(KERN_ERR "Timer still running\n"); 
    else 
        printk(KERN_ERR "Timer cancelled\n"); 
 
     device_destroy(cl, dev); 
     class_destroy(cl); 
        cdev_del(&cdev); 
        unregister_chrdev_region(dev, MINOR_CNT); 
} 
 
module_init(init_hrtimer); 
module_exit(clean_hrtimer); 
 
MODULE_LICENSE("GPL"); 

=========================

我用上面的代码作为驱动模块,用insmod插入。我希望它每 1 毫秒触发一次,它工作正常,但有时当以太网流量太高时,它会给出一个内核糟糕的解释。请检查代码是否有任何问题?

我检查了 lsmod,我看到所有 5 个内核模块(我自己的)都加载在:0x7f000000 到 0x7f02xxxx 之间
mod at 0x7f020xxxx,  
timermod at 0x7f01xxx,  
mod2 at 0x7f01xxxx,  
mod3 at 0x7f00xxxx,  
mod4 at 0x7f000000.  

在 oops 地址 0x7eb52754 处未加载任何模块。我从/proc/kallsyms 文件检查到
验证这一点。如何检查 0x7eb5xxxx 到源文件的映射?我还能在哪里获得系统上的数据。

请您参考如下方法:

根据错误消息,导致此内核 panic 的代码位于虚拟地址 0x7eb52754。从地址(略低于 0x8000000)来看,我猜这是内核模块的代码段 - 可能是您自己的内核模块之一。

要进行根本原因分析,请按照发生这种 panic 时的加载顺序加载您的(和所有其他)内核模块,并观察 lsmod 打印的加载地址(或几乎相同的 cat/proc/modules) .

使用它们的代码大小和加载地址,计算哪个模块文本段位于虚拟地址 0x7eb52754。从模块加载地址中减去 0x7eb52754。

您将得到的是导致 panic 的指令在模块二进制中的偏移量。

现在在内核模块二进制文件上使用 objdump 并查找该偏移量,并检查它属于哪个函数(这也可以使用 add2line 完成,如果你也有的话)。这应该指向导致此 panic 的指令的函数甚至行号(如果您有调试信息)。

祝你好运。