# how to analyze this kernel panic (net_rx_action)? [SOLVED]

## alex.blackbit

hi,

on one of my machines (x86) i get kernel panics with net_rx_action in EIP with kernel versions >2.6.26 under heavy network load.

i guess this a network related issue, but i am not completely sure if i can make this assumption.

my network interfaces are 4 adaptec starfires on a quad card, currently 2 used.

since i do not have any troubles with kernels <=2.6.26 i doubt that this is a hardware problem.

on the internet i did not find anything useful concerning panics with "net_rx_action".

nothing is written in the system log when the panic occurs.

i did a quick "diff -u" on the file starfire.c between old and new kernel, maybe this helps:

```
--- linux-2.6.26-gentoo-r2/drivers/net/starfire.c       2008-07-13 23:51:29.000000000 +0200

+++ linux-2.6.27-gentoo-r2/drivers/net/starfire.c       2008-10-10 00:13:53.000000000 +0200

@@ -27,8 +27,8 @@

 */

 

 #define DRV_NAME       "starfire"

-#define DRV_VERSION    "2.0"

-#define DRV_RELDATE    "June 27, 2006"

+#define DRV_VERSION    "2.1"

+#define DRV_RELDATE    "July  6, 2008"

 

 #include <linux/module.h>

 #include <linux/kernel.h>

@@ -69,10 +69,6 @@

 #define VLAN_SUPPORT

 #endif

 

-#ifndef CONFIG_ADAPTEC_STARFIRE_NAPI

-#undef HAVE_NETDEV_POLL

-#endif

-

 /* The user-configurable values.

    These may be modified when a driver module is loaded.*/

 

@@ -177,44 +173,6 @@

 #define skb_first_frag_len(skb)        skb_headlen(skb)

 #define skb_num_frags(skb) (skb_shinfo(skb)->nr_frags + 1)

 

-#ifdef HAVE_NETDEV_POLL

-#define init_poll(dev, np) \

-       netif_napi_add(dev, &np->napi, netdev_poll, max_interrupt_work)

-#define netdev_rx(dev, np, ioaddr) \

-do { \

-       u32 intr_enable; \

-       if (netif_rx_schedule_prep(dev, &np->napi)) { \

-               __netif_rx_schedule(dev, &np->napi); \

-               intr_enable = readl(ioaddr + IntrEnable); \

-               intr_enable &= ~(IntrRxDone | IntrRxEmpty); \

-               writel(intr_enable, ioaddr + IntrEnable); \

-               readl(ioaddr + IntrEnable); /* flush PCI posting buffers */ \

-       } else { \

-               /* Paranoia check */ \

-               intr_enable = readl(ioaddr + IntrEnable); \

-               if (intr_enable & (IntrRxDone | IntrRxEmpty)) { \

-                       printk(KERN_INFO "%s: interrupt while in polling mode!\n", dev->name); \

-                       intr_enable &= ~(IntrRxDone | IntrRxEmpty); \

-                       writel(intr_enable, ioaddr + IntrEnable); \

-               } \

-       } \

-} while (0)

-#define netdev_receive_skb(skb) netif_receive_skb(skb)

-#define vlan_netdev_receive_skb(skb, vlgrp, vlid) vlan_hwaccel_receive_skb(skb, vlgrp, vlid)

-static int     netdev_poll(struct napi_struct *napi, int budget);

-#else  /* not HAVE_NETDEV_POLL */

-#define init_poll(dev, np)

-#define netdev_receive_skb(skb) netif_rx(skb)

-#define vlan_netdev_receive_skb(skb, vlgrp, vlid) vlan_hwaccel_rx(skb, vlgrp, vlid)

-#define netdev_rx(dev, np, ioaddr) \

-do { \

-       int quota = np->dirty_rx + RX_RING_SIZE - np->cur_rx; \

-       __netdev_rx(dev, &quota);\

-} while (0)

-#endif /* not HAVE_NETDEV_POLL */

-/* end of compatibility code */

-

-

 /* These identify the driver base version and may not be removed. */

 static char version[] =

 KERN_INFO "starfire.c:v1.03 7/26/2000  Written by Donald Becker <becker@scyld.com>\n"

@@ -635,6 +593,7 @@

 static irqreturn_t intr_handler(int irq, void *dev_instance);

 static void    netdev_error(struct net_device *dev, int intr_status);

 static int     __netdev_rx(struct net_device *dev, int *quota);

+static int     netdev_poll(struct napi_struct *napi, int budget);

 static void    refill_rx_ring(struct net_device *dev);

 static void    netdev_error(struct net_device *dev, int intr_status);

 static void    set_rx_mode(struct net_device *dev);

@@ -851,7 +810,7 @@

        dev->hard_start_xmit = &start_tx;

        dev->tx_timeout = tx_timeout;

        dev->watchdog_timeo = TX_TIMEOUT;

-       init_poll(dev, np);

+       netif_napi_add(dev, &np->napi, netdev_poll, max_interrupt_work);

        dev->stop = &netdev_close;

        dev->get_stats = &get_stats;

        dev->set_multicast_list = &set_rx_mode;

@@ -1054,9 +1013,8 @@

 

        writel(np->intr_timer_ctrl, ioaddr + IntrTimerCtrl);

 

-#ifdef HAVE_NETDEV_POLL

        napi_enable(&np->napi);

-#endif

+

        netif_start_queue(dev);

 

        if (debug > 1)

@@ -1330,8 +1288,28 @@

 

                handled = 1;

 

-               if (intr_status & (IntrRxDone | IntrRxEmpty))

-                       netdev_rx(dev, np, ioaddr);

+               if (intr_status & (IntrRxDone | IntrRxEmpty)) {

+                       u32 enable;

+

+                       if (likely(netif_rx_schedule_prep(dev, &np->napi))) {

+                               __netif_rx_schedule(dev, &np->napi);

+                               enable = readl(ioaddr + IntrEnable);

+                               enable &= ~(IntrRxDone | IntrRxEmpty);

+                               writel(enable, ioaddr + IntrEnable);

+                               /* flush PCI posting buffers */

+                               readl(ioaddr + IntrEnable);

+                       } else {

+                               /* Paranoia check */

+                               enable = readl(ioaddr + IntrEnable);

+                               if (enable & (IntrRxDone | IntrRxEmpty)) {

+                                       printk(KERN_INFO

+                                              "%s: interrupt while in poll!\n",

+                                              dev->name);

+                                       enable &= ~(IntrRxDone | IntrRxEmpty);

+                                       writel(enable, ioaddr + IntrEnable);

+                               }

+                       }

+               }

 

                /* Scavenge the skbuff list based on the Tx-done queue.

                   There are redundant checks here that may be cleaned up

@@ -1411,8 +1389,10 @@

 }

 

 

-/* This routine is logically part of the interrupt/poll handler, but separated

-   for clarity, code sharing between NAPI/non-NAPI, and better register allocation. */

+/*

+ * This routine is logically part of the interrupt/poll handler, but separated

+ * for clarity and better register allocation.

+ */

 static int __netdev_rx(struct net_device *dev, int *quota)

 {

        struct netdev_private *np = netdev_priv(dev);

@@ -1507,13 +1487,20 @@

                }

 #ifdef VLAN_SUPPORT

                if (np->vlgrp && le16_to_cpu(desc->status2) & 0x0200) {

-                       if (debug > 4)

-                               printk(KERN_DEBUG "  netdev_rx() vlanid = %d\n", le16_to_cpu(desc->vlanid));

-                       /* vlan_netdev_receive_skb() expects a packet with the VLAN tag stripped out */

-                       vlan_netdev_receive_skb(skb, np->vlgrp, le16_to_cpu(desc->vlanid) & VLAN_VID_MASK);

+                       u16 vlid = le16_to_cpu(desc->vlanid);

+

+                       if (debug > 4) {

+                               printk(KERN_DEBUG "  netdev_rx() vlanid = %d\n",

+                                      vlid);

+                       }

+                       /*

+                        * vlan_hwaccel_rx expects a packet with the VLAN tag

+                        * stripped out.

+                        */

+                       vlan_hwaccel_rx(skb, np->vlgrp, vlid);

                } else

 #endif /* VLAN_SUPPORT */

-                       netdev_receive_skb(skb);

+                       netif_receive_skb(skb);

                dev->last_rx = jiffies;

                np->stats.rx_packets++;

 

@@ -1532,8 +1519,6 @@

        return retcode;

 }

 

-

-#ifdef HAVE_NETDEV_POLL

 static int netdev_poll(struct napi_struct *napi, int budget)

 {

        struct netdev_private *np = container_of(napi, struct netdev_private, napi);

@@ -1564,8 +1549,6 @@

        /* Restart Rx engine if stopped. */

        return budget - quota;

 }

-#endif /* HAVE_NETDEV_POLL */

-

 

 static void refill_rx_ring(struct net_device *dev)

 {

@@ -1906,9 +1889,8 @@

        int i;

 

        netif_stop_queue(dev);

-#ifdef HAVE_NETDEV_POLL

+

        napi_disable(&np->napi);

-#endif

 

        if (debug > 1) {

                printk(KERN_DEBUG "%s: Shutting down ethercard, Intr status %#8.8x.\n",

@@ -2044,11 +2026,8 @@

 /* when a module, this is printed whether or not devices are found in probe */

 #ifdef MODULE

        printk(version);

-#ifdef HAVE_NETDEV_POLL

+

        printk(KERN_INFO DRV_NAME ": polling (NAPI) enabled\n");

-#else

-       printk(KERN_INFO DRV_NAME ": polling (NAPI) disabled\n");

-#endif

 #endif

 

        /* we can do this test only at run-time... sigh */
```

does anybody have an idea what could cause this problems?

any pointers welcome.

thanks in advance.

p.s. if the link to the screenshot does not work, i tried a newer kernel and got a panic again.   :Rolling Eyes: Last edited by alex.blackbit on Tue Dec 16, 2008 10:33 pm; edited 2 times in total

----------

## geki

you may post your information to a fitted mailinglist on kernel.org, to linux developers.

and no need to send that diff.

----------

## alex.blackbit

i posted on the LKML.

let's see if somebody knows help.   :Rolling Eyes: 

----------

## geki

You may send a copy to linux-net kml.

I prefer thread view, just in case ...

http://thread.gmane.org/gmane.linux.kernel/768696

----------

## manaka

If you can reproduce the problem deterministically, you may try git bisect. It could tell the problematic commit that triggers the panic. See http://kerneltrap.org/node/11753 and http://www.kernel.org/pub/software/scm/git/docs/git-bisect.html.

----------

## alex.blackbit

thanks for the hint.

jarek poplawski answered my post on the LKML and provided a patch i am currently testing.

```
 drivers/net/starfire.c |    5 +++++

 1 files changed, 5 insertions(+), 0 deletions(-)

diff --git a/drivers/net/starfire.c b/drivers/net/starfire.c

index 0358809..f86d6bb 100644

--- a/drivers/net/starfire.c

+++ b/drivers/net/starfire.c

@@ -1503,6 +1503,11 @@ static int __netdev_rx(struct net_device *dev, int *quota)

       desc->status = 0;

       np->rx_done = (np->rx_done + 1) % DONE_Q_SIZE;

    }

+

+   if (*quota == 0) {   /* out of rx quota */

+      retcode = 1;

+      goto out;

+   }

    writew(np->rx_done, np->base + CompletionQConsumerIdx);

 

  out:
```

the machine is currently running 2.6.28-rc8 with that patch under heavy network load.

uptime is currently 2 hours.

let's see, maybe that solves the issue.

----------

## alex.blackbit

uptime currently 1 day, 2:31 hours.

the problem seems to be fixed.

the patch is submitted on the mailing list.

thanks to all who answered.

----------

