August 15, 2015

Linux ftrace: UDP receive packet kernel code path

What happens when a packet arrives at NIC ?
How is data flowing through various stacks ?
 .
 .
ftrace gives us the ability to look in depth into kernel code to figure what exactly happens. In this blog entry, I tried to look at UDP receive code.To capture required info, I used simple perl code to send/receive data on specified port using UDP protocol.On my test VM, NIC was using e1000 driver.

Using simple strace, I could see my UDP server is blocked in recvfrom syscall

[cv@mylinux1 ~]$ strace -e network -f ./us.pl 
socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP) = 3
bind(3, {sa_family=AF_INET, sin_port=htons(5000), sin_addr=inet_addr("0.0.0.0")}, 16) = 0
recvfrom(3, 
recvfrom inturn calls 'udp_recvmsg' for UDP.  Follow below steps to enable 3 functions in ftrace

[root@mylinux1 tracing]# pwd
/sys/kernel/debug/tracing

[root@mylinux1 tracing]# grep e1000 available_filter_functions |grep -i intr
e1000_intr [e1000]
e1000_test_intr [e1000]

[root@mylinux1 tracing]# grep udp_recv available_filter_functions 
udp_recvmsg

[root@mylinux1 tracing]# grep net_rx_action available_filter_functions 
net_rx_action

[root@mylinux1 tracing]# echo "e1000_intr net_rx_action udp_recvmsg" > set_graph_function 

[root@mylinux1 tracing]# cat set_graph_function
e1000_intr [e1000]
net_rx_action
udp_recvmsg

[root@mylinux1 tracing]# echo function_graph > current_tracer 

Below is the trace captured (after some manual edits)
0)               |  udp_recvmsg() {
 0)               |    __skb_recv_datagram() {
 0)   0.843 us    |      _raw_spin_lock_irqsave();
 0)   0.379 us    |      _raw_spin_unlock_irqrestore();
 0)               |      prepare_to_wait_exclusive() {
 0)   0.815 us    |        _raw_spin_lock_irqsave();
 0)   0.295 us    |        _raw_spin_unlock_irqrestore();
 0)   2.152 us    |      }
 0)               |      schedule_timeout() {
 0)               |        schedule() {
 0)               |          __schedule() {
 0)   0.472 us    |            _raw_spin_lock_irq();
 0)               |            deactivate_task() {
 0)               |              dequeue_task() {
 0)   0.110 us    |                update_rq_clock.part.72();
 0)               |                dequeue_task_fair() {
 0)               |                  dequeue_entity() {
 0)               |                    update_curr() {
 0)   0.183 us    |                      update_min_vruntime();
 0)   1.011 us    |                      cpuacct_charge();
 0)   2.380 us    |                    }
 0)   0.142 us    |                    update_cfs_rq_blocked_load();
 0)   0.067 us    |                    clear_buddies();
 0)               |                    account_entity_dequeue() {
 0)   0.386 us    |                      __cpu_to_node();
 0)   1.156 us    |                    }
 0)   0.070 us    |                    update_min_vruntime();
 0)   0.722 us    |                    update_cfs_shares();
 0)   7.507 us    |                  }
 0)   0.070 us    |                  hrtick_update();
 0)   8.903 us    |                }
 0) + 10.218 us   |              }
 0) + 10.833 us   |            }
 0)   0.161 us    |            put_prev_task_fair();
 0)               |            pick_next_task_fair() {
 0)   0.067 us    |              clear_buddies();
 0)               |              set_next_entity() {
 0)   0.202 us    |                update_stats_wait_end();
 0)   1.251 us    |              }
 0)   2.195 us    |            }
At this point, recvfrom/udp_recvmsg call is blocked, waiting for data to arrive. As seen above the task is deactivated (or) state changed to sleep and put on wait queue

Later when client sends data and it's received by NIC, e1000_intr get's called, which inturn calls __napi_schedule to copy data and raise soft IRQ

0)               |  e1000_intr [e1000]() {
 0)               |    __napi_schedule() {
 0)   0.151 us    |      __raise_softirq_irqoff();
 0)   1.400 us    |    }
 0) + 36.024 us   |  }
softIRQ later calls net_rx_action - which does most work, as you see in below trace
0)               |  net_rx_action() {
 0)               |    e1000_clean [e1000]() {
 0)               |      e1000_clean_rx_irq [e1000]() {
 0)   1.422 us    |        _raw_spin_lock_irqsave();
 0)   1.033 us    |        _raw_spin_lock_irqsave();
 0)   0.374 us    |        _raw_spin_unlock_irqrestore();
 0)   0.339 us    |        _raw_spin_unlock_irqrestore();
 0)               |        __netdev_alloc_skb() {
 0)   0.256 us    |          __netdev_alloc_frag();
 0)               |          build_skb() {
 0)               |            kmem_cache_alloc() {
 0)   0.471 us    |              should_failslab();
 0)   0.436 us    |              kmemleak_alloc();
 0)   2.641 us    |            }
 0)   3.653 us    |          }
 0)   5.250 us    |        }
 0)   0.082 us    |        skb_put();
 0)   0.153 us    |        e1000_rx_checksum.isra.25 [e1000]();
 0)   0.376 us    |        eth_type_trans();
 0)               |        napi_gro_receive() {
 0)   0.085 us    |          skb_gro_reset_offset();
 0)               |          dev_gro_receive() {
 0)               |            inet_gro_receive() {
 0)               |              udp4_gro_receive() {
 0)   0.259 us    |                udp_gro_receive();
 0)   1.060 us    |              }
 0)   2.804 us    |            }
 0)   5.094 us    |          }
 0)               |          netif_receive_skb() {
 0)               |            ktime_get_real() {
 0)               |              getnstimeofday() {
 0)   0.161 us    |                __getnstimeofday();
 0)   0.630 us    |              }
 0)   1.202 us    |            }
 0)               |            skb_defer_rx_timestamp() {
 0)   0.132 us    |              classify();
 0)   0.660 us    |            }
 0)               |            __netif_receive_skb() {
 0)               |              __netif_receive_skb_core() {
 0)               |                ip_rcv() {
 0)               |                  nf_hook_slow() {
 0)               |                    nf_iterate() {
 0)   0.087 us    |                      ip_sabotage_in [bridge]();
 0)   1.564 us    |                    }
 0)   2.997 us    |                  }
 0)               |                  ip_rcv_finish() {
 0)               |                    ip_route_input_noref() {
 0)               |                      fib_table_lookup() {
 0)   1.021 us    |                        check_leaf.isra.7();
 0)   3.657 us    |                      }
 0)               |                      fib_validate_source() {
 0)               |                        fib_table_lookup() {
 0)   0.243 us    |                          check_leaf.isra.7();
 0)   1.595 us    |                        }
 0)               |                        fib_table_lookup() {
 0)   0.564 us    |                          check_leaf.isra.7();
 0)   1.952 us    |                        }
 0)   5.546 us    |                      }
 0)   0.150 us    |                      __skb_dst_set_noref();
 0) + 13.506 us   |                    }
 0)               |                    ip_local_deliver() {
 0)               |                      ip_local_deliver_finish() {
 0)   0.235 us    |                        raw_local_deliver();
 0)               |                        udp_rcv() {
 0)               |                          __udp4_lib_rcv() {
 0)               |                            __skb_checksum_complete() {
 0)               |                              skb_checksum() {
 0)               |                                __skb_checksum() {
 0)               |                                  csum_partial() {
 0)   0.330 us    |                                    do_csum();
 0)   0.889 us    |                                  }
 0)   1.567 us    |                                }
 0)   2.146 us    |                              } /* skb_checksum */
 0)   2.729 us    |                            }
 0)   1.352 us    |                            __udp4_lib_lookup();
 0)               |                            udp_queue_rcv_skb() {
 0)               |                              ipv4_pktinfo_prepare() {
 0)   0.457 us    |                                fib_compute_spec_dst();
 0)   1.311 us    |                              }
 0)   1.334 us    |                              _raw_spin_lock();
 0)               |                              __udp_queue_rcv_skb() {
 0)               |                                sock_queue_rcv_skb() {
 0)               |                                  sk_filter() {
 0)               |                                    security_sock_rcv_skb() {
 0)               |                                      selinux_socket_sock_rcv_skb() {
 0)   0.204 us    |                                        netlbl_enabled();
 0)   1.537 us    |                                      }
 0)   2.632 us    |                                    }
 0)   3.922 us    |                                  }
 0)   0.387 us    |                                  __sk_mem_schedule();
 0)   0.854 us    |                                  _raw_spin_lock_irqsave();
 0)   0.499 us    |                                  _raw_spin_unlock_irqrestore();
 0)               |                                  sock_def_readable() {
 0)               |                                    __wake_up_sync_key() {
 0)   0.769 us    |                                      _raw_spin_lock_irqsave();
 0)               |                                      __wake_up_common() {
 0)               |                                        receiver_wake_function() {
 0)               |                                          autoremove_wake_function() {
 0)               |                                            default_wake_function() {
 0)               |                                              try_to_wake_up() {
 0)   0.954 us    |                                                _raw_spin_lock_irqsave();
 0)   0.217 us    |                                                task_waking_fair();
 0)   0.249 us    |                                                select_task_rq_fair();
 0)   0.833 us    |                                                _raw_spin_lock();
 0)               |                                                ttwu_do_activate.constprop.83() {
 0)               |                                                  activate_task() {
 0)               |                                                    enqueue_task() {
 0)   0.201 us    |                                                      update_rq_clock.part.72();
 0)               |                                                      enqueue_task_fair() {
 0)               |                                                        enqueue_entity() {
 0)   0.148 us    |                                                          update_curr();
 0)   0.080 us    |                                                          __compute_runnable_contrib.part.45();
 0)   0.251 us    |                                                          update_cfs_rq_blocked_load();
 0)               |                                                          account_entity_enqueue() {
 0)   0.464 us    |                                                            __cpu_to_node();
 0)   1.153 us    |                                                          }
 0)   0.369 us    |                                                          update_cfs_shares();
 0)   0.150 us    |                                                          place_entity();
 0)   0.240 us    |                                                          __enqueue_entity();
 0)   7.862 us    |                                                        }
 0)   0.235 us    |                                                        hrtick_update();
 0)   9.481 us    |                                                      } /* enqueue_task_fair */
 0) + 10.909 us   |                                                    }
 0) + 11.514 us   |                                                  }
 0)               |                                                  ttwu_do_wakeup() {
 0)               |                                                    check_preempt_curr() {
 0)   0.098 us    |                                                      resched_task();
 0)   0.762 us    |                                                    }
 0)   2.776 us    |                                                  }
 0) + 15.257 us   |                                                }
 0)   0.297 us    |                                                _raw_spin_unlock();
 0)   0.360 us    |                                                ttwu_stat();
 0)   0.373 us    |                                                _raw_spin_unlock_irqrestore();
 0) + 45.922 us   |                                              }
 0) + 46.438 us   |                                            }
 0) + 47.088 us   |                                          }
 0) + 47.682 us   |                                        }
 0) + 48.499 us   |                                      }
 0)   0.522 us    |                                      _raw_spin_unlock_irqrestore();
 0) + 51.418 us   |                                    }
 0) + 53.066 us   |                                  }
 0) + 61.798 us   |                                }
 0) + 62.568 us   |                              }
 0)   0.350 us    |                              _raw_spin_unlock();
 0) + 68.290 us   |                            }
 0) + 75.644 us   |                          }
 0) + 76.260 us   |                        }
 0) + 78.225 us   |                      }
 0) + 78.743 us   |                    }
 0) + 94.300 us   |                  }
 0) + 99.118 us   |                }
 0) ! 101.755 us  |              }
 0) ! 102.300 us  |            }
 0) ! 106.790 us  |          }
 0) ! 113.511 us  |        }
 0)               |        e1000_alloc_rx_buffers [e1000]() {
 0)   0.293 us    |          skb_trim();
 0)               |          nommu_map_page() {
 0)   0.177 us    |            check_addr();
 0)   0.711 us    |          }
 0)   0.666 us    |          _raw_spin_lock_irqsave();
 0)   0.253 us    |          _raw_spin_unlock_irqrestore();
 0)   0.472 us    |          _raw_spin_lock_irqsave();
 0)   0.238 us    |          _raw_spin_unlock_irqrestore();
 0)               |          save_stack_trace() {
 0)               |            dump_trace() {
 0)   0.067 us    |              save_stack_stack();
 0)               |              print_context_stack() {
 0)               |                __kernel_text_address() {
 0)               |                  is_module_text_address() {
 0)               |                    __module_text_address() {
 0)   0.153 us    |                      __module_address();
 0)   0.684 us    |                    }
 0)   1.314 us    |                  }
 0)   1.892 us    |                }
 0)   0.084 us    |                __kernel_text_address();
 0)   0.077 us    |                save_stack_address();
 0)               |                print_ftrace_graph_addr.isra.1.part.2() {
 0)   0.067 us    |                  save_stack_address();
 0)   0.541 us    |                }
 0)   0.073 us    |                __kernel_text_address();
 0)   0.080 us    |                save_stack_address();
 0)               |                print_ftrace_graph_addr.isra.1.part.2() {
 0)   0.081 us    |                  save_stack_address();
 0)   0.534 us    |                }
     .
     .
     .
 0)   3.415 us    |          _raw_spin_lock_irqsave();
 0)   1.336 us    |          _raw_spin_unlock_irqrestore();
 0)   0.959 us    |          _raw_spin_lock_irqsave();
 0)   0.628 us    |          _raw_spin_unlock_irqrestore();
 0) ! 687.078 us  |        }
 0) ! 821.321 us  |      }
 0)   0.363 us    |      e1000_update_itr [e1000]();
 0)   0.351 us    |      e1000_update_itr [e1000]();
 0)               |      napi_complete() {
 0)   0.354 us    |        napi_gro_flush();
 0)               |        __napi_complete() {
 0)   0.251 us    |          __smp_mb__before_atomic();
 0)   2.903 us    |        }
 0)   6.895 us    |      }
 0) ! 866.912 us  |    }
 0)   0.250 us    |    net_rps_action_and_irq_enable.isra.84();
 0) ! 872.356 us  |  }

In above trace, some important functions to note are
ip_rcv()
ip_rcv_finish()
ip_local_deliver()
ip_local_deliver_finish()
udp_rcv()
and also observe function calls to wake up any tasks that are sleeping on this data. As now our task is woken up, udp_recvmsg will continue to run now and below is the trace seen


0)               |            finish_task_switch() {
 0)   0.792 us    |              _raw_spin_unlock_irq();
 0)   4.152 us    |            }
 0) ! 11280427 us |          } /* __schedule */
 0) ! 11280429 us |        } /* schedule */
 0) ! 11280430 us |      } /* schedule_timeout */
 0)   0.304 us    |      finish_wait();
 0)   2.098 us    |      _raw_spin_lock_irqsave();
 0)   0.785 us    |      _raw_spin_unlock_irqrestore();
 0) ! 11280447 us |    } /* __skb_recv_datagram */
 0)               |    skb_copy_datagram_iovec() {
 0)               |      might_fault() {
 0)   0.922 us    |        __might_sleep();
 0)   5.446 us    |      }
 0)   9.181 us    |    }
 0)               |    skb_free_datagram_locked() {
 0)               |      lock_sock_fast() {
 0)   0.489 us    |        __might_sleep();
 0)   0.261 us    |        _cond_resched();
 0)               |        _raw_spin_lock_bh() {
 0)               |          local_bh_disable() {
 0)   0.320 us    |            __local_bh_disable();
 0)   1.851 us    |          }
 0)   4.977 us    |        }
 0) + 11.082 us   |      }
 0)   0.484 us    |      sock_rfree();
 0)               |      _raw_spin_unlock_bh() {
 0)   0.651 us    |        local_bh_enable_ip();
 0)   2.742 us    |      }
 0)               |      __kfree_skb() {
 0)               |        skb_release_all() {
 0)   0.280 us    |          skb_release_head_state();
 0)               |          skb_release_data() {
 0)               |            skb_free_head() {
 0)               |              put_page() {
 0)   0.250 us    |                put_compound_page();
 0)   1.897 us    |              }
 0)   4.156 us    |            }
 0)   5.770 us    |          }
 0)   8.950 us    |        }
 0)               |        kfree_skbmem() {
 0)               |          kmem_cache_free() {
 0)   1.042 us    |            kmemleak_free();
 0)   1.360 us    |            _raw_spin_lock_irqsave();
 0)   0.852 us    |            _raw_spin_unlock_irqrestore();
 0)   9.442 us    |          }
 0) + 11.503 us   |        }
 0) + 23.744 us   |      }
 0) + 45.085 us   |    }
 0) ! 11280507 us |  } /* udp_recvmsg */

will look at UDP transmit in the next blog entry

Linux ftrace: see underlying semantics of a system call

I always wondered what's happening under the hood of a system call. strace is a simple tool to see, what syscalls are being called by an application (with lot's of performance impact on app), however to know what's really going on beyond syscall we need ftrace

ftrace is an internal tracer designed to help out developers and designers of systems to find what is going on inside the kernel. ftrace uses the debugfs file system to hold the control files as well as the files to display output

Kernel documentation on ftrace could be found here
Brenden Gregg's lwn article talks about using ftrace and his perf-tools

Below is an example trace of read(2) syscall using function_graph tracer
[root@mylinux1 tracing]# pwd
/sys/kernel/debug/tracing
[root@mylinux1 tracing]# echo SyS_read > set_graph_function
[root@mylinux1 tracing]# echo function_graph > current_tracer 
[root@mylinux1 tracing]# echo 1 > tracing_on 
Observe tracer output in trace or trace_pipe files. Below output shows kernel code path
taken for read(2) syscall on my Linux VM running 3.10.0-229 kernel
 
 0)               |  SyS_read() {
 0)   0.261 us    |    fget_light();
 0)               |    vfs_read() {
 0)               |      rw_verify_area() {
 0)               |        security_file_permission() {
 0)               |          selinux_file_permission() {
 0)   0.066 us    |            avc_policy_seqno();
 0)   0.581 us    |          }
 0)               |          __fsnotify_parent() {
 0)               |            dget_parent() {
 0)   0.580 us    |              _raw_spin_lock();
 0)   0.180 us    |              _raw_spin_unlock();
 0)   2.213 us    |            }
 0)               |            dput() {
 0)   0.265 us    |              _raw_spin_lock();
 0)   0.172 us    |              _raw_spin_unlock();
 0)   1.318 us    |            }
 0)   4.431 us    |          }
 0)   0.080 us    |          fsnotify();
 0)   6.372 us    |        }
 0)   6.803 us    |      }
 0)               |      do_sync_read() {
 0)               |        xfs_file_aio_read [xfs]() {
 0)   0.065 us    |          generic_segment_checks();
 0)               |          xfs_ilock [xfs]() {
 0)               |            down_read_nested() {
 0)   0.142 us    |              __might_sleep();
 0)   0.058 us    |              _cond_resched();
 0)   1.416 us    |            }
 0)   1.919 us    |          }
 0)               |          generic_file_aio_read() {
 0)   0.072 us    |            generic_segment_checks();
 0)   0.153 us    |            __might_sleep();
 0)   0.059 us    |            _cond_resched();
 0)   0.635 us    |            __find_get_page();
 0)   0.070 us    |            mark_page_accessed();
 0)               |            file_read_actor() {
 0)               |              do_page_fault() {
 0)               |                __do_page_fault() {
 0)   0.276 us    |                  down_read_trylock();
 0)   0.164 us    |                  __might_sleep();
 0)   0.064 us    |                  _cond_resched();
 0)   0.163 us    |                  find_vma();
 0)               |                  handle_mm_fault() {
 0)   0.499 us    |                    __mem_cgroup_count_vm_event();
 0)   0.112 us    |                    sync_mm_rss();
 0)               |                    anon_vma_prepare() {
 0)   0.203 us    |                      __might_sleep();
 0)   0.059 us    |                      _cond_resched();
 0)   1.284 us    |                    }
 0)               |                    alloc_pages_vma() {
 0)   0.095 us    |                      get_vma_policy();
 0)   0.070 us    |                      policy_nodemask();
 0)   0.081 us    |                      policy_zonelist();
 0)               |                      __alloc_pages_nodemask() {
 0)   0.136 us    |                        __might_sleep();
 0)   0.059 us    |                        _cond_resched();
 0)   0.069 us    |                        next_zones_zonelist();
 0)               |                        get_page_from_freelist() {
 0)   0.073 us    |                          next_zones_zonelist();
 0)   0.061 us    |                          __zone_watermark_ok();
 0)   0.060 us    |                          __mod_zone_page_state();
 0)               |                          zone_statistics() {
 0)   0.072 us    |                            __inc_zone_state();
 0)   0.064 us    |                            __inc_zone_state();
 0)   0.929 us    |                          }
 0)   0.060 us    |                          bad_range();
 0)               |                          mem_cgroup_bad_page_check() {
 0)               |                            lookup_page_cgroup_used() {
 0)   0.061 us    |                              lookup_page_cgroup();
 0)   0.694 us    |                            }
 0)   1.175 us    |                          }
 0)   5.841 us    |                        }
 0)   8.006 us    |                      }
 0) + 10.248 us   |                    }
 0)               |                    mem_cgroup_newpage_charge() {
 0)               |                      mem_cgroup_charge_common() {
 0)   0.549 us    |                        __mem_cgroup_try_charge();
 0)               |                        __mem_cgroup_commit_charge() {
 0)   0.095 us    |                          lookup_page_cgroup();
 0)   0.061 us    |                          mem_cgroup_charge_statistics.isra.26();
 0)   0.061 us    |                          memcg_check_events();
 0)   1.615 us    |                        }
 0)   3.113 us    |                      }
 0)   3.667 us    |                    }
 0)   0.497 us    |                    _raw_spin_lock();
 0)   0.058 us    |                    add_mm_counter_fast();
 0)               |                    page_add_new_anon_rmap() {
 0)               |                      __inc_zone_page_state() {
 0)   0.079 us    |                        __inc_zone_state();
 0)   0.586 us    |                      }
 0)               |                      lru_cache_add() {
 0)   0.062 us    |                        __lru_cache_add();
 0)   0.592 us    |                      }
 0)   2.218 us    |                    }
 0)   0.188 us    |                    _raw_spin_unlock();
 0) + 23.584 us   |                  }
 0)   0.182 us    |                  up_read();
 0) + 27.283 us   |                }
 0) + 27.703 us   |              }
 0) + 28.902 us   |            }
 0)   0.064 us    |            put_page();
 0)   0.143 us    |            __might_sleep();
 0)   0.058 us    |            _cond_resched();
 0)   0.443 us    |            __find_get_page();
 0)   0.063 us    |            put_page();
 0)               |            touch_atime() {
 0)               |              current_fs_time() {
 0)   0.071 us    |                current_kernel_time();
 0)   0.534 us    |              }
 0)   1.022 us    |            }
 0) + 36.860 us   |          }
 0)               |          xfs_iunlock [xfs]() {
 0)   0.197 us    |            up_read();
 0)   1.325 us    |          }
 0) + 41.959 us   |        }
 0) + 42.513 us   |      }
 0)               |      __fsnotify_parent() {
 0)               |        dget_parent() {
 0)   0.525 us    |          _raw_spin_lock();
 0)   0.200 us    |          _raw_spin_unlock();
 0)   2.022 us    |        }
 0)               |        dput() {
 0)   0.238 us    |          _raw_spin_lock();
 0)   0.201 us    |          _raw_spin_unlock();
 0)   1.299 us    |        }
 0)   4.228 us    |      }
 0)   0.069 us    |      fsnotify();
 0) + 55.277 us   |    }
 0) + 56.499 us   |  }