How is data flowing through various stacks ?
.
.
ftrace gives us the ability to look in depth into kernel code to figure what exactly happens. In this blog entry, I tried to look at UDP receive code.To capture required info, I used simple perl code to send/receive data on specified port using UDP protocol.On my test VM, NIC was using e1000 driver.
Using simple strace, I could see my UDP server is blocked in recvfrom syscall
[cv@mylinux1 ~]$ strace -e network -f ./us.pl socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP) = 3 bind(3, {sa_family=AF_INET, sin_port=htons(5000), sin_addr=inet_addr("0.0.0.0")}, 16) = 0 recvfrom(3,recvfrom inturn calls 'udp_recvmsg' for UDP. Follow below steps to enable 3 functions in ftrace
[root@mylinux1 tracing]# pwd /sys/kernel/debug/tracing [root@mylinux1 tracing]# grep e1000 available_filter_functions |grep -i intr e1000_intr [e1000] e1000_test_intr [e1000] [root@mylinux1 tracing]# grep udp_recv available_filter_functions udp_recvmsg [root@mylinux1 tracing]# grep net_rx_action available_filter_functions net_rx_action [root@mylinux1 tracing]# echo "e1000_intr net_rx_action udp_recvmsg" > set_graph_function [root@mylinux1 tracing]# cat set_graph_function e1000_intr [e1000] net_rx_action udp_recvmsg [root@mylinux1 tracing]# echo function_graph > current_tracer
Below is the trace captured (after some manual edits)
0) | udp_recvmsg() { 0) | __skb_recv_datagram() { 0) 0.843 us | _raw_spin_lock_irqsave(); 0) 0.379 us | _raw_spin_unlock_irqrestore(); 0) | prepare_to_wait_exclusive() { 0) 0.815 us | _raw_spin_lock_irqsave(); 0) 0.295 us | _raw_spin_unlock_irqrestore(); 0) 2.152 us | } 0) | schedule_timeout() { 0) | schedule() { 0) | __schedule() { 0) 0.472 us | _raw_spin_lock_irq(); 0) | deactivate_task() { 0) | dequeue_task() { 0) 0.110 us | update_rq_clock.part.72(); 0) | dequeue_task_fair() { 0) | dequeue_entity() { 0) | update_curr() { 0) 0.183 us | update_min_vruntime(); 0) 1.011 us | cpuacct_charge(); 0) 2.380 us | } 0) 0.142 us | update_cfs_rq_blocked_load(); 0) 0.067 us | clear_buddies(); 0) | account_entity_dequeue() { 0) 0.386 us | __cpu_to_node(); 0) 1.156 us | } 0) 0.070 us | update_min_vruntime(); 0) 0.722 us | update_cfs_shares(); 0) 7.507 us | } 0) 0.070 us | hrtick_update(); 0) 8.903 us | } 0) + 10.218 us | } 0) + 10.833 us | } 0) 0.161 us | put_prev_task_fair(); 0) | pick_next_task_fair() { 0) 0.067 us | clear_buddies(); 0) | set_next_entity() { 0) 0.202 us | update_stats_wait_end(); 0) 1.251 us | } 0) 2.195 us | }At this point, recvfrom/udp_recvmsg call is blocked, waiting for data to arrive. As seen above the task is deactivated (or) state changed to sleep and put on wait queue
Later when client sends data and it's received by NIC, e1000_intr get's called, which inturn calls __napi_schedule to copy data and raise soft IRQ
0) | e1000_intr [e1000]() { 0) | __napi_schedule() { 0) 0.151 us | __raise_softirq_irqoff(); 0) 1.400 us | } 0) + 36.024 us | }softIRQ later calls net_rx_action - which does most work, as you see in below trace
0) | net_rx_action() { 0) | e1000_clean [e1000]() { 0) | e1000_clean_rx_irq [e1000]() { 0) 1.422 us | _raw_spin_lock_irqsave(); 0) 1.033 us | _raw_spin_lock_irqsave(); 0) 0.374 us | _raw_spin_unlock_irqrestore(); 0) 0.339 us | _raw_spin_unlock_irqrestore(); 0) | __netdev_alloc_skb() { 0) 0.256 us | __netdev_alloc_frag(); 0) | build_skb() { 0) | kmem_cache_alloc() { 0) 0.471 us | should_failslab(); 0) 0.436 us | kmemleak_alloc(); 0) 2.641 us | } 0) 3.653 us | } 0) 5.250 us | } 0) 0.082 us | skb_put(); 0) 0.153 us | e1000_rx_checksum.isra.25 [e1000](); 0) 0.376 us | eth_type_trans(); 0) | napi_gro_receive() { 0) 0.085 us | skb_gro_reset_offset(); 0) | dev_gro_receive() { 0) | inet_gro_receive() { 0) | udp4_gro_receive() { 0) 0.259 us | udp_gro_receive(); 0) 1.060 us | } 0) 2.804 us | } 0) 5.094 us | } 0) | netif_receive_skb() { 0) | ktime_get_real() { 0) | getnstimeofday() { 0) 0.161 us | __getnstimeofday(); 0) 0.630 us | } 0) 1.202 us | } 0) | skb_defer_rx_timestamp() { 0) 0.132 us | classify(); 0) 0.660 us | } 0) | __netif_receive_skb() { 0) | __netif_receive_skb_core() { 0) | ip_rcv() { 0) | nf_hook_slow() { 0) | nf_iterate() { 0) 0.087 us | ip_sabotage_in [bridge](); 0) 1.564 us | } 0) 2.997 us | } 0) | ip_rcv_finish() { 0) | ip_route_input_noref() { 0) | fib_table_lookup() { 0) 1.021 us | check_leaf.isra.7(); 0) 3.657 us | } 0) | fib_validate_source() { 0) | fib_table_lookup() { 0) 0.243 us | check_leaf.isra.7(); 0) 1.595 us | } 0) | fib_table_lookup() { 0) 0.564 us | check_leaf.isra.7(); 0) 1.952 us | } 0) 5.546 us | } 0) 0.150 us | __skb_dst_set_noref(); 0) + 13.506 us | } 0) | ip_local_deliver() { 0) | ip_local_deliver_finish() { 0) 0.235 us | raw_local_deliver(); 0) | udp_rcv() { 0) | __udp4_lib_rcv() { 0) | __skb_checksum_complete() { 0) | skb_checksum() { 0) | __skb_checksum() { 0) | csum_partial() { 0) 0.330 us | do_csum(); 0) 0.889 us | } 0) 1.567 us | } 0) 2.146 us | } /* skb_checksum */ 0) 2.729 us | } 0) 1.352 us | __udp4_lib_lookup(); 0) | udp_queue_rcv_skb() { 0) | ipv4_pktinfo_prepare() { 0) 0.457 us | fib_compute_spec_dst(); 0) 1.311 us | } 0) 1.334 us | _raw_spin_lock(); 0) | __udp_queue_rcv_skb() { 0) | sock_queue_rcv_skb() { 0) | sk_filter() { 0) | security_sock_rcv_skb() { 0) | selinux_socket_sock_rcv_skb() { 0) 0.204 us | netlbl_enabled(); 0) 1.537 us | } 0) 2.632 us | } 0) 3.922 us | } 0) 0.387 us | __sk_mem_schedule(); 0) 0.854 us | _raw_spin_lock_irqsave(); 0) 0.499 us | _raw_spin_unlock_irqrestore(); 0) | sock_def_readable() { 0) | __wake_up_sync_key() { 0) 0.769 us | _raw_spin_lock_irqsave(); 0) | __wake_up_common() { 0) | receiver_wake_function() { 0) | autoremove_wake_function() { 0) | default_wake_function() { 0) | try_to_wake_up() { 0) 0.954 us | _raw_spin_lock_irqsave(); 0) 0.217 us | task_waking_fair(); 0) 0.249 us | select_task_rq_fair(); 0) 0.833 us | _raw_spin_lock(); 0) | ttwu_do_activate.constprop.83() { 0) | activate_task() { 0) | enqueue_task() { 0) 0.201 us | update_rq_clock.part.72(); 0) | enqueue_task_fair() { 0) | enqueue_entity() { 0) 0.148 us | update_curr(); 0) 0.080 us | __compute_runnable_contrib.part.45(); 0) 0.251 us | update_cfs_rq_blocked_load(); 0) | account_entity_enqueue() { 0) 0.464 us | __cpu_to_node(); 0) 1.153 us | } 0) 0.369 us | update_cfs_shares(); 0) 0.150 us | place_entity(); 0) 0.240 us | __enqueue_entity(); 0) 7.862 us | } 0) 0.235 us | hrtick_update(); 0) 9.481 us | } /* enqueue_task_fair */ 0) + 10.909 us | } 0) + 11.514 us | } 0) | ttwu_do_wakeup() { 0) | check_preempt_curr() { 0) 0.098 us | resched_task(); 0) 0.762 us | } 0) 2.776 us | } 0) + 15.257 us | } 0) 0.297 us | _raw_spin_unlock(); 0) 0.360 us | ttwu_stat(); 0) 0.373 us | _raw_spin_unlock_irqrestore(); 0) + 45.922 us | } 0) + 46.438 us | } 0) + 47.088 us | } 0) + 47.682 us | } 0) + 48.499 us | } 0) 0.522 us | _raw_spin_unlock_irqrestore(); 0) + 51.418 us | } 0) + 53.066 us | } 0) + 61.798 us | } 0) + 62.568 us | } 0) 0.350 us | _raw_spin_unlock(); 0) + 68.290 us | } 0) + 75.644 us | } 0) + 76.260 us | } 0) + 78.225 us | } 0) + 78.743 us | } 0) + 94.300 us | } 0) + 99.118 us | } 0) ! 101.755 us | } 0) ! 102.300 us | } 0) ! 106.790 us | } 0) ! 113.511 us | } 0) | e1000_alloc_rx_buffers [e1000]() { 0) 0.293 us | skb_trim(); 0) | nommu_map_page() { 0) 0.177 us | check_addr(); 0) 0.711 us | } 0) 0.666 us | _raw_spin_lock_irqsave(); 0) 0.253 us | _raw_spin_unlock_irqrestore(); 0) 0.472 us | _raw_spin_lock_irqsave(); 0) 0.238 us | _raw_spin_unlock_irqrestore(); 0) | save_stack_trace() { 0) | dump_trace() { 0) 0.067 us | save_stack_stack(); 0) | print_context_stack() { 0) | __kernel_text_address() { 0) | is_module_text_address() { 0) | __module_text_address() { 0) 0.153 us | __module_address(); 0) 0.684 us | } 0) 1.314 us | } 0) 1.892 us | } 0) 0.084 us | __kernel_text_address(); 0) 0.077 us | save_stack_address(); 0) | print_ftrace_graph_addr.isra.1.part.2() { 0) 0.067 us | save_stack_address(); 0) 0.541 us | } 0) 0.073 us | __kernel_text_address(); 0) 0.080 us | save_stack_address(); 0) | print_ftrace_graph_addr.isra.1.part.2() { 0) 0.081 us | save_stack_address(); 0) 0.534 us | } . . . 0) 3.415 us | _raw_spin_lock_irqsave(); 0) 1.336 us | _raw_spin_unlock_irqrestore(); 0) 0.959 us | _raw_spin_lock_irqsave(); 0) 0.628 us | _raw_spin_unlock_irqrestore(); 0) ! 687.078 us | } 0) ! 821.321 us | } 0) 0.363 us | e1000_update_itr [e1000](); 0) 0.351 us | e1000_update_itr [e1000](); 0) | napi_complete() { 0) 0.354 us | napi_gro_flush(); 0) | __napi_complete() { 0) 0.251 us | __smp_mb__before_atomic(); 0) 2.903 us | } 0) 6.895 us | } 0) ! 866.912 us | } 0) 0.250 us | net_rps_action_and_irq_enable.isra.84(); 0) ! 872.356 us | }
In above trace, some important functions to note are
ip_rcv() ip_rcv_finish() ip_local_deliver() ip_local_deliver_finish() udp_rcv()and also observe function calls to wake up any tasks that are sleeping on this data. As now our task is woken up, udp_recvmsg will continue to run now and below is the trace seen
0) | finish_task_switch() { 0) 0.792 us | _raw_spin_unlock_irq(); 0) 4.152 us | } 0) ! 11280427 us | } /* __schedule */ 0) ! 11280429 us | } /* schedule */ 0) ! 11280430 us | } /* schedule_timeout */ 0) 0.304 us | finish_wait(); 0) 2.098 us | _raw_spin_lock_irqsave(); 0) 0.785 us | _raw_spin_unlock_irqrestore(); 0) ! 11280447 us | } /* __skb_recv_datagram */ 0) | skb_copy_datagram_iovec() { 0) | might_fault() { 0) 0.922 us | __might_sleep(); 0) 5.446 us | } 0) 9.181 us | } 0) | skb_free_datagram_locked() { 0) | lock_sock_fast() { 0) 0.489 us | __might_sleep(); 0) 0.261 us | _cond_resched(); 0) | _raw_spin_lock_bh() { 0) | local_bh_disable() { 0) 0.320 us | __local_bh_disable(); 0) 1.851 us | } 0) 4.977 us | } 0) + 11.082 us | } 0) 0.484 us | sock_rfree(); 0) | _raw_spin_unlock_bh() { 0) 0.651 us | local_bh_enable_ip(); 0) 2.742 us | } 0) | __kfree_skb() { 0) | skb_release_all() { 0) 0.280 us | skb_release_head_state(); 0) | skb_release_data() { 0) | skb_free_head() { 0) | put_page() { 0) 0.250 us | put_compound_page(); 0) 1.897 us | } 0) 4.156 us | } 0) 5.770 us | } 0) 8.950 us | } 0) | kfree_skbmem() { 0) | kmem_cache_free() { 0) 1.042 us | kmemleak_free(); 0) 1.360 us | _raw_spin_lock_irqsave(); 0) 0.852 us | _raw_spin_unlock_irqrestore(); 0) 9.442 us | } 0) + 11.503 us | } 0) + 23.744 us | } 0) + 45.085 us | } 0) ! 11280507 us | } /* udp_recvmsg */
will look at UDP transmit in the next blog entry