ICode9

精准搜索请尝试: 精确搜索
首页 > 其他分享> 文章详细

聊一聊tcp 拥塞控制 相关数据结构

2021-11-20 01:00:08  阅读:214  来源: 互联网

标签:__ struct tcp 拥塞 u32 聊一聊 icsk out


struct tcp_sock {//在 inet_connection_sock  基础上增加了 滑动窗口 拥塞控制算法等tcp 专有 属性
    /* inet_connection_sock has to be the first member of tcp_sock */
    struct inet_connection_sock    inet_conn;
    u16    tcp_header_len;    /* Bytes of tcp header to send        */
    u16    gso_segs;    /* Max number of segs per GSO packet    */

/*
 *    Header prediction flags
 *    0x5?10 << 16 + snd_wnd in net byte order
 */
    __be32    pred_flags;/*首部预测标志 在接收到 syn 跟新窗口 等时设置此标志 ,
    此标志和时间戳 序号等 用于判断执行 快速还是慢速路径*/
        
/*
 *    RFC793 variables by their proper names. This means you can
 *    read the code and the spec side by side (and laugh ...)
 *    See RFC793 and RFC1122. The RFC writes these in capitals.
 */
    u64    bytes_received;    /* RFC4898 tcpEStatsAppHCThruOctetsReceived
                 * sum(delta(rcv_nxt)), or how many bytes
                 * were acked.
                 */
    u32    segs_in;    /* RFC4898 tcpEStatsPerfSegsIn
                 * total number of segments in.
                 */
     u32    rcv_nxt;    /* What we want to receive next  等待接收的下一个序列号    */
    u32    copied_seq;    /* Head of yet unread data        */

/* rcv_nxt on last window update sent最早接收但没有确认的序号, 也就是接收窗口的左端,
        在发送ack的时候, rcv_nxt更新 因此rcv_wup 更新比rcv_nxt 滞后一些  */
    u32    rcv_wup;    

    u32    snd_nxt;    /* Next sequence we send 等待发送的下一个序列号        */
    u32    segs_out;    /* RFC4898 tcpEStatsPerfSegsOut
                 * The total number of segments sent.
                 */
    u64    bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked
                 * sum(delta(snd_una)), or how many bytes
                 * were acked.
                 */
    struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */

     u32    snd_una;    /* First byte we want an ack for  最早一个未被确认的序号    */
     u32    snd_sml;    /* Last byte of the most recently transmitted small packet  最近发送一个小于mss的最后 一个字节序列号
    在成功发送, 如果报文小于mss,跟新这个字段 主要用来判断是否启用 nagle 算法*/
    u32    rcv_tstamp;    /* timestamp of last received ACK (for keepalives)  最近一次收到ack的时间 用于 tcp 保活*/
    u32    lsndtime;    /* timestamp of last sent data packet (for restart window) 最近一次发送 数据包时间*/
    u32    last_oow_ack_time;  /* timestamp of last out-of-window ACK */

    u32    tsoffset;    /* timestamp offset */

    struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
    unsigned long    tsq_flags;

    /* Data for direct copy to user cp 数据到用户进程的控制块 有用户缓存以及其长度 prequeue 队列 其内存*/
    struct {
        struct sk_buff_head    prequeue // tcp 段 缓冲到此队列 知道进程主动读取才真正的处理;
        struct task_struct    *task;
        struct msghdr        *msg;
        int            memory;// prequeue 当前消耗的内存
        int            len;// 用户缓存中 当前可以使用的缓存大小 
    } ucopy;

    u32    snd_wl1;    /* Sequence for window update记录跟新发送窗口的那个ack 段号 用来判断是否 需要跟新窗口
    如果后续收到ack大于snd_wll 则表示需要更新 窗口*/
    u32    snd_wnd;    /* The window we expect to receive 接收方 提供的窗口大小 也就是发送方窗口大小    */
    u32    max_window;    /* Maximal window ever seen from peer 接收方通告的最大窗口    */
    u32    mss_cache;    /* Cached effective mss, not including SACKS  发送方当前有效的mss*/

    u32    window_clamp;    /* Maximal window to advertise 滑动窗口最大值        */
    u32    rcv_ssthresh;    /* Current window clamp  当前接收窗口的阈值            */

    /* Information of the most recently (s)acked skb */
    struct tcp_rack {
        struct skb_mstamp mstamp; /* (Re)sent time of the skb */
        u8 advanced; /* mstamp advanced since last lost marking */
        u8 reord;    /* reordering detected */
    } rack;
    u16    advmss;        /* Advertised MSS本端能接收的 MSS 上限,在建立时用来通告对方            */
    u8    unused;
    u8    nonagle     : 4,/* Disable Nagle algorithm?  是否  开启 ngnagle 算法           */
        thin_lto    : 1,/* Use linear timeouts for thin streams */
        thin_dupack : 1,/* Fast retransmit on first dupack      */
        repair      : 1,
        frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
    u8    repair_queue;
    u8    do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
        syn_data:1,    /* SYN includes data */
        syn_fastopen:1,    /* SYN includes Fast Open option */
        syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
        syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
        save_syn:1,    /* Save headers of SYN packet */
        is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
    u32    tlp_high_seq;    /* snd_nxt at the time of TLP retransmit. */

/* RTT measurement */
    u32    srtt_us;    /* smoothed round trip time << 3 in usecs  平滑rtt*/
    u32    mdev_us;    /* medium deviation    rtt平均偏差        */
    u32    mdev_max_us;    /* maximal mdev for the last rtt period   rtt平均偏差最大值     */
    u32    rttvar_us;    /* smoothed mdev_max            */
    u32    rtt_seq;    /* sequence number to update rttvar  记录SND.UNA 计算rto 时比较SND.NUA是否已经给更新
    如果SND.UNA 跟新,则需要同时跟新rttval*/
    struct rtt_meas {
        u32 rtt, ts;    /* RTT in usec and sampling time in jiffies. */
    } rtt_min[3];

    u32    packets_out;    /* Packets which are "in flight"发送出去 没有被ack的数 (SND.NEXT -SND.UNA )*/
    u32    retrans_out;    /* Retransmitted packets out 重传还未得到确认的tcp数        重传并且还未得到确认的 TCP 段的数目*/
    u32    max_packets_out;  /* max packets_out in last window */
    u32    max_packets_seq;  /* right edge of max_packets_out flight */

    u16    urg_data;    /* Saved octet of OOB data and control flags 存放紧急数据以及控制标示 */
    u8    ecn_flags;    /* ECN status bits.            */
    u8    keepalive_probes; /* num of allowed keep alive probes 保活探测次数上限    */
    u32    reordering;    /* Packet reordering metric. tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;三个重复ACK报文时,触发快速重传 */
    u32    snd_up;        /* Urgent pointer 紧急数据指针 带外数据的序号        */

/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
    struct tcp_options_received rx_opt;

/*
 *    Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
     u32    snd_ssthresh;    /* Slow start size threshold 拥塞控制 满启动阈值        */
     u32    snd_cwnd;    /* Sending congestion window    当前拥塞窗口大小  ---发送的拥塞窗口    */
    u32    snd_cwnd_cnt;    /* Linear increase counter    自从上次调整拥塞窗口后 到目前位置接收到的
    总ack段数 如果该字段为0  表示调整拥塞窗口但是没有收到ack,调整拥塞窗口之后 收到ack段就回让
    snd_cwnd_cnt 加1 */
    u32    snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this  snd_cwnd  的最大值*/
    u32    snd_cwnd_used;//记录已经从队列发送而没有被ack的段数
    u32    snd_cwnd_stamp;//记录最近一次检验cwnd 的时间;     拥塞期间 每次会检验cwnd而调节拥塞窗口 ,
    //在非拥塞期间,为了防止应用层序造成拥塞窗口失效  因此在发送后 有必要检测cwnd
    u32    prior_cwnd;    /* Congestion window at start of Recovery.在进入 Recovery 状态时的拥塞窗口 */
    u32    prr_delivered;    /* Number of newly delivered packets to在恢复阶段给接收者新发送包的数量
                 * receiver in Recovery. */
    u32    prr_out;    /* Total number of pkts sent during Recovery.在恢复阶段一共发送的包的数量 */

     u32    rcv_wnd;    /* Current receiver window 当前接收窗口的大小        */
    u32    write_seq;    /* Tail(+1) of data held in tcp send buffer   已加入发送队列中的最后一个字节序号*/
    u32    notsent_lowat;    /* TCP_NOTSENT_LOWAT */
    u32    pushed_seq;    /* Last pushed seq, required to talk to windows */
    u32    lost_out;    /* Lost packets丢失的数据报            */
    u32    sacked_out;    /* SACK'd packets启用 SACK 时,通过 SACK 的 TCP 选项标识已接收到的段的数量。
                 不启用 SACK 时,标识接收到的重复确认的次数,该值在接收到确认新数据段时被清除。            */
    u32    fackets_out;    /* FACK'd packets    FACK'd packets 记录 SND.UNA 与 (SACK 选项中目前接收方收到的段中最高序号段) 之间的段数。FACK
            用 SACK 选项来计算丢失在网络中上的段数  lost_out=fackets_out-sacked_out  left_out=fackets_out  fackets_out = sack_out + lost_out
*/

    /* from STCP, retrans queue hinting */
    struct sk_buff* lost_skb_hint; /*在重传队列中, 缓存下次要标志的段*/
    struct sk_buff *retransmit_skb_hint;/* 表示将要重传的起始包*/

    /* OOO segments go in this list. Note that socket lock must be held,
     * as we do not use sk_buff_head lock.
     */
    struct sk_buff_head    out_of_order_queue;

    /* SACKs data, these 2 need to be together (see tcp_options_write) */
    struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
    struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

    struct tcp_sack_block recv_sack_cache[4];

    struct sk_buff *highest_sack;   /* skb just after the highest最大sack序列号
                     * skb with SACKed bit set
                     * (validity guaranteed only if
                     * sacked_out > 0)
                     */

    int     lost_cnt_hint;/* 已经标志了多少个段 */
    u32     retransmit_high;    /* L-bits may be on up to this seqno  表示将要重传的起始包 */

    u32    prior_ssthresh; /* ssthresh saved at recovery start表示前一个snd_ssthresh得大小    */
    u32    high_seq;    /* snd_nxt at onset of congestion拥塞开始时,snd_nxt的大----开始拥塞的时候下一个要发送的序号字节*/

    u32    retrans_stamp;    /* Timestamp of the last retransmit,
                 * also used in SYN-SENT to remember stamp of
                 * the first SYN. */
    u32    undo_marker;    /* snd_una upon a new recovery episode. 在使用 F-RTO 算法进行发送超时处理,或进入 Recovery 进行重传,
                    或进入 Loss 开始慢启动时,记录当时 SND.UNA, 标记重传起始点。它是检测是否可以进行拥塞控制撤销的条件之一,一般在完成
                    拥塞撤销操作或进入拥塞控制 Loss 状态后会清零。*/
    int    undo_retrans;    /* number of undoable retransmissions. 在恢复拥塞控制之前可进行撤销的重传段数。
                    在进入 FTRO 算法或 拥塞状态 Loss 时,清零,在重传时计数,是检测是否可以进行拥塞撤销的条件之一。*/
    u32    total_retrans;    /* Total retransmits for entire connection */

    u32    urg_seq;    /* Seq of received urgent pointer  紧急数据的序号 所在段的序号和紧急指针相加获得*/
    unsigned int        keepalive_time;      /* time before keep alive takes place */
    unsigned int        keepalive_intvl;  /* time interval between keep alive probes */

    int            linger2;

/* Receiver side RTT estimation */
    struct {
        u32    rtt;
        u32    seq;
        u32    time;
    } rcv_rtt_est;

/* Receiver queue space */
    struct {
        int    space;
        u32    seq;
        u32    time;
    } rcvq_space;

/* TCP-specific MTU probe information. */
    struct {
        u32          probe_seq_start;
        u32          probe_seq_end;
    } mtu_probe;
    u32    mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
               * while socket was owned by user.
               */

#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
    const struct tcp_sock_af_ops    *af_specific;

/* TCP MD5 Signature Option information */
    struct tcp_md5sig_info    __rcu *md5sig_info;
#endif

/* TCP fastopen related information */
    struct tcp_fastopen_request *fastopen_req;
    /* fastopen_rsk points to request_sock that resulted in this big
     * socket. Used to retransmit SYNACKs etc.
     */
    struct request_sock *fastopen_rsk;
    u32    *saved_syn;
};

   接收端在检测数据包乱序是否超过乱序阀值(默认为3,在proc的tcp_reordering里可配置)是会用到fack_out和sacked_out,针对这两个的含义如下图所示,fack_out表示收到最大sack到snd_una间的大小,sacked_out表示接收方sack到的包个数;
接收端在检测数据包乱序是否超过乱序阀值(默认为3,在proc的tcp_reordering里可配置)是会用到fack_out和sacked_out,针对这两个的含义如下图所示,fack_out表示收到最大sack到snd_una间的大小,sacked_out表示接收方sack到的包个数;

 

/* This is what the send packet queuing engine uses to pass
 * TCP per-packet control information to the transmission code.
 * We also store the host-order sequence numbers in here too.
 * This is 44 bytes if IPV6 is enabled.
 * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
 */
struct tcp_skb_cb {
    __u32        seq;        /* Starting sequence number    *起始序号 */
    __u32        end_seq;    /* SEQ + FIN + SYN + datalen    */
    union {
        /* Note : tcp_tw_isn is used in input path only
         *      (isn chosen by tcp_timewait_state_process())
         *
         *       tcp_gso_segs/size are used in write queue only,
         *      cf tcp_skb_pcount()/tcp_skb_mss()
         */
        __u32        tcp_tw_isn;
        struct {
            u16    tcp_gso_segs;
            u16    tcp_gso_size;
        };
    };
    __u8        tcp_flags;    /* TCP header flags. (tcp[13])    */

    __u8        sacked;        /* State flags for SACK/FACK.    */
#define TCPCB_SACKED_ACKED    0x01    /* SKB ACK'd by a SACK block  SKB 被确认了 ----被 SACK 块 ACK'd 也就是SACK块已经给出了skb数据缓冲区中的段回答信息    */
#define TCPCB_SACKED_RETRANS    0x02    /* SKB retransmitted  数据段被重传        */
#define TCPCB_LOST        0x04    /* SKB is lost    数据段已经丢失        */
#define TCPCB_TAGBITS        0x07    /* All tag bits TCPCB_TAGBITS = TCPCB_SACKED_ACKED |  TCPCB_SACKED_RESTRANS | TCPCB_LOST            */
#define TCPCB_REPAIRED        0x10    /* SKB repaired (no skb_mstamp)    */
#define TCPCB_EVER_RETRANS    0x80    /* Ever retransmitted frame 指明数据段以前是否重传过    */
#define TCPCB_RETRANS        (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
                TCPCB_REPAIRED)

    __u8        ip_dsfield;    /* IPv4 tos or IPv6 dsfield    */
    /* 1 byte hole */
    __u32        ack_seq;    /* Sequence number ACK'd  ACK 的序号    */
    union {
        struct inet_skb_parm    h4;
#if IS_ENABLED(CONFIG_IPV6)
        struct inet6_skb_parm    h6;
#endif

 

 2

 

/** inet_connection_sock - INET connection oriented sock
 *
 * @icsk_accept_queue:       FIFO of established children 
 * @icsk_bind_hash:       Bind node
 * @icsk_timeout:       Timeout
 * @icsk_retransmit_timer: Resend (no ack)
 * @icsk_rto:           Retransmit timeout
 * @icsk_pmtu_cookie       Last pmtu seen by socket
 * @icsk_ca_ops           Pluggable congestion control hook
 * @icsk_af_ops           Operations which are AF_INET{4,6} specific
 * @icsk_ca_state:       Congestion control state
 * @icsk_retransmits:       Number of unrecovered [RTO] timeouts
 * @icsk_pending:       Scheduled timer event
 * @icsk_backoff:       Backoff
 * @icsk_syn_retries:      Number of allowed SYN (or equivalent) retries
 * @icsk_probes_out:       unanswered 0 window probes
 * @icsk_ext_hdr_len:       Network protocol overhead (IP/IPv6 options)
 * @icsk_ack:           Delayed ACK control data
 * @icsk_mtup;           MTU probing control data
 */
struct inet_connection_sock {
    /* inet_sock has to be the first member! */
    struct inet_sock      icsk_inet; //inet_connection_sock  common struct
    struct request_sock_queue icsk_accept_queue;  //tcp newsk 存放新的链接sock 等待accept 读取
    struct inet_bind_bucket      *icsk_bind_hash;//指向与之bind的信息
    unsigned long          icsk_timeout;//数据包超时时间-- 重传 tv_off    --通常为 jiffies+ icsk_rto 后 进行重传
     struct timer_list      icsk_retransmit_timer; // 通过icsk_pengding 来区分重传定时器和持续定时器
     struct timer_list      icsk_delack_timer;// 延时发送ack 定时器
    __u32              icsk_rto;// 重传超时时间 初始值为    TCP_TIMEOUT_INIT    根据网络情况动态计算
    __u32              icsk_pmtu_cookie; //最后一次更新的路径MTU 
    const struct tcp_congestion_ops *icsk_ca_ops;
    const struct inet_connection_sock_af_ops *icsk_af_ops;
    unsigned int          (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
    __u8              icsk_ca_state:6,// 拥塞状态
                  icsk_ca_setsockopt:1,
                  icsk_ca_dst_locked:1;
    __u8              icsk_retransmits;// 超时重传的次数 
    __u8              icsk_pending; //标志定时器事件     ICSK_TIME_EARLY_RETRANS  等可选值 表示 重传定时 持续定时器 保活定时器等
    __u8              icsk_backoff;// 计算持续定时器 下一个设定值的指数 退避算法指数 
    __u8              icsk_syn_retries; // 建立tcp 允许 重传 syn   syn+ack的次数
    __u8              icsk_probes_out;// 持续定时等  周期性发出未被确认的tcp seg 数目
    __u16              icsk_ext_hdr_len;
    struct {
        __u8          pending;     /* ACK is pending 标示 需要确认发送的 紧急程度 和状态               */
        __u8          quick;     /* Scheduled number of quick acks在快速发送确认模式中       */
        __u8          pingpong;     /* The session is interactive 启用禁用 快速确认模式    1 ---标示延时发送ack 0 标示快速发送ack       */
        __u8          blocked;     /* Delayed ACK was blocked by socket lock  软中断 用户进程 不能同时own sk
        如果sk 被 user  拥有, 延时ack 定时器被触发,此时不应该发送ack, 
        blocked 为1;标示 如果有机会就需要立即发送,所以当接收的数据被cp到user 后 就可以立即发送ack */
        __u32          ato;         /* Predicted tick of soft clock 延时确认的估值       */
        unsigned long      timeout;     /* Currently scheduled timeout当前延时确认时间 超时后立即发送ack           */
        __u32          lrcvtime;     /* timestamp of last received data packet 最近一次接收到数据包时间*/
        __u16          last_seg_size; /* Size of last incoming segment最后一个接收到段的长度 用来计算rcv_mss       */
        __u16          rcv_mss;     /* MSS used for delayed ACK decisions 由最近接收的段计算出MSS       */ 
    } icsk_ack; // 延时确认控制块
    struct {
        int          enabled;// 是否开启路径MTU

        /* Range of MTUs to search  */
        int          search_high;
        int          search_low;

        /* Information on the current probe当前mtu 探测的长度 用于判断mtu是否完成  初始值为0. */
        int          probe_size;

        u32          probe_timestamp;
    } icsk_mtup;
    u32              icsk_user_timeout;

    u64              icsk_ca_priv[64 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE      (8 * sizeof(u64))
};

 

标签:__,struct,tcp,拥塞,u32,聊一聊,icsk,out
来源: https://www.cnblogs.com/codestack/p/15578849.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有