说起backlog, 都会想起socket编程中的listen backlog 参数,而这个backlog 是linux内核中处理的backlog么?
int listen(int sockfd, int backlog)man listen
The backlog argument defines the maximum length to which the queue
of pending connections for sockfd may grow. If a connection request arrives when the queue is full, the client may receive an
error with an indication of ECONNREFUSED or, if the underlying protocol supports retransmission, the request may be ignored
so that a later reattempt at connection succeeds.
实际上在linux内核2.2版本以后,backlog参数控制的是已经握手成功的还在accept queue的大小。
在linux 中我们可以简单的认为在握手过程中会出现2个queue, 我们先来看linux定义的2个结构体
struct request_sock_queue { /*Points to the request_sock accept queue, when after 3 handshake will add the request_sock from syn_table to here*/ struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_tail; rwlock_t syn_wait_lock; u8 rskq_defer_accept; /* 3 bytes hole, try to pack */ struct listen_sock *listen_opt; }; struct listen_sock { u8 max_qlen_log; /*2^max_qlen_log is the length of the accpet queue, max of max_qlen_log is 10. (2^10=1024)*/ /* 3 bytes hole, try to use */ int qlen; /* qlen is the current length of the accpet queue*/ int qlen_young; int clock_hand; u32 hash_rnd; u32 nr_table_entries; /*nr_table_entries is the number of the syn_table,max is 512*/ struct request_sock *syn_table[0]; };
收到客户端的syn请求 ->将这个请求放入syn_table中去->服务器端回复syn-ack->收到客户端的ack->放入accept queue中
我们把整个过程分为5个部分,其中将请求放入syn_table和accept queue中的过程也是backlog相关的,在下面我们会详细阐述。
我们先简单的描述一下几个tcp的操作函数,下面针对的也是ip4协议的
const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .remember_stamp = tcp_v4_remember_stamp, .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), .bind_conflict = inet_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ip_setsockopt, .compat_getsockopt = compat_ip_getsockopt, #endif };
我们所重点关注的主要是方法中的drop逻辑
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { /* Never answer to SYNs send to broadcast or multicast */ if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) goto drop; /* TW buckets are converted to open requests without * limitations, they conserve resources and peer is * evidently real one. */ if (inet_csk_reqsk_queue_is_full(sk) && !isn) { #ifdef CONFIG_SYN_COOKIES if (sysctl_tcp_syncookies) { want_cookie = 1; } else #endif goto drop; } /* Accept backlog is full. If we have already queued enough * of warm entries in syn queue, drop request. It is better than * clogging syn queue with openreqs with exponentially increasing * timeout. */ if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; .... }1. inet_csk_reqsk_queue_is_full(sk)
判断的是 queue->listen_opt->qlen >> queue->listen_opt->max_qlen_log;
这里有个 qlen 代表的是listen_opt的 syn_table的长度,那什么是max_qlen_log呢?
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog); <span style="white-space:pre"> </span>nr_table_entries = max_t(u32, nr_table_entries, 8); <span style="white-space:pre"> </span>nr_table_entries = roundup_pow_of_two(nr_table_entries + 1); for (lopt->max_qlen_log = 3; (1 << lopt->max_qlen_log) < nr_table_entries; lopt->max_qlen_log++);
也就是max_qlen 是listen 传入的backlog和sysctl_max_syn_backlog最小值,并且一定大于512, sysctl_max_syn_backlog 就是我们熟悉的
/proc/sys/net/ipv4/tcp_max_syn_backlog
我们看一下listen 函数在kernel的实现
SYSCALL_DEFINE2(listen, int, fd, int, backlog) { struct socket *sock; int err, fput_needed; int somaxconn; sock = sockfd_lookup_light(fd, &err, &fput_needed); if (sock) { <span style="color: rgb(255, 102, 102);">somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn; if ((unsigned)backlog > somaxconn) backlog = somaxconn;</span> err = security_socket_listen(sock, backlog); if (!err) err = sock->ops->listen(sock, backlog); fput_light(sock->file, fput_needed); } return err; }
somaxconn的值定义在
/proc/sys/net/core/somaxconn
2.sk_acceptq_is_full
static inline int sk_acceptq_is_full(struct sock *sk) { return sk->sk_ack_backlog > sk->sk_max_ack_backlog; } int inet_listen(struct socket *sock, int backlog) { <pre name="code" class="cpp">sk->sk_max_ack_backlog = backlog; }就是等于我们刚才在前面部分看到的listen中的值
3.inet_csk_reqsk_queue_young
在判断sk_acceptq_is_full 的情况下,同是也要求了判断inet_csk_reqsk_queue_young>1,也就是刚才的结构体listen_sock的qlen_young
qlen_young 是对syn_table的计数,进入 syn 加1,出了syn_table 进入accept_table -1
有的人可能会有疑问了
如果accept queue满了,那么qlen_young不就是一直增加,而新来的客户端都会被条件if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) 而drop syn的ack包,那么客户端会出现connected timeout, 而实际上你在测试linux的环境中会发现并没有出现这样的情况。
实际上linux在server起socket的时候会调用tcp_keepalive_timer启动tcp_synack_timer,会调用inet_csk_reqsk_queue_prune
if (sk->sk_state == TCP_LISTEN) { tcp_synack_timer(sk); goto out; }
static void tcp_synack_timer(struct sock *sk) { inet_csk_reqsk_queue_prune(sk, TCP_SYNQ_INTERVAL, TCP_TIMEOUT_INIT, TCP_RTO_MAX); }
而inet_csk_reqsk_queue_prune会在去检查syn的table, 而删除一些这个request 过期后并且完成retry 的syn ack包的请求
struct request_sock { struct request_sock *dl_next; /* Must be first member! */ u16 mss; u8 retrans; u8 cookie_ts; /* syncookie: encode tcpopts in timestamp */ /* The following two fields can be easily recomputed I think -AK */ u32 window_clamp; /* window clamp at creation time */ u32 rcv_wnd; /* rcv_wnd offered first time */ u32 ts_recent; unsigned long <span style="color:#ff0000;">expires</span>; const struct request_sock_ops *rsk_ops; struct sock *sk; u32 secid; u32 peer_secid; };
关于retry, retry的参数可以通过设置
/proc/sys/net/ipv4/tcp_syn_retries当然你可以通过设置
/proc/sys/net/ipv4/tcp_abort_on_overflow 为1 不允许syn ack 重试
因为被inet_csk_reqsk_queue_prune函数清除了syn_table,在没有并发的情况下基本上不会出现inet_csk_reqsk_queue_young>1的情况,也就是说不会出现drop sync的情况,在客户端表现,不会出现connect timeout 的情况,这里的实现linux和mac的实现有很大的不同。
通过函数tcp_v4_conn_request的分析,在linux的设计初衷是尽力的允许新的连接。
我们也许会问,刚才的服务器syn ack回去后,如果客户端也回复了ack的话,而此时accept的queue满了,将会如何处理
我们回到前面提到的步骤,处理客户端的ack 函数也就是
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { struct inet_request_sock *ireq; struct inet_sock *newinet; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *key; #endif if (sk_acceptq_is_full(sk)) goto exit_overflow; if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL) goto exit; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) goto exit; newsk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(newsk, dst); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); ireq = inet_rsk(req); newinet->inet_daddr = ireq->rmt_addr; newinet->inet_rcv_saddr = ireq->loc_addr; newinet->inet_saddr = ireq->loc_addr; newinet->opt = ireq->opt; ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->inet_id = newtp->write_seq ^ jiffies; tcp_mtup_init(newsk); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); if (tcp_sk(sk)->rx_opt.user_mss && tcp_sk(sk)->rx_opt.user_mss < newtp->advmss) newtp->advmss = tcp_sk(sk)->rx_opt.user_mss; tcp_initialize_rcv_mss(newsk); #ifdef CONFIG_TCP_MD5SIG /* Copy over the MD5 key from the original socket */ key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr); if (key != NULL) { /* * We're using one, so create a matching key * on the newsk structure. If we fail to get * memory, then we end up not copying the key * across. Shucks. */ char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC); if (newkey != NULL) tcp_v4_md5_do_add(newsk, newinet->inet_daddr, newkey, key->keylen); newsk->sk_route_caps &= ~NETIF_F_GSO_MASK; } #endif __inet_hash_nolisten(newsk, NULL); __inet_inherit_port(sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); dst_release(dst); return NULL; }
netstat -s 来查看到
在函数tcp_v4_syn_recv_sock中我们看到tcp_create_openreq_child,此时才clone出一个新的socket ,也就是只有通过了3次握手后,linux才会产生新的socket, 而在3次握手中所传的socket 实际上是server的listen的 socket, 那也就是说这个socket 只有一个状态TCP_LISTEN
case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; smp_mb(); tcp_set_state(sk, TCP_ESTABLISHED);
/proc/net/tcp
static void get_openreq4(struct sock *sk, struct request_sock *req, struct seq_file *f, int i, int uid, int *len) { const struct inet_request_sock *ireq = inet_rsk(req); int ttd = req->expires - jiffies; seq_printf(f, "%4d: %08X:%04X %08X:%04X" " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n", i, ireq->loc_addr, ntohs(inet_sk(sk)->inet_sport), ireq->rmt_addr, ntohs(ireq->rmt_port), <span style="color:#ff0000;">TCP_SYN_RECV</span>, 0, 0, /* could print option size, but that is af dependent. */ 1, /* timers active (only the expire timer) */ jiffies_to_clock_t(ttd), req->retrans, uid, 0, /* non standard timer */ 0, /* open_requests have no inode */ atomic_read(&sk->sk_refcnt), req, len); }
原文地址:http://blog.csdn.net/raintungli/article/details/37913765