标签:des style blog http color io os 使用 ar
1: int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) 2: { 3: ..... 4: /* 设置目的地址和目标端口 */ 5: inet->dport = usin->sin_port; 6: inet->daddr = daddr; 7: .... 8: /* 初始化MSS上限 */ 9: tp->rx_opt.mss_clamp = 536; 10: 11: /* Socket identity is still unknown (sport may be zero). 12: * However we set state to SYN-SENT and not releasing socket 13: * lock select source port, enter ourselves into the hash tables and 14: * complete initialization after this. 15: */ 16: tcp_set_state(sk, TCP_SYN_SENT);/* 设置状态 */ 17: err = tcp_v4_hash_connect(sk);/* 将传输控制添加到ehash散列表中,并动态分配端口 */ 18: if (err) 19: goto failure; 20: .... 21: if (!tp->write_seq)/* 还未计算初始序号 */ 22: /* 根据双方地址、端口计算初始序号 */ 23: tp->write_seq = secure_tcp_sequence_number(inet->saddr, 24: inet->daddr, 25: inet->sport, 26: usin->sin_port); 27: 28: /* 根据初始序号和当前时间,随机算一个初始id */ 29: inet->id = tp->write_seq ^ jiffies; 30: 31: /* 发送SYN段 */ 32: err = tcp_connect(sk); 33: rt = NULL; 34: if (err) 35: goto failure; 36: 37: return 0; 38: }
1: asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen) 2: { 3: struct socket *sock, *newsock; 4: ..... 5: sock = sockfd_lookup(fd, &err);/* 获得侦听端口的socket */ 6: ..... 7: if (!(newsock = sock_alloc()))/* 分配一个新的套接口,用来处理与客户端的连接 */ 8: ..... 9: /* 调用传输层的accept,对TCP来说,是inet_accept */ 10: err = sock->ops->accept(sock, newsock, sock->file->f_flags); 11: .... 12: if (upeer_sockaddr) {/* 调用者需要获取对方套接口地址和端口 */ 13: /* 调用传输层回调获得对方的地址和端口 */ 14: if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) { 15: } 16: /* 成功后复制到用户态 */ 17: err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen); 18: } 19: ..... 20: if ((err = sock_map_fd(newsock)) < 0)/* 为新连接分配文件描述符 */ 21: 22: return err; 23: }
[注]: 在内核2.6.32以后对应函数为inet_csk_accept().
1: struct sock *tcp_accept(struct sock *sk, int flags, int *err) 2: { 3: .... 4: /* Find already established connection */ 5: if (!tp->accept_queue) {/* accept队列为空,说明还没有收到新连接 */ 6: long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);/* 如果套口是非阻塞的,或者在一定时间内没有新连接,则返回 */ 7: 8: if (!timeo)/* 超时时间到,没有新连接,退出 */ 9: goto out; 10: 11: /* 运行到这里,说明有新连接到来,则等待新的传输控制块 */ 12: error = wait_for_connect(sk, timeo); 13: if (error) 14: goto out; 15: } 16: 17: req = tp->accept_queue; 18: if ((tp->accept_queue = req->dl_next) == NULL) 19: tp->accept_queue_tail = NULL; 20: 21: newsk = req->sk; 22: sk_acceptq_removed(sk); 23: tcp_openreq_fastfree(req); 24: .... 25: 26: return newsk; 27: }
1: /* 构造并发送SYN段 */ 2: int tcp_connect(struct sock *sk) 3: { 4: struct tcp_sock *tp = tcp_sk(sk); 5: struct sk_buff *buff; 6: 7: tcp_connect_init(sk);/* 初始化传输控制块中与连接相关的成员 */ 8: 9: /* 为SYN段分配报文并进行初始化 */ 10: buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); 11: if (unlikely(buff == NULL)) 12: return -ENOBUFS; 13: 14: /* Reserve space for headers. */ 15: skb_reserve(buff, MAX_TCP_HEADER); 16: 17: TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; 18: TCP_ECN_send_syn(sk, tp, buff); 19: TCP_SKB_CB(buff)->sacked = 0; 20: skb_shinfo(buff)->tso_segs = 1; 21: skb_shinfo(buff)->tso_size = 0; 22: buff->csum = 0; 23: TCP_SKB_CB(buff)->seq = tp->write_seq++; 24: TCP_SKB_CB(buff)->end_seq = tp->write_seq; 25: tp->snd_nxt = tp->write_seq; 26: tp->pushed_seq = tp->write_seq; 27: tcp_ca_init(tp); 28: 29: /* Send it off. */ 30: TCP_SKB_CB(buff)->when = tcp_time_stamp; 31: tp->retrans_stamp = TCP_SKB_CB(buff)->when; 32: 33: /* 将报文添加到发送队列上 */ 34: __skb_queue_tail(&sk->sk_write_queue, buff); 35: sk_charge_skb(sk, buff); 36: tp->packets_out += tcp_skb_pcount(buff); 37: /* 发送SYN段 */ 38: tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); 39: TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); 40: 41: /* Timer for repeating the SYN until an answer. */ 42: /* 启动重传定时器 */ 43: tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); 44: return 0; 45: } 46:
图: 服务端接收到SYN段后,发送SYN/ACK处理流程。
1: /* 向客户端发送SYN+ACK报文 */ 2: static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, 3: struct dst_entry *dst) 4: { 5: int err = -1; 6: struct sk_buff * skb; 7: 8: /* First, grab a route. */ 9: /* 查找到客户端的路由 */ 10: if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) 11: goto out; 12: 13: /* 根据路由、传输控制块、连接请求块中的构建SYN+ACK段 */ 14: skb = tcp_make_synack(sk, dst, req); 15: 16: if (skb) {/* 生成SYN+ACK段成功 */ 17: struct tcphdr *th = skb->h.th; 18: 19: /* 生成校验码 */ 20: th->check = tcp_v4_check(th, skb->len, 21: req->af.v4_req.loc_addr, 22: req->af.v4_req.rmt_addr, 23: csum_partial((char *)th, skb->len, 24: skb->csum)); 25: 26: /* 生成IP数据报并发送出去 */ 27: err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, 28: req->af.v4_req.rmt_addr, 29: req->af.v4_req.opt); 30: if (err == NET_XMIT_CN) 31: err = 0; 32: } 33: 34: out: 35: dst_release(dst); 36: return err; 37: } 38:
1: /* 在SYN_SENT状态下处理接收到的段,但是不处理带外数据 */ 2: static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, 3: struct tcphdr *th, unsigned len) 4: { 5: struct tcp_sock *tp = tcp_sk(sk); 6: int saved_clamp = tp->rx_opt.mss_clamp; 7: 8: /* 解析TCP选项并保存到传输控制块中 */ 9: tcp_parse_options(skb, &tp->rx_opt, 0); 10: 11: if (th->ack) {/* 处理ACK标志 */ 12: /* rfc793: 13: * "If the state is SYN-SENT then 14: * first check the ACK bit 15: * If the ACK bit is set 16: * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send 17: * a reset (unless the RST bit is set, if so drop 18: * the segment and return)" 19: * 20: * We do not send data with SYN, so that RFC-correct 21: * test reduces to: 22: */ 23: if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) 24: goto reset_and_undo; 25: 26: if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 27: !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, 28: tcp_time_stamp)) { 29: NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED); 30: goto reset_and_undo; 31: } 32: 33: /* Now ACK is acceptable. 34: * 35: * "If the RST bit is set 36: * If the ACK was acceptable then signal the user "error: 37: * connection reset", drop the segment, enter CLOSED state, 38: * delete TCB, and return." 39: */ 40: 41: if (th->rst) {/* 收到ACK+RST段,需要tcp_reset设置错误码,并关闭套接口 */ 42: tcp_reset(sk); 43: goto discard; 44: } 45: 46: /* rfc793: 47: * "fifth, if neither of the SYN or RST bits is set then 48: * drop the segment and return." 49: * 50: * See note below! 51: * --ANK(990513) 52: */ 53: if (!th->syn)/* 在SYN_SENT状态下接收到的段必须存在SYN标志,否则说明接收到的段无效,丢弃该段 */ 54: goto discard_and_undo; 55: 56: /* rfc793: 57: * "If the SYN bit is on ... 58: * are acceptable then ... 59: * (our SYN has been ACKed), change the connection 60: * state to ESTABLISHED..." 61: */ 62: 63: /* 从首部标志中获取显示拥塞通知的特性 */ 64: TCP_ECN_rcv_synack(tp, th); 65: if (tp->ecn_flags&TCP_ECN_OK)/* 如果支持ECN,则设置标志 */ 66: sk->sk_no_largesend = 1; 67: 68: /* 设置与窗口相关的成员变量 */ 69: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 70: tcp_ack(sk, skb, FLAG_SLOWPATH); 71: 72: /* Ok.. it‘s good. Set up sequence numbers and 73: * move to established. 74: */ 75: tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 76: tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 77: 78: /* RFC1323: The window in SYN & SYN/ACK segments is 79: * never scaled. 80: */ 81: tp->snd_wnd = ntohs(th->window); 82: tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); 83: 84: if (!tp->rx_opt.wscale_ok) { 85: tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; 86: tp->window_clamp = min(tp->window_clamp, 65535U); 87: } 88: 89: if (tp->rx_opt.saw_tstamp) {/* 根据是否支持时间戳选项来设置传输控制块的相关字段 */ 90: tp->rx_opt.tstamp_ok = 1; 91: tp->tcp_header_len = 92: sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 93: tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 94: tcp_store_ts_recent(tp); 95: } else { 96: tp->tcp_header_len = sizeof(struct tcphdr); 97: } 98: 99: /* 初始化PMTU、MSS等成员变量 */ 100: if (tp->rx_opt.sack_ok && sysctl_tcp_fack) 101: tp->rx_opt.sack_ok |= 2; 102: 103: tcp_sync_mss(sk, tp->pmtu_cookie); 104: tcp_initialize_rcv_mss(sk); 105: 106: /* Remember, tcp_poll() does not lock socket! 107: * Change state from SYN-SENT only after copied_seq 108: * is initialized. */ 109: tp->copied_seq = tp->rcv_nxt; 110: mb(); 111: tcp_set_state(sk, TCP_ESTABLISHED); 112: 113: /* Make sure socket is routed, for correct metrics. */ 114: tp->af_specific->rebuild_header(sk); 115: 116: tcp_init_metrics(sk); 117: 118: /* Prevent spurious tcp_cwnd_restart() on first data 119: * packet. 120: */ 121: tp->lsndtime = tcp_time_stamp; 122: 123: tcp_init_buffer_space(sk); 124: 125: /* 如果启用了连接保活,则启用连接保活定时器 */ 126: if (sock_flag(sk, SOCK_KEEPOPEN)) 127: tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); 128: 129: if (!tp->rx_opt.snd_wscale)/* 首部预测 */ 130: __tcp_fast_path_on(tp, tp->snd_wnd); 131: else 132: tp->pred_flags = 0; 133: 134: if (!sock_flag(sk, SOCK_DEAD)) {/* 如果套口不处于SOCK_DEAD状态,则唤醒等待该套接口的进程 */ 135: sk->sk_state_change(sk); 136: sk_wake_async(sk, 0, POLL_OUT); 137: } 138: 139: /* 连接建立完成,根据情况进入延时确认模式 */ 140: if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { 141: /* Save one ACK. Data will be ready after 142: * several ticks, if write_pending is set. 143: * 144: * It may be deleted, but with this feature tcpdumps 145: * look so _wonderfully_ clever, that I was not able 146: * to stand against the temptation 8) --ANK 147: */ 148: tcp_schedule_ack(tp); 149: tp->ack.lrcvtime = tcp_time_stamp; 150: tp->ack.ato = TCP_ATO_MIN; 151: tcp_incr_quickack(tp); 152: tcp_enter_quickack_mode(tp); 153: tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); 154: 155: discard: 156: __kfree_skb(skb); 157: return 0; 158: } else {/* 不需要延时确认,立即发送ACK段 */ 159: tcp_send_ack(sk); 160: } 161: return -1; 162: } 163: 164: /* No ACK in the segment */ 165: 166: if (th->rst) {/* 收到RST段,则丢弃传输控制块 */ 167: /* rfc793: 168: * "If the RST bit is set 169: * 170: * Otherwise (no ACK) drop the segment and return." 171: */ 172: 173: goto discard_and_undo; 174: } 175: 176: /* PAWS check. */ 177: /* PAWS检测失效,也丢弃传输控制块 */ 178: if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0)) 179: goto discard_and_undo; 180: 181: /* 在SYN_SENT状态下收到了SYN段并且没有ACK,说明是两端同时打开 */ 182: if (th->syn) { 183: /* We see SYN without ACK. It is attempt of 184: * simultaneous connect with crossed SYNs. 185: * Particularly, it can be connect to self. 186: */ 187: tcp_set_state(sk, TCP_SYN_RECV);/* 设置状态为TCP_SYN_RECV */ 188: 189: if (tp->rx_opt.saw_tstamp) {/* 设置时间戳相关的字段 */ 190: tp->rx_opt.tstamp_ok = 1; 191: tcp_store_ts_recent(tp); 192: tp->tcp_header_len = 193: sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; 194: } else { 195: tp->tcp_header_len = sizeof(struct tcphdr); 196: } 197: 198: /* 初始化窗口相关的成员变量 */ 199: tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; 200: tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; 201: 202: /* RFC1323: The window in SYN & SYN/ACK segments is 203: * never scaled. 204: */ 205: tp->snd_wnd = ntohs(th->window); 206: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; 207: tp->max_window = tp->snd_wnd; 208: 209: TCP_ECN_rcv_syn(tp, th);/* 从首部标志中获取显式拥塞通知的特性。 */ 210: if (tp->ecn_flags&TCP_ECN_OK) 211: sk->sk_no_largesend = 1; 212: 213: /* 初始化MSS相关的成员变量 */ 214: tcp_sync_mss(sk, tp->pmtu_cookie); 215: tcp_initialize_rcv_mss(sk); 216: 217: /* 向对端发送SYN+ACK段,并丢弃接收到的SYN段 */ 218: tcp_send_synack(sk); 219: #if 0 220: /* Note, we could accept data and URG from this segment. 221: * There are no obstacles to make this. 222: * 223: * However, if we ignore data in ACKless segments sometimes, 224: * we have no reasons to accept it sometimes. 225: * Also, seems the code doing it in step6 of tcp_rcv_state_process 226: * is not flawless. So, discard packet for sanity. 227: * Uncomment this return to process the data. 228: */ 229: return -1; 230: #else 231: goto discard; 232: #endif 233: } 234: /* "fifth, if neither of the SYN or RST bits is set then 235: * drop the segment and return." 236: */ 237: 238: discard_and_undo: 239: tcp_clear_options(&tp->rx_opt); 240: tp->rx_opt.mss_clamp = saved_clamp; 241: goto discard; 242: 243: reset_and_undo: 244: tcp_clear_options(&tp->rx_opt); 245: tp->rx_opt.mss_clamp = saved_clamp; 246: return 1; 247: } 248:
1: /* 除了ESTABLISHED和TIME_WAIT状态外,其他状态下的TCP段处理都由本函数实现 */ 2: int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, 3: struct tcphdr *th, unsigned len) 4: { 5: struct tcp_sock *tp = tcp_sk(sk); 6: int queued = 0; 7: 8: tp->rx_opt.saw_tstamp = 0; 9: 10: switch (sk->sk_state) { 11: ..... 12: /* SYN_RECV状态的处理 */ 13: if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp &&/* 解析TCP选项,如果首部中存在时间戳选项 */ 14: tcp_paws_discard(tp, skb)) {/* PAWS检测失败,则丢弃报文 */ 15: if (!th->rst) {/* 如果不是RST段 */ 16: /* 发送DACK给对端,说明接收到的TCP段已经处理过 */ 17: NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); 18: tcp_send_dupack(sk, skb); 19: goto discard; 20: } 21: /* Reset is accepted even if it did not pass PAWS. */ 22: } 23: 24: /* step 1: check sequence number */ 25: if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) {/* TCP段序号无效 */ 26: if (!th->rst)/* 如果TCP段无RST标志,则发送DACK给对方 */ 27: tcp_send_dupack(sk, skb); 28: goto discard; 29: } 30: 31: /* step 2: check RST bit */ 32: if(th->rst) {/* 如果有RST标志,则重置连接 */ 33: tcp_reset(sk); 34: goto discard; 35: } 36: 37: /* 如果有必要,则更新时间戳 */ 38: tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); 39: 40: /* step 3: check security and precedence [ignored] */ 41: 42: /* step 4: 43: * 44: * Check for a SYN in window. 45: */ 46: if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) {/* 如果有SYN标志并且序号在接收窗口内 */ 47: NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); 48: tcp_reset(sk);/* 复位连接 */ 49: return 1; 50: } 51: 52: /* step 5: check the ACK field */ 53: if (th->ack) {/* 如果有ACK标志 */ 54: /* 检查ACK是否为正常的第三次握手 */ 55: int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); 56: 57: switch(sk->sk_state) { 58: case TCP_SYN_RECV: 59: if (acceptable) { 60: tp->copied_seq = tp->rcv_nxt; 61: mb(); 62: /* 正常的第三次握手,设置连接状态为TCP_ESTABLISHED */ 63: tcp_set_state(sk, TCP_ESTABLISHED); 64: sk->sk_state_change(sk); 65: 66: /* Note, that this wakeup is only for marginal 67: * crossed SYN case. Passively open sockets 68: * are not waked up, because sk->sk_sleep == 69: * NULL and sk->sk_socket == NULL. 70: */ 71: if (sk->sk_socket) {/* 状态已经正常,唤醒那些等待的线程 */ 72: sk_wake_async(sk,0,POLL_OUT); 73: } 74: 75: /* 初始化传输控制块,如果存在时间戳选项,同时平滑RTT为0,则需计算重传超时时间 */ 76: tp->snd_una = TCP_SKB_CB(skb)->ack_seq; 77: tp->snd_wnd = ntohs(th->window) << 78: tp->rx_opt.snd_wscale; 79: tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, 80: TCP_SKB_CB(skb)->seq); 81: 82: /* tcp_ack considers this ACK as duplicate 83: * and does not calculate rtt. 84: * Fix it at least with timestamps. 85: */ 86: if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && 87: !tp->srtt) 88: tcp_ack_saw_tstamp(tp, 0); 89: 90: if (tp->rx_opt.tstamp_ok) 91: tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; 92: 93: /* Make sure socket is routed, for 94: * correct metrics. 95: */ 96: /* 建立路由,初始化拥塞控制模块 */ 97: tp->af_specific->rebuild_header(sk); 98: 99: tcp_init_metrics(sk); 100: 101: /* Prevent spurious tcp_cwnd_restart() on 102: * first data packet. 103: */ 104: tp->lsndtime = tcp_time_stamp;/* 更新最近一次发送数据包的时间 */ 105: 106: tcp_initialize_rcv_mss(sk); 107: tcp_init_buffer_space(sk); 108: tcp_fast_path_on(tp);/* 计算有关TCP首部预测的标志 */ 109: } else { 110: return 1; 111: } 112: break; 113: ..... 114: } 115: } else 116: goto discard; 117: ..... 118: 119: /* step 6: check the URG bit */ 120: tcp_urg(sk, skb, th);/* 检测带外数据位 */ 121: 122: /* tcp_data could move socket to TIME-WAIT */ 123: if (sk->sk_state != TCP_CLOSE) {/* 如果tcp_data需要发送数据和ACK则在这里处理 */ 124: tcp_data_snd_check(sk); 125: tcp_ack_snd_check(sk); 126: } 127: 128: if (!queued) { /* 如果段没有加入队列,或者前面的流程需要释放报文,则释放它 */ 129: discard: 130: __kfree_skb(skb); 131: } 132: return 0; 133: }
send() 直接调用了sendto().
1: /* 2: * Send a datagram down a socket. 3: */ 4: 5: SYSCALL_DEFINE4(send, int, fd, void __user *, buff, size_t, len, 6: unsigned, flags) 7: { 8: return sys_sendto(fd, buff, len, flags, NULL, 0); 9: }
1: /* 2: * Send a datagram to a given address. We move the address into kernel 3: * space and check the user space data area is readable before invoking 4: * the protocol. 5: */ 6: 7: SYSCALL_DEFINE6(sendto, int, fd, void __user *, buff, size_t, len, 8: unsigned, flags, struct sockaddr __user *, addr, 9: int, addr_len) 10: { 11: struct socket *sock; 12: struct sockaddr_storage address; 13: int err; 14: struct msghdr msg; 15: struct iovec iov; 16: int fput_needed; 17: 18: if (len > INT_MAX) 19: len = INT_MAX; 20: sock = sockfd_lookup_light(fd, &err, &fput_needed); 21: if (!sock) 22: goto out; 23: 24: /* 可以看出用户空间的buff直接赋给了iov.iov_base, iov.iov_len = len */ 25: iov.iov_base = buff; 26: iov.iov_len = len; 27: msg.msg_name = NULL; 28: msg.msg_iov = &iov; 29: msg.msg_iovlen = 1; 30: msg.msg_control = NULL; 31: msg.msg_controllen = 0; 32: msg.msg_namelen = 0; 33: if (addr) { 34: err = move_addr_to_kernel(addr, addr_len, (struct sockaddr *)&address); 35: if (err < 0) 36: goto out_put; 37: msg.msg_name = (struct sockaddr *)&address; 38: msg.msg_namelen = addr_len; 39: } 40: if (sock->file->f_flags & O_NONBLOCK) 41: flags |= MSG_DONTWAIT; 42: msg.msg_flags = flags; 43: err = sock_sendmsg(sock, &msg, len); 44: 45: out_put: 46: fput_light(sock->file, fput_needed); 47: out: 48: return err; 49: }
关键路径:
- 通过copy_from_user把用户的struct msghdr拷贝到内核的msg_sys。
- 也通过verify_iovec()把用户buff中的内容拷贝到内核的iovstack中。
- 最后调用sock_sendmsg().
1: static int __sys_sendmsg(struct socket *sock, struct msghdr __user *msg, 2: struct msghdr *msg_sys, unsigned flags, 3: struct used_address *used_address) 4: { 5: struct compat_msghdr __user *msg_compat = 6: (struct compat_msghdr __user *)msg; 7: struct sockaddr_storage address; 8: struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; 9: unsigned char ctl[sizeof(struct cmsghdr) + 20] 10: __attribute__ ((aligned(sizeof(__kernel_size_t)))); 11: /* 20 is size of ipv6_pktinfo */ 12: unsigned char *ctl_buf = ctl; 13: int err, ctl_len, iov_size, total_len; 14: 15: err = -EFAULT; 16: if (MSG_CMSG_COMPAT & flags) { 17: if (get_compat_msghdr(msg_sys, msg_compat)) 18: return -EFAULT; 19: } 20: else if (copy_from_user(msg_sys, msg, sizeof(struct msghdr))) 21: return -EFAULT; 22: 23: /* do not move before msg_sys is valid */ 24: err = -EMSGSIZE; 25: if (msg_sys->msg_iovlen > UIO_MAXIOV) 26: goto out; 27: 28: /* Check whether to allocate the iovec area */ 29: err = -ENOMEM; 30: iov_size = msg_sys->msg_iovlen * sizeof(struct iovec); 31: if (msg_sys->msg_iovlen > UIO_FASTIOV) { 32: iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); 33: if (!iov) 34: goto out; 35: } 36: 37: /* This will also move the address data into kernel space */ 38: if (MSG_CMSG_COMPAT & flags) { 39: err = verify_compat_iovec(msg_sys, iov, 40: (struct sockaddr *)&address, 41: VERIFY_READ); 42: } else 43: err = verify_iovec(msg_sys, iov, 44: (struct sockaddr *)&address, 45: VERIFY_READ); 46: if (err < 0) 47: goto out_freeiov; 48: total_len = err; 49: 50: err = -ENOBUFS; 51: 52: if (msg_sys->msg_controllen > INT_MAX) 53: goto out_freeiov; 54: ctl_len = msg_sys->msg_controllen; 55: if ((MSG_CMSG_COMPAT & flags) && ctl_len) { 56: err = 57: cmsghdr_from_user_compat_to_kern(msg_sys, sock->sk, ctl, 58: sizeof(ctl)); 59: if (err) 60: goto out_freeiov; 61: ctl_buf = msg_sys->msg_control; 62: ctl_len = msg_sys->msg_controllen; 63: } else if (ctl_len) { 64: if (ctl_len > sizeof(ctl)) { 65: ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); 66: if (ctl_buf == NULL) 67: goto out_freeiov; 68: } 69: err = -EFAULT; 70: /* 71: * Careful! Before this, msg_sys->msg_control contains a user pointer. 72: * Afterwards, it will be a kernel pointer. Thus the compiler-assisted 73: * checking falls down on this. 74: */ 75: if (copy_from_user(ctl_buf, (void __user *)msg_sys->msg_control, 76: ctl_len)) 77: goto out_freectl; 78: msg_sys->msg_control = ctl_buf; 79: } 80: msg_sys->msg_flags = flags; 81: 82: if (sock->file->f_flags & O_NONBLOCK) 83: msg_sys->msg_flags |= MSG_DONTWAIT; 84: /* 85: * If this is sendmmsg() and current destination address is same as 86: * previously succeeded address, omit asking LSM‘s decision. 87: * used_address->name_len is initialized to UINT_MAX so that the first 88: * destination address never matches. 89: */ 90: if (used_address && used_address->name_len == msg_sys->msg_namelen && 91: !memcmp(&used_address->name, msg->msg_name, 92: used_address->name_len)) { 93: err = sock_sendmsg_nosec(sock, msg_sys, total_len); 94: goto out_freectl; 95: } 96: err = sock_sendmsg(sock, msg_sys, total_len); 97: /* 98: * If this is sendmmsg() and sending to current destination address was 99: * successful, remember it. 100: */ 101: if (used_address && err >= 0) { 102: used_address->name_len = msg_sys->msg_namelen; 103: memcpy(&used_address->name, msg->msg_name, 104: used_address->name_len); 105: } 106: 107: out_freectl: 108: if (ctl_buf != ctl) 109: sock_kfree_s(sock->sk, ctl_buf, ctl_len); 110: out_freeiov: 111: if (iov != iovstack) 112: sock_kfree_s(sock->sk, iov, iov_size); 113: out: 114: return err; 115: } 116:
1: /* sendmsg系统调用在TCP层的实现 */ 2: int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, 3: size_t size) 4: { 5: struct iovec *iov; 6: struct tcp_sock *tp = tcp_sk(sk); 7: struct sk_buff *skb; 8: int iovlen, flags; 9: int mss_now; 10: int err, copied; 11: long timeo; 12: 13: /* 获取套接口的锁 */ 14: lock_sock(sk); 15: TCP_CHECK_TIMER(sk); 16: 17: /* 根据标志计算阻塞超时时间 */ 18: flags = msg->msg_flags; 19: timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); 20: 21: /* Wait for a connection to finish. */ 22: if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))/* 只有这两种状态才能发送消息 */ 23: if ((err = sk_stream_wait_connect(sk, &timeo)) != 0)/* 其它状态下等待连接正确建立,超时则进行错误处理 */ 24: goto out_err; 25: 26: /* This should be in poll */ 27: clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); 28: 29: /* 获得有效的MSS,如果支持OOB,则不能支持TSO,MSS则应当是比较小的值 */ 30: mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 31: 32: /* Ok commence sending. */ 33: /* 获取待发送数据块数及数据块指针 */ 34: iovlen = msg->msg_iovlen; 35: iov = msg->msg_iov; 36: /* copied表示从用户数据块复制到skb中的字节数。 */ 37: copied = 0; 38: 39: err = -EPIPE; 40: /* 如果套接口存在错误,则不允许发送数据,返回EPIPE错误 */ 41: if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) 42: goto do_error; 43: 44: while (--iovlen >= 0) {/* 处理所有待发送数据块 */ 45: int seglen = iov->iov_len; 46: unsigned char __user *from = iov->iov_base; 47: 48: iov++; 49: 50: while (seglen > 0) {/* 处理单个数据块中的所有数据 */ 51: int copy; 52: 53: skb = sk->sk_write_queue.prev; 54: 55: if (!sk->sk_send_head ||/* 发送队列为空,前面取得的skb无效 */ 56: (copy = mss_now - skb->len) <= 0) {/* 如果skb有效,但是它已经没有多余的空间复制新数据了 */ 57: 58: new_segment: 59: /* Allocate new segment. If the interface is SG, 60: * allocate skb fitting to single page. 61: */ 62: if (!sk_stream_memory_free(sk))/* 发送队列中数据长度达到发送缓冲区的上限,等待缓冲区 */ 63: goto wait_for_sndbuf; 64: 65: skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), 66: 0, sk->sk_allocation);/* 分配新的skb */ 67: if (!skb)/* 分配失败,说明系统内存不足,等待 */ 68: goto wait_for_memory; 69: 70: /* 71: * Check whether we can use HW checksum. 72: */ 73: if (sk->sk_route_caps & 74: (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | 75: NETIF_F_HW_CSUM))/* 根据路由网络设备的特性,确定是否由硬件执行校验和 */ 76: skb->ip_summed = CHECKSUM_HW; 77: 78: skb_entail(sk, tp, skb);/* 将SKB添加到发送队列尾部 */ 79: copy = mss_now;/* 本次需要复制的数据量是MSS */ 80: } 81: 82: /* Try to append data to the end of skb. */ 83: if (copy > seglen)/* 要复制的数据不能大于当前段的长度 */ 84: copy = seglen; 85: 86: /* Where to copy to? */ 87: if (skb_tailroom(skb) > 0) {/* skb线性存储区底部还有空间 */ 88: /* We have some space in skb head. Superb! */ 89: if (copy > skb_tailroom(skb))/* 本次只复制skb存储区底部剩余空间大小的数据量 */ 90: copy = skb_tailroom(skb); 91: /* 从用户空间复制指定长度的数据到skb中,如果失败,则退出 */ 92: if ((err = skb_add_data(skb, from, copy)) != 0) 93: goto do_fault; 94: } else {/* 线性存储区底部已经没有空间了,复制到分散/聚集存储区中 */ 95: int merge = 0;/* 是否在页中添加数据 */ 96: int i = skb_shinfo(skb)->nr_frags;/* 分散/聚集片断数 */ 97: struct page *page = TCP_PAGE(sk);/* 分片页页 */ 98: int off = TCP_OFF(sk);/* 分片内的偏移 */ 99: 100: if (skb_can_coalesce(skb, i, page, off) && 101: off != PAGE_SIZE) {/* 当前分片还能添加数据 */ 102: /* We can extend the last page 103: * fragment. */ 104: merge = 1; 105: } else if (i == MAX_SKB_FRAGS ||/* 目前skb中的页不能添加数据,这里判断是否能再分配页 */ 106: (!i && 107: !(sk->sk_route_caps & NETIF_F_SG))) {/* 网卡不支持S/G,不能分片 */ 108: /* Need to add new fragment and cannot 109: * do this because interface is non-SG, 110: * or because all the page slots are 111: * busy. */ 112: tcp_mark_push(tp, skb);/* SKB可以提交了 */ 113: goto new_segment;/* 重新分配skb */ 114: } else if (page) {/* 分页数量未达到上限,判断当前页是否还有空间 */ 115: /* If page is cached, align 116: * offset to L1 cache boundary 117: */ 118: off = (off + L1_CACHE_BYTES - 1) & 119: ~(L1_CACHE_BYTES - 1); 120: if (off == PAGE_SIZE) {/* 最后一个分页数据已经满,需要分配新页 */ 121: put_page(page); 122: TCP_PAGE(sk) = page = NULL; 123: } 124: } 125: 126: if (!page) {/* 需要分配新页 */ 127: /* Allocate new cache page. */ 128: if (!(page = sk_stream_alloc_page(sk)))/* 分配新页,如果内存不足则等待内存 */ 129: goto wait_for_memory; 130: off = 0; 131: } 132: 133: if (copy > PAGE_SIZE - off)/* 待复制的数据不能大于页中剩余空间 */ 134: copy = PAGE_SIZE - off; 135: 136: /* Time to copy data. We are close to 137: * the end! */ 138: err = skb_copy_to_page(sk, from, skb, page, 139: off, copy);/* 从用户态复制数据到页中 */ 140: if (err) {/* 复制失败了 */ 141: /* If this page was new, give it to the 142: * socket so it does not get leaked. 143: */ 144: if (!TCP_PAGE(sk)) {/* 如果是新分配的页,则将页记录到skb中,供今后使用 */ 145: TCP_PAGE(sk) = page; 146: TCP_OFF(sk) = 0; 147: } 148: goto do_error; 149: } 150: 151: /* Update the skb. */ 152: /* 更新skb的分段信息 */ 153: if (merge) {/* 在最后一个页中追加数据 */ 154: skb_shinfo(skb)->frags[i - 1].size += 155: copy;/* 更新最后一页的数据长度 */ 156: } else {/* 新分配的页 */ 157: /* 更新skb中分片信息 */ 158: skb_fill_page_desc(skb, i, page, off, copy); 159: if (TCP_PAGE(sk)) { 160: get_page(page); 161: } else if (off + copy < PAGE_SIZE) { 162: get_page(page); 163: TCP_PAGE(sk) = page; 164: } 165: } 166: 167: /* 更新页内偏移 */ 168: TCP_OFF(sk) = off + copy; 169: } 170: 171: if (!copied)/* 如果没有复制数据,则取消PSH标志 */ 172: TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; 173: 174: tp->write_seq += copy;/* 更新发送队列最后一个包的序号 */ 175: TCP_SKB_CB(skb)->end_seq += copy;/* 更新skb的序号 */ 176: skb_shinfo(skb)->tso_segs = 0; 177: 178: /* 更新数据复制的指针 */ 179: from += copy; 180: copied += copy; 181: /* 如果所有数据已经复制完毕则退出 */ 182: if ((seglen -= copy) == 0 && iovlen == 0) 183: goto out; 184: 185: /* 如果当前skb中的数据小于mss,说明可以往里面继续复制数据。或者发送的是OOB数据,则也跳过发送过程,继续复制数据 */ 186: if (skb->len != mss_now || (flags & MSG_OOB)) 187: continue; 188: 189: if (forced_push(tp)) {/* 必须立即发送数据,即上次发送后产生的数据已经超过通告窗口值的一半 */ 190: /* 设置PSH标志后发送数据 */ 191: tcp_mark_push(tp, skb); 192: __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); 193: } else if (skb == sk->sk_send_head)/* 虽然不是必须发送数据,但是发送队列上只存在当前段,也将其发送出去 */ 194: tcp_push_one(sk, mss_now); 195: continue; 196: 197: wait_for_sndbuf: 198: /* 由于发送队列满的原因导致等待 */ 199: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 200: wait_for_memory: 201: if (copied)/* 虽然没有内存了,但是本次调用复制了数据到缓冲区,调用tcp_push将其发送出去 */ 202: tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); 203: 204: /* 等待内存可用 */ 205: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) 206: goto do_error;/* 确实没有内存了,超时后返回失败 */ 207: 208: /* 睡眠后,MSS可能发生了变化,重新计算 */ 209: mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); 210: } 211: } 212: 213: out: 214: if (copied)/* 从用户态复制了数据,发送它 */ 215: tcp_push(sk, tp, flags, mss_now, tp->nonagle); 216: TCP_CHECK_TIMER(sk); 217: release_sock(sk);/* 释放锁以后返回 */ 218: return copied; 219: 220: do_fault: 221: if (!skb->len) {/* 复制数据失败了,如果skb长度为0,说明是新分配的,释放它 */ 222: if (sk->sk_send_head == skb)/* 如果skb是发送队列头,则清空队列头 */ 223: sk->sk_send_head = NULL; 224: __skb_unlink(skb, skb->list); 225: sk_stream_free_skb(sk, skb);/* 释放skb */ 226: } 227: 228: do_error: 229: if (copied) 230: goto out; 231: out_err: 232: err = sk_stream_error(sk, flags, err); 233: TCP_CHECK_TIMER(sk); 234: release_sock(sk); 235: return err; 236: }
#+END_SRC
书上说到设置该skb的sk宿主时TCP使用sk_stream_set_owner_r(),而到内核kernel-2.6.32中,
TCP和UDP统一使用skb_set_owner_r().
标签:des style blog http color io os 使用 ar
原文地址:http://www.cnblogs.com/mosp/p/3982630.html