-
Notifications
You must be signed in to change notification settings - Fork 54.5k
/
tcp.h
2496 lines (2103 loc) · 75.8 KB
/
tcp.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* INET An implementation of the TCP/IP protocol suite for the LINUX
* operating system. INET is implemented using the BSD Socket
* interface as the means of communication with the user level.
*
* Definitions for the TCP module.
*
* Version: @(#)tcp.h 1.0.5 05/23/93
*
* Authors: Ross Biro
* Fred N. van Kempen, <[email protected]>
*/
#ifndef _TCP_H
#define _TCP_H
#define FASTRETRANS_DEBUG 1
#include <linux/list.h>
#include <linux/tcp.h>
#include <linux/bug.h>
#include <linux/slab.h>
#include <linux/cache.h>
#include <linux/percpu.h>
#include <linux/skbuff.h>
#include <linux/kref.h>
#include <linux/ktime.h>
#include <linux/indirect_call_wrapper.h>
#include <net/inet_connection_sock.h>
#include <net/inet_timewait_sock.h>
#include <net/inet_hashtables.h>
#include <net/checksum.h>
#include <net/request_sock.h>
#include <net/sock_reuseport.h>
#include <net/sock.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/tcp_states.h>
#include <net/inet_ecn.h>
#include <net/dst.h>
#include <net/mptcp.h>
#include <linux/seq_file.h>
#include <linux/memcontrol.h>
#include <linux/bpf-cgroup.h>
#include <linux/siphash.h>
extern struct inet_hashinfo tcp_hashinfo;
DECLARE_PER_CPU(unsigned int, tcp_orphan_count);
int tcp_orphan_count_sum(void);
void tcp_time_wait(struct sock *sk, int state, int timeo);
#define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER)
#define MAX_TCP_OPTION_SPACE 40
#define TCP_MIN_SND_MSS 48
#define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE)
/*
* Never offer a window over 32767 without using window scaling. Some
* poor stacks do signed 16bit maths!
*/
#define MAX_TCP_WINDOW 32767U
/* Minimal accepted MSS. It is (60+60+8) - (20+20). */
#define TCP_MIN_MSS 88U
/* The initial MTU to use for probing */
#define TCP_BASE_MSS 1024
/* probing interval, default to 10 minutes as per RFC4821 */
#define TCP_PROBE_INTERVAL 600
/* Specify interval when tcp mtu probing will stop */
#define TCP_PROBE_THRESHOLD 8
/* After receiving this amount of duplicate ACKs fast retransmit starts. */
#define TCP_FASTRETRANS_THRESH 3
/* Maximal number of ACKs sent quickly to accelerate slow-start. */
#define TCP_MAX_QUICKACKS 16U
/* Maximal number of window scale according to RFC1323 */
#define TCP_MAX_WSCALE 14U
/* urg_data states */
#define TCP_URG_VALID 0x0100
#define TCP_URG_NOTYET 0x0200
#define TCP_URG_READ 0x0400
#define TCP_RETR1 3 /*
* This is how many retries it does before it
* tries to figure out if the gateway is
* down. Minimal RFC value is 3; it corresponds
* to ~3sec-8min depending on RTO.
*/
#define TCP_RETR2 15 /*
* This should take at least
* 90 minutes to time out.
* RFC1122 says that the limit is 100 sec.
* 15 is ~13-30min depending on RTO.
*/
#define TCP_SYN_RETRIES 6 /* This is how many retries are done
* when active opening a connection.
* RFC1122 says the minimum retry MUST
* be at least 180secs. Nevertheless
* this value is corresponding to
* 63secs of retransmission with the
* current initial RTO.
*/
#define TCP_SYNACK_RETRIES 5 /* This is how may retries are done
* when passive opening a connection.
* This is corresponding to 31secs of
* retransmission with the current
* initial RTO.
*/
#define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT
* state, about 60 seconds */
#define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN
/* BSD style FIN_WAIT2 deadlock breaker.
* It used to be 3min, new value is 60sec,
* to combine FIN-WAIT-2 timeout with
* TIME-WAIT timer.
*/
#define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */
#define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */
#if HZ >= 100
#define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */
#define TCP_ATO_MIN ((unsigned)(HZ/25))
#else
#define TCP_DELACK_MIN 4U
#define TCP_ATO_MIN 4U
#endif
#define TCP_RTO_MAX ((unsigned)(120*HZ))
#define TCP_RTO_MIN ((unsigned)(HZ/5))
#define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */
#define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */
#define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now
* used as a fallback RTO for the
* initial data transmission if no
* valid RTT sample has been acquired,
* most likely due to retrans in 3WHS.
*/
#define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes
* for local resources.
*/
#define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */
#define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */
#define TCP_KEEPALIVE_INTVL (75*HZ)
#define MAX_TCP_KEEPIDLE 32767
#define MAX_TCP_KEEPINTVL 32767
#define MAX_TCP_KEEPCNT 127
#define MAX_TCP_SYNCNT 127
#define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */
#define TCP_PAWS_24DAYS (60 * 60 * 24 * 24)
#define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated
* after this time. It should be equal
* (or greater than) TCP_TIMEWAIT_LEN
* to provide reliability equal to one
* provided by timewait state.
*/
#define TCP_PAWS_WINDOW 1 /* Replay window for per-host
* timestamps. It must be less than
* minimal timewait lifetime.
*/
/*
* TCP option
*/
#define TCPOPT_NOP 1 /* Padding */
#define TCPOPT_EOL 0 /* End of options */
#define TCPOPT_MSS 2 /* Segment size negotiating */
#define TCPOPT_WINDOW 3 /* Window scaling */
#define TCPOPT_SACK_PERM 4 /* SACK Permitted */
#define TCPOPT_SACK 5 /* SACK Block */
#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */
#define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */
#define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */
#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */
#define TCPOPT_EXP 254 /* Experimental */
/* Magic number to be after the option value for sharing TCP
* experimental options. See draft-ietf-tcpm-experimental-options-00.txt
*/
#define TCPOPT_FASTOPEN_MAGIC 0xF989
#define TCPOPT_SMC_MAGIC 0xE2D4C3D9
/*
* TCP option lengths
*/
#define TCPOLEN_MSS 4
#define TCPOLEN_WINDOW 3
#define TCPOLEN_SACK_PERM 2
#define TCPOLEN_TIMESTAMP 10
#define TCPOLEN_MD5SIG 18
#define TCPOLEN_FASTOPEN_BASE 2
#define TCPOLEN_EXP_FASTOPEN_BASE 4
#define TCPOLEN_EXP_SMC_BASE 6
/* But this is what stacks really send out. */
#define TCPOLEN_TSTAMP_ALIGNED 12
#define TCPOLEN_WSCALE_ALIGNED 4
#define TCPOLEN_SACKPERM_ALIGNED 4
#define TCPOLEN_SACK_BASE 2
#define TCPOLEN_SACK_BASE_ALIGNED 4
#define TCPOLEN_SACK_PERBLOCK 8
#define TCPOLEN_MD5SIG_ALIGNED 20
#define TCPOLEN_MSS_ALIGNED 4
#define TCPOLEN_EXP_SMC_BASE_ALIGNED 8
/* Flags in tp->nonagle */
#define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */
#define TCP_NAGLE_CORK 2 /* Socket is corked */
#define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */
/* TCP thin-stream limits */
#define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */
/* TCP initial congestion window as per rfc6928 */
#define TCP_INIT_CWND 10
/* Bit Flags for sysctl_tcp_fastopen */
#define TFO_CLIENT_ENABLE 1
#define TFO_SERVER_ENABLE 2
#define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */
/* Accept SYN data w/o any cookie option */
#define TFO_SERVER_COOKIE_NOT_REQD 0x200
/* Force enable TFO on all listeners, i.e., not requiring the
* TCP_FASTOPEN socket option.
*/
#define TFO_SERVER_WO_SOCKOPT1 0x400
/* sysctl variables for tcp */
extern int sysctl_tcp_max_orphans;
extern long sysctl_tcp_mem[3];
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
#define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */
extern atomic_long_t tcp_memory_allocated;
DECLARE_PER_CPU(int, tcp_memory_per_cpu_fw_alloc);
extern struct percpu_counter tcp_sockets_allocated;
extern unsigned long tcp_memory_pressure;
/* optimized version of sk_under_memory_pressure() for TCP sockets */
static inline bool tcp_under_memory_pressure(const struct sock *sk)
{
if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
mem_cgroup_under_socket_pressure(sk->sk_memcg))
return true;
return READ_ONCE(tcp_memory_pressure);
}
/*
* The next routines deal with comparing 32 bit unsigned ints
* and worry about wraparound (automatic with unsigned arithmetic).
*/
static inline bool before(__u32 seq1, __u32 seq2)
{
return (__s32)(seq1-seq2) < 0;
}
#define after(seq2, seq1) before(seq1, seq2)
/* is s2<=s1<=s3 ? */
static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3)
{
return seq3 - seq2 >= seq1 - seq2;
}
static inline bool tcp_out_of_memory(struct sock *sk)
{
if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2))
return true;
return false;
}
static inline void tcp_wmem_free_skb(struct sock *sk, struct sk_buff *skb)
{
sk_wmem_queued_add(sk, -skb->truesize);
if (!skb_zcopy_pure(skb))
sk_mem_uncharge(sk, skb->truesize);
else
sk_mem_uncharge(sk, SKB_TRUESIZE(skb_end_offset(skb)));
__kfree_skb(skb);
}
void sk_forced_mem_schedule(struct sock *sk, int size);
bool tcp_check_oom(struct sock *sk, int shift);
extern struct proto tcp_prot;
#define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field)
#define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field)
#define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val)
void tcp_tasklet_init(void);
int tcp_v4_err(struct sk_buff *skb, u32);
void tcp_shutdown(struct sock *sk, int how);
int tcp_v4_early_demux(struct sk_buff *skb);
int tcp_v4_rcv(struct sk_buff *skb);
void tcp_remove_empty_skb(struct sock *sk);
int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw);
int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size);
int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
size_t size, struct ubuf_info *uarg);
int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size,
int flags);
int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
size_t size, int flags);
int tcp_send_mss(struct sock *sk, int *size_goal, int flags);
void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle,
int size_goal);
void tcp_release_cb(struct sock *sk);
void tcp_wfree(struct sk_buff *skb);
void tcp_write_timer_handler(struct sock *sk);
void tcp_delack_timer_handler(struct sock *sk);
int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg);
int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
void tcp_twsk_purge(struct list_head *net_exit_list, int family);
ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags);
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp,
bool force_schedule);
void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks);
static inline void tcp_dec_quickack_mode(struct sock *sk,
const unsigned int pkts)
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (icsk->icsk_ack.quick) {
if (pkts >= icsk->icsk_ack.quick) {
icsk->icsk_ack.quick = 0;
/* Leaving quickack mode we deflate ATO. */
icsk->icsk_ack.ato = TCP_ATO_MIN;
} else
icsk->icsk_ack.quick -= pkts;
}
}
#define TCP_ECN_OK 1
#define TCP_ECN_QUEUE_CWR 2
#define TCP_ECN_DEMAND_CWR 4
#define TCP_ECN_SEEN 8
enum tcp_tw_status {
TCP_TW_SUCCESS = 0,
TCP_TW_RST = 1,
TCP_TW_ACK = 2,
TCP_TW_SYN = 3
};
enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw,
struct sk_buff *skb,
const struct tcphdr *th);
struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
struct request_sock *req, bool fastopen,
bool *lost_race);
int tcp_child_process(struct sock *parent, struct sock *child,
struct sk_buff *skb);
void tcp_enter_loss(struct sock *sk);
void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int newly_lost, int flag);
void tcp_clear_retrans(struct tcp_sock *tp);
void tcp_update_metrics(struct sock *sk);
void tcp_init_metrics(struct sock *sk);
void tcp_metrics_init(void);
bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst);
void __tcp_close(struct sock *sk, long timeout);
void tcp_close(struct sock *sk, long timeout);
void tcp_init_sock(struct sock *sk);
void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb);
__poll_t tcp_poll(struct file *file, struct socket *sock,
struct poll_table_struct *wait);
int do_tcp_getsockopt(struct sock *sk, int level,
int optname, sockptr_t optval, sockptr_t optlen);
int tcp_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen);
bool tcp_bpf_bypass_getsockopt(int level, int optname);
int do_tcp_setsockopt(struct sock *sk, int level, int optname,
sockptr_t optval, unsigned int optlen);
int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
unsigned int optlen);
void tcp_set_keepalive(struct sock *sk, int val);
void tcp_syn_ack_timeout(const struct request_sock *req);
int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int flags, int *addr_len);
int tcp_set_rcvlowat(struct sock *sk, int val);
int tcp_set_window_clamp(struct sock *sk, int val);
void tcp_update_recv_tstamps(struct sk_buff *skb,
struct scm_timestamping_internal *tss);
void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk,
struct scm_timestamping_internal *tss);
void tcp_data_ready(struct sock *sk);
#ifdef CONFIG_MMU
int tcp_mmap(struct file *file, struct socket *sock,
struct vm_area_struct *vma);
#endif
void tcp_parse_options(const struct net *net, const struct sk_buff *skb,
struct tcp_options_received *opt_rx,
int estab, struct tcp_fastopen_cookie *foc);
const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
/*
* BPF SKB-less helpers
*/
u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph,
struct tcphdr *th, u32 *cookie);
u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph,
struct tcphdr *th, u32 *cookie);
u16 tcp_parse_mss_option(const struct tcphdr *th, u16 user_mss);
u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct tcphdr *th);
/*
* TCP v4 functions exported for the inet6 API
*/
void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb);
void tcp_v4_mtu_reduced(struct sock *sk);
void tcp_req_err(struct sock *sk, u32 seq, bool abort);
void tcp_ld_RTO_revert(struct sock *sk, u32 seq);
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb);
struct sock *tcp_create_openreq_child(const struct sock *sk,
struct request_sock *req,
struct sk_buff *skb);
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst);
struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst,
struct request_sock *req_unhash,
bool *own_req);
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb);
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int tcp_connect(struct sock *sk);
enum tcp_synack_type {
TCP_SYNACK_NORMAL,
TCP_SYNACK_FASTOPEN,
TCP_SYNACK_COOKIE,
};
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req,
struct tcp_fastopen_cookie *foc,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb);
int tcp_disconnect(struct sock *sk, int flags);
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size);
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb);
/* From syncookies.c */
struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst, u32 tsoff);
int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th,
u32 cookie);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb);
struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
const struct tcp_request_sock_ops *af_ops,
struct sock *sk, struct sk_buff *skb);
#ifdef CONFIG_SYN_COOKIES
/* Syncookies use a monotonic timer which increments every 60 seconds.
* This counter is used both as a hash input and partially encoded into
* the cookie value. A cookie is only validated further if the delta
* between the current counter value and the encoded one is less than this,
* i.e. a sent cookie is valid only at most for 2*60 seconds (or less if
* the counter advances immediately after a cookie is generated).
*/
#define MAX_SYNCOOKIE_AGE 2
#define TCP_SYNCOOKIE_PERIOD (60 * HZ)
#define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD)
/* syncookies: remember time of last synqueue overflow
* But do not dirty this field too often (once per second is enough)
* It is racy as we do not hold a lock, but race is very minor.
*/
static inline void tcp_synq_overflow(const struct sock *sk)
{
unsigned int last_overflow;
unsigned int now = jiffies;
if (sk->sk_reuseport) {
struct sock_reuseport *reuse;
reuse = rcu_dereference(sk->sk_reuseport_cb);
if (likely(reuse)) {
last_overflow = READ_ONCE(reuse->synq_overflow_ts);
if (!time_between32(now, last_overflow,
last_overflow + HZ))
WRITE_ONCE(reuse->synq_overflow_ts, now);
return;
}
}
last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
if (!time_between32(now, last_overflow, last_overflow + HZ))
WRITE_ONCE(tcp_sk_rw(sk)->rx_opt.ts_recent_stamp, now);
}
/* syncookies: no recent synqueue overflow on this listening socket? */
static inline bool tcp_synq_no_recent_overflow(const struct sock *sk)
{
unsigned int last_overflow;
unsigned int now = jiffies;
if (sk->sk_reuseport) {
struct sock_reuseport *reuse;
reuse = rcu_dereference(sk->sk_reuseport_cb);
if (likely(reuse)) {
last_overflow = READ_ONCE(reuse->synq_overflow_ts);
return !time_between32(now, last_overflow - HZ,
last_overflow +
TCP_SYNCOOKIE_VALID);
}
}
last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp);
/* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID,
* then we're under synflood. However, we have to use
* 'last_overflow - HZ' as lower bound. That's because a concurrent
* tcp_synq_overflow() could update .ts_recent_stamp after we read
* jiffies but before we store .ts_recent_stamp into last_overflow,
* which could lead to rejecting a valid syncookie.
*/
return !time_between32(now, last_overflow - HZ,
last_overflow + TCP_SYNCOOKIE_VALID);
}
static inline u32 tcp_cookie_time(void)
{
u64 val = get_jiffies_64();
do_div(val, TCP_SYNCOOKIE_PERIOD);
return val;
}
u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
u16 *mssp);
__u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss);
u64 cookie_init_timestamp(struct request_sock *req, u64 now);
bool cookie_timestamp_decode(const struct net *net,
struct tcp_options_received *opt);
bool cookie_ecn_ok(const struct tcp_options_received *opt,
const struct net *net, const struct dst_entry *dst);
/* From net/ipv6/syncookies.c */
int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
u32 cookie);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
const struct tcphdr *th, u16 *mssp);
__u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss);
#endif
/* tcp_output.c */
void tcp_skb_entail(struct sock *sk, struct sk_buff *skb);
void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb);
void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
int nonagle);
int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs);
void tcp_retransmit_timer(struct sock *sk);
void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
enum tcp_queue {
TCP_FRAG_IN_WRITE_QUEUE,
TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
struct sk_buff *skb, u32 len,
unsigned int mss_now, gfp_t gfp);
void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
int tcp_write_wakeup(struct sock *, int mib);
void tcp_send_fin(struct sock *sk);
void tcp_send_active_reset(struct sock *sk, gfp_t priority);
int tcp_send_synack(struct sock *);
void tcp_push_one(struct sock *, unsigned int mss_now);
void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
void tcp_send_ack(struct sock *sk);
void tcp_send_delayed_ack(struct sock *sk);
void tcp_send_loss_probe(struct sock *sk);
bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto);
void tcp_skb_collapse_tstamp(struct sk_buff *skb,
const struct sk_buff *next_skb);
/* tcp_input.c */
void tcp_rearm_rto(struct sock *sk);
void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req);
void tcp_reset(struct sock *sk, struct sk_buff *skb);
void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb);
void tcp_fin(struct sock *sk);
void tcp_check_space(struct sock *sk);
void tcp_sack_compress_send_ack(struct sock *sk);
/* tcp_timer.c */
void tcp_init_xmit_timers(struct sock *);
static inline void tcp_clear_xmit_timers(struct sock *sk)
{
if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1)
__sock_put(sk);
if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1)
__sock_put(sk);
inet_csk_clear_xmit_timers(sk);
}
unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu);
unsigned int tcp_current_mss(struct sock *sk);
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when);
/* Bound MSS / TSO packet size with the half of the window */
static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
{
int cutoff;
/* When peer uses tiny windows, there is no use in packetizing
* to sub-MSS pieces for the sake of SWS or making sure there
* are enough packets in the pipe for fast recovery.
*
* On the other hand, for extremely large MSS devices, handling
* smaller than MSS windows in this way does make sense.
*/
if (tp->max_window > TCP_MSS_DEFAULT)
cutoff = (tp->max_window >> 1);
else
cutoff = tp->max_window;
if (cutoff && pktsize > cutoff)
return max_t(int, cutoff, 68U - tp->tcp_header_len);
else
return pktsize;
}
/* tcp.c */
void tcp_get_info(struct sock *, struct tcp_info *);
/* Read 'sendfile()'-style from a TCP socket */
int tcp_read_sock(struct sock *sk, read_descriptor_t *desc,
sk_read_actor_t recv_actor);
int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off);
void tcp_read_done(struct sock *sk, size_t len);
void tcp_initialize_rcv_mss(struct sock *sk);
int tcp_mtu_to_mss(struct sock *sk, int pmtu);
int tcp_mss_to_mtu(struct sock *sk, int mss);
void tcp_mtup_init(struct sock *sk);
static inline void tcp_bound_rto(const struct sock *sk)
{
if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX)
inet_csk(sk)->icsk_rto = TCP_RTO_MAX;
}
static inline u32 __tcp_set_rto(const struct tcp_sock *tp)
{
return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us);
}
static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd)
{
/* mptcp hooks are only on the slow path */
if (sk_is_mptcp((struct sock *)tp))
return;
tp->pred_flags = htonl((tp->tcp_header_len << 26) |
ntohl(TCP_FLAG_ACK) |
snd_wnd);
}
static inline void tcp_fast_path_on(struct tcp_sock *tp)
{
__tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale);
}
static inline void tcp_fast_path_check(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if (RB_EMPTY_ROOT(&tp->out_of_order_queue) &&
tp->rcv_wnd &&
atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf &&
!tp->urg_data)
tcp_fast_path_on(tp);
}
/* Compute the actual rto_min value */
static inline u32 tcp_rto_min(struct sock *sk)
{
const struct dst_entry *dst = __sk_dst_get(sk);
u32 rto_min = inet_csk(sk)->icsk_rto_min;
if (dst && dst_metric_locked(dst, RTAX_RTO_MIN))
rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN);
return rto_min;
}
static inline u32 tcp_rto_min_us(struct sock *sk)
{
return jiffies_to_usecs(tcp_rto_min(sk));
}
static inline bool tcp_ca_dst_locked(const struct dst_entry *dst)
{
return dst_metric_locked(dst, RTAX_CC_ALGO);
}
/* Minimum RTT in usec. ~0 means not available. */
static inline u32 tcp_min_rtt(const struct tcp_sock *tp)
{
return minmax_get(&tp->rtt_min);
}
/* Compute the actual receive window we are currently advertising.
* Rcv_nxt can be after the window if our peer push more data
* than the offered window.
*/
static inline u32 tcp_receive_window(const struct tcp_sock *tp)
{
s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt;
if (win < 0)
win = 0;
return (u32) win;
}
/* Choose a new window, without checks for shrinking, and without
* scaling applied to the result. The caller does these things
* if necessary. This is a "raw" window selection.
*/
u32 __tcp_select_window(struct sock *sk);
void tcp_send_window_probe(struct sock *sk);
/* TCP uses 32bit jiffies to save some space.
* Note that this is different from tcp_time_stamp, which
* historically has been the same until linux-4.13.
*/
#define tcp_jiffies32 ((u32)jiffies)
/*
* Deliver a 32bit value for TCP timestamp option (RFC 7323)
* It is no longer tied to jiffies, but to 1 ms clock.
* Note: double check if you want to use tcp_jiffies32 instead of this.
*/
#define TCP_TS_HZ 1000
static inline u64 tcp_clock_ns(void)
{
return ktime_get_ns();
}
static inline u64 tcp_clock_us(void)
{
return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
}
/* This should only be used in contexts where tp->tcp_mstamp is up to date */
static inline u32 tcp_time_stamp(const struct tcp_sock *tp)
{
return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ);
}
/* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */
static inline u32 tcp_ns_to_ts(u64 ns)
{
return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ);
}
/* Could use tcp_clock_us() / 1000, but this version uses a single divide */
static inline u32 tcp_time_stamp_raw(void)
{
return tcp_ns_to_ts(tcp_clock_ns());
}
void tcp_mstamp_refresh(struct tcp_sock *tp);
static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0)
{
return max_t(s64, t1 - t0, 0);
}
static inline u32 tcp_skb_timestamp(const struct sk_buff *skb)
{
return tcp_ns_to_ts(skb->skb_mstamp_ns);
}
/* provide the departure time in us unit */
static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb)
{
return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC);
}
#define tcp_flag_byte(th) (((u_int8_t *)th)[13])
#define TCPHDR_FIN 0x01
#define TCPHDR_SYN 0x02
#define TCPHDR_RST 0x04
#define TCPHDR_PSH 0x08
#define TCPHDR_ACK 0x10
#define TCPHDR_URG 0x20
#define TCPHDR_ECE 0x40
#define TCPHDR_CWR 0x80
#define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR)
/* This is what the send packet queuing engine uses to pass
* TCP per-packet control information to the transmission code.
* We also store the host-order sequence numbers in here too.
* This is 44 bytes if IPV6 is enabled.
* If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately.
*/
struct tcp_skb_cb {
__u32 seq; /* Starting sequence number */
__u32 end_seq; /* SEQ + FIN + SYN + datalen */
union {
/* Note : tcp_tw_isn is used in input path only
* (isn chosen by tcp_timewait_state_process())
*
* tcp_gso_segs/size are used in write queue only,
* cf tcp_skb_pcount()/tcp_skb_mss()
*/
__u32 tcp_tw_isn;
struct {
u16 tcp_gso_segs;
u16 tcp_gso_size;
};
};
__u8 tcp_flags; /* TCP header flags. (tcp[13]) */
__u8 sacked; /* State flags for SACK. */
#define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */
#define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */
#define TCPCB_LOST 0x04 /* SKB is lost */
#define TCPCB_TAGBITS 0x07 /* All tag bits */
#define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */
#define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */
#define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \
TCPCB_REPAIRED)
__u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */
__u8 txstamp_ack:1, /* Record TX timestamp for ack? */
eor:1, /* Is skb MSG_EOR marked? */
has_rxtstamp:1, /* SKB has a RX timestamp */
unused:5;
__u32 ack_seq; /* Sequence number ACK'd */
union {
struct {
#define TCPCB_DELIVERED_CE_MASK ((1U<<20) - 1)
/* There is space for up to 24 bytes */
__u32 is_app_limited:1, /* cwnd not fully used? */
delivered_ce:20,
unused:11;
/* pkts S/ACKed so far upon tx of skb, incl retrans: */
__u32 delivered;
/* start of send pipeline phase */
u64 first_tx_mstamp;
/* when we reached the "delivered" count */
u64 delivered_mstamp;
} tx; /* only used for outgoing skbs */
union {
struct inet_skb_parm h4;
#if IS_ENABLED(CONFIG_IPV6)
struct inet6_skb_parm h6;
#endif
} header; /* For incoming skbs */
};
};
#define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0]))
extern const struct inet_connection_sock_af_ops ipv4_specific;
#if IS_ENABLED(CONFIG_IPV6)
/* This is the variant of inet6_iif() that must be used by TCP,
* as TCP moves IP6CB into a different location in skb->cb[]
*/
static inline int tcp_v6_iif(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->header.h6.iif;
}
static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
{
bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
}
/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v6_sdif(const struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags))
return TCP_SKB_CB(skb)->header.h6.iif;
#endif
return 0;
}
extern const struct inet_connection_sock_af_ops ipv6_specific;
INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb));
INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb));
void tcp_v6_early_demux(struct sk_buff *skb);
#endif
/* TCP_SKB_CB reference means this can not be used from early demux */
static inline int tcp_v4_sdif(struct sk_buff *skb)
{
#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags))
return TCP_SKB_CB(skb)->header.h4.iif;
#endif
return 0;
}
/* Due to TSO, an SKB can be composed of multiple actual
* packets. To keep these tracked properly, we use this.
*/
static inline int tcp_skb_pcount(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->tcp_gso_segs;
}
static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs)
{
TCP_SKB_CB(skb)->tcp_gso_segs = segs;
}
static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs)
{
TCP_SKB_CB(skb)->tcp_gso_segs += segs;
}
/* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */
static inline int tcp_skb_mss(const struct sk_buff *skb)
{
return TCP_SKB_CB(skb)->tcp_gso_size;
}
static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb)
{
return likely(!TCP_SKB_CB(skb)->eor);
}
static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
const struct sk_buff *from)
{
return likely(tcp_skb_can_collapse_to(to) &&
mptcp_skb_can_collapse(to, from) &&
skb_pure_zcopy_same(to, from));
}
/* Events passed to congestion control interface */