-
-
Notifications
You must be signed in to change notification settings - Fork 400
/
mod.rs
3776 lines (3442 loc) · 147 KB
/
mod.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
use std::{
cmp,
collections::VecDeque,
convert::TryFrom,
fmt, io, mem,
net::{IpAddr, SocketAddr},
sync::Arc,
time::{Duration, Instant},
};
use bytes::{Bytes, BytesMut};
use frame::StreamMetaVec;
use rand::{rngs::StdRng, Rng, SeedableRng};
use thiserror::Error;
use tracing::{debug, error, trace, trace_span, warn};
use crate::{
cid_generator::ConnectionIdGenerator,
cid_queue::CidQueue,
coding::BufMutExt,
config::{ServerConfig, TransportConfig},
crypto::{self, KeyPair, Keys, PacketKey},
frame,
frame::{Close, Datagram, FrameStruct},
packet::{
FixedLengthConnectionIdParser, Header, InitialHeader, InitialPacket, LongType, Packet,
PacketNumber, PartialDecode, SpaceId,
},
range_set::ArrayRangeSet,
shared::{
ConnectionEvent, ConnectionEventInner, ConnectionId, DatagramConnectionEvent, EcnCodepoint,
EndpointEvent, EndpointEventInner,
},
token::ResetToken,
transport_parameters::TransportParameters,
Dir, EndpointConfig, Frame, Side, StreamId, Transmit, TransportError, TransportErrorCode,
VarInt, MAX_STREAM_COUNT, MIN_INITIAL_SIZE, TIMER_GRANULARITY,
};
mod ack_frequency;
use ack_frequency::AckFrequencyState;
mod assembler;
pub use assembler::Chunk;
mod cid_state;
use cid_state::CidState;
mod datagrams;
use datagrams::DatagramState;
pub use datagrams::{Datagrams, SendDatagramError};
mod mtud;
mod pacing;
mod packet_builder;
use packet_builder::PacketBuilder;
mod packet_crypto;
use packet_crypto::{PrevCrypto, ZeroRttCrypto};
mod paths;
pub use paths::RttEstimator;
use paths::{PathData, PathResponses};
mod send_buffer;
mod spaces;
#[cfg(fuzzing)]
pub use spaces::Retransmits;
#[cfg(not(fuzzing))]
use spaces::Retransmits;
use spaces::{PacketNumberFilter, PacketSpace, SendableFrames, SentPacket, ThinRetransmits};
mod stats;
pub use stats::{ConnectionStats, FrameStats, PathStats, UdpStats};
mod streams;
#[cfg(fuzzing)]
pub use streams::StreamsState;
#[cfg(not(fuzzing))]
use streams::StreamsState;
pub use streams::{
BytesSource, Chunks, ClosedStream, FinishError, ReadError, ReadableError, RecvStream,
SendStream, ShouldTransmit, StreamEvent, Streams, WriteError, Written,
};
mod timer;
use crate::congestion::Controller;
use timer::{Timer, TimerTable};
/// Protocol state and logic for a single QUIC connection
///
/// Objects of this type receive [`ConnectionEvent`]s and emit [`EndpointEvent`]s and application
/// [`Event`]s to make progress. To handle timeouts, a `Connection` returns timer updates and
/// expects timeouts through various methods. A number of simple getter methods are exposed
/// to allow callers to inspect some of the connection state.
///
/// `Connection` has roughly 4 types of methods:
///
/// - A. Simple getters, taking `&self`
/// - B. Handlers for incoming events from the network or system, named `handle_*`.
/// - C. State machine mutators, for incoming commands from the application. For convenience we
/// refer to this as "performing I/O" below, however as per the design of this library none of the
/// functions actually perform system-level I/O. For example, [`read`](RecvStream::read) and
/// [`write`](SendStream::write), but also things like [`reset`](SendStream::reset).
/// - D. Polling functions for outgoing events or actions for the caller to
/// take, named `poll_*`.
///
/// The simplest way to use this API correctly is to call (B) and (C) whenever
/// appropriate, then after each of those calls, as soon as feasible call all
/// polling methods (D) and deal with their outputs appropriately, e.g. by
/// passing it to the application or by making a system-level I/O call. You
/// should call the polling functions in this order:
///
/// 1. [`poll_transmit`](Self::poll_transmit)
/// 2. [`poll_timeout`](Self::poll_timeout)
/// 3. [`poll_endpoint_events`](Self::poll_endpoint_events)
/// 4. [`poll`](Self::poll)
///
/// Currently the only actual dependency is from (2) to (1), however additional
/// dependencies may be added in future, so the above order is recommended.
///
/// (A) may be called whenever desired.
///
/// Care should be made to ensure that the input events represent monotonically
/// increasing time. Specifically, calling [`handle_timeout`](Self::handle_timeout)
/// with events of the same [`Instant`] may be interleaved in any order with a
/// call to [`handle_event`](Self::handle_event) at that same instant; however
/// events or timeouts with different instants must not be interleaved.
pub struct Connection {
endpoint_config: Arc<EndpointConfig>,
server_config: Option<Arc<ServerConfig>>,
config: Arc<TransportConfig>,
rng: StdRng,
crypto: Box<dyn crypto::Session>,
/// The CID we initially chose, for use during the handshake
handshake_cid: ConnectionId,
/// The CID the peer initially chose, for use during the handshake
rem_handshake_cid: ConnectionId,
/// The "real" local IP address which was was used to receive the initial packet.
/// This is only populated for the server case, and if known
local_ip: Option<IpAddr>,
path: PathData,
/// Whether MTU detection is supported in this environment
allow_mtud: bool,
prev_path: Option<(ConnectionId, PathData)>,
state: State,
side: Side,
/// Whether or not 0-RTT was enabled during the handshake. Does not imply acceptance.
zero_rtt_enabled: bool,
/// Set if 0-RTT is supported, then cleared when no longer needed.
zero_rtt_crypto: Option<ZeroRttCrypto>,
key_phase: bool,
/// How many packets are in the current key phase. Used only for `Data` space.
key_phase_size: u64,
/// Transport parameters set by the peer
peer_params: TransportParameters,
/// Source ConnectionId of the first packet received from the peer
orig_rem_cid: ConnectionId,
/// Destination ConnectionId sent by the client on the first Initial
initial_dst_cid: ConnectionId,
/// The value that the server included in the Source Connection ID field of a Retry packet, if
/// one was received
retry_src_cid: Option<ConnectionId>,
/// Total number of outgoing packets that have been deemed lost
lost_packets: u64,
events: VecDeque<Event>,
endpoint_events: VecDeque<EndpointEventInner>,
/// Whether the spin bit is in use for this connection
spin_enabled: bool,
/// Outgoing spin bit state
spin: bool,
/// Packet number spaces: initial, handshake, 1-RTT
spaces: [PacketSpace; 3],
/// Highest usable packet number space
highest_space: SpaceId,
/// 1-RTT keys used prior to a key update
prev_crypto: Option<PrevCrypto>,
/// 1-RTT keys to be used for the next key update
///
/// These are generated in advance to prevent timing attacks and/or DoS by third-party attackers
/// spoofing key updates.
next_crypto: Option<KeyPair<Box<dyn PacketKey>>>,
accepted_0rtt: bool,
/// Whether the idle timer should be reset the next time an ack-eliciting packet is transmitted.
permit_idle_reset: bool,
/// Negotiated idle timeout
idle_timeout: Option<VarInt>,
timers: TimerTable,
/// Number of packets received which could not be authenticated
authentication_failures: u64,
/// Why the connection was lost, if it has been
error: Option<ConnectionError>,
/// Sent in every outgoing Initial packet. Always empty for servers and after Initial keys are
/// discarded.
retry_token: Bytes,
/// Identifies Data-space packet numbers to skip. Not used in earlier spaces.
packet_number_filter: PacketNumberFilter,
//
// Queued non-retransmittable 1-RTT data
//
/// Responses to PATH_CHALLENGE frames
path_responses: PathResponses,
close: bool,
//
// ACK frequency
//
ack_frequency: AckFrequencyState,
//
// Loss Detection
//
/// The number of times a PTO has been sent without receiving an ack.
pto_count: u32,
//
// Congestion Control
//
/// Whether the most recently received packet had an ECN codepoint set
receiving_ecn: bool,
/// Number of packets authenticated
total_authed_packets: u64,
/// Whether the last `poll_transmit` call yielded no data because there was
/// no outgoing application data.
app_limited: bool,
streams: StreamsState,
/// Surplus remote CIDs for future use on new paths
rem_cids: CidQueue,
// Attributes of CIDs generated by local peer
local_cid_state: CidState,
/// State of the unreliable datagram extension
datagrams: DatagramState,
/// Connection level statistics
stats: ConnectionStats,
/// QUIC version used for the connection.
version: u32,
}
impl Connection {
pub(crate) fn new(
endpoint_config: Arc<EndpointConfig>,
server_config: Option<Arc<ServerConfig>>,
config: Arc<TransportConfig>,
init_cid: ConnectionId,
loc_cid: ConnectionId,
rem_cid: ConnectionId,
pref_addr_cid: Option<ConnectionId>,
remote: SocketAddr,
local_ip: Option<IpAddr>,
crypto: Box<dyn crypto::Session>,
cid_gen: &dyn ConnectionIdGenerator,
now: Instant,
version: u32,
allow_mtud: bool,
rng_seed: [u8; 32],
path_validated: bool,
) -> Self {
let side = if server_config.is_some() {
Side::Server
} else {
Side::Client
};
let initial_space = PacketSpace {
crypto: Some(crypto.initial_keys(&init_cid, side)),
..PacketSpace::new(now)
};
let state = State::Handshake(state::Handshake {
rem_cid_set: side.is_server(),
expected_token: Bytes::new(),
client_hello: None,
});
let mut rng = StdRng::from_seed(rng_seed);
let mut this = Self {
endpoint_config,
server_config,
crypto,
handshake_cid: loc_cid,
rem_handshake_cid: rem_cid,
local_cid_state: CidState::new(
cid_gen.cid_len(),
cid_gen.cid_lifetime(),
now,
if pref_addr_cid.is_some() { 2 } else { 1 },
),
path: PathData::new(remote, allow_mtud, None, now, path_validated, &config),
allow_mtud,
local_ip,
prev_path: None,
side,
state,
zero_rtt_enabled: false,
zero_rtt_crypto: None,
key_phase: false,
// A small initial key phase size ensures peers that don't handle key updates correctly
// fail sooner rather than later. It's okay for both peers to do this, as the first one
// to perform an update will reset the other's key phase size in `update_keys`, and a
// simultaneous key update by both is just like a regular key update with a really fast
// response. Inspired by quic-go's similar behavior of performing the first key update
// at the 100th short-header packet.
key_phase_size: rng.gen_range(10..1000),
peer_params: TransportParameters::default(),
orig_rem_cid: rem_cid,
initial_dst_cid: init_cid,
retry_src_cid: None,
lost_packets: 0,
events: VecDeque::new(),
endpoint_events: VecDeque::new(),
spin_enabled: config.allow_spin && rng.gen_ratio(7, 8),
spin: false,
spaces: [initial_space, PacketSpace::new(now), PacketSpace::new(now)],
highest_space: SpaceId::Initial,
prev_crypto: None,
next_crypto: None,
accepted_0rtt: false,
permit_idle_reset: true,
idle_timeout: config.max_idle_timeout,
timers: TimerTable::default(),
authentication_failures: 0,
error: None,
retry_token: Bytes::new(),
#[cfg(test)]
packet_number_filter: match config.deterministic_packet_numbers {
false => PacketNumberFilter::new(&mut rng),
true => PacketNumberFilter::disabled(),
},
#[cfg(not(test))]
packet_number_filter: PacketNumberFilter::new(&mut rng),
path_responses: PathResponses::default(),
close: false,
ack_frequency: AckFrequencyState::new(get_max_ack_delay(
&TransportParameters::default(),
)),
pto_count: 0,
app_limited: false,
receiving_ecn: false,
total_authed_packets: 0,
streams: StreamsState::new(
side,
config.max_concurrent_uni_streams,
config.max_concurrent_bidi_streams,
config.send_window,
config.receive_window,
config.stream_receive_window,
),
datagrams: DatagramState::default(),
config,
rem_cids: CidQueue::new(rem_cid),
rng,
stats: ConnectionStats::default(),
version,
};
if side.is_client() {
// Kick off the connection
this.write_crypto();
this.init_0rtt();
}
this
}
/// Returns the next time at which `handle_timeout` should be called
///
/// The value returned may change after:
/// - the application performed some I/O on the connection
/// - a call was made to `handle_event`
/// - a call to `poll_transmit` returned `Some`
/// - a call was made to `handle_timeout`
#[must_use]
pub fn poll_timeout(&mut self) -> Option<Instant> {
self.timers.next_timeout()
}
/// Returns application-facing events
///
/// Connections should be polled for events after:
/// - a call was made to `handle_event`
/// - a call was made to `handle_timeout`
#[must_use]
pub fn poll(&mut self) -> Option<Event> {
if let Some(x) = self.events.pop_front() {
return Some(x);
}
if let Some(event) = self.streams.poll() {
return Some(Event::Stream(event));
}
if let Some(err) = self.error.take() {
return Some(Event::ConnectionLost { reason: err });
}
None
}
/// Return endpoint-facing events
#[must_use]
pub fn poll_endpoint_events(&mut self) -> Option<EndpointEvent> {
self.endpoint_events.pop_front().map(EndpointEvent)
}
/// Provide control over streams
#[must_use]
pub fn streams(&mut self) -> Streams<'_> {
Streams {
state: &mut self.streams,
conn_state: &self.state,
}
}
/// Provide control over streams
#[must_use]
pub fn recv_stream(&mut self, id: StreamId) -> RecvStream<'_> {
assert!(id.dir() == Dir::Bi || id.initiator() != self.side);
RecvStream {
id,
state: &mut self.streams,
pending: &mut self.spaces[SpaceId::Data].pending,
}
}
/// Provide control over streams
#[must_use]
pub fn send_stream(&mut self, id: StreamId) -> SendStream<'_> {
assert!(id.dir() == Dir::Bi || id.initiator() == self.side);
SendStream {
id,
state: &mut self.streams,
pending: &mut self.spaces[SpaceId::Data].pending,
conn_state: &self.state,
}
}
/// Returns packets to transmit
///
/// Connections should be polled for transmit after:
/// - the application performed some I/O on the connection
/// - a call was made to `handle_event`
/// - a call was made to `handle_timeout`
///
/// `max_datagrams` specifies how many datagrams can be returned inside a
/// single Transmit using GSO. This must be at least 1.
#[must_use]
pub fn poll_transmit(
&mut self,
now: Instant,
max_datagrams: usize,
buf: &mut Vec<u8>,
) -> Option<Transmit> {
assert!(max_datagrams != 0);
let max_datagrams = match self.config.enable_segmentation_offload {
false => 1,
true => max_datagrams.min(MAX_TRANSMIT_SEGMENTS),
};
let mut num_datagrams = 0;
// Position in `buf` of the first byte of the current UDP datagram. When coalescing QUIC
// packets, this can be earlier than the start of the current QUIC packet.
let mut datagram_start = 0;
let mut segment_size = usize::from(self.path.current_mtu());
// Send PATH_CHALLENGE for a previous path if necessary
if let Some((prev_cid, ref mut prev_path)) = self.prev_path {
if prev_path.challenge_pending {
prev_path.challenge_pending = false;
let token = prev_path
.challenge
.expect("previous path challenge pending without token");
let destination = prev_path.remote;
debug_assert_eq!(
self.highest_space,
SpaceId::Data,
"PATH_CHALLENGE queued without 1-RTT keys"
);
buf.reserve(MIN_INITIAL_SIZE as usize);
let buf_capacity = buf.capacity();
// Use the previous CID to avoid linking the new path with the previous path. We
// don't bother accounting for possible retirement of that prev_cid because this is
// sent once, immediately after migration, when the CID is known to be valid. Even
// if a post-migration packet caused the CID to be retired, it's fair to pretend
// this is sent first.
let mut builder = PacketBuilder::new(
now,
SpaceId::Data,
prev_cid,
buf,
buf_capacity,
0,
false,
self,
)?;
trace!("validating previous path with PATH_CHALLENGE {:08x}", token);
buf.write(frame::Type::PATH_CHALLENGE);
buf.write(token);
self.stats.frame_tx.path_challenge += 1;
// An endpoint MUST expand datagrams that contain a PATH_CHALLENGE frame
// to at least the smallest allowed maximum datagram size of 1200 bytes,
// unless the anti-amplification limit for the path does not permit
// sending a datagram of this size
builder.pad_to(MIN_INITIAL_SIZE);
builder.finish(self, buf);
self.stats.udp_tx.on_sent(1, buf.len());
return Some(Transmit {
destination,
size: buf.len(),
ecn: None,
segment_size: None,
src_ip: self.local_ip,
});
}
}
// If we need to send a probe, make sure we have something to send.
for space in SpaceId::iter() {
let request_immediate_ack =
space == SpaceId::Data && self.peer_supports_ack_frequency();
self.spaces[space].maybe_queue_probe(request_immediate_ack, &self.streams);
}
// Check whether we need to send a close message
let close = match self.state {
State::Drained => {
self.app_limited = true;
return None;
}
State::Draining | State::Closed(_) => {
// self.close is only reset once the associated packet had been
// encoded successfully
if !self.close {
self.app_limited = true;
return None;
}
true
}
_ => false,
};
// Check whether we need to send an ACK_FREQUENCY frame
if let Some(config) = &self.config.ack_frequency_config {
self.spaces[SpaceId::Data].pending.ack_frequency = self
.ack_frequency
.should_send_ack_frequency(self.path.rtt.get(), config, &self.peer_params)
&& self.highest_space == SpaceId::Data
&& self.peer_supports_ack_frequency();
}
// Reserving capacity can provide more capacity than we asked for. However, we are not
// allowed to write more than `segment_size`. Therefore the maximum capacity is tracked
// separately.
let mut buf_capacity = 0;
let mut coalesce = true;
let mut builder_storage: Option<PacketBuilder> = None;
let mut sent_frames = None;
let mut pad_datagram = false;
let mut congestion_blocked = false;
// Iterate over all spaces and find data to send
let mut space_idx = 0;
let spaces = [SpaceId::Initial, SpaceId::Handshake, SpaceId::Data];
// This loop will potentially spend multiple iterations in the same `SpaceId`,
// so we cannot trivially rewrite it to take advantage of `SpaceId::iter()`.
while space_idx < spaces.len() {
let space_id = spaces[space_idx];
// Number of bytes available for frames if this is a 1-RTT packet. We're guaranteed to
// be able to send an individual frame at least this large in the next 1-RTT
// packet. This could be generalized to support every space, but it's only needed to
// handle large fixed-size frames, which only exist in 1-RTT (application datagrams). We
// don't account for coalesced packets potentially occupying space because frames can
// always spill into the next datagram.
let pn = self.packet_number_filter.peek(&self.spaces[SpaceId::Data]);
let frame_space_1rtt =
segment_size.saturating_sub(self.predict_1rtt_overhead(Some(pn)));
// Is there data or a close message to send in this space?
let can_send = self.space_can_send(space_id, frame_space_1rtt);
if can_send.is_empty() && (!close || self.spaces[space_id].crypto.is_none()) {
space_idx += 1;
continue;
}
let mut ack_eliciting = !self.spaces[space_id].pending.is_empty(&self.streams)
|| self.spaces[space_id].ping_pending
|| self.spaces[space_id].immediate_ack_pending;
if space_id == SpaceId::Data {
ack_eliciting |= self.can_send_1rtt(frame_space_1rtt);
}
// Can we append more data into the current buffer?
// It is not safe to assume that `buf.len()` is the end of the data,
// since the last packet might not have been finished.
let buf_end = if let Some(builder) = &builder_storage {
buf.len().max(builder.min_size) + builder.tag_len
} else {
buf.len()
};
if !coalesce || buf_capacity - buf_end < MIN_PACKET_SPACE {
// We need to send 1 more datagram and extend the buffer for that.
// Is 1 more datagram allowed?
if buf_capacity >= segment_size * max_datagrams {
// No more datagrams allowed
break;
}
// Anti-amplification is only based on `total_sent`, which gets
// updated at the end of this method. Therefore we pass the amount
// of bytes for datagrams that are already created, as well as 1 byte
// for starting another datagram. If there is any anti-amplification
// budget left, we always allow a full MTU to be sent
// (see https://github.com/quinn-rs/quinn/issues/1082)
if self
.path
.anti_amplification_blocked(segment_size as u64 * num_datagrams + 1)
{
trace!("blocked by anti-amplification");
break;
}
// Congestion control and pacing checks
// Tail loss probes must not be blocked by congestion, or a deadlock could arise
if ack_eliciting && self.spaces[space_id].loss_probes == 0 {
// Assume the current packet will get padded to fill the segment
let untracked_bytes = if let Some(builder) = &builder_storage {
buf_capacity - builder.partial_encode.start
} else {
0
} as u64;
debug_assert!(untracked_bytes <= segment_size as u64);
let bytes_to_send = segment_size as u64 + untracked_bytes;
if self.path.in_flight.bytes + bytes_to_send >= self.path.congestion.window() {
space_idx += 1;
congestion_blocked = true;
// We continue instead of breaking here in order to avoid
// blocking loss probes queued for higher spaces.
trace!("blocked by congestion control");
continue;
}
// Check whether the next datagram is blocked by pacing
let smoothed_rtt = self.path.rtt.get();
if let Some(delay) = self.path.pacing.delay(
smoothed_rtt,
bytes_to_send,
self.path.current_mtu(),
self.path.congestion.window(),
now,
) {
self.timers.set(Timer::Pacing, delay);
congestion_blocked = true;
// Loss probes should be subject to pacing, even though
// they are not congestion controlled.
trace!("blocked by pacing");
break;
}
}
// Finish current packet
if let Some(mut builder) = builder_storage.take() {
if pad_datagram {
builder.pad_to(MIN_INITIAL_SIZE);
}
if num_datagrams > 1 {
// If too many padding bytes would be required to continue the GSO batch
// after this packet, end the GSO batch here. Ensures that fixed-size frames
// with heterogeneous sizes (e.g. application datagrams) won't inadvertently
// waste large amounts of bandwidth. The exact threshold is a bit arbitrary
// and might benefit from further tuning, though there's no universally
// optimal value.
const MAX_PADDING: usize = 16;
let packet_len_unpadded = cmp::max(builder.min_size, buf.len())
- datagram_start
+ builder.tag_len;
if packet_len_unpadded + MAX_PADDING < segment_size {
trace!(
"GSO truncated by demand for {} padding bytes",
segment_size - packet_len_unpadded
);
builder_storage = Some(builder);
break;
}
// Pad the current packet to GSO segment size so it can be included in the
// GSO batch.
builder.pad_to(segment_size as u16);
}
builder.finish_and_track(now, self, sent_frames.take(), buf);
if num_datagrams == 1 {
// Set the segment size for this GSO batch to the size of the first UDP
// datagram in the batch. Larger data that cannot be fragmented
// (e.g. application datagrams) will be included in a future batch. When
// sending large enough volumes of data for GSO to be useful, we expect
// packet sizes to usually be consistent, e.g. populated by max-size STREAM
// frames or uniformly sized datagrams.
segment_size = buf.len();
// Clip the unused capacity out of the buffer so future packets don't
// overrun
buf_capacity = buf.len();
// Check whether the data we planned to send will fit in the reduced segment
// size. If not, bail out and leave it for the next GSO batch so we don't
// end up trying to send an empty packet. We can't easily compute the right
// segment size before the original call to `space_can_send`, because at
// that time we haven't determined whether we're going to coalesce with the
// first datagram or potentially pad it to `MIN_INITIAL_SIZE`.
if space_id == SpaceId::Data {
let frame_space_1rtt =
segment_size.saturating_sub(self.predict_1rtt_overhead(Some(pn)));
if self.space_can_send(space_id, frame_space_1rtt).is_empty() {
break;
}
}
}
}
// Allocate space for another datagram
buf_capacity += segment_size;
if buf.capacity() < buf_capacity {
// We reserve the maximum space for sending `max_datagrams` upfront
// to avoid any reallocations if more datagrams have to be appended later on.
// Benchmarks have shown shown a 5-10% throughput improvement
// compared to continuously resizing the datagram buffer.
// While this will lead to over-allocation for small transmits
// (e.g. purely containing ACKs), modern memory allocators
// (e.g. mimalloc and jemalloc) will pool certain allocation sizes
// and therefore this is still rather efficient.
buf.reserve(max_datagrams * segment_size);
}
num_datagrams += 1;
coalesce = true;
pad_datagram = false;
datagram_start = buf.len();
debug_assert_eq!(
datagram_start % segment_size,
0,
"datagrams in a GSO batch must be aligned to the segment size"
);
} else {
// We can append/coalesce the next packet into the current
// datagram.
// Finish current packet without adding extra padding
if let Some(builder) = builder_storage.take() {
builder.finish_and_track(now, self, sent_frames.take(), buf);
}
}
debug_assert!(buf_capacity - buf.len() >= MIN_PACKET_SPACE);
//
// From here on, we've determined that a packet will definitely be sent.
//
if self.spaces[SpaceId::Initial].crypto.is_some()
&& space_id == SpaceId::Handshake
&& self.side.is_client()
{
// A client stops both sending and processing Initial packets when it
// sends its first Handshake packet.
self.discard_space(now, SpaceId::Initial);
}
if let Some(ref mut prev) = self.prev_crypto {
prev.update_unacked = false;
}
debug_assert!(
builder_storage.is_none() && sent_frames.is_none(),
"Previous packet must have been finished"
);
// This should really be `builder.insert()`, but `Option::insert`
// is not stable yet. Since we `debug_assert!(builder.is_none())` it
// doesn't make any functional difference.
let builder = builder_storage.get_or_insert(PacketBuilder::new(
now,
space_id,
self.rem_cids.active(),
buf,
buf_capacity,
datagram_start,
ack_eliciting,
self,
)?);
coalesce = coalesce && !builder.short_header;
// https://tools.ietf.org/html/draft-ietf-quic-transport-34#section-14.1
pad_datagram |=
space_id == SpaceId::Initial && (self.side.is_client() || ack_eliciting);
if close {
trace!("sending CONNECTION_CLOSE");
// Encode ACKs before the ConnectionClose message, to give the receiver
// a better approximate on what data has been processed. This is
// especially important with ack delay, since the peer might not
// have gotten any other ACK for the data earlier on.
if !self.spaces[space_id].pending_acks.ranges().is_empty() {
Self::populate_acks(
now,
self.receiving_ecn,
&mut SentFrames::default(),
&mut self.spaces[space_id],
buf,
&mut self.stats,
);
}
// Since there only 64 ACK frames there will always be enough space
// to encode the ConnectionClose frame too. However we still have the
// check here to prevent crashes if something changes.
debug_assert!(
buf.len() + frame::ConnectionClose::SIZE_BOUND < builder.max_size,
"ACKs should leave space for ConnectionClose"
);
if buf.len() + frame::ConnectionClose::SIZE_BOUND < builder.max_size {
let max_frame_size = builder.max_size - buf.len();
match self.state {
State::Closed(state::Closed { ref reason }) => {
if space_id == SpaceId::Data || reason.is_transport_layer() {
reason.encode(buf, max_frame_size)
} else {
frame::ConnectionClose {
error_code: TransportErrorCode::APPLICATION_ERROR,
frame_type: None,
reason: Bytes::new(),
}
.encode(buf, max_frame_size)
}
}
State::Draining => frame::ConnectionClose {
error_code: TransportErrorCode::NO_ERROR,
frame_type: None,
reason: Bytes::new(),
}
.encode(buf, max_frame_size),
_ => unreachable!(
"tried to make a close packet when the connection wasn't closed"
),
}
}
if space_id == self.highest_space {
// Don't send another close packet
self.close = false;
// `CONNECTION_CLOSE` is the final packet
break;
} else {
// Send a close frame in every possible space for robustness, per RFC9000
// "Immediate Close during the Handshake". Don't bother trying to send anything
// else.
space_idx += 1;
continue;
}
}
// Send an off-path PATH_RESPONSE. Prioritized over on-path data to ensure that path
// validation can occur while the link is saturated.
if space_id == SpaceId::Data && num_datagrams == 1 {
if let Some((token, remote)) = self.path_responses.pop_off_path(&self.path.remote) {
// `unwrap` guaranteed to succeed because `builder_storage` was populated just
// above.
let mut builder = builder_storage.take().unwrap();
trace!("PATH_RESPONSE {:08x} (off-path)", token);
buf.write(frame::Type::PATH_RESPONSE);
buf.write(token);
self.stats.frame_tx.path_response += 1;
builder.pad_to(MIN_INITIAL_SIZE);
builder.finish_and_track(
now,
self,
Some(SentFrames {
non_retransmits: true,
..SentFrames::default()
}),
buf,
);
self.stats.udp_tx.on_sent(1, buf.len());
return Some(Transmit {
destination: remote,
size: buf.len(),
ecn: None,
segment_size: None,
src_ip: self.local_ip,
});
}
}
let sent =
self.populate_packet(now, space_id, buf, builder.max_size, builder.exact_number);
// ACK-only packets should only be sent when explicitly allowed. If we write them due to
// any other reason, there is a bug which leads to one component announcing write
// readiness while not writing any data. This degrades performance. The condition is
// only checked if the full MTU is available and when potentially large fixed-size
// frames aren't queued, so that lack of space in the datagram isn't the reason for just
// writing ACKs.
debug_assert!(
!(sent.is_ack_only(&self.streams)
&& !can_send.acks
&& can_send.other
&& (buf_capacity - builder.datagram_start) == self.path.current_mtu() as usize
&& self.datagrams.outgoing.is_empty()),
"SendableFrames was {can_send:?}, but only ACKs have been written"
);
pad_datagram |= sent.requires_padding;
if sent.largest_acked.is_some() {
self.spaces[space_id].pending_acks.acks_sent();
self.timers.stop(Timer::MaxAckDelay);
}
// Keep information about the packet around until it gets finalized
sent_frames = Some(sent);
// Don't increment space_idx.
// We stay in the current space and check if there is more data to send.
}
// Finish the last packet
if let Some(mut builder) = builder_storage {
if pad_datagram {
builder.pad_to(MIN_INITIAL_SIZE);
}
let last_packet_number = builder.exact_number;
builder.finish_and_track(now, self, sent_frames, buf);
self.path
.congestion
.on_sent(now, buf.len() as u64, last_packet_number);
}
self.app_limited = buf.is_empty() && !congestion_blocked;
// Send MTU probe if necessary
if buf.is_empty() && self.state.is_established() {
let space_id = SpaceId::Data;
let probe_size = match self
.path
.mtud
.poll_transmit(now, self.packet_number_filter.peek(&self.spaces[space_id]))
{
Some(next_probe_size) => next_probe_size,
None => return None,
};
let buf_capacity = probe_size as usize;
buf.reserve(buf_capacity);
let mut builder = PacketBuilder::new(
now,
space_id,
self.rem_cids.active(),
buf,
buf_capacity,
0,
true,
self,
)?;
// We implement MTU probes as ping packets padded up to the probe size
buf.write(frame::Type::PING);
self.stats.frame_tx.ping += 1;
// If supported by the peer, we want no delays to the probe's ACK
if self.peer_supports_ack_frequency() {
buf.write(frame::Type::IMMEDIATE_ACK);
self.stats.frame_tx.immediate_ack += 1;
}
builder.pad_to(probe_size);
let sent_frames = SentFrames {
non_retransmits: true,
..Default::default()
};
builder.finish_and_track(now, self, Some(sent_frames), buf);
self.stats.path.sent_plpmtud_probes += 1;
num_datagrams = 1;
trace!(?probe_size, "writing MTUD probe");
}
if buf.is_empty() {
return None;
}
trace!("sending {} bytes in {} datagrams", buf.len(), num_datagrams);