Compare commits

...

1 Commits

Author SHA1 Message Date
Conrad Irwin
8b713e2c58 Slow down reconnection logic
Currently when we deploy a new version of collab clients get kicked out
of rooms.

This is because we currently can't respond to the hello fast enough, so
the client disconnects and tries again. If this happens more than twice,
the client is now racing with the CLEANUP_TIMEOUT, which will mark any
connections to the old server as gone.

As we expect that reconnections will work fine, this just slows
everything down a bit to give the server more time.
2024-03-06 22:41:32 -07:00
2 changed files with 10 additions and 8 deletions

View File

@@ -65,7 +65,7 @@ lazy_static! {
}
pub const INITIAL_RECONNECTION_DELAY: Duration = Duration::from_millis(100);
pub const CONNECTION_TIMEOUT: Duration = Duration::from_secs(5);
pub const CONNECTION_TIMEOUT: Duration = Duration::from_secs(15);
actions!(client, [SignIn, SignOut, Reconnect]);
@@ -249,7 +249,6 @@ struct ClientState {
status: (watch::Sender<Status>, watch::Receiver<Status>),
entity_id_extractors: HashMap<TypeId, fn(&dyn AnyTypedEnvelope) -> u64>,
_reconnect_task: Option<Task<()>>,
reconnect_interval: Duration,
entities_by_type_and_remote_id: HashMap<(TypeId, u64), WeakSubscriber>,
models_by_message_type: HashMap<TypeId, AnyWeakModel>,
entity_types_by_message_type: HashMap<TypeId, TypeId>,
@@ -287,7 +286,6 @@ impl Default for ClientState {
status: watch::channel_with(Status::SignedOut),
entity_id_extractors: Default::default(),
_reconnect_task: None,
reconnect_interval: Duration::from_secs(5),
models_by_message_type: Default::default(),
entities_by_type_and_remote_id: Default::default(),
entity_types_by_message_type: Default::default(),
@@ -524,14 +522,13 @@ impl Client {
}
Status::ConnectionLost => {
let this = self.clone();
let reconnect_interval = state.reconnect_interval;
state._reconnect_task = Some(cx.spawn(move |cx| async move {
#[cfg(any(test, feature = "test-support"))]
let mut rng = StdRng::seed_from_u64(0);
#[cfg(not(any(test, feature = "test-support")))]
let mut rng = StdRng::from_entropy();
let mut delay = INITIAL_RECONNECTION_DELAY;
let mut delay = INITIAL_RECONNECTION_DELAY.mul_f32(rng.gen_range(0.1..=5.0));
while let Err(error) = this.authenticate_and_connect(true, &cx).await {
log::error!("failed to connect {}", error);
if matches!(*this.status().borrow(), Status::ConnectionError) {
@@ -544,7 +541,7 @@ impl Client {
cx.background_executor().timer(delay).await;
delay = delay
.mul_f32(rng.gen_range(1.0..=2.0))
.min(reconnect_interval);
.min(CONNECTION_TIMEOUT);
} else {
break;
}

View File

@@ -68,8 +68,13 @@ use util::SemanticVersion;
pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30);
// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources.
pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
// This timeout must be longer than the kubernetes graceful shutdown period (10s) so that
// we can be sure the old server is gone.
// It also needs to be long enough that clients have a chance to reconnect before we kick
// them out of any rooms. They have a 15s timeout, so 35s should give enough time for them
// to try ~twice.
// This is all to paper over our server restarts causing a huge stampede, more work is needed.
pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(35);
const MESSAGE_COUNT_PER_PAGE: usize = 100;
const MAX_MESSAGE_LEN: usize = 1024;