Slow down reconnection logic

Currently when we deploy a new version of collab clients get kicked out of rooms. This is because we currently can't respond to the hello fast enough, so the client disconnects and tries again. If this happens more than twice, the client is now racing with the CLEANUP_TIMEOUT, which will mark any connections to the old server as gone. As we expect that reconnections will work fine, this just slows everything down a bit to give the server more time.
2024-03-06 22:41:32 -07:00
2 changed files with 10 additions and 8 deletions
--- a/crates/client/src/client.rs
+++ b/crates/client/src/client.rs
@@ -65,7 +65,7 @@ lazy_static! {
 }

 pub const INITIAL_RECONNECTION_DELAY: Duration = Duration::from_millis(100);
-pub const CONNECTION_TIMEOUT: Duration = Duration::from_secs(5);
+pub const CONNECTION_TIMEOUT: Duration = Duration::from_secs(15);

 actions!(client, [SignIn, SignOut, Reconnect]);

@@ -249,7 +249,6 @@ struct ClientState {
    status: (watch::Sender<Status>, watch::Receiver<Status>),
    entity_id_extractors: HashMap<TypeId, fn(&dyn AnyTypedEnvelope) -> u64>,
    _reconnect_task: Option<Task<()>>,
-    reconnect_interval: Duration,
    entities_by_type_and_remote_id: HashMap<(TypeId, u64), WeakSubscriber>,
    models_by_message_type: HashMap<TypeId, AnyWeakModel>,
    entity_types_by_message_type: HashMap<TypeId, TypeId>,
@@ -287,7 +286,6 @@ impl Default for ClientState {
            status: watch::channel_with(Status::SignedOut),
            entity_id_extractors: Default::default(),
            _reconnect_task: None,
-            reconnect_interval: Duration::from_secs(5),
            models_by_message_type: Default::default(),
            entities_by_type_and_remote_id: Default::default(),
            entity_types_by_message_type: Default::default(),
@@ -524,14 +522,13 @@ impl Client {
            }
            Status::ConnectionLost => {
                let this = self.clone();
-                let reconnect_interval = state.reconnect_interval;
                state._reconnect_task = Some(cx.spawn(move |cx| async move {
                    #[cfg(any(test, feature = "test-support"))]
                    let mut rng = StdRng::seed_from_u64(0);
                    #[cfg(not(any(test, feature = "test-support")))]
                    let mut rng = StdRng::from_entropy();

-                    let mut delay = INITIAL_RECONNECTION_DELAY;
+                    let mut delay = INITIAL_RECONNECTION_DELAY.mul_f32(rng.gen_range(0.1..=5.0));
                    while let Err(error) = this.authenticate_and_connect(true, &cx).await {
                        log::error!("failed to connect {}", error);
                        if matches!(*this.status().borrow(), Status::ConnectionError) {
@@ -544,7 +541,7 @@ impl Client {
                            cx.background_executor().timer(delay).await;
                            delay = delay
                                .mul_f32(rng.gen_range(1.0..=2.0))
-                                .min(reconnect_interval);
+                                .min(CONNECTION_TIMEOUT);
                        } else {
                            break;
                        }
--- a/crates/collab/src/rpc.rs
+++ b/crates/collab/src/rpc.rs
@@ -68,8 +68,13 @@ use util::SemanticVersion;

 pub const RECONNECT_TIMEOUT: Duration = Duration::from_secs(30);

-// kubernetes gives terminated pods 10s to shutdown gracefully. After they're gone, we can clean up old resources.
-pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(15);
+// This timeout must be longer than the kubernetes graceful shutdown period (10s) so that
+// we can be sure the old server is gone.
+// It also needs to be long enough that clients have a chance to reconnect before we kick
+// them out of any rooms. They have a 15s timeout, so 35s should give enough time for them
+// to try ~twice.
+// This is all to paper over our server restarts causing a huge stampede, more work is needed.
+pub const CLEANUP_TIMEOUT: Duration = Duration::from_secs(35);

 const MESSAGE_COUNT_PER_PAGE: usize = 100;
 const MAX_MESSAGE_LEN: usize = 1024;