Fix potential autoreconnect problem and add exponential backoff

This commit is contained in:
Tulir Asokan 2019-05-28 14:09:49 +03:00
parent 498c0e4130
commit 7f0c67168c
3 changed files with 25 additions and 11 deletions

@ -35,6 +35,7 @@ type BridgeConfig struct {
ConnectionTimeout int `yaml:"connection_timeout"`
MaxConnectionAttempts int `yaml:"max_connection_attempts"`
ConnectionRetryDelay int `yaml:"connection_retry_delay"`
ReportConnectionRetry bool `yaml:"report_connection_retry"`
InitialChatSync int `yaml:"initial_chat_sync_count"`
@ -56,6 +57,7 @@ type BridgeConfig struct {
func (bc *BridgeConfig) setDefaults() {
bc.ConnectionTimeout = 20
bc.MaxConnectionAttempts = 3
bc.ConnectionRetryDelay = -1
bc.ReportConnectionRetry = true
bc.InitialChatSync = 10

@ -62,6 +62,9 @@ bridge:
connection_timeout: 20
# Maximum number of times to retry connecting on connection error.
max_connection_attempts: 3
# Number of seconds to wait between connection attempts.
# Negative numbers are exponential backoff: -connection_retry_delay + 1 + 2^attempts
connection_retry_delay: -1
# Whether or not the bridge should send a notice to the user's management room when it retries connecting.
# If false, it will only report when it stops retrying.
report_connection_retry: true

31
user.go

@ -348,23 +348,22 @@ func (user *User) HandleError(err error) {
if errors.Cause(err) != whatsapp.ErrInvalidWsData {
user.log.Errorln("WhatsApp error:", err)
}
var msg string
if closed, ok := err.(*whatsapp.ErrConnectionClosed); ok {
user.Connected = false
if closed.Code == 1000 {
// Normal closure
return
}
user.ConnectionErrors++
msg = fmt.Sprintf("Your WhatsApp connection was closed with websocket status code %d", closed.Code)
go user.tryReconnect(fmt.Sprintf("Your WhatsApp connection was closed with websocket status code %d", closed.Code))
} else if failed, ok := err.(*whatsapp.ErrConnectionFailed); ok {
user.Connected = false
user.ConnectionErrors++
msg = fmt.Sprintf("Your WhatsApp connection failed: %v", failed.Err)
} else {
// Unknown error, probably mostly harmless
return
go user.tryReconnect(fmt.Sprintf("Your WhatsApp connection failed: %v", failed.Err))
}
// Otherwise unknown error, probably mostly harmless
}
func (user *User) tryReconnect(msg string) {
if user.ConnectionErrors > user.bridge.Config.Bridge.MaxConnectionAttempts {
content := format.RenderMarkdown(fmt.Sprintf("%s. Use the `reconnect` command to reconnect.", msg))
_, _ = user.bridge.Bot.SendMessageEvent(user.ManagementRoom, mautrix.EventMessage, content)
@ -375,9 +374,16 @@ func (user *User) HandleError(err error) {
// Don't want the same error to be repeated
msg = ""
}
tries := 0
var tries uint
var exponentialBackoff bool
baseDelay := time.Duration(user.bridge.Config.Bridge.ConnectionRetryDelay)
if baseDelay < 0 {
exponentialBackoff = true
baseDelay = -baseDelay + 1
}
delay := baseDelay
for user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
err = user.Conn.Restore()
err := user.Conn.Restore()
if err == nil {
user.ConnectionErrors = 0
user.Connected = true
@ -389,11 +395,14 @@ func (user *User) HandleError(err error) {
tries++
user.ConnectionErrors++
if user.ConnectionErrors <= user.bridge.Config.Bridge.MaxConnectionAttempts {
if exponentialBackoff {
delay = (1 << tries) + baseDelay
}
if user.bridge.Config.Bridge.ReportConnectionRetry {
_, _ = user.bridge.Bot.SendNotice(user.ManagementRoom,
fmt.Sprintf("Reconnection attempt failed: %v. Retrying in 10 seconds...", err))
fmt.Sprintf("Reconnection attempt failed: %v. Retrying in %d seconds...", err, delay))
}
time.Sleep(10 * time.Second)
time.Sleep(delay * time.Second)
}
}