feat(BRIDGE-363): Observability metrics for IMAP connections; minor unleash service refactor;

This commit is contained in:
Atanas Janeshliev
2025-05-12 13:35:52 +02:00
parent a305ee1113
commit 89da7335b6
26 changed files with 334 additions and 64 deletions

View File

@ -36,6 +36,9 @@ type IMAPServerManager interface {
RemoveIMAPUser(ctx context.Context, deleteData bool, provider GluonIDProvider, addrID ...string) error
LogRemoteLabelIDs(ctx context.Context, provider GluonIDProvider, addrID ...string) error
GetOpenIMAPSessionCount() int
GetRollingIMAPConnectionCount() int
}
type NullIMAPServerManager struct{}
@ -67,6 +70,14 @@ func (n NullIMAPServerManager) LogRemoteLabelIDs(
return nil
}
func (n NullIMAPServerManager) GetOpenIMAPSessionCount() int {
return 0
}
func (n NullIMAPServerManager) GetRollingIMAPConnectionCount() int {
return 0
}
func NewNullIMAPServerManager() *NullIMAPServerManager {
return &NullIMAPServerManager{}
}

View File

@ -24,6 +24,7 @@ import (
"io"
"os"
"path/filepath"
"time"
"github.com/Masterminds/semver/v3"
"github.com/ProtonMail/gluon"
@ -40,6 +41,12 @@ import (
"github.com/sirupsen/logrus"
)
const (
rollingCounterNewConnectionThreshold = 300
rollingCounterNumberOfBuckets = 6
rollingCounterBucketRotationInterval = time.Second * 10
)
var logIMAP = logrus.WithField("pkg", "server/imap") //nolint:gochecknoglobals
type IMAPSettingsProvider interface {
@ -126,6 +133,7 @@ func newIMAPServer(
gluon.WithUIDValidityGenerator(uidValidityGenerator),
gluon.WithPanicHandler(panicHandler),
gluon.WithObservabilitySender(observability.NewAdapter(observabilitySender), int(observability.GluonImapError), int(observability.GluonMessageError), int(observability.GluonOtherError)),
gluon.WithConnectionRollingCounter(rollingCounterNewConnectionThreshold, rollingCounterNumberOfBuckets, rollingCounterBucketRotationInterval),
}
if disableIMAPAuthenticate {

View File

@ -200,6 +200,14 @@ func (sm *Service) RemoveSMTPAccount(ctx context.Context, service *bridgesmtp.Se
return err
}
func (sm *Service) GetOpenIMAPSessionCount() int {
return sm.imapServer.GetOpenSessionCount()
}
func (sm *Service) GetRollingIMAPConnectionCount() int {
return sm.imapServer.GetRollingIMAPConnectionCount()
}
func (sm *Service) run(ctx context.Context, subscription events.Subscription) {
eventSub := subscription.Add()
defer subscription.Remove(eventSub)

View File

@ -44,7 +44,7 @@ type Service struct {
store *Store
getFlagValueFn unleash.GetFlagValueFn
featureFlagValueProvider unleash.FeatureFlagValueProvider
observabilitySender observability.Sender
}
@ -52,7 +52,7 @@ type Service struct {
const bitfieldRegexPattern = `^\\\d+`
func NewService(userID string, service userevents.Subscribable, eventPublisher events.EventPublisher, store *Store,
getFlagFn unleash.GetFlagValueFn, observabilitySender observability.Sender) *Service {
featureFlagValueProvider unleash.FeatureFlagValueProvider, observabilitySender observability.Sender) *Service {
return &Service{
userID: userID,
@ -68,8 +68,8 @@ func NewService(userID string, service userevents.Subscribable, eventPublisher e
store: store,
getFlagValueFn: getFlagFn,
observabilitySender: observabilitySender,
featureFlagValueProvider: featureFlagValueProvider,
observabilitySender: observabilitySender,
}
}
@ -102,7 +102,7 @@ func (s *Service) run(ctx context.Context) {
}
func (s *Service) HandleNotificationEvents(ctx context.Context, notificationEvents []proton.NotificationEvent) error {
if s.getFlagValueFn(unleash.EventLoopNotificationDisabled) {
if s.featureFlagValueProvider.GetFlagValue(unleash.EventLoopNotificationDisabled) {
s.log.Info("Received notification events. Skipping as kill switch is enabled.")
return nil
}

View File

@ -19,6 +19,7 @@ package observability
import (
"github.com/ProtonMail/go-proton-api"
"github.com/ProtonMail/proton-bridge/v3/internal/services/observability/gluonmetrics"
)
type Adapter struct {
@ -88,6 +89,15 @@ func (adapter *Adapter) AddDistinctMetrics(errType interface{}, metrics ...map[s
}
if len(typedMetrics) > 0 {
adapter.sender.AddDistinctMetrics(DistinctionErrorTypeEnum(errTypeInt), typedMetrics...)
adapter.sender.AddDistinctMetrics(DistinctionMetricTypeEnum(errTypeInt), typedMetrics...)
}
}
func (adapter *Adapter) AddIMAPConnectionsExceededThresholdMetric(totalOpenIMAPConnections, newIMAPConnections int) {
metric := gluonmetrics.GenerateNewOpenedIMAPConnectionsExceedThreshold(
adapter.sender.GetEmailClient(),
BucketIMAPConnections(totalOpenIMAPConnections),
BucketIMAPConnections(newIMAPConnections))
adapter.sender.AddTimeLimitedMetric(NewIMAPConnectionsExceedThreshold, metric)
}

View File

@ -19,21 +19,22 @@ package observability
import "time"
// DistinctionErrorTypeEnum - maps to the specific error schema for which we
// want to send a user update.
type DistinctionErrorTypeEnum int
// DistinctionMetricTypeEnum - used to distinct specific metrics which we want to limit over some interval.
// Most enums are tied to a specific error schema for which we also send a specific distinction user update.
type DistinctionMetricTypeEnum int
const (
SyncError DistinctionErrorTypeEnum = iota
SyncError DistinctionMetricTypeEnum = iota
GluonImapError
GluonMessageError
GluonOtherError
SMTPError
EventLoopError // EventLoopError - should always be kept last when inserting new keys.
NewIMAPConnectionsExceedThreshold
)
// errorSchemaMap - maps between the DistinctionErrorTypeEnum and the relevant schema name.
var errorSchemaMap = map[DistinctionErrorTypeEnum]string{ //nolint:gochecknoglobals
// errorSchemaMap - maps between some DistinctionMetricTypeEnum and the relevant schema name.
var errorSchemaMap = map[DistinctionMetricTypeEnum]string{ //nolint:gochecknoglobals
SyncError: "bridge_sync_errors_users_total",
EventLoopError: "bridge_event_loop_events_errors_users_total",
GluonImapError: "bridge_gluon_imap_errors_users_total",
@ -43,9 +44,9 @@ var errorSchemaMap = map[DistinctionErrorTypeEnum]string{ //nolint:gochecknoglob
}
// createLastSentMap - needs to be updated whenever we make changes to the enum.
func createLastSentMap() map[DistinctionErrorTypeEnum]time.Time {
func createLastSentMap() map[DistinctionMetricTypeEnum]time.Time {
registerTime := time.Now().Add(-updateInterval)
lastSentMap := make(map[DistinctionErrorTypeEnum]time.Time)
lastSentMap := make(map[DistinctionMetricTypeEnum]time.Time)
for errType := SyncError; errType <= EventLoopError; errType++ {
lastSentMap[errType] = registerTime

View File

@ -40,7 +40,7 @@ type distinctionUtility struct {
panicHandler async.PanicHandler
lastSentMap map[DistinctionErrorTypeEnum]time.Time // Ensures we don't step over the limit of one user update every 5 mins.
lastSentMap map[DistinctionMetricTypeEnum]time.Time // Ensures we don't step over the limit of one user update every 5 mins.
observabilitySender observabilitySender
settingsGetter settingsGetter
@ -87,7 +87,7 @@ func (d *distinctionUtility) setSettingsGetter(getter settingsGetter) {
// checkAndUpdateLastSentMap - checks whether we have sent a relevant user update metric
// within the last 5 minutes.
func (d *distinctionUtility) checkAndUpdateLastSentMap(key DistinctionErrorTypeEnum) bool {
func (d *distinctionUtility) checkAndUpdateLastSentMap(key DistinctionMetricTypeEnum) bool {
curTime := time.Now()
val, ok := d.lastSentMap[key]
if !ok {
@ -107,7 +107,7 @@ func (d *distinctionUtility) checkAndUpdateLastSentMap(key DistinctionErrorTypeE
// and the relevant settings. In the future this will need to be expanded to support multiple
// versions of the metric if we ever decide to change them.
func (d *distinctionUtility) generateUserMetric(
metricType DistinctionErrorTypeEnum,
metricType DistinctionMetricTypeEnum,
) proton.ObservabilityMetric {
schemaName, ok := errorSchemaMap[metricType]
if !ok {
@ -138,7 +138,7 @@ func generateUserMetric(schemaName, plan, mailClient, dohEnabled, betaAccess str
}
}
func (d *distinctionUtility) generateDistinctMetrics(errType DistinctionErrorTypeEnum, metrics ...proton.ObservabilityMetric) []proton.ObservabilityMetric {
func (d *distinctionUtility) generateDistinctMetrics(errType DistinctionMetricTypeEnum, metrics ...proton.ObservabilityMetric) []proton.ObservabilityMetric {
d.updateHeartbeatData(errType)
if d.checkAndUpdateLastSentMap(errType) {

View File

@ -0,0 +1,45 @@
// Copyright (c) 2025 Proton AG
//
// This file is part of Proton Mail Bridge.
//
// Proton Mail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Proton Mail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Proton Mail Bridge. If not, see <https://www.gnu.org/licenses/>.
package gluonmetrics
import (
"time"
"github.com/ProtonMail/go-proton-api"
)
const (
newIMAPConnectionThresholdExceededSchemaName = "bridge_imap_recently_opened_connections_total"
newIMAPConnectionThresholdExceededVersion = 1
)
func GenerateNewOpenedIMAPConnectionsExceedThreshold(emailClient, totalOpenIMAPConnectionCount, newlyOpenedIMAPConnectionCount string) proton.ObservabilityMetric {
return proton.ObservabilityMetric{
Name: newIMAPConnectionThresholdExceededSchemaName,
Version: newIMAPConnectionThresholdExceededVersion,
Timestamp: time.Now().Unix(),
Data: map[string]interface{}{
"Value": 1,
"Labels": map[string]string{
"mailClient": emailClient,
"numberOfOpenIMAPConnectionsBuckets": totalOpenIMAPConnectionCount,
"numberOfRecentlyOpenedIMAPConnectionsBuckets": newlyOpenedIMAPConnectionCount,
},
},
}
}

View File

@ -42,7 +42,7 @@ func (d *distinctionUtility) resetHeartbeatData() {
d.heartbeatData.receivedGluonError = false
}
func (d *distinctionUtility) updateHeartbeatData(errType DistinctionErrorTypeEnum) {
func (d *distinctionUtility) updateHeartbeatData(errType DistinctionMetricTypeEnum) {
d.withUpdateHeartbeatDataLock(func() {
//nolint:exhaustive
switch errType {

View File

@ -45,7 +45,9 @@ type client struct {
// so we can easily pass them down to relevant components.
type Sender interface {
AddMetrics(metrics ...proton.ObservabilityMetric)
AddDistinctMetrics(errType DistinctionErrorTypeEnum, metrics ...proton.ObservabilityMetric)
AddDistinctMetrics(errType DistinctionMetricTypeEnum, metrics ...proton.ObservabilityMetric)
AddTimeLimitedMetric(metricType DistinctionMetricTypeEnum, metric proton.ObservabilityMetric)
GetEmailClient() string
}
type Service struct {
@ -325,11 +327,25 @@ func (s *Service) AddMetrics(metrics ...proton.ObservabilityMetric) {
// what number of events come from what number of users.
// As the binning interval is what allows us to do this we
// should not send these if there are no logged-in users at that moment.
func (s *Service) AddDistinctMetrics(errType DistinctionErrorTypeEnum, metrics ...proton.ObservabilityMetric) {
func (s *Service) AddDistinctMetrics(errType DistinctionMetricTypeEnum, metrics ...proton.ObservabilityMetric) {
metrics = s.distinctionUtility.generateDistinctMetrics(errType, metrics...)
s.addMetricsIfClients(metrics...)
}
// AddTimeLimitedMetric - schedules a metric to be sent if a metric of the same type has not been sent within some interval.
// The interval is defined in the distinction utility.
func (s *Service) AddTimeLimitedMetric(metricType DistinctionMetricTypeEnum, metric proton.ObservabilityMetric) {
if !s.distinctionUtility.checkAndUpdateLastSentMap(metricType) {
return
}
s.addMetricsIfClients(metric)
}
func (s *Service) GetEmailClient() string {
return s.distinctionUtility.getEmailClientUserAgent()
}
// ModifyHeartbeatInterval - should only be used for testing. Resets the heartbeat ticker.
func (s *Service) ModifyHeartbeatInterval(duration time.Duration) {
s.distinctionUtility.heartbeatTicker.Reset(duration)

View File

@ -66,3 +66,30 @@ func getEnabled(value bool) string {
}
return "enabled"
}
func BucketIMAPConnections(val int) string {
switch {
case val < 10:
return "<10"
case val < 25:
return "10-24"
case val < 50:
return "25-49"
case val < 100:
return "50-99"
case val < 200:
return "100-199"
case val < 300:
return "200-299"
case val < 500:
return "300-499"
case val < 1000:
return "500-999"
case val < 2000:
return "1000-1999"
case val < 3000:
return "2000-2999"
default:
return "3000+"
}
}

View File

@ -21,6 +21,7 @@ import (
"time"
"github.com/ProtonMail/go-proton-api"
"github.com/ProtonMail/proton-bridge/v3/internal/services/observability"
)
const (
@ -29,6 +30,9 @@ const (
smtpSendSuccessSchemaName = "bridge_smtp_send_success_total"
smtpSendSuccessSchemaVersion = 1
smtpSubmissionRequestSchemaName = "bridge_smtp_send_request_total"
smtpSubmissionRequestSchemaVersion = 1
)
func generateSMTPErrorObservabilityMetric(errorType string) proton.ObservabilityMetric {
@ -88,3 +92,19 @@ func GenerateSMTPSendSuccess() proton.ObservabilityMetric {
},
}
}
func GenerateSMTPSubmissionRequest(emailClient string, numberOfOpenIMAPConnections, numberOfRecentlyOpenedIMAPConnections int) proton.ObservabilityMetric {
return proton.ObservabilityMetric{
Name: smtpSubmissionRequestSchemaName,
Version: smtpSubmissionRequestSchemaVersion,
Timestamp: time.Now().Unix(),
Data: map[string]interface{}{
"Value": 1,
"Labels": map[string]string{
"numberOfOpenIMAPConnections": observability.BucketIMAPConnections(numberOfOpenIMAPConnections),
"numberOfRecentlyOpenedIMAPConnections": observability.BucketIMAPConnections(numberOfRecentlyOpenedIMAPConnections),
"mailClient": emailClient,
},
},
}
}

View File

@ -32,13 +32,24 @@ import (
"github.com/ProtonMail/proton-bridge/v3/internal/services/observability"
"github.com/ProtonMail/proton-bridge/v3/internal/services/orderedtasks"
"github.com/ProtonMail/proton-bridge/v3/internal/services/sendrecorder"
"github.com/ProtonMail/proton-bridge/v3/internal/services/smtp/observabilitymetrics"
"github.com/ProtonMail/proton-bridge/v3/internal/services/userevents"
"github.com/ProtonMail/proton-bridge/v3/internal/services/useridentity"
"github.com/ProtonMail/proton-bridge/v3/internal/unleash"
"github.com/ProtonMail/proton-bridge/v3/internal/usertypes"
"github.com/ProtonMail/proton-bridge/v3/pkg/cpc"
"github.com/sirupsen/logrus"
)
const (
newlyOpenedIMAPConnectionsThreshold = 300
)
type imapSessionCountProvider interface {
GetOpenIMAPSessionCount() int
GetRollingIMAPConnectionCount() int
}
type Service struct {
userID string
panicHandler async.PanicHandler
@ -59,6 +70,9 @@ type Service struct {
serverManager ServerManager
observabilitySender observability.Sender
imapSessionCountProvider imapSessionCountProvider
featureFlagValueProvider unleash.FeatureFlagValueProvider
}
func NewService(
@ -74,6 +88,8 @@ func NewService(
identityState *useridentity.State,
serverManager ServerManager,
observabilitySender observability.Sender,
imapSessionCountProvider imapSessionCountProvider,
featureFlagValueProvider unleash.FeatureFlagValueProvider,
) *Service {
subscriberName := fmt.Sprintf("smpt-%v", userID)
@ -99,7 +115,9 @@ func NewService(
addressMode: mode,
serverManager: serverManager,
observabilitySender: observabilitySender,
imapSessionCountProvider: imapSessionCountProvider,
observabilitySender: observabilitySender,
featureFlagValueProvider: featureFlagValueProvider,
}
}
@ -207,7 +225,6 @@ func (s *Service) run(ctx context.Context) {
switch r := request.Value().(type) {
case *sendMailReq:
s.log.Debug("Received send mail request")
err := s.sendMail(ctx, r)
request.Reply(ctx, nil, err)
@ -252,15 +269,38 @@ type sendMailReq struct {
func (s *Service) sendMail(ctx context.Context, req *sendMailReq) error {
defer async.HandlePanic(s.panicHandler)
openSessionCount := s.imapSessionCountProvider.GetOpenIMAPSessionCount()
newlyOpenedSessions := s.imapSessionCountProvider.GetRollingIMAPConnectionCount()
log := s.log.WithFields(logrus.Fields{
"newlyOpenedIMAPConnectionsCount": newlyOpenedSessions,
"openIMAPConnectionsCount": openSessionCount,
})
log.Debug("Received send mail request")
// Send SMTP send request metric to observability.
s.observabilitySender.AddMetrics(observabilitymetrics.GenerateSMTPSubmissionRequest(s.observabilitySender.GetEmailClient(), openSessionCount, newlyOpenedSessions))
// Send report to sentry if kill switch is disabled & number of newly opened IMAP connections exceed threshold.
if !s.featureFlagValueProvider.GetFlagValue(unleash.SMTPSubmissionRequestSentryReportDisabled) && newlyOpenedSessions >= newlyOpenedIMAPConnectionsThreshold {
if err := s.reporter.ReportMessageWithContext("SMTP Send Mail Request - newly opened IMAP connections exceed threshold", reporter.Context{
"newlyOpenedIMAPConnectionsCount": newlyOpenedSessions,
"openIMAPConnectionsCount": openSessionCount,
"emailClient": s.observabilitySender.GetEmailClient(),
}); err != nil {
s.log.WithError(err).Error("Failed to submit report to sentry (SMTP Send Mail Request)")
}
}
start := time.Now()
defer func() {
end := time.Now()
s.log.Debugf("Send mail request finished in %v", end.Sub(start))
log.Debugf("Send mail request finished in %v", end.Sub(start))
}()
if err := s.smtpSendMail(ctx, req.authID, req.from, req.to, req.r); err != nil {
if apiErr := new(proton.APIError); errors.As(err, &apiErr) {
s.log.WithError(apiErr).WithField("Details", apiErr.DetailsToString()).Error("failed to send message")
log.WithError(apiErr).WithField("Details", apiErr.DetailsToString()).Error("failed to send message")
}
return err