feat(BRIDGE-150): Observability service modification; user distinction utility & heartbeat; various observbility metrics & relevant integration tests

This commit is contained in:
Atanas Janeshliev
2024-09-23 10:13:05 +00:00
parent 5b874657cb
commit 3ca9e625f5
30 changed files with 1348 additions and 106 deletions

View File

@ -160,10 +160,6 @@ func (s *childJob) onError(err error) {
s.job.onError(err)
}
func (s *childJob) userID() string {
return s.job.userID
}
func (s *childJob) chunkDivide(chunks [][]proton.FullMessage) []childJob {
numChunks := len(chunks)

View File

@ -0,0 +1,67 @@
// Copyright (c) 2024 Proton AG
//
// This file is part of Proton Mail Bridge.
//
// Proton Mail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Proton Mail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Proton Mail Bridge. If not, see <https://www.gnu.org/licenses/>.
package observabilitymetrics
import (
"time"
"github.com/ProtonMail/go-proton-api"
)
const (
errorCaseSchemaName = "bridge_sync_message_build_errors_total"
errorCaseSchemaVersion = 1
successCaseSchemaName = "bridge_sync_message_build_success_total"
successCaseSchemaVersion = 1
)
func generateStageBuildFailureObservabilityMetric(errorType string) proton.ObservabilityMetric {
return proton.ObservabilityMetric{
Name: errorCaseSchemaName,
Version: errorCaseSchemaVersion,
Timestamp: time.Now().Unix(),
Data: map[string]interface{}{
"Value": 1,
"Labels": map[string]string{
"errorType": errorType,
},
},
}
}
func GenerateNoUnlockedKeyringMetric() proton.ObservabilityMetric {
return generateStageBuildFailureObservabilityMetric("noUnlockedKeyring")
}
func GenerateFailedToBuildMetric() proton.ObservabilityMetric {
return generateStageBuildFailureObservabilityMetric("failedToBuild")
}
// GenerateMessageBuiltSuccessMetric - Maybe this is incorrect, I'm not sure how metrics with no labels
// should be dealt with. The integration tests will tell us.
func GenerateMessageBuiltSuccessMetric() proton.ObservabilityMetric {
return proton.ObservabilityMetric{
Name: successCaseSchemaName,
Version: successCaseSchemaVersion,
Timestamp: time.Now().Unix(),
Data: map[string]interface{}{
"Value": 1,
"Labels": map[string]string{},
},
}
}

View File

@ -21,7 +21,7 @@ import (
"context"
"github.com/ProtonMail/gluon/async"
"github.com/ProtonMail/gluon/reporter"
"github.com/ProtonMail/proton-bridge/v3/internal/services/observability"
)
// Service which mediates IMAP syncing in Bridge.
@ -36,8 +36,9 @@ type Service struct {
group *async.Group
}
func NewService(reporter reporter.Reporter,
func NewService(
panicHandler async.PanicHandler,
observabilitySender observability.Sender,
) *Service {
limits := newSyncLimits(2 * Gigabyte)
@ -50,7 +51,7 @@ func NewService(reporter reporter.Reporter,
limits: limits,
metadataStage: NewMetadataStage(metaCh, downloadCh, limits.DownloadRequestMem, panicHandler),
downloadStage: NewDownloadStage(downloadCh, buildCh, limits.MaxParallelDownloads, panicHandler),
buildStage: NewBuildStage(buildCh, applyCh, limits.MessageBuildMem, panicHandler, reporter),
buildStage: NewBuildStage(buildCh, applyCh, limits.MessageBuildMem, panicHandler, observabilitySender),
applyStage: NewApplyStage(applyCh),
metaCh: metaCh,
group: async.NewGroup(context.Background(), panicHandler),

View File

@ -26,9 +26,10 @@ import (
"github.com/ProtonMail/gluon/async"
"github.com/ProtonMail/gluon/logging"
"github.com/ProtonMail/gluon/reporter"
"github.com/ProtonMail/go-proton-api"
"github.com/ProtonMail/gopenpgp/v2/crypto"
"github.com/ProtonMail/proton-bridge/v3/internal/services/observability"
obsMetrics "github.com/ProtonMail/proton-bridge/v3/internal/services/syncservice/observabilitymetrics"
"github.com/bradenaw/juniper/parallel"
"github.com/bradenaw/juniper/xslices"
"github.com/sirupsen/logrus"
@ -50,8 +51,10 @@ type BuildStage struct {
maxBuildMem uint64
panicHandler async.PanicHandler
reporter reporter.Reporter
log *logrus.Entry
// Observability
observabilitySender observability.Sender
}
func NewBuildStage(
@ -59,15 +62,15 @@ func NewBuildStage(
output BuildStageOutput,
maxBuildMem uint64,
panicHandler async.PanicHandler,
reporter reporter.Reporter,
observabilitySender observability.Sender,
) *BuildStage {
return &BuildStage{
input: input,
output: output,
maxBuildMem: maxBuildMem,
log: logrus.WithField("sync-stage", "build"),
panicHandler: panicHandler,
reporter: reporter,
input: input,
output: output,
maxBuildMem: maxBuildMem,
log: logrus.WithField("sync-stage", "build"),
panicHandler: panicHandler,
observabilitySender: observabilitySender,
}
}
@ -147,35 +150,24 @@ func (b *BuildStage) run(ctx context.Context) {
req.job.log.WithError(err).Error("Failed to add failed message ID")
}
if err := b.reporter.ReportMessageWithContext("Failed to build message - no unlocked keyring (sync)", reporter.Context{
"messageID": msg.ID,
"userID": req.userID(),
}); err != nil {
req.job.log.WithError(err).Error("Failed to report message build error")
}
b.observabilitySender.AddDistinctMetrics(observability.SyncError, obsMetrics.GenerateNoUnlockedKeyringMetric())
return BuildResult{}, nil
}
res, err := req.job.messageBuilder.BuildMessage(req.job.labels, msg, kr, new(bytes.Buffer))
if err != nil {
req.job.log.WithError(err).WithField("msgID", msg.ID).Error("Failed to build message (syn)")
req.job.log.WithError(err).WithField("msgID", msg.ID).Error("Failed to build message (sync)")
if err := req.job.state.AddFailedMessageID(req.getContext(), msg.ID); err != nil {
req.job.log.WithError(err).Error("Failed to add failed message ID")
}
if err := b.reporter.ReportMessageWithContext("Failed to build message (sync)", reporter.Context{
"messageID": msg.ID,
"error": err,
"userID": req.userID(),
}); err != nil {
req.job.log.WithError(err).Error("Failed to report message build error")
}
b.observabilitySender.AddDistinctMetrics(observability.SyncError, obsMetrics.GenerateFailedToBuildMetric())
// We could sync a placeholder message here, but for now we skip it entirely.
return BuildResult{}, nil
}
b.observabilitySender.AddMetrics(obsMetrics.GenerateMessageBuiltSuccessMetric())
return res, nil
})
if err != nil {

View File

@ -24,10 +24,11 @@ import (
"github.com/ProtonMail/gluon/async"
"github.com/ProtonMail/gluon/imap"
"github.com/ProtonMail/gluon/reporter"
"github.com/ProtonMail/go-proton-api"
"github.com/ProtonMail/gopenpgp/v2/crypto"
"github.com/ProtonMail/proton-bridge/v3/internal/bridge/mocks"
"github.com/ProtonMail/proton-bridge/v3/internal/services/observability"
obsMetrics "github.com/ProtonMail/proton-bridge/v3/internal/services/syncservice/observabilitymetrics"
"github.com/bradenaw/juniper/xslices"
"github.com/golang/mock/gomock"
"github.com/stretchr/testify/require"
@ -67,7 +68,6 @@ func TestBuildStage_SuccessRemovesFailedMessage(t *testing.T) {
input := NewChannelConsumerProducer[BuildRequest]()
output := NewChannelConsumerProducer[ApplyRequest]()
reporter := mocks.NewMockReporter(mockCtrl)
labels := getTestLabels()
@ -105,7 +105,10 @@ func TestBuildStage_SuccessRemovesFailedMessage(t *testing.T) {
tj.messageBuilder.EXPECT().BuildMessage(gomock.Eq(labels), gomock.Eq(msg), gomock.Any(), gomock.Any()).Return(buildResult, nil)
tj.state.EXPECT().RemFailedMessageID(gomock.Any(), gomock.Eq("MSG"))
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, reporter)
observabilityService := mocks.NewMockObservabilitySender(mockCtrl)
observabilityService.EXPECT().AddMetrics(obsMetrics.GenerateMessageBuiltSuccessMetric())
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, observabilityService)
go func() {
stage.run(ctx)
@ -125,7 +128,7 @@ func TestBuildStage_BuildFailureIsReportedButDoesNotCancelJob(t *testing.T) {
input := NewChannelConsumerProducer[BuildRequest]()
output := NewChannelConsumerProducer[ApplyRequest]()
mockReporter := mocks.NewMockReporter(mockCtrl)
mockObservabilityService := mocks.NewMockObservabilitySender(mockCtrl)
labels := getTestLabels()
@ -156,15 +159,12 @@ func TestBuildStage_BuildFailureIsReportedButDoesNotCancelJob(t *testing.T) {
tj.messageBuilder.EXPECT().BuildMessage(gomock.Eq(labels), gomock.Eq(msg), gomock.Any(), gomock.Any()).Return(BuildResult{}, buildError)
tj.state.EXPECT().AddFailedMessageID(gomock.Any(), gomock.Eq([]string{"MSG"}))
mockReporter.EXPECT().ReportMessageWithContext(gomock.Any(), gomock.Eq(reporter.Context{
"userID": "u",
"messageID": "MSG",
"error": buildError,
})).Return(nil)
tj.syncReporter.EXPECT().OnProgress(gomock.Any(), gomock.Eq(int64(10)))
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mockReporter)
mockObservabilityService.EXPECT().AddDistinctMetrics(observability.SyncError, obsMetrics.GenerateNoUnlockedKeyringMetric())
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mockObservabilityService)
go func() {
stage.run(ctx)
@ -183,7 +183,6 @@ func TestBuildStage_FailedToLocateKeyRingIsReportedButDoesNotFailBuild(t *testin
input := NewChannelConsumerProducer[BuildRequest]()
output := NewChannelConsumerProducer[ApplyRequest]()
mockReporter := mocks.NewMockReporter(mockCtrl)
labels := getTestLabels()
@ -209,14 +208,13 @@ func TestBuildStage_FailedToLocateKeyRingIsReportedButDoesNotFailBuild(t *testin
tj.job.end()
tj.state.EXPECT().AddFailedMessageID(gomock.Any(), gomock.Eq([]string{"MSG"}))
mockReporter.EXPECT().ReportMessageWithContext(gomock.Any(), gomock.Eq(reporter.Context{
"userID": "u",
"messageID": "MSG",
})).Return(nil)
tj.syncReporter.EXPECT().OnProgress(gomock.Any(), gomock.Eq(int64(10)))
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mockReporter)
observabilitySender := mocks.NewMockObservabilitySender(mockCtrl)
observabilitySender.EXPECT().AddDistinctMetrics(observability.SyncError)
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, observabilitySender)
go func() {
stage.run(ctx)
@ -235,7 +233,6 @@ func TestBuildStage_OtherErrorsFailJob(t *testing.T) {
input := NewChannelConsumerProducer[BuildRequest]()
output := NewChannelConsumerProducer[ApplyRequest]()
mockReporter := mocks.NewMockReporter(mockCtrl)
labels := getTestLabels()
@ -261,7 +258,7 @@ func TestBuildStage_OtherErrorsFailJob(t *testing.T) {
childJob := tj.job.newChildJob("f", 10)
tj.job.end()
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mockReporter)
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mocks.NewMockObservabilitySender(mockCtrl))
go func() {
stage.run(ctx)
@ -283,7 +280,6 @@ func TestBuildStage_CancelledJobIsDiscarded(t *testing.T) {
input := NewChannelConsumerProducer[BuildRequest]()
output := NewChannelConsumerProducer[ApplyRequest]()
mockReporter := mocks.NewMockReporter(mockCtrl)
msg := proton.FullMessage{
Message: proton.Message{
@ -294,7 +290,7 @@ func TestBuildStage_CancelledJobIsDiscarded(t *testing.T) {
},
}
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mockReporter)
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mocks.NewMockObservabilitySender(mockCtrl))
ctx, cancel := context.WithCancel(context.Background())
@ -327,7 +323,6 @@ func TestTask_EmptyInputDoesNotCrash(t *testing.T) {
input := NewChannelConsumerProducer[BuildRequest]()
output := NewChannelConsumerProducer[ApplyRequest]()
reporter := mocks.NewMockReporter(mockCtrl)
labels := getTestLabels()
@ -340,7 +335,7 @@ func TestTask_EmptyInputDoesNotCrash(t *testing.T) {
childJob := tj.job.newChildJob("f", 10)
tj.job.end()
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, reporter)
stage := NewBuildStage(input, output, 1024, &async.NoopPanicHandler{}, mocks.NewMockObservabilitySender(mockCtrl))
go func() {
stage.run(ctx)