feat(BRIDGE-396): Observability metrics for vault issues; Extension to observability service to support caching

This commit is contained in:
Atanas Janeshliev
2025-07-11 13:20:49 +02:00
parent de3fd34998
commit 7faf32d0ff
19 changed files with 452 additions and 79 deletions

View File

@ -19,12 +19,17 @@ package observability
import (
"context"
"encoding/json"
"os"
"path/filepath"
"sync"
"time"
"github.com/ProtonMail/gluon/async"
"github.com/ProtonMail/go-proton-api"
"github.com/ProtonMail/proton-bridge/v3/internal/locations"
"github.com/ProtonMail/proton-bridge/v3/internal/services/telemetry"
"github.com/bradenaw/juniper/xslices"
"github.com/sirupsen/logrus"
)
@ -34,6 +39,7 @@ var throttleDuration = 5 * time.Second //nolint:gochecknoglobals
const (
maxStorageSize = 5000
maxBatchSize = 1000
filename = "metric_cache.json"
)
type client struct {
@ -50,15 +56,23 @@ type Sender interface {
GetEmailClient() string
}
type BasicSender interface {
AddMetrics(metric ...proton.ObservabilityMetric)
}
type Service struct {
ctx context.Context
cancel context.CancelFunc
panicHandler async.PanicHandler
cachePath string
lastDispatch time.Time
isDispatchScheduled bool
wg sync.WaitGroup
signalDataArrived chan struct{}
signalDispatch chan struct{}
@ -73,41 +87,70 @@ type Service struct {
distinctionUtility *distinctionUtility
}
func NewService(ctx context.Context, panicHandler async.PanicHandler) *Service {
ctx, cancel := context.WithCancel(ctx)
service := &Service{
ctx: ctx,
cancel: cancel,
panicHandler: panicHandler,
lastDispatch: time.Now().Add(-throttleDuration),
signalDataArrived: make(chan struct{}, 1),
signalDispatch: make(chan struct{}, 1),
log: logrus.WithFields(logrus.Fields{"pkg": "observability"}),
metricStore: make([]proton.ObservabilityMetric, 0),
func newService() *Service {
return &Service{
ctx: context.Background(),
metricStore: make([]proton.ObservabilityMetric, 0),
log: logrus.WithFields(logrus.Fields{"pkg": "observability"}),
userClientStore: make(map[string]*client),
}
}
service.distinctionUtility = newDistinctionUtility(ctx, panicHandler, service)
// NewTestService initializes a new basic observability service with the required struct fields.
// Should only be used for testing.
func NewTestService() *Service {
return newService()
}
return service
func WithObservability(locations *locations.Locations, fn func(service *Service) error) error {
service := newService()
cacheDir, err := locations.ProvideObservabilityMetricsCachePath()
if err != nil {
service.log.WithError(err).Warn("Could not obtain cache path")
return fn(service)
}
cachePath := filepath.Clean(filepath.Join(cacheDir, filename))
service.cachePath = cachePath
service.readCacheFile()
defer service.writeCacheFile()
return fn(service)
}
// Initialize sets up the observability Service. If not initialized, the service will remain inactive and emit no metrics.
// Should exclusively be called during bridge set-up.
func (s *Service) Initialize(ctx context.Context, panicHandler async.PanicHandler) {
ctx, cancel := context.WithCancel(ctx)
s.ctx = ctx
s.cancel = cancel
s.panicHandler = panicHandler
s.lastDispatch = time.Now().Add(-throttleDuration)
s.signalDataArrived = make(chan struct{}, 1)
s.signalDispatch = make(chan struct{}, 1)
s.distinctionUtility = newDistinctionUtility(ctx, panicHandler, s)
}
// Run starts the observability service goroutine.
// The function also sets some utility functions to a helper struct aimed at differentiating the amount of users sending metric updates.
func (s *Service) Run(settingsGetter settingsGetter) {
s.log.Info("Starting service")
if s.log != nil {
s.log.Info("Starting service")
}
s.distinctionUtility.setSettingsGetter(settingsGetter)
s.distinctionUtility.runHeartbeat()
s.wg.Add(1)
go func() {
defer s.wg.Done()
s.start()
}()
}
@ -142,6 +185,62 @@ func (s *Service) start() {
}
}
func (s *Service) readCacheFile() {
if s.cachePath == "" {
return
}
file, err := os.Open(s.cachePath)
if err != nil {
s.log.WithError(err).Info("Unable to open cache file")
return
}
defer func(file *os.File) {
if err := file.Close(); err != nil {
s.log.WithError(err).Error("Unable to close cache file after read")
}
}(file)
s.withMetricStoreLock(func() {
if err = json.NewDecoder(file).Decode(&s.metricStore); err != nil {
s.log.WithError(err).Error("Unable to decode cache file")
}
// Since we omit marshalling the field, we need to explicitly overwrite it.
for i := range s.metricStore {
s.metricStore[i].ShouldCache = true
}
})
}
func (s *Service) writeCacheFile() {
if s.cachePath == "" {
return
}
file, err := os.Create(s.cachePath)
if err != nil {
s.log.WithError(err).Warn("Unable to create cache file")
}
defer func(file *os.File) {
if err := file.Close(); err != nil {
s.log.WithError(err).Error("Unable to close cache file after write")
}
}(file)
s.withMetricStoreLock(func() {
metricsToCache := xslices.Filter(s.metricStore, func(m proton.ObservabilityMetric) bool {
return m.ShouldCache
})
if err = json.NewEncoder(file).Encode(metricsToCache); err != nil {
s.log.WithError(err).Error("Unable to encode data to cache file")
}
})
}
func (s *Service) dispatchData() {
s.isDispatchScheduled = false // Only accessed via a single goroutine, so no mutexes.
if !s.haveMetricsAndClients() {
@ -237,6 +336,12 @@ func (s *Service) addMetrics(metric ...proton.ObservabilityMetric) {
s.sendSignal(s.signalDataArrived)
}
func (s *Service) flushMetricsTest() {
s.withMetricStoreLock(func() {
s.metricStore = make([]proton.ObservabilityMetric, 0)
})
}
// addMetricsIfClients - will append a metric only if there are authenticated clients
// via which we can reach the endpoint.
func (s *Service) addMetricsIfClients(metric ...proton.ObservabilityMetric) {
@ -280,6 +385,7 @@ func (s *Service) Stop() {
s.log.Info("Stopping service")
s.cancel()
s.wg.Wait()
close(s.signalDataArrived)
close(s.signalDispatch)
}

View File

@ -0,0 +1,151 @@
// Copyright (c) 2025 Proton AG
//
// This file is part of Proton Mail Bridge.
//
// Proton Mail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// Proton Mail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with Proton Mail Bridge. If not, see <https://www.gnu.org/licenses/>.
package observability
import (
"context"
"path/filepath"
"testing"
"time"
"github.com/ProtonMail/go-proton-api"
"github.com/stretchr/testify/require"
)
func TestService_cacheFile_NoCachePath(t *testing.T) {
s := NewTestService()
s.readCacheFile()
s.writeCacheFile()
require.Empty(t, s.metricStore)
}
func TestService_cacheFile_ValidCachePath(t *testing.T) {
tempDir := t.TempDir()
cachePath := filepath.Join(tempDir, "test_cache.json")
s := NewTestService()
s.cachePath = cachePath
s.readCacheFile()
s.writeCacheFile()
require.Empty(t, s.metricStore)
}
func TestService_cacheFile_AllMetricsCacheable(t *testing.T) {
tempDir := t.TempDir()
cachePath := filepath.Clean(filepath.Join(tempDir, "test_cache.json"))
s := NewTestService()
s.cachePath = cachePath
s.ctx = context.Background()
testMetrics := []proton.ObservabilityMetric{
{
Name: "test1",
Version: 1,
Timestamp: time.Now().Unix(),
Data: nil,
ShouldCache: true,
},
{
Name: "test2",
Version: 2,
Timestamp: time.Now().Unix(),
Data: nil,
ShouldCache: true,
},
{
Name: "test3",
Version: 3,
Timestamp: time.Now().Unix(),
Data: nil,
ShouldCache: true,
},
}
s.readCacheFile()
require.Empty(t, s.metricStore)
s.addMetrics(testMetrics...)
require.Equal(t, s.metricStore, testMetrics)
s.writeCacheFile()
s.flushMetricsTest()
require.Empty(t, s.metricStore)
s.readCacheFile()
require.Equal(t, s.metricStore, testMetrics)
}
func TestService_cacheFile_SomeMetricsCacheable(t *testing.T) {
tempDir := t.TempDir()
cachePath := filepath.Clean(filepath.Join(tempDir, "test_cache.json"))
s := NewTestService()
s.cachePath = cachePath
s.ctx = context.Background()
testMetricsCacheable := []proton.ObservabilityMetric{
{
Name: "test1",
Version: 1,
Timestamp: time.Now().Unix(),
Data: nil,
ShouldCache: true,
},
{
Name: "test2",
Version: 2,
Timestamp: time.Now().Unix(),
Data: nil,
ShouldCache: true,
},
}
testMetricsNonCacheable := []proton.ObservabilityMetric{
{
Name: "test3",
Version: 3,
Timestamp: time.Now().Unix(),
},
{
Name: "test2",
Version: 2,
Timestamp: time.Now().Unix(),
},
}
s.readCacheFile()
require.Empty(t, s.metricStore)
s.addMetrics(testMetricsCacheable...)
s.addMetrics(testMetricsNonCacheable...)
require.Equal(t, s.metricStore, append(testMetricsCacheable, testMetricsNonCacheable...))
s.writeCacheFile()
s.flushMetricsTest()
require.Empty(t, s.metricStore)
s.readCacheFile()
require.Equal(t, s.metricStore, testMetricsCacheable)
}