GODT-1777: Message de-duplication in SMTP

This commit is contained in:
James Houlahan
2022-10-20 12:10:59 +02:00
parent 8e34b51c77
commit c9808d07df
7 changed files with 732 additions and 2 deletions

274
internal/user/send.go Normal file
View File

@ -0,0 +1,274 @@
package user
import (
"bytes"
"context"
"crypto/sha256"
"encoding/base64"
"errors"
"fmt"
"sync"
"time"
"github.com/ProtonMail/gluon/rfc822"
"github.com/sirupsen/logrus"
)
const sendHashExpiry = 5 * time.Minute
type sendRecorder struct {
hasher func([]byte) (string, error)
expiry time.Duration
entries map[string]*sendEntry
entriesLock sync.Mutex
}
func newSendRecorder(expiry time.Duration) *sendRecorder {
return &sendRecorder{
hasher: getMessageHash,
expiry: expiry,
entries: make(map[string]*sendEntry),
}
}
type sendEntry struct {
msgID string
exp time.Time
waitCh chan struct{}
}
// tryInsertWait tries to insert the given message into the send recorder.
// If an entry already exists but it was not sent yet, it waits.
// It returns whether an entry could be inserted and an error if it times out while waiting.
func (h *sendRecorder) tryInsertWait(ctx context.Context, b []byte, deadline time.Time) (string, bool, error) {
hash, err := h.hasher(b)
if err != nil {
return "", false, fmt.Errorf("failed to hash message: %w", err)
}
// If we successfully inserted the hash, we can return true.
if h.tryInsert(hash) {
return hash, true, nil
}
// A message with this hash is already being sent; wait for it.
_, wasSent, err := h.wait(ctx, hash, deadline)
if err != nil {
return "", false, fmt.Errorf("failed to wait for message to be sent: %w", err)
}
// If the message failed to send, try to insert it again.
if !wasSent {
return h.tryInsertWait(ctx, b, deadline)
}
return hash, false, nil
}
// hasEntryWait returns whether the given message already exists in the send recorder.
// If it does, it waits for its ID to be known, then returns it and true.
// If no entry exists, or it times out while waiting for its ID to be known, it returns false.
func (h *sendRecorder) hasEntryWait(ctx context.Context, b []byte, deadline time.Time) (string, bool, error) {
hash, err := h.hasher(b)
if err != nil {
return "", false, fmt.Errorf("failed to hash message: %w", err)
}
if !h.hasEntry(hash) {
return "", false, nil
}
messageID, wasSent, err := h.wait(ctx, hash, deadline)
if errors.Is(err, context.DeadlineExceeded) {
return "", false, nil
} else if err != nil {
return "", false, fmt.Errorf("failed to wait for message to be sent: %w", err)
}
if wasSent {
return messageID, true, nil
}
return h.hasEntryWait(ctx, b, deadline)
}
func (h *sendRecorder) tryInsert(hash string) bool {
h.entriesLock.Lock()
defer h.entriesLock.Unlock()
for hash, entry := range h.entries {
if entry.exp.Before(time.Now()) {
delete(h.entries, hash)
}
}
if _, ok := h.entries[hash]; ok {
return false
}
h.entries[hash] = &sendEntry{
exp: time.Now().Add(h.expiry),
waitCh: make(chan struct{}),
}
return true
}
func (h *sendRecorder) hasEntry(hash string) bool {
h.entriesLock.Lock()
defer h.entriesLock.Unlock()
for hash, entry := range h.entries {
if entry.exp.Before(time.Now()) {
delete(h.entries, hash)
}
}
if _, ok := h.entries[hash]; ok {
return true
}
return false
}
func (h *sendRecorder) addMessageID(hash, msgID string) {
h.entriesLock.Lock()
defer h.entriesLock.Unlock()
entry, ok := h.entries[hash]
if ok {
entry.msgID = msgID
} else {
logrus.Warn("Cannot add message ID to send hash entry, it may have expired")
}
close(entry.waitCh)
}
func (h *sendRecorder) removeOnFail(hash string) {
h.entriesLock.Lock()
defer h.entriesLock.Unlock()
entry, ok := h.entries[hash]
if !ok || entry.msgID != "" {
return
}
close(entry.waitCh)
delete(h.entries, hash)
}
func (h *sendRecorder) wait(ctx context.Context, hash string, deadline time.Time) (string, bool, error) {
ctx, cancel := context.WithDeadline(ctx, deadline)
defer cancel()
waitCh, err := h.getWaitCh(hash)
if err != nil {
return "", false, nil
}
select {
case <-ctx.Done():
return "", false, ctx.Err()
case <-waitCh:
// ...
}
h.entriesLock.Lock()
defer h.entriesLock.Unlock()
if entry, ok := h.entries[hash]; ok {
return entry.msgID, true, nil
}
return "", false, nil
}
func (h *sendRecorder) getWaitCh(hash string) (<-chan struct{}, error) {
h.entriesLock.Lock()
defer h.entriesLock.Unlock()
if entry, ok := h.entries[hash]; ok {
return entry.waitCh, nil
}
return nil, fmt.Errorf("no entry with hash %s", hash)
}
// getMessageHash returns the hash of the given message.
// This takes into account:
// - the Subject header
// - the From/To/Cc/Bcc headers
// - the Content-Type header of each (leaf) part
// - the Content-Disposition header of each (leaf) part
// - the (decoded) body of each part
func getMessageHash(b []byte) (string, error) {
section := rfc822.Parse(b)
header, err := section.ParseHeader()
if err != nil {
return "", err
}
h := sha256.New()
if _, err := h.Write([]byte(header.Get("Subject"))); err != nil {
return "", err
}
if _, err := h.Write([]byte(header.Get("From"))); err != nil {
return "", err
}
if _, err := h.Write([]byte(header.Get("To"))); err != nil {
return "", err
}
if _, err := h.Write([]byte(header.Get("Cc"))); err != nil {
return "", err
}
if _, err := h.Write([]byte(header.Get("Bcc"))); err != nil {
return "", err
}
if err := section.Walk(func(section *rfc822.Section) error {
children, err := section.Children()
if err != nil {
return err
} else if len(children) > 0 {
return nil
}
header, err := section.ParseHeader()
if err != nil {
return err
}
if _, err := h.Write([]byte(header.Get("Content-Type"))); err != nil {
return err
}
if _, err := h.Write([]byte(header.Get("Content-Disposition"))); err != nil {
return err
}
body, err := section.DecodedBody()
if err != nil {
return err
}
if _, err := h.Write(bytes.TrimSpace(body)); err != nil {
return err
}
return nil
}); err != nil {
return "", err
}
return base64.StdEncoding.EncodeToString(h.Sum(nil)), nil
}

327
internal/user/send_test.go Normal file
View File

@ -0,0 +1,327 @@
package user
import (
"context"
"testing"
"time"
"github.com/stretchr/testify/require"
)
func TestSendHasher_Insert(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash1, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash1)
// Simulate successfully sending the message.
h.addMessageID(hash1, "abc")
// Inserting a message with the same hash should return false.
_, ok, err = h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.False(t, ok)
// Inserting a message with a different hash should return true.
hash2, ok, err := h.tryInsertWait(context.Background(), []byte(literal2), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash2)
}
func TestSendHasher_Insert_Expired(t *testing.T) {
h := newSendRecorder(time.Second)
// Insert a message into the hasher.
hash1, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash1)
// Simulate successfully sending the message.
h.addMessageID(hash1, "abc")
// Wait for the entry to expire.
time.Sleep(time.Second)
// Inserting a message with the same hash should return true because the previous entry has since expired.
hash2, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
// The hashes should be the same.
require.Equal(t, hash1, hash2)
}
func TestSendHasher_Wait_SendSuccess(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// Simulate successfully sending the message after half a second.
go func() {
time.Sleep(time.Millisecond * 500)
h.addMessageID(hash, "abc")
}()
// Inserting a message with the same hash should fail.
_, ok, err = h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.False(t, ok)
}
func TestSendHasher_Wait_SendFail(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// Simulate failing to send the message after half a second.
go func() {
time.Sleep(time.Millisecond * 500)
h.removeOnFail(hash)
}()
// Inserting a message with the same hash should succeed because the first message failed to send.
hash2, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
// The hashes should be the same.
require.Equal(t, hash, hash2)
}
func TestSendHasher_Wait_Timeout(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// We should fail to insert because the message is not sent within the timeout period.
_, _, err = h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.Error(t, err)
}
func TestSendHasher_HasEntry(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// Simulate successfully sending the message.
h.addMessageID(hash, "abc")
// The message was already sent; we should find it in the hasher.
messageID, ok, err := h.hasEntryWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.Equal(t, "abc", messageID)
}
func TestSendHasher_HasEntry_SendSuccess(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// Simulate successfully sending the message after half a second.
go func() {
time.Sleep(time.Millisecond * 500)
h.addMessageID(hash, "abc")
}()
// The message was already sent; we should find it in the hasher.
messageID, ok, err := h.hasEntryWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.Equal(t, "abc", messageID)
}
func TestSendHasher_HasEntry_SendFail(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// Simulate failing to send the message after half a second.
go func() {
time.Sleep(time.Millisecond * 500)
h.removeOnFail(hash)
}()
// The message failed to send; we should not find it in the hasher.
_, ok, err = h.hasEntryWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.False(t, ok)
}
func TestSendHasher_HasEntry_Timeout(t *testing.T) {
h := newSendRecorder(sendHashExpiry)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// The message is never sent; we should not find it in the hasher.
_, ok, err = h.hasEntryWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.False(t, ok)
}
func TestSendHasher_HasEntry_Expired(t *testing.T) {
h := newSendRecorder(time.Second)
// Insert a message into the hasher.
hash, ok, err := h.tryInsertWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.True(t, ok)
require.NotEmpty(t, hash)
// Simulate successfully sending the message.
h.addMessageID(hash, "abc")
// Wait for the entry to expire.
time.Sleep(time.Second)
// The entry has expired; we should not find it in the hasher.
_, ok, err = h.hasEntryWait(context.Background(), []byte(literal1), time.Now().Add(time.Second))
require.NoError(t, err)
require.False(t, ok)
}
const literal1 = `From: Sender <sender@pm.me>
To: Receiver <receiver@pm.me>
Content-Type: multipart/mixed; boundary=longrandomstring
--longrandomstring
body
--longrandomstring
Content-Disposition: attachment; filename="attname.txt"
attachment
--longrandomstring--
`
const literal2 = `From: Sender <sender@pm.me>
To: Receiver <receiver@pm.me>
Content-Type: multipart/mixed; boundary=longrandomstring
--longrandomstring
body
--longrandomstring
Content-Disposition: attachment; filename="attname2.txt"
attachment
--longrandomstring--
`
func TestGetMessageHash(t *testing.T) {
tests := []struct {
name string
lit1, lit2 []byte
wantEqual bool
}{
{
name: "empty",
lit1: []byte{},
lit2: []byte{},
wantEqual: true,
},
{
name: "same to",
lit1: []byte("To: someone@pm.me\r\n\r\nHello world!"),
lit2: []byte("To: someone@pm.me\r\n\r\nHello world!"),
wantEqual: true,
},
{
name: "different to",
lit1: []byte("To: someone@pm.me\r\n\r\nHello world!"),
lit2: []byte("To: another@pm.me\r\n\r\nHello world!"),
wantEqual: false,
},
{
name: "same from",
lit1: []byte("From: someone@pm.me\r\n\r\nHello world!"),
lit2: []byte("From: someone@pm.me\r\n\r\nHello world!"),
wantEqual: true,
},
{
name: "different from",
lit1: []byte("From: someone@pm.me\r\n\r\nHello world!"),
lit2: []byte("From: another@pm.me\r\n\r\nHello world!"),
wantEqual: false,
},
{
name: "same subject",
lit1: []byte("Subject: Hello world!\r\n\r\nHello world!"),
lit2: []byte("Subject: Hello world!\r\n\r\nHello world!"),
wantEqual: true,
},
{
name: "different subject",
lit1: []byte("Subject: Hello world!\r\n\r\nHello world!"),
lit2: []byte("Subject: Goodbye world!\r\n\r\nHello world!"),
wantEqual: false,
},
{
name: "same plaintext body",
lit1: []byte("To: someone@pm.me\r\nContent-Type: text/plain\r\n\r\nHello world!"),
lit2: []byte("To: someone@pm.me\r\nContent-Type: text/plain\r\n\r\nHello world!"),
wantEqual: true,
},
{
name: "different plaintext body",
lit1: []byte("To: someone@pm.me\r\nContent-Type: text/plain\r\n\r\nHello world!"),
lit2: []byte("To: someone@pm.me\r\nContent-Type: text/plain\r\n\r\nGoodbye world!"),
wantEqual: false,
},
{
name: "different attachment filenames",
lit1: []byte(literal1),
lit2: []byte(literal2),
wantEqual: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
hash1, err := getMessageHash(tt.lit1)
require.NoError(t, err)
hash2, err := getMessageHash(tt.lit2)
require.NoError(t, err)
if tt.wantEqual {
require.Equal(t, hash1, hash2)
} else {
require.NotEqual(t, hash1, hash2)
}
})
}
}

View File

@ -18,6 +18,7 @@
package user
import (
"bytes"
"context"
"encoding/base64"
"fmt"
@ -26,6 +27,7 @@ import (
"net/url"
"runtime"
"strings"
"time"
"github.com/ProtonMail/gluon/rfc822"
"github.com/ProtonMail/go-rfc5322"
@ -43,8 +45,24 @@ func (user *User) sendMail(authID string, emails []string, from string, to []str
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Read the message to send.
b, err := io.ReadAll(r)
if err != nil {
return fmt.Errorf("failed to read message: %w", err)
}
// Check if we already tried to send this message recently.
hash, ok, err := user.sendHash.tryInsertWait(ctx, b, time.Now().Add(90*time.Second))
if err != nil {
return fmt.Errorf("failed to check send hash: %w", err)
} else if !ok {
user.log.Warn("A duplicate message was already sent recently, skipping")
return nil
}
defer user.sendHash.removeOnFail(hash)
// Create a new message parser from the reader.
parser, err := parser.New(r)
parser, err := parser.New(bytes.NewReader(b))
if err != nil {
return fmt.Errorf("failed to create parser: %w", err)
}
@ -103,7 +121,8 @@ func (user *User) sendMail(authID string, emails []string, from string, to []str
return fmt.Errorf("failed to send message: %w", err)
}
user.log.WithField("messageID", sent.ID).Info("Message sent")
// If the message was successfully sent, we can update the message ID in the record.
user.sendHash.addMessageID(hash, sent.ID)
return nil
})

View File

@ -55,6 +55,7 @@ type User struct {
apiUser *safe.Value[liteapi.User]
apiAddrs *safe.Map[string, liteapi.Address]
updateCh *safe.Map[string, *queue.QueuedChannel[imap.Update]]
sendHash *sendRecorder
tasks *xsync.Group
abortable async.Abortable
@ -126,6 +127,7 @@ func New(
apiUser: safe.NewValue(apiUser),
apiAddrs: safe.NewMapFrom(groupBy(apiAddrs, func(addr liteapi.Address) string { return addr.ID }), sortAddr),
updateCh: safe.NewMapFrom(updateCh, nil),
sendHash: newSendRecorder(sendHashExpiry),
tasks: xsync.NewGroup(context.Background()),

View File

@ -0,0 +1,58 @@
Feature: SMTP sending the same message twice
Background:
Given there exists an account with username "user@pm.me" and password "password"
And there exists an account with username "bridgetest@protonmail.com" and password "password"
And bridge starts
And the user logs in with username "user@pm.me" and password "password"
And the user logs in with username "bridgetest@protonmail.com" and password "password"
And user "user@pm.me" connects and authenticates SMTP client "1"
And SMTP client "1" sends the following message from "user@pm.me" to "bridgetest@protonmail.com":
"""
From: Bridge Test <user@pm.me>
To: Internal Bridge <bridgetest@protonmail.com>
Subject: Hello
World
"""
And it succeeds
Scenario: The exact same message is not sent twice
When SMTP client "1" sends the following message from "user@pm.me" to "bridgetest@protonmail.com":
"""
From: Bridge Test <user@pm.me>
To: Internal Bridge <bridgetest@protonmail.com>
Subject: Hello
World
"""
Then it succeeds
When user "user@pm.me" connects and authenticates IMAP client "1"
Then IMAP client "1" eventually sees the following messages in "Sent":
| from | to | subject | body |
| user@pm.me | bridgetest@protonmail.com | Hello | World |
When user "bridgetest@protonmail.com" connects and authenticates IMAP client "2"
Then IMAP client "2" eventually sees the following messages in "Inbox":
| from | to | subject | body |
| user@pm.me | bridgetest@protonmail.com | Hello | World |
Scenario: Slight change means different message and is sent twice
When SMTP client "1" sends the following message from "user@pm.me" to "bridgetest@protonmail.com":
"""
From: Bridge Test <user@pm.me>
To: Internal Bridge <bridgetest@protonmail.com>
Subject: Hello.
World
"""
Then it succeeds
When user "user@pm.me" connects and authenticates IMAP client "1"
Then IMAP client "1" eventually sees the following messages in "Sent":
| from | to | subject | body |
| user@pm.me | bridgetest@protonmail.com | Hello | World |
| user@pm.me | bridgetest@protonmail.com | Hello. | World |
When user "bridgetest@protonmail.com" connects and authenticates IMAP client "2"
Then IMAP client "2" eventually sees the following messages in "Inbox":
| from | to | subject | body |
| user@pm.me | bridgetest@protonmail.com | Hello | World |
| user@pm.me | bridgetest@protonmail.com | Hello. | World |

View File

@ -0,0 +1,48 @@
Feature: SMTP sending with APPENDing to Sent
Background:
Given there exists an account with username "user@pm.me" and password "password"
And there exists an account with username "bridgetest@protonmail.com" and password "password"
And bridge starts
And the user logs in with username "user@pm.me" and password "password"
And user "user@pm.me" connects and authenticates SMTP client "1"
And user "user@pm.me" connects and authenticates IMAP client "1"
Scenario: Send message and append to Sent
# First do sending.
When SMTP client "1" sends the following message from "user@pm.me" to "bridgetest@protonmail.com":
"""
To: Internal Bridge <bridgetest@protonmail.com>
Subject: Manual send and append
Message-ID: bridgemessage42
hello
"""
Then it succeeds
And the body in the "POST" request to "/mail/v4/messages" is:
"""
{
"Message": {
"Subject": "Manual send and append",
"ExternalID": "bridgemessage42"
}
}
"""
And IMAP client "1" eventually sees the following messages in "Sent":
| to | subject | body | message-id |
| bridgetest@protonmail.com | Manual send and append | hello | <bridgemessage42> |
# Then simulate manual append to Sent mailbox - message should be detected as a duplicate.
When IMAP client "1" appends the following message to "Sent":
"""
To: Internal Bridge <bridgetest@protonmail.com>
Subject: Manual send and append
Message-ID: bridgemessage42
hello
"""
Then it succeeds
And IMAP client "1" eventually sees the following messages in "Sent":
| to | subject | body | message-id |
| bridgetest@protonmail.com | Manual send and append | hello | <bridgemessage42> |

View File

@ -36,6 +36,7 @@ type Message struct {
Subject string `bdd:"subject"`
Body string `bdd:"body"`
Attachments string `bdd:"attachments"`
MessageID string `bdd:"message-id"`
From string `bdd:"from"`
To string `bdd:"to"`
@ -96,6 +97,7 @@ func newMessageFromIMAP(msg *imap.Message) Message {
Subject: msg.Envelope.Subject,
Body: body,
Attachments: strings.Join(xslices.Map(m.Attachments, func(att message.Attachment) string { return att.Name }), ", "),
MessageID: msg.Envelope.MessageId,
Unread: !slices.Contains(msg.Flags, imap.SeenFlag),
}