From f87ca36ffd291fb686c6b1c542023aaaab0ba430 Mon Sep 17 00:00:00 2001 From: James Houlahan Date: Fri, 8 May 2020 09:54:11 +0200 Subject: [PATCH] refactor: tidy up DecodeCharset --- Changelog.md | 2 +- pkg/mime/encoding.go | 50 ++++++++++-------------- test/features/smtp/send/failures.feature | 17 +------- test/features/smtp/send/plain.feature | 42 +++++++++++++++++++- 4 files changed, 64 insertions(+), 47 deletions(-) diff --git a/Changelog.md b/Changelog.md index fcd42b12..d341f4bd 100644 --- a/Changelog.md +++ b/Changelog.md @@ -9,7 +9,7 @@ Changelog [format](http://keepachangelog.com/en/1.0.0/) * IMAP mailbox info update when new mailbox is created * IMAP extension Unselect * More logs about event loop activity -* GODT-72 Try ISO-8859-1 encoding if charset is not specified and it isn't UTF-8 +* GODT-72 Use ISO-8859-1 encoding if charset is not specified and it isn't UTF-8 ### Changed * GODT-162 User Agent does not contain bridge version, only client in format `client name/client version (os)` diff --git a/pkg/mime/encoding.go b/pkg/mime/encoding.go index 9a1d94bd..5ebda5ca 100644 --- a/pkg/mime/encoding.go +++ b/pkg/mime/encoding.go @@ -18,7 +18,6 @@ package pmmime import ( - "bytes" "fmt" "io" "mime" @@ -29,10 +28,10 @@ import ( "encoding/base64" + "github.com/pkg/errors" "golang.org/x/text/encoding" "golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/htmlindex" - "golang.org/x/text/transform" ) var wordDec = &mime.WordDecoder{ @@ -161,7 +160,7 @@ func getEncoding(charset string) (enc encoding.Encoding, err error) { enc, _ = htmlindex.Get(preparsed) if enc == nil { - err = fmt.Errorf("can not get encodig for '%s' (or '%s')", charset, preparsed) + err = fmt.Errorf("can not get encoding for '%s' (or '%s')", charset, preparsed) } return } @@ -202,41 +201,34 @@ func EncodeHeader(s string) string { // If it isn't, it checks whether the content is valid latin1 (iso-8859-1), and if so, // reencodes it as utf-8. func DecodeCharset(original []byte, contentTypeParams map[string]string) ([]byte, error) { - var decoder *encoding.Decoder - var err error - + // If the charset is specified, use that. if charset, ok := contentTypeParams["charset"]; ok { - decoder, err = selectDecoder(charset) - } else if utf8.Valid(original) { + decoder, err := selectDecoder(charset) + if err != nil { + return original, errors.Wrap(err, "unknown charset was specified") + } + + return decoder.Bytes(original) + } + + // The charset was not specified. First try utf8. + if utf8.Valid(original) { return original, nil - } else if decoded, err = charmap.ISO8859_1.NewDecoder().Bytes(original); err == nil { - return decoded, nil - } else { - err = fmt.Errorf("non-utf8 content without charset specification") } + // Fallback to latin1. + // In future this should fallback to whatever default encoding user specified. + decoded, err := charmap.ISO8859_1.NewDecoder().Bytes(original) if err != nil { - return original, err + return original, errors.Wrap(err, "failed to decode as latin1") } - utf8 := make([]byte, len(original)) - nDst, nSrc, err := decoder.Transform(utf8, original, false) - for err == transform.ErrShortDst { - if nDst < 1 { - nDst = 1 - } - if nSrc < 1 { - nSrc = 1 - } - utf8 = make([]byte, (nDst/nSrc+1)*len(original)) - nDst, nSrc, err = decoder.Transform(utf8, original, false) + // If the decoded string is not valid utf8, it wasn't latin1, so give up. + if !utf8.Valid(decoded) { + return original, errors.Wrap(err, "failed to decode as latin1") } - if err != nil { - return original, err - } - utf8 = bytes.Trim(utf8, "\x00") - return utf8, nil + return decoded, nil } // DecodeContentEncoding wraps the reader with decoder based on content encoding. diff --git a/test/features/smtp/send/failures.feature b/test/features/smtp/send/failures.feature index db44ce7c..099b2026 100644 --- a/test/features/smtp/send/failures.feature +++ b/test/features/smtp/send/failures.feature @@ -3,21 +3,6 @@ Feature: SMTP wrong messages Given there is connected user "user" And there is SMTP client logged in as "user" - Scenario: Message with no charset and bad character - When SMTP client sends message - """ - From: Bridge Test - To: External Bridge - Subject: Plain text, no charset, wrong base64 external - Content-Disposition: inline - Content-Type: text/plain; - Content-Transfer-Encoding: base64 - - sdfsdfsd - - """ - Then SMTP response is "SMTP error: 554 Error: transaction failed, blame it on the weather: non-utf8 content without charset specification" - Scenario: Message with attachment and wrong boundaries When SMTP client sends message """ @@ -53,4 +38,4 @@ Feature: SMTP wrong messages """ - Then SMTP response is "SMTP error: 554 Error: transaction failed, blame it on the weather: multipart: NextPart: EOF" \ No newline at end of file + Then SMTP response is "SMTP error: 554 Error: transaction failed, blame it on the weather: multipart: NextPart: EOF" diff --git a/test/features/smtp/send/plain.feature b/test/features/smtp/send/plain.feature index e19c0dd4..259a3bf3 100644 --- a/test/features/smtp/send/plain.feature +++ b/test/features/smtp/send/plain.feature @@ -145,7 +145,7 @@ Feature: SMTP sending of plain messages } """ - Scenario: Message without charset + Scenario: Message without charset is utf8 When SMTP client sends message """ From: Bridge Test @@ -156,6 +156,46 @@ Feature: SMTP sending of plain messages This is body of mail without charset. Please assume utf8 + """ + Then SMTP response is "OK" + And mailbox "Sent" for "user" has messages + | time | from | to | subject | + | now | [userAddress] | pm.bridge.qa@gmail.com | Plain text no charset external | + And message is sent with API call: + """ + { + "Message": { + "Subject": "Plain text no charset external", + "Sender": { + "Name": "Bridge Test" + }, + "ToList": [ + { + "Address": "pm.bridge.qa@gmail.com", + "Name": "External Bridge" + } + ], + "CCList": [], + "BCCList": [], + "MIMEType": "text/plain" + } + } + """ + + Scenario: Message without charset is base64-encoded latin1 + When SMTP client sends message + """ + From: Bridge Test + To: External Bridge + Subject: Plain text no charset external + Content-Disposition: inline + Content-Type: text/plain; + Content-Transfer-Encoding: base64 + + dGhpcyBpcyBpbiBsYXRpbjEgYW5kIHRoZXJlIGFyZSBsb3RzIG9mIGVzIHdpdGggYWNjZW50czog + 6enp6enp6enp6enp6enp + + """ Then SMTP response is "OK" And mailbox "Sent" for "user" has messages