refactor: tidy up DecodeCharset

This commit is contained in:
James Houlahan
2020-05-08 09:54:11 +02:00
parent 37f4e46bdc
commit f87ca36ffd
4 changed files with 64 additions and 47 deletions

View File

@ -9,7 +9,7 @@ Changelog [format](http://keepachangelog.com/en/1.0.0/)
* IMAP mailbox info update when new mailbox is created * IMAP mailbox info update when new mailbox is created
* IMAP extension Unselect * IMAP extension Unselect
* More logs about event loop activity * More logs about event loop activity
* GODT-72 Try ISO-8859-1 encoding if charset is not specified and it isn't UTF-8 * GODT-72 Use ISO-8859-1 encoding if charset is not specified and it isn't UTF-8
### Changed ### Changed
* GODT-162 User Agent does not contain bridge version, only client in format `client name/client version (os)` * GODT-162 User Agent does not contain bridge version, only client in format `client name/client version (os)`

View File

@ -18,7 +18,6 @@
package pmmime package pmmime
import ( import (
"bytes"
"fmt" "fmt"
"io" "io"
"mime" "mime"
@ -29,10 +28,10 @@ import (
"encoding/base64" "encoding/base64"
"github.com/pkg/errors"
"golang.org/x/text/encoding" "golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap" "golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/htmlindex" "golang.org/x/text/encoding/htmlindex"
"golang.org/x/text/transform"
) )
var wordDec = &mime.WordDecoder{ var wordDec = &mime.WordDecoder{
@ -161,7 +160,7 @@ func getEncoding(charset string) (enc encoding.Encoding, err error) {
enc, _ = htmlindex.Get(preparsed) enc, _ = htmlindex.Get(preparsed)
if enc == nil { if enc == nil {
err = fmt.Errorf("can not get encodig for '%s' (or '%s')", charset, preparsed) err = fmt.Errorf("can not get encoding for '%s' (or '%s')", charset, preparsed)
} }
return return
} }
@ -202,41 +201,34 @@ func EncodeHeader(s string) string {
// If it isn't, it checks whether the content is valid latin1 (iso-8859-1), and if so, // If it isn't, it checks whether the content is valid latin1 (iso-8859-1), and if so,
// reencodes it as utf-8. // reencodes it as utf-8.
func DecodeCharset(original []byte, contentTypeParams map[string]string) ([]byte, error) { func DecodeCharset(original []byte, contentTypeParams map[string]string) ([]byte, error) {
var decoder *encoding.Decoder // If the charset is specified, use that.
var err error
if charset, ok := contentTypeParams["charset"]; ok { if charset, ok := contentTypeParams["charset"]; ok {
decoder, err = selectDecoder(charset) decoder, err := selectDecoder(charset)
} else if utf8.Valid(original) { if err != nil {
return original, errors.Wrap(err, "unknown charset was specified")
}
return decoder.Bytes(original)
}
// The charset was not specified. First try utf8.
if utf8.Valid(original) {
return original, nil return original, nil
} else if decoded, err = charmap.ISO8859_1.NewDecoder().Bytes(original); err == nil {
return decoded, nil
} else {
err = fmt.Errorf("non-utf8 content without charset specification")
} }
// Fallback to latin1.
// In future this should fallback to whatever default encoding user specified.
decoded, err := charmap.ISO8859_1.NewDecoder().Bytes(original)
if err != nil { if err != nil {
return original, err return original, errors.Wrap(err, "failed to decode as latin1")
} }
utf8 := make([]byte, len(original)) // If the decoded string is not valid utf8, it wasn't latin1, so give up.
nDst, nSrc, err := decoder.Transform(utf8, original, false) if !utf8.Valid(decoded) {
for err == transform.ErrShortDst { return original, errors.Wrap(err, "failed to decode as latin1")
if nDst < 1 {
nDst = 1
}
if nSrc < 1 {
nSrc = 1
}
utf8 = make([]byte, (nDst/nSrc+1)*len(original))
nDst, nSrc, err = decoder.Transform(utf8, original, false)
} }
if err != nil {
return original, err
}
utf8 = bytes.Trim(utf8, "\x00")
return utf8, nil return decoded, nil
} }
// DecodeContentEncoding wraps the reader with decoder based on content encoding. // DecodeContentEncoding wraps the reader with decoder based on content encoding.

View File

@ -3,21 +3,6 @@ Feature: SMTP wrong messages
Given there is connected user "user" Given there is connected user "user"
And there is SMTP client logged in as "user" And there is SMTP client logged in as "user"
Scenario: Message with no charset and bad character
When SMTP client sends message
"""
From: Bridge Test <bridgetest@pm.test>
To: External Bridge <pm.bridge.qa@gmail.com>
Subject: Plain text, no charset, wrong base64 external
Content-Disposition: inline
Content-Type: text/plain;
Content-Transfer-Encoding: base64
sdfsdfsd
"""
Then SMTP response is "SMTP error: 554 Error: transaction failed, blame it on the weather: non-utf8 content without charset specification"
Scenario: Message with attachment and wrong boundaries Scenario: Message with attachment and wrong boundaries
When SMTP client sends message When SMTP client sends message
""" """
@ -53,4 +38,4 @@ Feature: SMTP wrong messages
""" """
Then SMTP response is "SMTP error: 554 Error: transaction failed, blame it on the weather: multipart: NextPart: EOF" Then SMTP response is "SMTP error: 554 Error: transaction failed, blame it on the weather: multipart: NextPart: EOF"

View File

@ -145,7 +145,7 @@ Feature: SMTP sending of plain messages
} }
""" """
Scenario: Message without charset Scenario: Message without charset is utf8
When SMTP client sends message When SMTP client sends message
""" """
From: Bridge Test <bridgetest@pm.test> From: Bridge Test <bridgetest@pm.test>
@ -156,6 +156,46 @@ Feature: SMTP sending of plain messages
This is body of mail without charset. Please assume utf8 This is body of mail without charset. Please assume utf8
"""
Then SMTP response is "OK"
And mailbox "Sent" for "user" has messages
| time | from | to | subject |
| now | [userAddress] | pm.bridge.qa@gmail.com | Plain text no charset external |
And message is sent with API call:
"""
{
"Message": {
"Subject": "Plain text no charset external",
"Sender": {
"Name": "Bridge Test"
},
"ToList": [
{
"Address": "pm.bridge.qa@gmail.com",
"Name": "External Bridge"
}
],
"CCList": [],
"BCCList": [],
"MIMEType": "text/plain"
}
}
"""
Scenario: Message without charset is base64-encoded latin1
When SMTP client sends message
"""
From: Bridge Test <bridgetest@pm.test>
To: External Bridge <pm.bridge.qa@gmail.com>
Subject: Plain text no charset external
Content-Disposition: inline
Content-Type: text/plain;
Content-Transfer-Encoding: base64
dGhpcyBpcyBpbiBsYXRpbjEgYW5kIHRoZXJlIGFyZSBsb3RzIG9mIGVzIHdpdGggYWNjZW50czog
6enp6enp6enp6enp6enp
""" """
Then SMTP response is "OK" Then SMTP response is "OK"
And mailbox "Sent" for "user" has messages And mailbox "Sent" for "user" has messages