We build too many walls and not enough bridges

This commit is contained in:
Jakub
2020-04-08 12:59:16 +02:00
commit 17f4d6097a
494 changed files with 62753 additions and 0 deletions

24
pkg/mime/Changelog.md Normal file
View File

@ -0,0 +1,24 @@
# Do not modify this file!
It is here for historical reasons only. All changes should be documented in the
Changelog at the root of this repository.
# Changelog
## [2019-12-10] v1.0.2
### Added
* support for shift_JIS (cp932) encoding
## [2019-09-30] v1.0.1
### Changed
* fix divide by zero
## [2019-09-26] v1.0.0
### Changed
* Import-Export#192: filter header parameters
* ignore twice the same parameter (take the latest)
* convert non utf8 RFC2231 parameters to a single line utf8 RFC2231

254
pkg/mime/encoding.go Normal file
View File

@ -0,0 +1,254 @@
// Copyright (c) 2020 Proton Technologies AG
//
// This file is part of ProtonMail Bridge.
//
// ProtonMail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ProtonMail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with ProtonMail Bridge. If not, see <https://www.gnu.org/licenses/>.
package pmmime
import (
"bytes"
"fmt"
"io"
"mime"
"mime/quotedprintable"
"regexp"
"strings"
"unicode/utf8"
"encoding/base64"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/htmlindex"
"golang.org/x/text/transform"
)
var wordDec = &mime.WordDecoder{
CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
dec, err := selectDecoder(charset)
if err != nil {
return nil, err
}
if dec == nil { // utf-8
return input, nil
}
return dec.Reader(input), nil
},
}
// Expects trimmed lowercase.
func getEncoding(charset string) (enc encoding.Encoding, err error) {
preparsed := strings.Trim(strings.ToLower(charset), " \t\r\n")
// koi
re := regexp.MustCompile("(cs)?koi[-_ ]?8?[-_ ]?(r|ru|u|uk)?$")
matches := re.FindAllStringSubmatch(preparsed, -1)
if len(matches) == 1 && len(matches[0]) == 3 {
preparsed = "koi8-"
switch matches[0][2] {
case "u", "uk":
preparsed += "u"
default:
preparsed += "r"
}
}
// windows-XXXX
re = regexp.MustCompile("(cp|(cs)?win(dows)?)[-_ ]?([0-9]{3,4})$")
matches = re.FindAllStringSubmatch(preparsed, -1)
if len(matches) == 1 && len(matches[0]) == 5 {
switch matches[0][4] {
case "874", "1250", "1251", "1252", "1253", "1254", "1255", "1256", "1257", "1258":
preparsed = "windows-" + matches[0][4]
}
}
// iso
re = regexp.MustCompile("iso[-_ ]?([0-9]{4})[-_ ]?([0-9]+|jp)?[-_ ]?(i|e)?")
matches = re.FindAllStringSubmatch(preparsed, -1)
if len(matches) == 1 && len(matches[0]) == 4 {
if matches[0][1] == "2022" && matches[0][2] == "jp" {
preparsed = "iso-2022-jp"
}
if matches[0][1] == "8859" {
switch matches[0][2] {
case "1", "2", "3", "4", "5", "7", "8", "9", "10", "11", "13", "14", "15", "16":
preparsed = "iso-8859-" + matches[0][2]
if matches[0][3] == "i" {
preparsed += "-" + matches[0][3]
}
case "":
preparsed = "iso-8859-1"
}
}
}
// Latin is tricky.
re = regexp.MustCompile("^(cs|csiso)?l(atin)?[-_ ]?([0-9]{1,2})$")
matches = re.FindAllStringSubmatch(preparsed, -1)
if len(matches) == 1 && len(matches[0]) == 4 {
switch matches[0][3] {
case "1":
preparsed = "windows-1252"
case "2", "3", "4", "5":
preparsed = "iso-8859-" + matches[0][3]
case "6":
preparsed = "iso-8859-10"
case "8":
preparsed = "iso-8859-14"
case "9":
preparsed = "iso-8859-15"
case "10":
preparsed = "iso-8859-16"
}
}
// Missing substitutions.
switch preparsed {
case "csutf8", "iso-utf-8", "utf8mb4":
preparsed = "utf-8"
case "cp932", "windows-932", "windows-31J", "ibm-943", "cp943":
preparsed = "shift_jis"
case "eucjp", "ibm-eucjp":
preparsed = "euc-jp"
case "euckr", "ibm-euckr", "cp949":
preparsed = "euc-kr"
case "euccn", "ibm-euccn":
preparsed = "gbk"
case "zht16mswin950", "cp950":
preparsed = "big5"
case "csascii",
"ansi_x3.4-1968",
"ansi_x3.4-1986",
"ansi_x3.110-1983",
"cp850",
"cp858",
"us",
"iso646",
"iso-646",
"iso646-us",
"iso_646.irv:1991",
"cp367",
"ibm367",
"ibm-367",
"iso-ir-6":
preparsed = "ascii"
case "ibm852":
preparsed = "iso-8859-2"
case "iso-ir-199", "iso-celtic":
preparsed = "iso-8859-14"
case "iso-ir-226":
preparsed = "iso-8859-16"
case "macroman":
preparsed = "macintosh"
}
enc, _ = htmlindex.Get(preparsed)
if enc == nil {
err = fmt.Errorf("can not get encodig for '%s' (or '%s')", charset, preparsed)
}
return
}
func selectDecoder(charset string) (decoder *encoding.Decoder, err error) {
var enc encoding.Encoding
lcharset := strings.Trim(strings.ToLower(charset), " \t\r\n")
switch lcharset {
case "utf7", "utf-7", "unicode-1-1-utf-7":
return NewUtf7Decoder(), nil
default:
enc, err = getEncoding(lcharset)
}
if err == nil {
decoder = enc.NewDecoder()
}
return
}
// DecodeHeader if needed. Returns error if raw contains non-utf8 characters.
func DecodeHeader(raw string) (decoded string, err error) {
if decoded, err = wordDec.DecodeHeader(raw); err != nil {
decoded = raw
}
if !utf8.ValidString(decoded) {
err = fmt.Errorf("header contains non utf8 chars: %v", err)
}
return
}
// EncodeHeader using quoted printable and utf8
func EncodeHeader(s string) string {
return mime.QEncoding.Encode("utf-8", s)
}
// DecodeCharset decodes the orginal using content type parameters.
// When charset is missing it checks thaht the content is valid utf8.
func DecodeCharset(original []byte, contentTypeParams map[string]string) ([]byte, error) {
var decoder *encoding.Decoder
var err error
if charset, ok := contentTypeParams["charset"]; ok {
decoder, err = selectDecoder(charset)
} else {
if utf8.Valid(original) {
return original, nil
}
err = fmt.Errorf("non-utf8 content without charset specification")
}
if err != nil {
return original, err
}
utf8 := make([]byte, len(original))
nDst, nSrc, err := decoder.Transform(utf8, original, false)
for err == transform.ErrShortDst {
if nDst < 1 {
nDst = 1
}
if nSrc < 1 {
nSrc = 1
}
utf8 = make([]byte, (nDst/nSrc+1)*len(original))
nDst, nSrc, err = decoder.Transform(utf8, original, false)
}
if err != nil {
return original, err
}
utf8 = bytes.Trim(utf8, "\x00")
return utf8, nil
}
// DecodeContentEncoding wraps the reader with decoder based on content encoding.
func DecodeContentEncoding(r io.Reader, contentEncoding string) (d io.Reader) {
switch strings.ToLower(contentEncoding) {
case "quoted-printable":
d = quotedprintable.NewReader(r)
case "base64":
d = base64.NewDecoder(base64.StdEncoding, r)
case "7bit", "8bit", "binary", "": // Nothing to do
d = r
}
return
}
// ParseMediaType from MIME doesn't support RFC2231 for non asci / utf8 encodings so we have to pre-parse it.
func ParseMediaType(v string) (mediatype string, params map[string]string, err error) {
v, _ = changeEncodingAndKeepLastParamDefinition(v)
return mime.ParseMediaType(v)
}

445
pkg/mime/encoding_test.go Normal file
View File

@ -0,0 +1,445 @@
// Copyright (c) 2020 Proton Technologies AG
//
// This file is part of ProtonMail Bridge.
//
// ProtonMail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ProtonMail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with ProtonMail Bridge. If not, see <https://www.gnu.org/licenses/>.
package pmmime
import (
"bytes"
//"fmt"
"strings"
"testing"
"golang.org/x/text/encoding/htmlindex"
a "github.com/stretchr/testify/assert"
)
func TestDecodeHeader(t *testing.T) {
testData := []struct{ raw, expected string }{
{
"",
"",
},
{
"=?iso-2022-jp?Q?=1B$B!Z=1B(BTimes_Car_PLUS=1B$B![JV5Q>Z=1B(B?=",
"【Times Car PLUS】返却証",
},
{
`=?iso-2022-jp?Q?iTunes_Movie_=1B$B%K%e!<%j%j!<%9$HCmL\:nIJ=1B(B?=`,
"iTunes Movie ニューリリースと注目作品",
},
{
"=?UTF-8?B?w4TDi8OPw5bDnA==?= =?UTF-8?B?IMOkw6vDr8O2w7w=?=",
"ÄËÏÖÜ äëïöü",
},
{
"=?ISO-8859-2?B?xMtJ1tw=?= =?ISO-8859-2?B?IOTrafb8?=",
"ÄËIÖÜ äëiöü",
},
{
"=?uknown?B?xMtJ1tw=?= =?ISO-8859-2?B?IOTrafb8?=",
"=?uknown?B?xMtJ1tw=?= =?ISO-8859-2?B?IOTrafb8?=",
},
}
for _, val := range testData {
if decoded, err := DecodeHeader(val.raw); strings.Compare(val.expected, decoded) != 0 {
t.Errorf("Incorrect decoding of header %q expected %q but have %q; Error %v", val.raw, val.expected, decoded, err)
} else {
// fmt.Println("Header", val.raw, "successfully decoded", decoded, ". Error", err)
}
}
}
type testParseMediaTypeData struct {
arg, wantMediaType string
wantParams map[string]string
}
func (d *testParseMediaTypeData) run(t *testing.T) {
gotMediaType, params, err := ParseMediaType(d.arg)
a.Nil(t, err)
a.Equal(t, d.wantMediaType, gotMediaType)
a.Equal(t, d.wantParams, params)
}
func TestParseMediaType(t *testing.T) {
testTable := map[string]testParseMediaTypeData{
"TwiceTheSameParameter": {
arg: "attachment; filename=joy.txt; filename=JOY.TXT; title=hi;",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "JOY.TXT", "title": "hi"},
},
"SingleLineUTF8": {
arg: "attachment;\nfilename*=utf-8''%F0%9F%98%81%F0%9F%98%82.txt;\n title=smile",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "😁😂.txt", "title": "smile"},
},
"MultiLineUTF8": {
arg: "attachment;\nfilename*0*=utf-8''%F0%9F%98%81; title=smile;\nfilename*1*=%F0%9F%98%82;\nfilename*2=.txt",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "😁😂.txt", "title": "smile"},
},
"MultiLineFirstNoEncNextUTF8": {
arg: "attachment;\nfilename*0*=utf-8''joy ;\n title*=utf-8''smile; \nfilename*1*=%F0%9F%98%82;\nfilename*2=.txt",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "joy😂.txt", "title": "smile"},
},
"SingleLineBig5": {
arg: "attachment;\nfilename*=big5''%B3%C6%A7%D1%BF%FD.m4a; title*=utf8''memorandum",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "備忘錄.m4a", "title": "memorandum"},
},
"MultiLineBig5": {
arg: "attachment;\nfilename*0*=big5''%B3%C6a; title*0=utf8''memorandum; filename*2=%BF%FD.m4a; \nfilename*1*=%A7%D1b;",
wantMediaType: "attachment",
wantParams: map[string]string{"filename": "備a忘b錄.m4a", "title": "memorandum"},
},
}
for name, testData := range testTable {
t.Run(name, testData.run)
}
}
func TestGetEncoding(t *testing.T) {
// All MIME charsets with aliases can be found here:
// https://www.iana.org/assignments/character-sets/character-sets.xhtml
mimesets := map[string][]string{
"utf-8": []string{ // MIB 16
"utf8",
"csutf8",
"unicode-1-1-utf-8",
"iso-utf-8",
"utf8mb4",
},
"gbk": []string{
"gb2312", // MIB 2025
//"euc-cn": []string{
"euccn",
"ibm-euccn",
},
//"utf7": []string{"utf-7", "unicode-1-1-utf-7"},
"iso-8859-2": []string{ // MIB 5
"iso-ir-101",
"iso_8859-2",
"iso8859-2",
"latin2",
"l2",
"csisolatin2",
"ibm852",
//"FAILEDibm852",
},
"iso-8859-3": []string{ // MIB 6
"iso-ir-109",
"iso_8859-3",
"latin3",
"l3",
"csisolatin3",
},
"iso-8859-4": []string{ // MIB 7
"iso-ir-110",
"iso_8859-4",
"latin4",
"l4",
"csisolatin4",
},
"iso-8859-5": []string{ // MIB 8
"iso-ir-144",
"iso_8859-5",
"cyrillic",
"csisolatincyrillic",
},
"iso-8859-6": []string{ // MIB 9
"iso-ir-127",
"iso_8859-6",
"ecma-114",
"asmo-708",
"arabic",
"csisolatinarabic",
//"iso-8859-6e": []string{ // MIB 81 just direction
"csiso88596e",
"iso-8859-6-e",
//"iso-8859-6i": []string{ // MIB 82
"csiso88596i",
"iso-8859-6-i"},
"iso-8859-7": []string{ // MIB 10
"iso-ir-126",
"iso_8859-7",
"elot_928",
"ecma-118",
"greek",
"greek8",
"csisolatingreek"},
"iso-8859-8": []string{ // MIB 11
"iso-ir-138",
"iso_8859-8",
"hebrew",
"csisolatinhebrew",
//"iso-8859-8e": []string{ // MIB 84 (directionality
"csiso88598e",
"iso-8859-8-e",
},
"iso-8859-8-i": []string{ // MIB 85
"logical",
"csiso88598i",
"iso-8859-8-i", // Hebrew, the "i" means right-to-left, probably unnecessary with ISO cleaning above.
},
"iso-8859-10": []string{ // MIB 13
"iso-ir-157",
"l6",
"iso_8859-10:1992",
"csisolatin6",
"latin6"},
"iso-8859-13": []string{ // MIB 109
"csiso885913"},
"iso-8859-14": []string{ // MIB 110
"iso-ir-199",
"iso_8859-14:1998",
"iso_8859-14",
"latin8",
"iso-celtic",
"l8",
"csiso885914"},
"iso-8859-15": []string{ // MIB 111
"iso_8859-15",
"latin-9",
"csiso885915",
"ISO8859-15"},
"iso-8859-16": []string{ // MIB 112
"iso-ir-226",
"iso_8859-16:2001",
"iso_8859-16",
"latin10",
"l10",
"csiso885916",
},
"windows-874": []string{ // MIB 2109
"cswindows874",
"cp874",
"iso-8859-11",
"tis-620",
},
"windows-1250": []string{ // MIB 2250
"cswindows1250",
"cp1250",
},
"windows-1251": []string{ // MIB 2251
"cswindows1251",
"cp1251",
},
"windows-1252": []string{ // MIB 2252
"cswindows1252",
"cp1252",
"3dwindows-1252",
"we8mswin1252",
"us-ascii", // MIB 3
"ansi_x3.110-1983", // MIB 74 // usascii
//"iso-8859-1": []string{ // MIB 4 succeed by win1252
"iso8859-1",
"iso-ir-100",
"iso_8859-1",
"latin1",
"l1",
"ibm819",
"cp819",
"csisolatin1",
"ansi_x3.4-1968",
"ansi_x3.4-1986",
"cp850",
"cp858", // "cp850" Mostly correct except for the Euro sign.
"iso_646.irv:1991",
"iso646-us",
"us",
"ibm367",
"cp367",
"csascii",
"ascii",
"iso-ir-6",
"we8iso8859p1",
},
"windows-1253": []string{"cswindows1253", "cp1253"}, // MIB 2253
"windows-1254": []string{"cswindows1254", "cp1254"}, // MIB 2254
"windows-1255": []string{"cSwindows1255", "cp1255"}, // MIB 2255
"windows-1256": []string{"cswIndows1256", "cp1256"}, // MIB 2256
"windows-1257": []string{"cswinDows1257", "cp1257"}, // MIB 2257
"windows-1258": []string{"cswindoWs1258", "cp1258"}, // MIB 2257
"koi8-r": []string{"cskoi8r", "koi8r"}, // MIB 2084
"koi8-u": []string{"cskoi8u", "koi8u"}, // MIB 2088
"macintosh": []string{"mac", "macroman", "csmacintosh"}, // MIB 2027
"big5": []string{
"zht16mswin950", // cp950
"cp950",
},
"euc-kr": []string{
"euckr", // MIB 38
"ibm-euckr",
//"uhc": []string{ // Korea
"ks_c_5601-1987",
"ksc5601",
"cp949",
},
"euc-jp": []string{
"eucjp",
"ibm-eucjp",
},
"shift_jis": []string{
"CP932",
"MS932",
"Windows-932",
"Windows-31J",
"MS_Kanji",
"IBM-943",
"CP943",
},
"iso-2022-jp": []string{ // MIB 39
"iso2022jp",
"csiso2022jp",
},
}
for expected, names := range mimesets {
expenc, _ := htmlindex.Get(expected)
if canonical, err := htmlindex.Name(expenc); canonical != expected || err != nil {
t.Fatalf("Error while get canonical name. Expected '%v' but have %v `%#v`: %v", expected, canonical, expenc, err)
}
for _, name := range names {
enc, err := getEncoding(name)
if err != nil || enc == nil {
t.Errorf("Error while getting encoding for %v returned: '%#v' and error: '%v'", name, enc, err)
}
if expenc != enc {
t.Errorf("For %v expected %v '%v' but have '%v'", name, expected, expenc, enc)
}
}
}
}
// sample text for UTF8 http://www.columbia.edu/~fdc/utf8/index.html
func TestEncodeReader(t *testing.T) {
// define test data
testData := []struct {
params map[string]string
original []byte
message string
}{
// russian
{
map[string]string{"charset": "koi8-r"},
// а, з, б, у, к, а, а, б, в, г, д, е, ё
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
map[string]string{"charset": "KOI8-R"},
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
map[string]string{"charset": "csKOI8R"},
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
map[string]string{"charset": "koi8-u"},
[]byte{0xC1, 0xDA, 0xC2, 0xD5, 0xCB, 0xC1, 0xC1, 0xC2, 0xD7, 0xC7, 0xC4, 0xC5, 0xA3},
"азбукаабвгдеё",
},
{
map[string]string{"charset": "iso-8859-5"},
// а , з , б , у , к , а , а , б , в , г , д , е , ё
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xF1},
"азбукаабвгдеё",
},
{
map[string]string{"charset": "csWrong"},
[]byte{0xD0, 0xD7, 0xD1, 0xE3, 0xDA, 0xD0, 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6},
"",
},
{
map[string]string{"charset": "utf8"},
[]byte{0xD0, 0xB0, 0xD0, 0xB7, 0xD0, 0xB1, 0xD1, 0x83, 0xD0, 0xBA, 0xD0, 0xB0, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD1, 0x91},
"азбукаабвгдеё",
},
// czechoslovakia
{
map[string]string{"charset": "windows-1250"},
[]byte{225, 228, 232, 233, 236, 244},
"áäčéěô",
},
// umlauts
{
map[string]string{"charset": "iso-8859-1"},
[]byte{196, 203, 214, 220, 228, 235, 246, 252},
"ÄËÖÜäëöü",
},
// latvia
{
map[string]string{"charset": "iso-8859-4"},
[]byte{224, 239, 243, 182, 254},
"āīķļū",
},
{ // encoded by https://www.motobit.com/util/charset-codepage-conversion.asp
map[string]string{"charset": "utf7"},
[]byte("He wes Leovena+APA-es sone -- li+APA-e him be Drihten.+A6QDtw- +A7MDuwPOA8MDwwOx- +A7wDvwPF- +A60DtAPJA8MDsQO9- +A7UDuwO7A7cDvQO5A7oDrg-. +BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+BCcENQRABD0ENQQ7BDg- +BDgENwQxBEs- +BDcENAQ1BEEETA- +BDg- +BEIEMAQ8-,+C68LvguuC7ELvwuoC80LpA- +C64Lygu0C78LlQuzC78LsgvH- +C6QLrgu/C7QLzQuuC8oLtAu/- +C6oLywuyC80- +C4cLqQu/C6QLvgu1C6QLwQ- +C44LmQvNC5ULwQuuC80- +C5ULvgujC8sLrgvN-."),
"He wes Leovenaðes sone -- liðe him be Drihten.Τη γλώσσα μου έδωσαν ελληνική. Чернели избы здесь и там,Чернели избы здесь и там,யாமறிந்த மொழிகளிலே தமிழ்மொழி போல் இனிதாவது எங்கும் காணோம்.",
},
// iconv -f UTF8 -t GB2312 utf8.txt | hexdump -v -e '"0x" 1/1 "%x, "'
{ // encoded by iconv; dump by `cat gb2312.txt | hexdump -v -e '"0x" 1/1 "%x "'` and reformat; text from https://zh.wikipedia.org/wiki/GB_2312
map[string]string{"charset": "GB2312"},
[]byte{0x47, 0x42, 0x20, 0x32, 0x33, 0x31, 0x32, 0xb5, 0xc4, 0xb3, 0xf6, 0xcf, 0xd6, 0xa3, 0xac, 0xbb, 0xf9, 0xb1, 0xbe, 0xc2, 0xfa, 0xd7, 0xe3, 0xc1, 0xcb, 0xba, 0xba, 0xd7, 0xd6, 0xb5, 0xc4, 0xbc, 0xc6, 0xcb, 0xe3, 0xbb, 0xfa, 0xb4, 0xa6, 0xc0, 0xed, 0xd0, 0xe8, 0xd2, 0xaa, 0xa3, 0xac, 0xcb, 0xfc, 0xcb, 0xf9, 0xca, 0xd5, 0xc2, 0xbc, 0xb5, 0xc4, 0xba, 0xba, 0xd7, 0xd6, 0xd2, 0xd1, 0xbe, 0xad, 0xb8, 0xb2, 0xb8, 0xc7, 0xd6, 0xd0, 0xb9, 0xfa, 0xb4, 0xf3, 0xc2, 0xbd, 0x39, 0x39, 0x2e, 0x37, 0x35, 0x25, 0xb5, 0xc4, 0xca, 0xb9, 0xd3, 0xc3, 0xc6, 0xb5, 0xc2, 0xca, 0xa1, 0xa3, 0xb5, 0xab, 0xb6, 0xd4, 0xd3, 0xda, 0xc8, 0xcb, 0xc3, 0xfb},
"GB 2312的出现基本满足了汉字的计算机处理需要它所收录的汉字已经覆盖中国大陆99.75%的使用频率。但对于人名",
},
{ // encoded by iconv; text from https://jp.wikipedia.org/wiki/Shift_JIS
map[string]string{"charset": "shift-jis"},
[]byte{0x95, 0xb6, 0x8e, 0x9a, 0x95, 0x84, 0x8d, 0x86, 0x89, 0xbb, 0x95, 0xfb, 0x8e, 0xae, 0x53, 0x68, 0x69, 0x66, 0x74, 0x5f, 0x4a, 0x49, 0x53, 0x82, 0xcc, 0x90, 0xdd, 0x8c, 0x76, 0x8e, 0xd2, 0x82, 0xe7, 0x82, 0xcd, 0x81, 0x41, 0x90, 0xe6, 0x8d, 0x73, 0x82, 0xb5, 0x82, 0xc4, 0x82, 0xe6, 0x82, 0xad, 0x97, 0x98, 0x97, 0x70, 0x82, 0xb3, 0x82, 0xea, 0x82, 0xc4, 0x82, 0xa2, 0x82, 0xbd, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x30, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x31, 0x81, 0x6a, 0x82, 0xcc, 0x38, 0x83, 0x72, 0x83, 0x62, 0x83, 0x67, 0x95, 0x84, 0x8d, 0x86, 0x81, 0x69, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x89, 0x70, 0x90, 0x94, 0x8e, 0x9a, 0x81, 0x45, 0x94, 0xbc, 0x8a, 0x70, 0x83, 0x4a, 0x83, 0x69, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xc6, 0x81, 0x41, 0x4a, 0x49, 0x53, 0x20, 0x43, 0x20, 0x36, 0x32, 0x32, 0x36, 0x81, 0x69, 0x8c, 0xbb, 0x8d, 0xdd, 0x82, 0xcc, 0x4a, 0x49, 0x53, 0x20, 0x58, 0x20, 0x30, 0x32, 0x30, 0x38, 0x81, 0x41, 0x88, 0xc8, 0x89, 0xba, 0x81, 0x75, 0x8a, 0xbf, 0x8e, 0x9a, 0x81, 0x76, 0x81, 0x6a, 0x82, 0xcc, 0x97, 0xbc, 0x95, 0xb6, 0x8e, 0x9a, 0x8f, 0x57, 0x8d, 0x87, 0x82, 0xf0, 0x95, 0x5c, 0x8c, 0xbb, 0x82, 0xb5, 0x82, 0xe6, 0x82, 0xa4, 0x82, 0xc6, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42, 0x82, 0xdc, 0x82, 0xbd, 0x81, 0x41, 0x83, 0x74, 0x83, 0x40, 0x83, 0x43, 0x83, 0x8b, 0x82, 0xcc, 0x91, 0xe5, 0x82, 0xab, 0x82, 0xb3, 0x82, 0xe2, 0x8f, 0x88, 0x97, 0x9d, 0x8e, 0x9e, 0x8a, 0xd4, 0x82, 0xcc, 0x92, 0x5a, 0x8f, 0x6b, 0x82, 0xf0, 0x90, 0x7d, 0x82, 0xe9, 0x82, 0xbd, 0x82, 0xdf, 0x81, 0x41, 0x83, 0x47, 0x83, 0x58, 0x83, 0x50, 0x81, 0x5b, 0x83, 0x76, 0x83, 0x56, 0x81, 0x5b, 0x83, 0x50, 0x83, 0x93, 0x83, 0x58, 0x82, 0xc8, 0x82, 0xb5, 0x82, 0xc5, 0x8d, 0xac, 0x8d, 0xdd, 0x89, 0xc2, 0x94, 0x5c, 0x82, 0xc9, 0x82, 0xb7, 0x82, 0xe9, 0x82, 0xb1, 0x82, 0xc6, 0x82, 0xf0, 0x8a, 0xe9, 0x90, 0x7d, 0x82, 0xb5, 0x82, 0xbd, 0x81, 0x42},
"文字符号化方式Shift_JISの設計者らは、先行してよく利用されていたJIS C 6220現在のJIS X 0201の8ビット符号以下「英数字・半角カナ」と、JIS C 6226現在のJIS X 0208、以下「漢字」の両文字集合を表現しようとした。また、ファイルの大きさや処理時間の短縮を図るため、エスケープシーケンスなしで混在可能にすることを企図した。",
},
// add more from mutations of https://en.wikipedia.org/wiki/World_Wide_Web
}
// run tests
for _, val := range testData {
//fmt.Println("Testing ", val)
expected := []byte(val.message)
decoded, err := DecodeCharset(val.original, val.params)
if len(expected) == 0 {
if err == nil {
t.Error("Expected err but have ", err)
} else {
//fmt.Println("Expected err: ", err)
continue
}
} else {
if err != nil {
t.Error("Expected ok but have ", err)
}
}
if bytes.Equal(decoded, expected) {
// fmt.Println("Succesfull decoding of ", val.params, ":", string(decoded))
} else {
t.Error("Wrong encoding of ", val.params, ".Expected\n", expected, "\nbut have\n", decoded)
}
if strings.Compare(val.message, string(decoded)) != 0 {
t.Error("Wrong message for ", val.params, ".Expected\n", val.message, "\nbut have\n", string(decoded))
}
}
}

364
pkg/mime/mediaType.go Normal file
View File

@ -0,0 +1,364 @@
// Copyright (c) 2020 Proton Technologies AG
//
// This file is part of ProtonMail Bridge.
//
// ProtonMail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ProtonMail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with ProtonMail Bridge. If not, see <https://www.gnu.org/licenses/>.
package pmmime
import (
"errors"
"fmt"
"strings"
"unicode"
"github.com/sirupsen/logrus"
)
// changeEncodingAndKeepLastParamDefinition is necessary to modify behaviour
// provided by the golang standard libraries.
func changeEncodingAndKeepLastParamDefinition(v string) (out string, err error) {
log := logrus.WithField("pkg", "pm-mime")
out = v // By default don't do anything with that.
keepOrig := true
i := strings.Index(v, ";")
if i == -1 {
i = len(v)
}
mediatype := strings.TrimSpace(strings.ToLower(v[0:i]))
params := map[string]string{}
var continuation map[string]map[string]string
v = v[i:]
for len(v) > 0 {
v = strings.TrimLeftFunc(v, unicode.IsSpace)
if len(v) == 0 {
break
}
key, value, rest := consumeMediaParam(v)
if key == "" {
break
}
pmap := params
if idx := strings.Index(key, "*"); idx != -1 {
baseName := key[:idx]
if continuation == nil {
continuation = make(map[string]map[string]string)
}
var ok bool
if pmap, ok = continuation[baseName]; !ok {
continuation[baseName] = make(map[string]string)
pmap = continuation[baseName]
}
if isFirstContinuation(key) {
charset, _, err := get2231Charset(value)
if err != nil {
log.Errorln("Filter params:", err)
continue
}
if charset != "utf-8" && charset != "us-ascii" {
keepOrig = false
}
}
}
if _, exists := pmap[key]; exists {
keepOrig = false
}
pmap[key] = value
v = rest
}
if keepOrig {
return
}
if continuation != nil {
for paramKey, contMap := range continuation {
value, err := mergeContinuations(paramKey, contMap)
if err == nil {
params[paramKey+"*"] = value
continue
}
// Fallback.
log.Errorln("Merge param", paramKey, ":", err)
for ck, cv := range contMap {
params[ck] = cv
}
}
}
// Merge ;
out = mediatype
for k, v := range params {
out += ";"
out += k
out += "="
out += v
}
return
}
func isFirstContinuation(key string) bool {
if idx := strings.Index(key, "*"); idx != -1 {
return key[idx:] == "*" || key[idx:] == "*0*"
}
return false
}
// get2231Charset partially from mime/mediatype.go:211 function `decode2231Enc`.
func get2231Charset(v string) (charset, value string, err error) {
sv := strings.SplitN(v, "'", 3)
if len(sv) != 3 {
err = errors.New("incorrect RFC2231 charset format")
return
}
charset = strings.ToLower(sv[0])
value = sv[2]
return
}
func mergeContinuations(paramKey string, contMap map[string]string) (string, error) {
var err error
var charset, value string
// Single value.
if contValue, ok := contMap[paramKey+"*"]; ok {
if charset, value, err = get2231Charset(contValue); err != nil {
return "", err
}
} else {
for n := 0; ; n++ {
contKey := fmt.Sprintf("%s*%d", paramKey, n)
contValue, isLast := contMap[contKey]
if !isLast {
var ok bool
contValue, ok = contMap[contKey+"*"]
if !ok {
return "", errors.New("not valid RFC2231 continuation")
}
}
if n == 0 {
if charset, value, err = get2231Charset(contValue); err != nil || charset == "" {
return "", err
}
} else {
value += contValue
}
if isLast {
break
}
}
}
return convertHexToUTF(charset, value)
}
// convertHexToUTF converts hex values string with charset to UTF8 in RFC2231 format.
func convertHexToUTF(charset, value string) (string, error) {
raw, err := percentHexUnescape(value)
if err != nil {
return "", err
}
utf8, err := DecodeCharset(raw, map[string]string{"charset": charset})
return "utf-8''" + percentHexEscape(utf8), err
}
// consumeMediaParam copy paste mime/mediatype.go:297.
func consumeMediaParam(v string) (param, value, rest string) {
rest = strings.TrimLeftFunc(v, unicode.IsSpace)
if !strings.HasPrefix(rest, ";") {
return "", "", v
}
rest = rest[1:] // Consume semicolon.
rest = strings.TrimLeftFunc(rest, unicode.IsSpace)
param, rest = consumeToken(rest)
param = strings.ToLower(param)
if param == "" {
return "", "", v
}
rest = strings.TrimLeftFunc(rest, unicode.IsSpace)
if !strings.HasPrefix(rest, "=") {
return "", "", v
}
rest = rest[1:] // Consume equals sign.
rest = strings.TrimLeftFunc(rest, unicode.IsSpace)
value, rest2 := consumeValue(rest)
if value == "" && rest2 == rest {
return "", "", v
}
rest = rest2
return param, value, rest
}
// consumeToken copy paste mime/mediatype.go:238.
// consumeToken consumes a token from the beginning of the provided string,
// per RFC 2045 section 5.1 (referenced from 2183), and returns
// the token consumed and the rest of the string.
// Returns ("", v) on failure to consume at least one character.
func consumeToken(v string) (token, rest string) {
notPos := strings.IndexFunc(v, isNotTokenChar)
if notPos == -1 {
return v, ""
}
if notPos == 0 {
return "", v
}
return v[0:notPos], v[notPos:]
}
// consumeValue copy paste mime/mediatype.go:253
// consumeValue consumes a "value" per RFC 2045, where a value is
// either a 'token' or a 'quoted-string'. On success, consumeValue
// returns the value consumed (and de-quoted/escaped, if a
// quoted-string) and the rest of the string.
// On failure, returns ("", v).
func consumeValue(v string) (value, rest string) {
if v == "" {
return
}
if v[0] != '"' {
return consumeToken(v)
}
// parse a quoted-string
buffer := new(strings.Builder)
for i := 1; i < len(v); i++ {
r := v[i]
if r == '"' {
return buffer.String(), v[i+1:]
}
// When MSIE sends a full file path (in "intranet mode"), it does not
// escape backslashes: "C:\dev\go\foo.txt", not "C:\\dev\\go\\foo.txt".
//
// No known MIME generators emit unnecessary backslash escapes
// for simple token characters like numbers and letters.
//
// If we see an unnecessary backslash escape, assume it is from MSIE
// and intended as a literal backslash. This makes Go servers deal better
// with MSIE without affecting the way they handle conforming MIME
// generators.
if r == '\\' && i+1 < len(v) && !isTokenChar(rune(v[i+1])) {
buffer.WriteByte(v[i+1])
i++
continue
}
if r == '\r' || r == '\n' {
return "", v
}
buffer.WriteByte(v[i])
}
// Did not find end quote.
return "", v
}
// isNotTokenChar copy paste from mime/mediatype.go:234.
func isNotTokenChar(r rune) bool {
return !isTokenChar(r)
}
// isTokenChar copy paste from mime/grammar.go:19.
// isTokenChar reports whether rune is in 'token' as defined by RFC 1521 and RFC 2045.
func isTokenChar(r rune) bool {
// token := 1*<any (US-ASCII) CHAR except SPACE, CTLs,
// or tspecials>
return r > 0x20 && r < 0x7f && !isTSpecial(r)
}
// isTSpecial copy paste from mime/grammar.go:13
// isTSpecial reports whether rune is in 'tspecials' as defined by RFC
// 1521 and RFC 2045.
func isTSpecial(r rune) bool {
return strings.ContainsRune(`()<>@,;:\"/[]?=`, r)
}
func percentHexEscape(raw []byte) (out string) {
for _, v := range raw {
out += fmt.Sprintf("%%%x", v)
}
return
}
// percentHexUnescape copy paste from mime/mediatype.go:325.
func percentHexUnescape(s string) ([]byte, error) {
// Count %, check that they're well-formed.
percents := 0
for i := 0; i < len(s); {
if s[i] != '%' {
i++
continue
}
percents++
if i+2 >= len(s) || !ishex(s[i+1]) || !ishex(s[i+2]) {
s = s[i:]
if len(s) > 3 {
s = s[0:3]
}
return []byte{}, fmt.Errorf("mime: bogus characters after %%: %q", s)
}
i += 3
}
if percents == 0 {
return []byte(s), nil
}
t := make([]byte, len(s)-2*percents)
j := 0
for i := 0; i < len(s); {
switch s[i] {
case '%':
t[j] = unhex(s[i+1])<<4 | unhex(s[i+2])
j++
i += 3
default:
t[j] = s[i]
j++
i++
}
}
return t, nil
}
// ishex copy paste from mime/mediatype.go:364.
func ishex(c byte) bool {
switch {
case '0' <= c && c <= '9':
return true
case 'a' <= c && c <= 'f':
return true
case 'A' <= c && c <= 'F':
return true
}
return false
}
// unhex copy paste from mime/mediatype.go:376.
func unhex(c byte) byte {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + 10
case 'A' <= c && c <= 'F':
return c - 'A' + 10
}
return 0
}

544
pkg/mime/parser.go Normal file
View File

@ -0,0 +1,544 @@
// Copyright (c) 2020 Proton Technologies AG
//
// This file is part of ProtonMail Bridge.
//
// ProtonMail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ProtonMail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with ProtonMail Bridge. If not, see <https://www.gnu.org/licenses/>.
package pmmime
import (
"bufio"
"bytes"
"io"
"io/ioutil"
"mime"
"mime/multipart"
"net/http"
"net/mail"
"net/textproto"
"regexp"
"strings"
log "github.com/sirupsen/logrus"
)
// VisitAcceptor decides what to do with part which is processed.
// It is used by MIMEVisitor.
type VisitAcceptor interface {
Accept(partReader io.Reader, header textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error)
}
func VisitAll(part io.Reader, h textproto.MIMEHeader, accepter VisitAcceptor) (err error) {
mediaType, _, err := getContentType(h)
if err != nil {
return
}
return accepter.Accept(part, h, mediaType == "text/plain", true, true)
}
func IsLeaf(h textproto.MIMEHeader) bool {
return !strings.HasPrefix(h.Get("Content-Type"), "multipart/")
}
// MIMEVisitor is main object to parse (visit) and process (accept) all parts of MIME message.
type MimeVisitor struct {
target VisitAcceptor
}
// Accept reads part recursively if needed.
// hasPlainSibling is there when acceptor want to check alternatives.
func (mv *MimeVisitor) Accept(part io.Reader, h textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error) {
if !isFirst {
return
}
parentMediaType, params, err := getContentType(h)
if err != nil {
return
}
if err = mv.target.Accept(part, h, hasPlainSibling, true, false); err != nil {
return
}
if !IsLeaf(h) {
var multiparts []io.Reader
var multipartHeaders []textproto.MIMEHeader
if multiparts, multipartHeaders, err = GetMultipartParts(part, params); err != nil {
return
}
hasPlainChild := false
for _, header := range multipartHeaders {
mediaType, _, _ := getContentType(header)
if mediaType == "text/plain" {
hasPlainChild = true
}
}
if hasPlainSibling && parentMediaType == "multipart/related" {
hasPlainChild = true
}
for i, p := range multiparts {
if err = mv.Accept(p, multipartHeaders[i], hasPlainChild, true, true); err != nil {
return
}
if err = mv.target.Accept(part, h, hasPlainSibling, false, i == (len(multiparts)-1)); err != nil {
return
}
}
}
return
}
// NewMIMEVisitor returns a new mime visitor initialised with an acceptor.
func NewMimeVisitor(targetAccepter VisitAcceptor) *MimeVisitor {
return &MimeVisitor{targetAccepter}
}
func GetRawMimePart(rawdata io.Reader, boundary string) (io.Reader, io.Reader) {
b, _ := ioutil.ReadAll(rawdata)
tee := bytes.NewReader(b)
reader := bufio.NewReader(bytes.NewReader(b))
byteBoundary := []byte(boundary)
bodyBuffer := &bytes.Buffer{}
for {
line, _, err := reader.ReadLine()
if err != nil {
return tee, bytes.NewReader(bodyBuffer.Bytes())
}
if bytes.HasPrefix(line, byteBoundary) {
break
}
}
lineEndingLength := 0
for {
line, isPrefix, err := reader.ReadLine()
if err != nil {
return tee, bytes.NewReader(bodyBuffer.Bytes())
}
if bytes.HasPrefix(line, byteBoundary) {
break
}
lineEndingLength = 0
bodyBuffer.Write(line)
if !isPrefix {
reader.UnreadByte()
reader.UnreadByte()
token, _ := reader.ReadByte()
if token == '\r' {
lineEndingLength++
bodyBuffer.WriteByte(token)
}
lineEndingLength++
bodyBuffer.WriteByte(token)
}
}
ioutil.ReadAll(reader)
data := bodyBuffer.Bytes()
return tee, bytes.NewReader(data[0 : len(data)-lineEndingLength])
}
func GetAllChildParts(part io.Reader, h textproto.MIMEHeader) (parts []io.Reader, headers []textproto.MIMEHeader, err error) {
mediaType, params, err := getContentType(h)
if err != nil {
return
}
if strings.HasPrefix(mediaType, "multipart/") {
var multiparts []io.Reader
var multipartHeaders []textproto.MIMEHeader
if multiparts, multipartHeaders, err = GetMultipartParts(part, params); err != nil {
return
}
if strings.Contains(mediaType, "alternative") {
var chosenPart io.Reader
var chosenHeader textproto.MIMEHeader
if chosenPart, chosenHeader, err = pickAlternativePart(multiparts, multipartHeaders); err != nil {
return
}
var childParts []io.Reader
var childHeaders []textproto.MIMEHeader
if childParts, childHeaders, err = GetAllChildParts(chosenPart, chosenHeader); err != nil {
return
}
parts = append(parts, childParts...)
headers = append(headers, childHeaders...)
} else {
for i, p := range multiparts {
var childParts []io.Reader
var childHeaders []textproto.MIMEHeader
if childParts, childHeaders, err = GetAllChildParts(p, multipartHeaders[i]); err != nil {
return
}
parts = append(parts, childParts...)
headers = append(headers, childHeaders...)
}
}
} else {
parts = append(parts, part)
headers = append(headers, h)
}
return
}
func GetMultipartParts(r io.Reader, params map[string]string) (parts []io.Reader, headers []textproto.MIMEHeader, err error) {
mr := multipart.NewReader(r, params["boundary"])
parts = []io.Reader{}
headers = []textproto.MIMEHeader{}
var p *multipart.Part
for {
p, err = mr.NextPart()
if err == io.EOF {
err = nil
break
}
if err != nil {
return
}
b, _ := ioutil.ReadAll(p)
buffer := bytes.NewBuffer(b)
parts = append(parts, buffer)
headers = append(headers, p.Header)
}
return
}
func pickAlternativePart(parts []io.Reader, headers []textproto.MIMEHeader) (part io.Reader, h textproto.MIMEHeader, err error) {
for i, h := range headers {
mediaType, _, err := getContentType(h)
if err != nil {
continue
}
if strings.HasPrefix(mediaType, "multipart/") {
return parts[i], headers[i], nil
}
}
for i, h := range headers {
mediaType, _, err := getContentType(h)
if err != nil {
continue
}
if mediaType == "text/html" {
return parts[i], headers[i], nil
}
}
for i, h := range headers {
mediaType, _, err := getContentType(h)
if err != nil {
continue
}
if mediaType == "text/plain" {
return parts[i], headers[i], nil
}
}
// If we get all the way here, part will be nil.
return
}
// "Parse address comment" as defined in http://tools.wordtothewise.com/rfc/822
// FIXME: Does not work for address groups.
// NOTE: This should be removed for go>1.10 (please check).
func parseAddressComment(raw string) string {
parsed := []string{}
for _, item := range regexp.MustCompile("[,;]").Split(raw, -1) {
re := regexp.MustCompile("[(][^)]*[)]")
comments := strings.Join(re.FindAllString(item, -1), " ")
comments = strings.Replace(comments, "(", "", -1)
comments = strings.Replace(comments, ")", "", -1)
withoutComments := re.ReplaceAllString(item, "")
addr, err := mail.ParseAddress(withoutComments)
if err != nil {
continue
}
if addr.Name == "" {
addr.Name = comments
}
parsed = append(parsed, addr.String())
}
return strings.Join(parsed, ", ")
}
func checkHeaders(headers []textproto.MIMEHeader) bool {
foundAttachment := false
for i := 0; i < len(headers); i++ {
h := headers[i]
mediaType, _, _ := getContentType(h)
if !strings.HasPrefix(mediaType, "text/") {
foundAttachment = true
} else if foundAttachment {
// This means that there is a text part after the first attachment,
// so we will have to convert the body from plain->HTML.
return true
}
}
return false
}
func decodePart(partReader io.Reader, header textproto.MIMEHeader) (decodedPart io.Reader) {
decodedPart = DecodeContentEncoding(partReader, header.Get("Content-Transfer-Encoding"))
if decodedPart == nil {
log.Warnf("Unsupported Content-Transfer-Encoding '%v'", header.Get("Content-Transfer-Encoding"))
decodedPart = partReader
}
return
}
// Assume 'text/plain' if missing.
func getContentType(header textproto.MIMEHeader) (mediatype string, params map[string]string, err error) {
contentType := header.Get("Content-Type")
if contentType == "" {
contentType = "text/plain"
}
return mime.ParseMediaType(contentType)
}
// ===================== MIME Printer ===================================
// Simply print resulting MIME tree into text form.
// TODO move this to file mime_printer.go.
type stack []string
func (s stack) Push(v string) stack {
return append(s, v)
}
func (s stack) Pop() (stack, string) {
l := len(s)
return s[:l-1], s[l-1]
}
func (s stack) Peek() string {
return s[len(s)-1]
}
type MIMEPrinter struct {
result *bytes.Buffer
boundaryStack stack
}
func NewMIMEPrinter() (pd *MIMEPrinter) {
return &MIMEPrinter{
result: bytes.NewBuffer([]byte("")),
boundaryStack: stack{},
}
}
func (pd *MIMEPrinter) Accept(partReader io.Reader, header textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error) {
if isFirst {
http.Header(header).Write(pd.result)
pd.result.Write([]byte("\n"))
if IsLeaf(header) {
pd.result.ReadFrom(partReader)
} else {
_, params, _ := getContentType(header)
boundary := params["boundary"]
pd.boundaryStack = pd.boundaryStack.Push(boundary)
pd.result.Write([]byte("\nThis is a multi-part message in MIME format.\n--" + boundary + "\n"))
}
} else {
if !isLast {
pd.result.Write([]byte("\n--" + pd.boundaryStack.Peek() + "\n"))
} else {
var boundary string
pd.boundaryStack, boundary = pd.boundaryStack.Pop()
pd.result.Write([]byte("\n--" + boundary + "--\n.\n"))
}
}
return nil
}
func (pd *MIMEPrinter) String() string {
return pd.result.String()
}
// ======================== PlainText Collector =========================
// Collect contents of all non-attachment text/plain parts and return it as a string.
// TODO move this to file collector_plaintext.go.
type PlainTextCollector struct {
target VisitAcceptor
plainTextContents *bytes.Buffer
}
func NewPlainTextCollector(targetAccepter VisitAcceptor) *PlainTextCollector {
return &PlainTextCollector{
target: targetAccepter,
plainTextContents: bytes.NewBuffer([]byte("")),
}
}
func (ptc *PlainTextCollector) Accept(partReader io.Reader, header textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error) {
if isFirst {
if IsLeaf(header) {
mediaType, params, _ := getContentType(header)
disp, _, _ := mime.ParseMediaType(header.Get("Content-Disposition"))
if mediaType == "text/plain" && disp != "attachment" {
partData, _ := ioutil.ReadAll(partReader)
decodedPart := decodePart(bytes.NewReader(partData), header)
if buffer, err := ioutil.ReadAll(decodedPart); err == nil {
buffer, err = DecodeCharset(buffer, params)
if err != nil {
log.Warnln("Decode charset error:", err)
return err
}
ptc.plainTextContents.Write(buffer)
}
err = ptc.target.Accept(bytes.NewReader(partData), header, hasPlainSibling, isFirst, isLast)
return
}
}
}
err = ptc.target.Accept(partReader, header, hasPlainSibling, isFirst, isLast)
return
}
func (ptc PlainTextCollector) GetPlainText() string {
return ptc.plainTextContents.String()
}
// ======================== Body Collector ==============
// Collect contents of all non-attachment parts and return it as a string.
// TODO move this to file collector_body.go.
type BodyCollector struct {
target VisitAcceptor
htmlBodyBuffer *bytes.Buffer
plainBodyBuffer *bytes.Buffer
htmlHeaderBuffer *bytes.Buffer
plainHeaderBuffer *bytes.Buffer
hasHtml bool
}
func NewBodyCollector(targetAccepter VisitAcceptor) *BodyCollector {
return &BodyCollector{
target: targetAccepter,
htmlBodyBuffer: bytes.NewBuffer([]byte("")),
plainBodyBuffer: bytes.NewBuffer([]byte("")),
htmlHeaderBuffer: bytes.NewBuffer([]byte("")),
plainHeaderBuffer: bytes.NewBuffer([]byte("")),
}
}
func (bc *BodyCollector) Accept(partReader io.Reader, header textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error) {
// TODO: Collect html and plaintext - if there's html with plain sibling don't include plain/text.
if isFirst {
if IsLeaf(header) {
mediaType, params, _ := getContentType(header)
disp, _, _ := mime.ParseMediaType(header.Get("Content-Disposition"))
if disp != "attachment" {
partData, _ := ioutil.ReadAll(partReader)
decodedPart := decodePart(bytes.NewReader(partData), header)
if buffer, err := ioutil.ReadAll(decodedPart); err == nil {
buffer, err = DecodeCharset(buffer, params)
if err != nil {
log.Warnln("Decode charset error:", err)
return err
}
if mediaType == "text/html" {
bc.hasHtml = true
http.Header(header).Write(bc.htmlHeaderBuffer)
bc.htmlBodyBuffer.Write(buffer)
} else if mediaType == "text/plain" {
http.Header(header).Write(bc.plainHeaderBuffer)
bc.plainBodyBuffer.Write(buffer)
}
}
err = bc.target.Accept(bytes.NewReader(partData), header, hasPlainSibling, isFirst, isLast)
return
}
}
}
err = bc.target.Accept(partReader, header, hasPlainSibling, isFirst, isLast)
return
}
func (bc *BodyCollector) GetBody() (string, string) {
if bc.hasHtml {
return bc.htmlBodyBuffer.String(), "text/html"
} else {
return bc.plainBodyBuffer.String(), "text/plain"
}
}
func (bc *BodyCollector) GetHeaders() string {
if bc.hasHtml {
return bc.htmlHeaderBuffer.String()
} else {
return bc.plainHeaderBuffer.String()
}
}
// ======================== Attachments Collector ==============
// Collect contents of all attachment parts and return them as a string.
// TODO move this to file collector_attachment.go.
type AttachmentsCollector struct {
target VisitAcceptor
attBuffers []string
attHeaders []string
}
func NewAttachmentsCollector(targetAccepter VisitAcceptor) *AttachmentsCollector {
return &AttachmentsCollector{
target: targetAccepter,
attBuffers: []string{},
attHeaders: []string{},
}
}
func (ac *AttachmentsCollector) Accept(partReader io.Reader, header textproto.MIMEHeader, hasPlainSibling bool, isFirst, isLast bool) (err error) {
if isFirst {
if IsLeaf(header) {
mediaType, params, _ := getContentType(header)
disp, _, _ := mime.ParseMediaType(header.Get("Content-Disposition"))
if (mediaType != "text/html" && mediaType != "text/plain") || disp == "attachment" {
partData, _ := ioutil.ReadAll(partReader)
decodedPart := decodePart(bytes.NewReader(partData), header)
if buffer, err := ioutil.ReadAll(decodedPart); err == nil {
buffer, err = DecodeCharset(buffer, params)
if err != nil {
log.Warnln("Decode charset error:", err)
return err
}
headerBuf := new(bytes.Buffer)
http.Header(header).Write(headerBuf)
ac.attHeaders = append(ac.attHeaders, headerBuf.String())
ac.attBuffers = append(ac.attBuffers, string(buffer))
}
err = ac.target.Accept(bytes.NewReader(partData), header, hasPlainSibling, isFirst, isLast)
return
}
}
}
err = ac.target.Accept(partReader, header, hasPlainSibling, isFirst, isLast)
return
}
func (ac AttachmentsCollector) GetAttachments() []string {
return ac.attBuffers
}
func (ac AttachmentsCollector) GetAttHeaders() []string {
return ac.attHeaders
}

228
pkg/mime/parser_test.go Normal file
View File

@ -0,0 +1,228 @@
// Copyright (c) 2020 Proton Technologies AG
//
// This file is part of ProtonMail Bridge.
//
// ProtonMail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ProtonMail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with ProtonMail Bridge. If not, see <https://www.gnu.org/licenses/>.
package pmmime
import (
"bytes"
"fmt"
"io/ioutil"
"net/mail"
"net/textproto"
"strings"
"testing"
)
func minimalParse(mimeBody string) (readBody string, plainContents string, err error) {
mm, err := mail.ReadMessage(strings.NewReader(mimeBody))
if err != nil {
return
}
h := textproto.MIMEHeader(mm.Header)
mmBodyData, err := ioutil.ReadAll(mm.Body)
if err != nil {
return
}
printAccepter := NewMIMEPrinter()
plainTextCollector := NewPlainTextCollector(printAccepter)
visitor := NewMimeVisitor(plainTextCollector)
err = VisitAll(bytes.NewReader(mmBodyData), h, visitor)
readBody = printAccepter.String()
plainContents = plainTextCollector.GetPlainText()
return readBody, plainContents, err
}
func androidParse(mimeBody string) (body, headers string, atts, attHeaders []string, err error) {
mm, err := mail.ReadMessage(strings.NewReader(mimeBody))
if err != nil {
return
}
h := textproto.MIMEHeader(mm.Header)
mmBodyData, err := ioutil.ReadAll(mm.Body)
printAccepter := NewMIMEPrinter()
bodyCollector := NewBodyCollector(printAccepter)
attachmentsCollector := NewAttachmentsCollector(bodyCollector)
mimeVisitor := NewMimeVisitor(attachmentsCollector)
err = VisitAll(bytes.NewReader(mmBodyData), h, mimeVisitor)
body, _ = bodyCollector.GetBody()
headers = bodyCollector.GetHeaders()
atts = attachmentsCollector.GetAttachments()
attHeaders = attachmentsCollector.GetAttHeaders()
return
}
func TestParseBoundaryIsEmpty(t *testing.T) {
testMessage :=
`Date: Sun, 10 Mar 2019 11:10:06 -0600
In-Reply-To: <abcbase64@protonmail.com>
X-Original-To: enterprise@protonmail.com
References: <abc64@unicoderns.com> <abc63@protonmail.com> <abc64@protonmail.com> <abc65@mail.gmail.com> <abc66@protonmail.com>
To: "ProtonMail" <enterprise@protonmail.com>
X-Pm-Origin: external
Delivered-To: enterprise@protonmail.com
Content-Type: multipart/mixed; boundary=ac7e36bd45425e70b4dab2128f34172e4dc3f9ff2eeb47e909267d4252794ec7
Reply-To: XYZ <xyz@xyz.com>
Mime-Version: 1.0
Subject: Encrypted Message
Return-Path: <xyz@xyz.com>
From: XYZ <xyz@xyz.com>
X-Pm-Conversationid-Id: gNX9bDPLmBgFZ-C3Tdlb628cas1Xl0m4dql5nsWzQAEI-WQv0ytfwPR4-PWELEK0_87XuFOgetc239Y0pjPYHQ==
X-Pm-Date: Sun, 10 Mar 2019 18:10:06 +0100
Message-Id: <68c11e46-e611-d9e4-edc1-5ec96bac77cc@unicoderns.com>
X-Pm-Transfer-Encryption: TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)
X-Pm-External-Id: <68c11e46-e611-d9e4-edc1-5ec96bac77cc@unicoderns.com>
X-Pm-Internal-Id: _iJ8ETxcqXTSK8IzCn0qFpMUTwvRf-xJUtldRA1f6yHdmXjXzKleG3F_NLjZL3FvIWVHoItTxOuuVXcukwwW3g==
Openpgp: preference=signencrypt
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:60.0) Gecko/20100101 Thunderbird/60.4.0
X-Pm-Content-Encryption: end-to-end
--ac7e36bd45425e70b4dab2128f34172e4dc3f9ff2eeb47e909267d4252794ec7
Content-Disposition: inline
Content-Transfer-Encoding: quoted-printable
Content-Type: multipart/mixed; charset=utf-8
Content-Type: multipart/mixed; boundary="xnAIW3Turb9YQZ2rXc2ZGZH45WepHIZyy";
protected-headers="v1"
From: XYZ <xyz@xyz.com>
To: "ProtonMail" <enterprise@protonmail.com>
Subject: Encrypted Message
Message-ID: <68c11e46-e611-d9e4-edc1-5ec96bac77cc@unicoderns.com>
References: <abc64@unicoderns.com> <abc63@protonmail.com> <abc64@protonmail.com> <abc65@mail.gmail.com> <abc66@protonmail.com>
In-Reply-To: <abcbase64@protonmail.com>
--xnAIW3Turb9YQZ2rXc2ZGZH45WepHIZyy
Content-Type: text/rfc822-headers; protected-headers="v1"
Content-Disposition: inline
From: XYZ <xyz@xyz.com>
To: ProtonMail <enterprise@protonmail.com>
Subject: Re: Encrypted Message
--xnAIW3Turb9YQZ2rXc2ZGZH45WepHIZyy
Content-Type: multipart/alternative;
boundary="------------F9E5AA6D49692F51484075E3"
Content-Language: en-US
This is a multi-part message in MIME format.
--------------F9E5AA6D49692F51484075E3
Content-Type: text/plain; charset=utf-8
Content-Transfer-Encoding: quoted-printable
Hi ...
--------------F9E5AA6D49692F51484075E3
Content-Type: text/html; charset=utf-8
Content-Transfer-Encoding: quoted-printable
<html>
<head>
</head>
<body text=3D"#000000" bgcolor=3D"#FFFFFF">
<p>Hi .. </p>
</body>
</html>
--------------F9E5AA6D49692F51484075E3--
--xnAIW3Turb9YQZ2rXc2ZGZH45WepHIZyy--
--ac7e36bd45425e70b4dab2128f34172e4dc3f9ff2eeb47e909267d4252794ec7--
`
body, content, err := minimalParse(testMessage)
if err == nil {
t.Fatal("should have error but is", err)
}
t.Log("==BODY==")
t.Log(body)
t.Log("==CONTENT==")
t.Log(content)
}
func TestParse(t *testing.T) {
testMessage :=
`From: John Doe <example@example.com>
MIME-Version: 1.0
Content-Type: multipart/mixed;
boundary="XXXXboundary text"
This is a multipart message in MIME format.
--XXXXboundary text
Content-Type: text/plain; charset=utf-8
this is the body text
--XXXXboundary text
Content-Type: text/html; charset=utf-8
<html><body>this is the html body text</body></html>
--XXXXboundary text
Content-Type: text/plain; charset=utf-8
Content-Disposition: attachment;
filename="test.txt"
this is the attachment text
--XXXXboundary text--
`
body, heads, att, attHeads, err := androidParse(testMessage)
if err != nil {
t.Error("parse error", err)
}
fmt.Println("==BODY:")
fmt.Println(body)
fmt.Println("==BODY HEADERS:")
fmt.Println(heads)
fmt.Println("==ATTACHMENTS:")
fmt.Println(att)
fmt.Println("==ATTACHMENT HEADERS:")
fmt.Println(attHeads)
}
func TestParseAddressComment(t *testing.T) {
parsingExamples := map[string]string{
"": "",
"(Only Comment) here@pm.me": "\"Only Comment\" <here@pm.me>",
"Normal Name (With Comment) <here@pm.me>": "\"Normal Name\" <here@pm.me>",
"<Muhammed.(I am the greatest)Ali@(the)Vegas.WBA>": "\"I am the greatest the\" <Muhammed.Ali@Vegas.WBA>",
}
for raw, expected := range parsingExamples {
parsed := parseAddressComment(raw)
if expected != parsed {
t.Errorf("When parsing %q expected %q but have %q", raw, expected, parsed)
}
}
}

188
pkg/mime/utf7Decoder.go Normal file
View File

@ -0,0 +1,188 @@
// Copyright (c) 2020 Proton Technologies AG
//
// This file is part of ProtonMail Bridge.
//
// ProtonMail Bridge is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// ProtonMail Bridge is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with ProtonMail Bridge. If not, see <https://www.gnu.org/licenses/>.
package pmmime
import (
"encoding/base64"
"errors"
"unicode/utf16"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/transform"
)
// utf7Decoder copied from: https://github.com/cention-sany/utf7/blob/master/utf7.go
// We need `encoding.Decoder` instead of function `UTF7DecodeBytes`.
type utf7Decoder struct {
transform.NopResetter
}
// NewUtf7Decoder returns a new decoder for utf7.
func NewUtf7Decoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: utf7Decoder{}}
}
const (
uRepl = '\uFFFD' // Unicode replacement code point
u7min = 0x20 // Minimum self-representing UTF-7 value
u7max = 0x7E // Maximum self-representing UTF-7 value
)
// ErrBadUTF7 is returned to indicate the invalid modified UTF-7 encoding.
var ErrBadUTF7 = errors.New("utf7: bad utf-7 encoding")
const modifiedbase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
var u7enc = base64.NewEncoding(modifiedbase64)
func isModifiedBase64(r byte) bool {
if r >= 'A' && r <= 'Z' {
return true
} else if r >= 'a' && r <= 'z' {
return true
} else if r >= '0' && r <= '9' {
return true
} else if r == '+' || r == '/' {
return true
}
return false
}
func (d utf7Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var implicit bool
var tmp int
nd, n := len(dst), len(src)
if n == 0 && !atEOF {
return 0, 0, transform.ErrShortSrc
}
for ; nSrc < n; nSrc++ {
if nDst >= nd {
return nDst, nSrc, transform.ErrShortDst
}
if c := src[nSrc]; ((c < u7min || c > u7max) &&
c != '\t' && c != '\r' && c != '\n') ||
c == '~' || c == '\\' {
return nDst, nSrc, ErrBadUTF7 // Illegal code point in ASCII mode.
} else if c != '+' {
dst[nDst] = c // Character is self-representing.
nDst++
continue
}
// Found '+'.
start := nSrc + 1
tmp = nSrc // nSrc still points to '+', tmp points to the end of BASE64.
// Find the end of the Base64 or "+-" segment.
implicit = false
for tmp++; tmp < n && src[tmp] != '-'; tmp++ {
if !isModifiedBase64(src[tmp]) {
if tmp == start {
return nDst, tmp, ErrBadUTF7 // '+' next char must modified base64.
}
// Implicit shift back to ASCII, no need for '-' character.
implicit = true
break
}
}
if tmp == start {
if tmp == n {
// Did not find '-' sign and '+' is the last character.
// Total nSrc does not include '+'.
if atEOF {
return nDst, nSrc, ErrBadUTF7 // '+' can not be at the end.
}
// '+' can not be at the end, the source is too short.
return nDst, nSrc, transform.ErrShortSrc
}
dst[nDst] = '+' // Escape sequence "+-".
nDst++
} else if tmp == n && !atEOF {
// No EOF found, the source is too short.
return nDst, nSrc, transform.ErrShortSrc
} else if b := utf7dec(src[start:tmp]); len(b) > 0 {
if len(b)+nDst > nd {
// Need more space in dst for the decoded modified BASE64 unicode.
// Total nSrc does not include '+'.
return nDst, nSrc, transform.ErrShortDst
}
copy(dst[nDst:], b) // Control or non-ASCII code points in Base64.
nDst += len(b)
if implicit {
if nDst >= nd {
return nDst, tmp, transform.ErrShortDst
}
dst[nDst] = src[tmp] // Implicit shift.
nDst++
}
if tmp == n {
return nDst, tmp, nil
}
} else {
return nDst, nSrc, ErrBadUTF7 // Bad encoding.
}
nSrc = tmp
}
return
}
// utf7dec extracts UTF-16-BE bytes from Base64 data and converts them to UTF-8.
// A nil slice is returned if the encoding is invalid.
func utf7dec(b64 []byte) []byte {
var b []byte
// Allocate a single block of memory large enough to store the Base64 data
// (if padding is required), UTF-16-BE bytes, and decoded UTF-8 bytes.
// Since a 2-byte UTF-16 sequence may expand into a 3-byte UTF-8 sequence,
// double the space allocation for UTF-8.
if n := len(b64); b64[n-1] == '=' {
return nil
} else if n&3 == 0 {
b = make([]byte, u7enc.DecodedLen(n)*3)
} else {
n += 4 - n&3
b = make([]byte, n+u7enc.DecodedLen(n)*3)
copy(b[copy(b, b64):n], []byte("=="))
b64, b = b[:n], b[n:]
}
// Decode Base64 into the first 1/3rd of b.
n, err := u7enc.Decode(b, b64)
if err != nil || n&1 == 1 {
return nil
}
// Decode UTF-16-BE into the remaining 2/3rds of b.
b, s := b[:n], b[n:]
j := 0
for i := 0; i < n; i += 2 {
r := rune(b[i])<<8 | rune(b[i+1])
if utf16.IsSurrogate(r) {
if i += 2; i == n {
return nil
}
r2 := rune(b[i])<<8 | rune(b[i+1])
if r = utf16.DecodeRune(r, r2); r == uRepl {
return nil
}
}
j += utf8.EncodeRune(s[j:], r)
}
return s[:j]
}