diff --git a/go.mod b/go.mod index 4a01ce28..a69ba058 100644 --- a/go.mod +++ b/go.mod @@ -19,6 +19,7 @@ require ( github.com/ProtonMail/go-imap-id v0.0.0-20190926060100-f94a56b9ecde github.com/ProtonMail/go-vcard v0.0.0-20180326232728-33aaa0a0c8a5 github.com/ProtonMail/gopenpgp/v2 v2.0.1 + github.com/PuerkitoBio/goquery v1.5.1 github.com/abiosoft/ishell v2.0.0+incompatible github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db // indirect github.com/allan-simon/go-singleinstance v0.0.0-20160830203053-79edcfdc2dfc diff --git a/go.sum b/go.sum index 36797887..4d928e97 100644 --- a/go.sum +++ b/go.sum @@ -23,12 +23,16 @@ github.com/ProtonMail/go-vcard v0.0.0-20180326232728-33aaa0a0c8a5 h1:Uga1DHFN4GU github.com/ProtonMail/go-vcard v0.0.0-20180326232728-33aaa0a0c8a5/go.mod h1:oeP9CMN+ajWp5jKp1kue5daJNwMMxLF+ujPaUIoJWlA= github.com/ProtonMail/gopenpgp/v2 v2.0.1 h1:x0uvDhry5WzoHeJO4J3dgMLhG4Z9PeBJ2O+sDOY0LcU= github.com/ProtonMail/gopenpgp/v2 v2.0.1/go.mod h1:wQQCJo7DURO6S9VwH+kSDEYs/B63yZnAEfGlOg8YNBY= +github.com/PuerkitoBio/goquery v1.5.1 h1:PSPBGne8NIUWw+/7vFBV+kG2J/5MOjbzc7154OaKCSE= +github.com/PuerkitoBio/goquery v1.5.1/go.mod h1:GsLWisAFVj4WgDibEWF4pvYnkVQBpKBKeU+7zCJoLcc= github.com/abiosoft/ishell v2.0.0+incompatible h1:zpwIuEHc37EzrsIYah3cpevrIc8Oma7oZPxr03tlmmw= github.com/abiosoft/ishell v2.0.0+incompatible/go.mod h1:HQR9AqF2R3P4XXpMpI0NAzgHf/aS6+zVXRj14cVk9qg= github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db h1:CjPUSXOiYptLbTdr1RceuZgSFDQ7U15ITERUGrUORx8= github.com/abiosoft/readline v0.0.0-20180607040430-155bce2042db/go.mod h1:rB3B4rKii8V21ydCbIzH5hZiCQE7f5E9SzUb/ZZx530= github.com/allan-simon/go-singleinstance v0.0.0-20160830203053-79edcfdc2dfc h1:mZca0/HZ/XWXP9txkfdl2GH6mUzBqAlyJz3u5Lg8fuA= github.com/allan-simon/go-singleinstance v0.0.0-20160830203053-79edcfdc2dfc/go.mod h1:qqsTQiwdyqxU05iDCsi0oN3P4nrVxAmn8xCtODDSf/U= +github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5zzsLTo= +github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/certifi/gocertifi v0.0.0-20200211180108-c7c1fbc02894 h1:JLaf/iINcLyjwbtTsCJjc6rtlASgHeIJPrB6QmwURnA= github.com/certifi/gocertifi v0.0.0-20200211180108-c7c1fbc02894/go.mod h1:sGbDF6GwGcLpkNXPUTkMRoywsNa/ol15pxFe6ERfguA= github.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE= @@ -165,11 +169,13 @@ github.com/urfave/cli v1.22.4/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtX go.etcd.io/bbolt v1.3.5 h1:XAzx9gjCb0Rxj7EoqcClPD1d5ZBxZJk0jbuoPHenBt0= go.etcd.io/bbolt v1.3.5/go.mod h1:G5EMThwa9y8QZGBClrRx5EY+Yw9kAhnjy3bSjsnlVTQ= golang.org/x/mod v0.1.1-0.20191105210325-c90efee705ee/go.mod h1:QqPTAvyqsEbceGzBzNggFXnrqF1CaUcvgkdR5Ot7KZg= +golang.org/x/net v0.0.0-20180218175443-cbe0f9307d01/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190420063019-afa5a82059c6/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190923162816-aa69164e4478/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200513185701-a91f0712d120/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/net v0.0.0-20200707034311-ab3426394381 h1:VXak5I6aEWmAXeQjA+QSZzlgNrpq9mjcfDemuexIKsU= golang.org/x/net v0.0.0-20200707034311-ab3426394381/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA= diff --git a/pkg/message/parser.go b/pkg/message/parser.go index 025b2927..782d2887 100644 --- a/pkg/message/parser.go +++ b/pkg/message/parser.go @@ -93,6 +93,13 @@ func convertForeignEncodings(p *parser.Parser) error { // HELP: Is it correct to only do this to text types? return p.NewWalker(). + RegisterContentTypeHandler("text/html", func(p *parser.Part) error { + if err := p.ConvertToUTF8(); err != nil { + return err + } + + return p.ConvertMetaCharset() + }). RegisterContentTypeHandler("text/.*", func(p *parser.Part) error { return p.ConvertToUTF8() }). diff --git a/pkg/message/parser/part.go b/pkg/message/parser/part.go index 3f8d7e27..6b500565 100644 --- a/pkg/message/parser/part.go +++ b/pkg/message/parser/part.go @@ -18,12 +18,16 @@ package parser import ( + "bytes" "errors" + "mime" "unicode/utf8" pmmime "github.com/ProtonMail/proton-bridge/pkg/mime" + "github.com/PuerkitoBio/goquery" "github.com/emersion/go-message" "github.com/sirupsen/logrus" + "golang.org/x/net/html" "golang.org/x/net/html/charset" "golang.org/x/text/encoding" ) @@ -79,18 +83,51 @@ func (p *Part) ConvertToUTF8() error { return err } - // HELP: Is this okay? What about when the charset is embedded in structured text type eg html/xml? if params == nil { params = make(map[string]string) } - params["charset"] = "utf-8" + params["charset"] = "UTF-8" p.Header.SetContentType(t, params) return nil } +func (p *Part) ConvertMetaCharset() error { + doc, err := html.Parse(bytes.NewReader(p.Body)) + if err != nil { + return err + } + + goquery.NewDocumentFromNode(doc).Find("meta").Each(func(n int, sel *goquery.Selection) { + if val, ok := sel.Attr("content"); ok { + t, params, err := mime.ParseMediaType(val) + if err != nil { + return + } + + params["charset"] = "UTF-8" + + sel.SetAttr("content", mime.FormatMediaType(t, params)) + } + + if _, ok := sel.Attr("charset"); ok { + sel.SetAttr("charset", "UTF-8") + } + }) + + buf := new(bytes.Buffer) + + if err := html.Render(buf, doc); err != nil { + return err + } + + p.Body = buf.Bytes() + + return nil +} + func selectSuitableDecoder(p *Part, t string, params map[string]string) *encoding.Decoder { if charset, ok := params["charset"]; ok { logrus.WithField("charset", charset).Debug("The part has a specified charset") diff --git a/pkg/message/parser_test.go b/pkg/message/parser_test.go index 27dea6db..51077df8 100644 --- a/pkg/message/parser_test.go +++ b/pkg/message/parser_test.go @@ -268,7 +268,7 @@ func TestParseTextHTML(t *testing.T) { assert.Equal(t, `"Sender" `, m.Sender.String()) assert.Equal(t, `"Receiver" `, m.ToList[0].String()) - assert.Equal(t, "This is body of HTML mail without attachment", m.Body) + assert.Equal(t, "This is body of HTML mail without attachment", m.Body) assert.Equal(t, "This is body of *HTML mail* without attachment", plainBody) assert.Len(t, attReaders, 0) @@ -283,7 +283,7 @@ func TestParseTextHTMLAlready7Bit(t *testing.T) { assert.Equal(t, `"Sender" `, m.Sender.String()) assert.Equal(t, `"Receiver" `, m.ToList[0].String()) - assert.Equal(t, "This is body of HTML mail without attachment", m.Body) + assert.Equal(t, "This is body of HTML mail without attachment", m.Body) assert.Equal(t, "This is body of *HTML mail* without attachment", plainBody) assert.Len(t, attReaders, 0) @@ -298,7 +298,7 @@ func TestParseTextHTMLWithOctetAttachment(t *testing.T) { assert.Equal(t, `"Sender" `, m.Sender.String()) assert.Equal(t, `"Receiver" `, m.ToList[0].String()) - assert.Equal(t, "This is body of HTML mail with attachment", m.Body) + assert.Equal(t, "This is body of HTML mail with attachment", m.Body) assert.Equal(t, "This is body of *HTML mail* with attachment", plainBody) require.Len(t, attReaders, 1) @@ -315,7 +315,7 @@ func TestParseTextHTMLWithPlainAttachment(t *testing.T) { assert.Equal(t, `"Receiver" `, m.ToList[0].String()) // BAD: plainBody should not be empty! - assert.Equal(t, "This is body of HTML mail with attachment", m.Body) + assert.Equal(t, "This is body of HTML mail with attachment", m.Body) assert.Equal(t, "This is body of *HTML mail* with attachment", plainBody) require.Len(t, attReaders, 1) @@ -331,7 +331,7 @@ func TestParseTextHTMLWithImageInline(t *testing.T) { assert.Equal(t, `"Sender" `, m.Sender.String()) assert.Equal(t, `"Receiver" `, m.ToList[0].String()) - assert.Equal(t, "This is body of HTML mail with attachment", m.Body) + assert.Equal(t, "This is body of HTML mail with attachment", m.Body) assert.Equal(t, "This is body of *HTML mail* with attachment", plainBody) // The inline image is an 8x8 mic-dropping gopher. @@ -368,8 +368,7 @@ func TestParseTextHTMLWithEmbeddedForeignEncoding(t *testing.T) { assert.Equal(t, `"Sender" `, m.Sender.String()) assert.Equal(t, `"Receiver" `, m.ToList[0].String()) - // BAD: Bridge does not detect the charset specified in the tag of the html. - assert.Equal(t, `latin2 řšřš`, m.Body) + assert.Equal(t, `latin2 řšřš`, m.Body) assert.Equal(t, `latin2 řšřš`, plainBody) assert.Len(t, attReaders, 0) @@ -384,15 +383,14 @@ func TestParseMultipartAlternative(t *testing.T) { assert.Equal(t, `"schizofrenic" `, m.Sender.String()) assert.Equal(t, ``, m.ToList[0].String()) - assert.Equal(t, ` - - + assert.Equal(t, ` + aoeuaoeu - - -`, m.Body) + + +`, m.Body) assert.Equal(t, "*aoeuaoeu*\n\n", plainBody) } @@ -406,15 +404,14 @@ func TestParseMultipartAlternativeNested(t *testing.T) { assert.Equal(t, `"schizofrenic" `, m.Sender.String()) assert.Equal(t, ``, m.ToList[0].String()) - assert.Equal(t, ` - - + assert.Equal(t, ` + multipart 2.2 - - -`, m.Body) + + +`, m.Body) assert.Equal(t, "*multipart 2.1*\n\n", plainBody) }