feat: convert content type in html meta tags

This commit is contained in:
James Houlahan
2020-09-04 12:52:07 +02:00
parent afeed4a801
commit 9b3cc9dc34
5 changed files with 69 additions and 21 deletions

View File

@ -93,6 +93,13 @@ func convertForeignEncodings(p *parser.Parser) error {
// HELP: Is it correct to only do this to text types?
return p.NewWalker().
RegisterContentTypeHandler("text/html", func(p *parser.Part) error {
if err := p.ConvertToUTF8(); err != nil {
return err
}
return p.ConvertMetaCharset()
}).
RegisterContentTypeHandler("text/.*", func(p *parser.Part) error {
return p.ConvertToUTF8()
}).

View File

@ -18,12 +18,16 @@
package parser
import (
"bytes"
"errors"
"mime"
"unicode/utf8"
pmmime "github.com/ProtonMail/proton-bridge/pkg/mime"
"github.com/PuerkitoBio/goquery"
"github.com/emersion/go-message"
"github.com/sirupsen/logrus"
"golang.org/x/net/html"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
)
@ -79,18 +83,51 @@ func (p *Part) ConvertToUTF8() error {
return err
}
// HELP: Is this okay? What about when the charset is embedded in structured text type eg html/xml?
if params == nil {
params = make(map[string]string)
}
params["charset"] = "utf-8"
params["charset"] = "UTF-8"
p.Header.SetContentType(t, params)
return nil
}
func (p *Part) ConvertMetaCharset() error {
doc, err := html.Parse(bytes.NewReader(p.Body))
if err != nil {
return err
}
goquery.NewDocumentFromNode(doc).Find("meta").Each(func(n int, sel *goquery.Selection) {
if val, ok := sel.Attr("content"); ok {
t, params, err := mime.ParseMediaType(val)
if err != nil {
return
}
params["charset"] = "UTF-8"
sel.SetAttr("content", mime.FormatMediaType(t, params))
}
if _, ok := sel.Attr("charset"); ok {
sel.SetAttr("charset", "UTF-8")
}
})
buf := new(bytes.Buffer)
if err := html.Render(buf, doc); err != nil {
return err
}
p.Body = buf.Bytes()
return nil
}
func selectSuitableDecoder(p *Part, t string, params map[string]string) *encoding.Decoder {
if charset, ok := params["charset"]; ok {
logrus.WithField("charset", charset).Debug("The part has a specified charset")

View File

@ -268,7 +268,7 @@ func TestParseTextHTML(t *testing.T) {
assert.Equal(t, `"Sender" <sender@pm.me>`, m.Sender.String())
assert.Equal(t, `"Receiver" <receiver@pm.me>`, m.ToList[0].String())
assert.Equal(t, "<html><body>This is body of <b>HTML mail</b> without attachment</body></html>", m.Body)
assert.Equal(t, "<html><head></head><body>This is body of <b>HTML mail</b> without attachment</body></html>", m.Body)
assert.Equal(t, "This is body of *HTML mail* without attachment", plainBody)
assert.Len(t, attReaders, 0)
@ -283,7 +283,7 @@ func TestParseTextHTMLAlready7Bit(t *testing.T) {
assert.Equal(t, `"Sender" <sender@pm.me>`, m.Sender.String())
assert.Equal(t, `"Receiver" <receiver@pm.me>`, m.ToList[0].String())
assert.Equal(t, "<html><body>This is body of <b>HTML mail</b> without attachment</body></html>", m.Body)
assert.Equal(t, "<html><head></head><body>This is body of <b>HTML mail</b> without attachment</body></html>", m.Body)
assert.Equal(t, "This is body of *HTML mail* without attachment", plainBody)
assert.Len(t, attReaders, 0)
@ -298,7 +298,7 @@ func TestParseTextHTMLWithOctetAttachment(t *testing.T) {
assert.Equal(t, `"Sender" <sender@pm.me>`, m.Sender.String())
assert.Equal(t, `"Receiver" <receiver@pm.me>`, m.ToList[0].String())
assert.Equal(t, "<html><body>This is body of <b>HTML mail</b> with attachment</body></html>", m.Body)
assert.Equal(t, "<html><head></head><body>This is body of <b>HTML mail</b> with attachment</body></html>", m.Body)
assert.Equal(t, "This is body of *HTML mail* with attachment", plainBody)
require.Len(t, attReaders, 1)
@ -315,7 +315,7 @@ func TestParseTextHTMLWithPlainAttachment(t *testing.T) {
assert.Equal(t, `"Receiver" <receiver@pm.me>`, m.ToList[0].String())
// BAD: plainBody should not be empty!
assert.Equal(t, "<html><body>This is body of <b>HTML mail</b> with attachment</body></html>", m.Body)
assert.Equal(t, "<html><head></head><body>This is body of <b>HTML mail</b> with attachment</body></html>", m.Body)
assert.Equal(t, "This is body of *HTML mail* with attachment", plainBody)
require.Len(t, attReaders, 1)
@ -331,7 +331,7 @@ func TestParseTextHTMLWithImageInline(t *testing.T) {
assert.Equal(t, `"Sender" <sender@pm.me>`, m.Sender.String())
assert.Equal(t, `"Receiver" <receiver@pm.me>`, m.ToList[0].String())
assert.Equal(t, "<html><body>This is body of <b>HTML mail</b> with attachment</body></html>", m.Body)
assert.Equal(t, "<html><head></head><body>This is body of <b>HTML mail</b> with attachment</body></html>", m.Body)
assert.Equal(t, "This is body of *HTML mail* with attachment", plainBody)
// The inline image is an 8x8 mic-dropping gopher.
@ -368,8 +368,7 @@ func TestParseTextHTMLWithEmbeddedForeignEncoding(t *testing.T) {
assert.Equal(t, `"Sender" <sender@pm.me>`, m.Sender.String())
assert.Equal(t, `"Receiver" <receiver@pm.me>`, m.ToList[0].String())
// BAD: Bridge does not detect the charset specified in the <meta> tag of the html.
assert.Equal(t, `<html><head><meta charset="ISO-8859-2"></head><body>latin2 řšřš</body></html>`, m.Body)
assert.Equal(t, `<html><head><meta charset="UTF-8"/></head><body>latin2 řšřš</body></html>`, m.Body)
assert.Equal(t, `latin2 řšřš`, plainBody)
assert.Len(t, attReaders, 0)
@ -384,15 +383,14 @@ func TestParseMultipartAlternative(t *testing.T) {
assert.Equal(t, `"schizofrenic" <schizofrenic@pm.me>`, m.Sender.String())
assert.Equal(t, `<pmbridgeietest@outlook.com>`, m.ToList[0].String())
assert.Equal(t, `<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
assert.Equal(t, `<html><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
</head>
<body>
<b>aoeuaoeu</b>
</body>
</html>
`, m.Body)
</body></html>`, m.Body)
assert.Equal(t, "*aoeuaoeu*\n\n", plainBody)
}
@ -406,15 +404,14 @@ func TestParseMultipartAlternativeNested(t *testing.T) {
assert.Equal(t, `"schizofrenic" <schizofrenic@pm.me>`, m.Sender.String())
assert.Equal(t, `<pmbridgeietest@outlook.com>`, m.ToList[0].String())
assert.Equal(t, `<html>
<head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8">
assert.Equal(t, `<html><head>
<meta http-equiv="content-type" content="text/html; charset=UTF-8"/>
</head>
<body>
<b>multipart 2.2</b>
</body>
</html>
`, m.Body)
</body></html>`, m.Body)
assert.Equal(t, "*multipart 2.1*\n\n", plainBody)
}