feat: handle foreign encodings

This commit is contained in:
James Houlahan
2020-08-05 17:00:06 +02:00
parent f4374a02da
commit ea01c155da
11 changed files with 194 additions and 78 deletions

View File

@ -205,7 +205,7 @@ func joinChildParts(childParts []parser.Parts) parser.Parts {
func bestChoice(childParts []parser.Parts, preferredContentType string) (parser.Parts, error) {
// If one of the parts has preferred content type, use that.
for i := len(childParts) - 1; i >= 0; i-- {
if allHaveContentType(childParts[i], preferredContentType) {
if allPartsHaveContentType(childParts[i], preferredContentType) {
return childParts[i], nil
}
}
@ -214,7 +214,7 @@ func bestChoice(childParts []parser.Parts, preferredContentType string) (parser.
return childParts[len(childParts)-1], nil
}
func allHaveContentType(parts parser.Parts, contentType string) bool {
func allPartsHaveContentType(parts parser.Parts, contentType string) bool {
for _, part := range parts {
t, _, err := part.Header.ContentType()
if err != nil {
@ -272,23 +272,11 @@ func getPlainBody(part *parser.Part) []byte {
}
}
func writeMIMEMessage(p *parser.Parser) (mime string, err error) {
writer := p.
NewWriter().
WithCondition(func(p *parser.Part) (keep bool) {
disp, _, err := p.Header.ContentDisposition()
if err != nil {
return true
}
// TODO: Is it true that we don't want to write attachments? I thought this was for externals...
return disp != "attachment"
})
func writeMIMEMessage(p *parser.Parser) (string, error) {
buf := new(bytes.Buffer)
if err = writer.Write(buf); err != nil {
return
if err := p.NewWriter().Write(buf); err != nil {
return "", err
}
return buf.String(), nil

View File

@ -1,11 +1,11 @@
package parser
import (
"fmt"
"io"
"io/ioutil"
"github.com/emersion/go-message"
"github.com/sirupsen/logrus"
)
type Parser struct {
@ -13,14 +13,19 @@ type Parser struct {
root *Part
}
func New(r io.Reader) (p *Parser, err error) {
p = new(Parser)
func New(r io.Reader) (*Parser, error) {
p := new(Parser)
if err = p.parse(r); err != nil {
return
entity, err := message.Read(r)
if err != nil && !message.IsUnknownCharset(err) {
return nil, err
}
return
if err := p.parseEntity(entity); err != nil {
return nil, err
}
return p, nil
}
func (p *Parser) NewWalker() *Walker {
@ -51,32 +56,25 @@ func (p *Parser) Part(number []int) (part *Part, err error) {
return
}
func (p *Parser) parse(r io.Reader) error {
entity, err := message.Read(r)
if err != nil {
if !message.IsUnknownCharset(err) {
return err
} else {
fmt.Println(err)
}
}
return p.parseEntity(entity)
}
func (p *Parser) enter() {
func (p *Parser) beginPart() {
p.stack = append(p.stack, &Part{})
}
func (p *Parser) exit() {
var built *Part
func (p *Parser) endPart() {
var part *Part
p.stack, built = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1]
p.stack, part = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1]
if len(p.stack) > 0 {
p.top().children = append(p.top().children, built)
p.top().children = append(p.top().children, part)
} else {
p.root = built
p.root = part
}
if !part.isUTF8() {
if err := part.convertToUTF8(); err != nil {
logrus.WithError(err).Error("failed to convert part to utf-8")
}
}
}
@ -96,9 +94,9 @@ func (p *Parser) withBody(bytes []byte) {
p.top().Body = bytes
}
func (p *Parser) parseEntity(e *message.Entity) (err error) {
p.enter()
defer p.exit()
func (p *Parser) parseEntity(e *message.Entity) error {
p.beginPart()
defer p.endPart()
p.withHeader(e.Header)

View File

@ -7,6 +7,7 @@ import (
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@ -19,6 +20,30 @@ func newTestParser(t *testing.T, msg string) *Parser {
return p
}
func TestParserSpecifiedLatin1Charset(t *testing.T) {
p := newTestParser(t, "text_plain_latin1.eml")
checkBodies(t, p, "ééééééé")
}
func TestParserUnspecifiedLatin1Charset(t *testing.T) {
p := newTestParser(t, "text_plain_unknown_latin1.eml")
checkBodies(t, p, "ééééééé")
}
func TestParserSpecifiedLatin2Charset(t *testing.T) {
p := newTestParser(t, "text_plain_latin2.eml")
checkBodies(t, p, "řšřšřš")
}
func TestParserEmbeddedLatin2Charset(t *testing.T) {
p := newTestParser(t, "text_html_embedded_latin2_encoding.eml")
checkBodies(t, p, `<html><head><meta charset="ISO-8859-2"></head><body>latin2 řšřš</body></html>`)
}
func f(filename string) io.ReadCloser {
f, err := os.Open(filepath.Join("testdata", filename))
@ -37,3 +62,21 @@ func s(filename string) string {
return string(b)
}
func checkBodies(t *testing.T, p *Parser, wantBodies ...string) {
var partBodies, expectedBodies [][]byte
require.NoError(t, p.NewWalker().RegisterDefaultHandler(func(p *Part) (err error) {
if p.Body != nil {
partBodies = append(partBodies, p.Body)
}
return
}).Walk())
for _, body := range wantBodies {
expectedBodies = append(expectedBodies, []byte(body))
}
assert.ElementsMatch(t, expectedBodies, partBodies)
}

View File

@ -2,8 +2,12 @@ package parser
import (
"errors"
"unicode/utf8"
pmmime "github.com/ProtonMail/proton-bridge/pkg/mime"
"github.com/emersion/go-message"
"golang.org/x/net/html/charset"
"golang.org/x/text/encoding"
)
type Parts []*Part
@ -30,35 +34,32 @@ func (p *Part) AddChild(child *Part) {
p.children = append(p.children, child)
}
func (p *Part) write(writer *message.Writer, w *Writer) (err error) {
if len(p.children) > 0 {
for _, child := range p.children {
if err = child.writeAsChild(writer, w); err != nil {
return
}
}
}
if _, err = writer.Write(p.Body); err != nil {
return
}
return
func (p *Part) isUTF8() bool {
return utf8.Valid(p.Body)
}
func (p *Part) writeAsChild(writer *message.Writer, w *Writer) (err error) {
if !w.shouldWrite(p) {
return
}
childWriter, err := writer.CreatePart(p.Header)
// TODO: Do we then need to set charset to utf-8? What if it's embedded in html?
func (p *Part) convertToUTF8() error {
t, params, err := p.Header.ContentType()
if err != nil {
return
return err
}
if err = p.write(childWriter, w); err != nil {
return
var decoder *encoding.Decoder
if knownCharset, ok := params["charset"]; !ok {
encoding, _, _ := charset.DetermineEncoding(p.Body, t)
decoder = encoding.NewDecoder()
} else if decoder, err = pmmime.SelectDecoder(knownCharset); err != nil {
return err
}
return childWriter.Close()
if p.Body, err = decoder.Bytes(p.Body); err != nil {
return err
}
params["charset"] = "utf-8"
p.Header.SetContentType(t, params)
return nil
}

View File

@ -0,0 +1,5 @@
From: Sender <sender@pm.me>
To: Receiver <receiver@pm.me>
Content-Type: text/html
<html><head><meta charset="ISO-8859-2"></head><body>latin2 <20><><EFBFBD><EFBFBD></body></html>

View File

@ -0,0 +1,5 @@
From: Sender <sender@pm.me>
To: Receiver <receiver@pm.me>
Content-Type: text/plain; charset=ISO-8859-1
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>

View File

@ -0,0 +1,5 @@
From: Sender <sender@pm.me>
To: Receiver <receiver@pm.me>
Content-Type: text/plain; charset=ISO-8859-2
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>

View File

@ -0,0 +1,5 @@
From: Sender <sender@pm.me>
To: Receiver <receiver@pm.me>
Content-Type: text/plain
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>

View File

@ -0,0 +1,5 @@
From: Sender <sender@pm.me>
To: Receiver <receiver@pm.me>
Content-Type: text/plain
<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>

View File

@ -19,19 +19,27 @@ func newWriter(root *Part) *Writer {
}
}
// WithCondition allows setting a condition when parts should be written.
// Parts are passed to each condition set and if any condition returns false,
// the part is not written.
// This initially seemed like a good idea but is now kinda useless.
func (w *Writer) WithCondition(cond Condition) *Writer {
w.cond = append(w.cond, cond)
return w
}
func (w *Writer) Write(ww io.Writer) (err error) {
msgWriter, err := message.CreateWriter(ww, w.root.Header)
if err != nil {
return
func (w *Writer) Write(ww io.Writer) error {
if w.shouldFilter(w.root) {
w.root.Header.Add("Content-Transfer-Encoding", "base64")
}
if err = w.root.write(msgWriter, w); err != nil {
return
msgWriter, err := message.CreateWriter(ww, w.root.Header)
if err != nil {
return err
}
if err := w.write(msgWriter, w.root); err != nil {
return err
}
return msgWriter.Close()
@ -46,3 +54,56 @@ func (w *Writer) shouldWrite(p *Part) bool {
return true
}
func (w *Writer) shouldFilter(p *Part) bool {
encoding := p.Header.Get("Content-Transfer-Encoding")
if encoding != "" && encoding == "quoted-printable" || encoding == "base64" {
return false
}
for _, b := range p.Body {
if uint8(b) > 1<<7 {
return true
}
}
return false
}
func (w *Writer) write(writer *message.Writer, p *Part) error {
if len(p.children) > 0 {
for _, child := range p.children {
if err := w.writeAsChild(writer, child); err != nil {
return err
}
}
}
if _, err := writer.Write(p.Body); err != nil {
return err
}
return nil
}
func (w *Writer) writeAsChild(writer *message.Writer, p *Part) error {
if !w.shouldWrite(p) {
return nil
}
if w.shouldFilter(p) {
p.Header.Add("Content-Transfer-Encoding", "base64")
}
childWriter, err := writer.CreatePart(p.Header)
if err != nil {
return err
}
if err := w.write(childWriter, p); err != nil {
return err
}
return childWriter.Close()
}