diff --git a/pkg/message/parser.go b/pkg/message/parser.go index 77c148a3..cf747573 100644 --- a/pkg/message/parser.go +++ b/pkg/message/parser.go @@ -205,7 +205,7 @@ func joinChildParts(childParts []parser.Parts) parser.Parts { func bestChoice(childParts []parser.Parts, preferredContentType string) (parser.Parts, error) { // If one of the parts has preferred content type, use that. for i := len(childParts) - 1; i >= 0; i-- { - if allHaveContentType(childParts[i], preferredContentType) { + if allPartsHaveContentType(childParts[i], preferredContentType) { return childParts[i], nil } } @@ -214,7 +214,7 @@ func bestChoice(childParts []parser.Parts, preferredContentType string) (parser. return childParts[len(childParts)-1], nil } -func allHaveContentType(parts parser.Parts, contentType string) bool { +func allPartsHaveContentType(parts parser.Parts, contentType string) bool { for _, part := range parts { t, _, err := part.Header.ContentType() if err != nil { @@ -272,23 +272,11 @@ func getPlainBody(part *parser.Part) []byte { } } -func writeMIMEMessage(p *parser.Parser) (mime string, err error) { - writer := p. - NewWriter(). - WithCondition(func(p *parser.Part) (keep bool) { - disp, _, err := p.Header.ContentDisposition() - if err != nil { - return true - } - - // TODO: Is it true that we don't want to write attachments? I thought this was for externals... - return disp != "attachment" - }) - +func writeMIMEMessage(p *parser.Parser) (string, error) { buf := new(bytes.Buffer) - if err = writer.Write(buf); err != nil { - return + if err := p.NewWriter().Write(buf); err != nil { + return "", err } return buf.String(), nil diff --git a/pkg/message/parser/parser.go b/pkg/message/parser/parser.go index 7b91c57c..b5a78af8 100644 --- a/pkg/message/parser/parser.go +++ b/pkg/message/parser/parser.go @@ -1,11 +1,11 @@ package parser import ( - "fmt" "io" "io/ioutil" "github.com/emersion/go-message" + "github.com/sirupsen/logrus" ) type Parser struct { @@ -13,14 +13,19 @@ type Parser struct { root *Part } -func New(r io.Reader) (p *Parser, err error) { - p = new(Parser) +func New(r io.Reader) (*Parser, error) { + p := new(Parser) - if err = p.parse(r); err != nil { - return + entity, err := message.Read(r) + if err != nil && !message.IsUnknownCharset(err) { + return nil, err } - return + if err := p.parseEntity(entity); err != nil { + return nil, err + } + + return p, nil } func (p *Parser) NewWalker() *Walker { @@ -51,32 +56,25 @@ func (p *Parser) Part(number []int) (part *Part, err error) { return } -func (p *Parser) parse(r io.Reader) error { - entity, err := message.Read(r) - if err != nil { - if !message.IsUnknownCharset(err) { - return err - } else { - fmt.Println(err) - } - } - - return p.parseEntity(entity) -} - -func (p *Parser) enter() { +func (p *Parser) beginPart() { p.stack = append(p.stack, &Part{}) } -func (p *Parser) exit() { - var built *Part +func (p *Parser) endPart() { + var part *Part - p.stack, built = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1] + p.stack, part = p.stack[:len(p.stack)-1], p.stack[len(p.stack)-1] if len(p.stack) > 0 { - p.top().children = append(p.top().children, built) + p.top().children = append(p.top().children, part) } else { - p.root = built + p.root = part + } + + if !part.isUTF8() { + if err := part.convertToUTF8(); err != nil { + logrus.WithError(err).Error("failed to convert part to utf-8") + } } } @@ -96,9 +94,9 @@ func (p *Parser) withBody(bytes []byte) { p.top().Body = bytes } -func (p *Parser) parseEntity(e *message.Entity) (err error) { - p.enter() - defer p.exit() +func (p *Parser) parseEntity(e *message.Entity) error { + p.beginPart() + defer p.endPart() p.withHeader(e.Header) diff --git a/pkg/message/parser/parser_test.go b/pkg/message/parser/parser_test.go index baa83753..94a907f1 100644 --- a/pkg/message/parser/parser_test.go +++ b/pkg/message/parser/parser_test.go @@ -7,6 +7,7 @@ import ( "path/filepath" "testing" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -19,6 +20,30 @@ func newTestParser(t *testing.T, msg string) *Parser { return p } +func TestParserSpecifiedLatin1Charset(t *testing.T) { + p := newTestParser(t, "text_plain_latin1.eml") + + checkBodies(t, p, "ééééééé") +} + +func TestParserUnspecifiedLatin1Charset(t *testing.T) { + p := newTestParser(t, "text_plain_unknown_latin1.eml") + + checkBodies(t, p, "ééééééé") +} + +func TestParserSpecifiedLatin2Charset(t *testing.T) { + p := newTestParser(t, "text_plain_latin2.eml") + + checkBodies(t, p, "řšřšřš") +} + +func TestParserEmbeddedLatin2Charset(t *testing.T) { + p := newTestParser(t, "text_html_embedded_latin2_encoding.eml") + + checkBodies(t, p, `
latin2 řšřš`) +} + func f(filename string) io.ReadCloser { f, err := os.Open(filepath.Join("testdata", filename)) @@ -37,3 +62,21 @@ func s(filename string) string { return string(b) } + +func checkBodies(t *testing.T, p *Parser, wantBodies ...string) { + var partBodies, expectedBodies [][]byte + + require.NoError(t, p.NewWalker().RegisterDefaultHandler(func(p *Part) (err error) { + if p.Body != nil { + partBodies = append(partBodies, p.Body) + } + + return + }).Walk()) + + for _, body := range wantBodies { + expectedBodies = append(expectedBodies, []byte(body)) + } + + assert.ElementsMatch(t, expectedBodies, partBodies) +} diff --git a/pkg/message/parser/part.go b/pkg/message/parser/part.go index e3ae8ff1..ae808806 100644 --- a/pkg/message/parser/part.go +++ b/pkg/message/parser/part.go @@ -2,8 +2,12 @@ package parser import ( "errors" + "unicode/utf8" + pmmime "github.com/ProtonMail/proton-bridge/pkg/mime" "github.com/emersion/go-message" + "golang.org/x/net/html/charset" + "golang.org/x/text/encoding" ) type Parts []*Part @@ -30,35 +34,32 @@ func (p *Part) AddChild(child *Part) { p.children = append(p.children, child) } -func (p *Part) write(writer *message.Writer, w *Writer) (err error) { - if len(p.children) > 0 { - for _, child := range p.children { - if err = child.writeAsChild(writer, w); err != nil { - return - } - } - } - - if _, err = writer.Write(p.Body); err != nil { - return - } - - return +func (p *Part) isUTF8() bool { + return utf8.Valid(p.Body) } -func (p *Part) writeAsChild(writer *message.Writer, w *Writer) (err error) { - if !w.shouldWrite(p) { - return - } - - childWriter, err := writer.CreatePart(p.Header) +// TODO: Do we then need to set charset to utf-8? What if it's embedded in html? +func (p *Part) convertToUTF8() error { + t, params, err := p.Header.ContentType() if err != nil { - return + return err } - if err = p.write(childWriter, w); err != nil { - return + var decoder *encoding.Decoder + + if knownCharset, ok := params["charset"]; !ok { + encoding, _, _ := charset.DetermineEncoding(p.Body, t) + decoder = encoding.NewDecoder() + } else if decoder, err = pmmime.SelectDecoder(knownCharset); err != nil { + return err } - return childWriter.Close() + if p.Body, err = decoder.Bytes(p.Body); err != nil { + return err + } + + params["charset"] = "utf-8" + p.Header.SetContentType(t, params) + + return nil } diff --git a/pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml b/pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml new file mode 100644 index 00000000..3bba7086 --- /dev/null +++ b/pkg/message/parser/testdata/text_html_embedded_latin2_encoding.eml @@ -0,0 +1,5 @@ +From: Sender