// Copyright (c) 2025 Proton AG // // This file is part of Proton Mail Bridge. // // Proton Mail Bridge is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // Proton Mail Bridge is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with Proton Mail Bridge. If not, see . package pmmime import ( "fmt" "io" "mime" "regexp" "strings" "unicode/utf8" "github.com/ProtonMail/gluon/rfc5322" "github.com/ProtonMail/gluon/rfc822" "github.com/ProtonMail/go-proton-api" "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/net/html/charset" "golang.org/x/text/encoding" "golang.org/x/text/encoding/htmlindex" ) var EmptyContentTypeErr = errors.New("empty content type") func init() { rfc822.ParseMediaType = ParseMediaType proton.CharsetReader = CharsetReader rfc5322.CharsetReader = CharsetReader } func CharsetReader(charset string, input io.Reader) (io.Reader, error) { dec, err := SelectDecoder(charset) if err != nil { return nil, err } if dec == nil { // utf-8 return input, nil } return dec.Reader(input), nil } var WordDec = &mime.WordDecoder{ CharsetReader: CharsetReader, } // Expects trimmed lowercase. func getEncoding(charset string) (enc encoding.Encoding, err error) { preparsed := strings.Trim(strings.ToLower(charset), " \t\r\n") // koi re := regexp.MustCompile("(cs)?koi[-_ ]?8?[-_ ]?(r|ru|u|uk)?$") matches := re.FindAllStringSubmatch(preparsed, -1) if len(matches) == 1 && len(matches[0]) == 3 { preparsed = "koi8-" switch matches[0][2] { case "u", "uk": preparsed += "u" default: preparsed += "r" } } // windows-XXXX re = regexp.MustCompile("(cp|(cs)?win(dows)?)[-_ ]?([0-9]{3,4})$") matches = re.FindAllStringSubmatch(preparsed, -1) if len(matches) == 1 && len(matches[0]) == 5 { switch matches[0][4] { case "874", "1250", "1251", "1252", "1253", "1254", "1255", "1256", "1257", "1258": preparsed = "windows-" + matches[0][4] } } // iso re = regexp.MustCompile("iso[-_ ]?([0-9]{4})[-_ ]?([0-9]+|jp)?[-_ ]?(i|e)?") matches = re.FindAllStringSubmatch(preparsed, -1) if len(matches) == 1 && len(matches[0]) == 4 { if matches[0][1] == "2022" && matches[0][2] == "jp" { preparsed = "iso-2022-jp" } if matches[0][1] == "8859" { switch matches[0][2] { case "1", "2", "3", "4", "5", "7", "8", "9", "10", "11", "13", "14", "15", "16": preparsed = "iso-8859-" + matches[0][2] if matches[0][3] == "i" { preparsed += "-" + matches[0][3] } case "": preparsed = "iso-8859-1" } } } // Latin is tricky. re = regexp.MustCompile("^(cs|csiso)?l(atin)?[-_ ]?([0-9]{1,2})$") matches = re.FindAllStringSubmatch(preparsed, -1) if len(matches) == 1 && len(matches[0]) == 4 { switch matches[0][3] { case "1": preparsed = "windows-1252" case "2", "3", "4", "5": preparsed = "iso-8859-" + matches[0][3] case "6": preparsed = "iso-8859-10" case "8": preparsed = "iso-8859-14" case "9": preparsed = "iso-8859-15" case "10": preparsed = "iso-8859-16" } } // Missing substitutions. switch preparsed { case "csutf8", "iso-utf-8", "utf8mb4": preparsed = "utf-8" case "cp932", "windows-932", "windows-31J", "ibm-943", "cp943": preparsed = "shift_jis" case "eucjp", "ibm-eucjp": preparsed = "euc-jp" case "euckr", "ibm-euckr", "cp949": preparsed = "euc-kr" case "euccn", "ibm-euccn": preparsed = "gbk" case "zht16mswin950", "cp950": preparsed = "big5" case "csascii", "ansi_x3.4-1968", "ansi_x3.4-1986", "ansi_x3.110-1983", "cp850", "cp858", "us", "iso646", "iso-646", "iso646-us", "iso_646.irv:1991", "cp367", "ibm367", "ibm-367", "iso-ir-6": preparsed = "ascii" case "ibm852": preparsed = "iso-8859-2" case "iso-ir-199", "iso-celtic": preparsed = "iso-8859-14" case "iso-ir-226": preparsed = "iso-8859-16" case "macroman": preparsed = "macintosh" } enc, _ = htmlindex.Get(preparsed) if enc == nil { err = fmt.Errorf("can not get encoding for '%s' (or '%s')", charset, preparsed) } return } func SelectDecoder(charset string) (decoder *encoding.Decoder, err error) { var enc encoding.Encoding lcharset := strings.Trim(strings.ToLower(charset), " \t\r\n") switch lcharset { case "utf7", "utf-7", "unicode-1-1-utf-7": return NewUtf7Decoder(), nil default: enc, err = getEncoding(lcharset) } if err == nil { decoder = enc.NewDecoder() } return } // DecodeHeader if needed. Returns error if raw contains non-utf8 characters. func DecodeHeader(raw string) (decoded string, err error) { if decoded, err = WordDec.DecodeHeader(raw); err != nil { decoded = raw } if !utf8.ValidString(decoded) { err = fmt.Errorf("header contains non utf8 chars: %v", err) } return } // EncodeHeader using quoted printable and utf8. func EncodeHeader(s string) string { return mime.QEncoding.Encode("utf-8", s) } // DecodeCharset decodes the original using content type parameters. // If the charset parameter is missing it checks that the content is valid utf8. // If it isn't, it checks if it's embedded in the html/xml. // If it isn't, it falls back to windows-1252. // It then reencodes it as utf-8. func DecodeCharset(original []byte, contentType string) ([]byte, error) { // If the contentType itself is specified, use that. if contentType != "" { _, params, err := ParseMediaType(contentType) if err != nil { return nil, err } if charset, ok := params["charset"]; ok { decoder, err := SelectDecoder(charset) if err != nil { return original, errors.Wrap(err, "unknown charset was specified") } return decoder.Bytes(original) } } // The charset was not specified. First try utf8. if utf8.Valid(original) { return original, nil } // encoding will be windows-1252 if it can't be determined properly. encoding, name, certain := charset.DetermineEncoding(original, contentType) if !certain { logrus.WithField("encoding", name).Warn("Determined encoding but was not certain") } // Re-encode as UTF-8. decoded, err := encoding.NewDecoder().Bytes(original) if err != nil { return original, errors.Wrap(err, "failed to decode as windows-1252") } // If the decoded string is not valid utf8, it wasn't windows-1252, so give up. if !utf8.Valid(decoded) { return original, errors.Wrap(err, "failed to decode as windows-1252") } return decoded, nil } // ParseMediaType from MIME doesn't support RFC2231 for non asci / utf8 encodings so we have to pre-parse it. func ParseMediaType(v string) (string, map[string]string, error) { if v == "" { return "", nil, EmptyContentTypeErr } decoded, err := DecodeHeader(v) if err != nil { logrus.WithField("value", v).WithField("pkg", "mime").WithError(err).Error("Cannot decode Headers.") return "", nil, err } v, _ = changeEncodingAndKeepLastParamDefinition(decoded) mediatype, params, err := mime.ParseMediaType(v) if err != nil { logrus.WithField("value", v).WithField("pkg", "mime").WithError(err).Error("Media Type parsing error.") return "", nil, err } return mediatype, params, err }