From fb8301d9f0cb38bde3bd37906a0bf7f69ae6ce8b Mon Sep 17 00:00:00 2001 From: Konstantin Ryabitsev Date: Mon, 8 Jun 2020 11:01:24 -0400 Subject: Use proper charset when reading back mailinfo Git mailinfo output is going to be in the same character set as specified in the message headers, so we should not assume that we can read it back in unicode. Signed-off-by: Konstantin Ryabitsev --- b4/__init__.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/b4/__init__.py b/b4/__init__.py index f5e77d0..b5b4f99 100644 --- a/b4/__init__.py +++ b/b4/__init__.py @@ -775,6 +775,7 @@ class LoreMessage: # Body and body-based info self.body = None + self.charset = 'utf-8' self.has_diff = False self.has_diffstat = False self.trailers = set() @@ -830,6 +831,7 @@ class LoreMessage: mcharset = self.msg.get_content_charset() if not mcharset: mcharset = 'utf-8' + self.charset = mcharset for part in msg.walk(): cte = part.get_content_type() @@ -843,6 +845,7 @@ class LoreMessage: pcharset = mcharset try: payload = payload.decode(pcharset, errors='replace') + self.charset = pcharset except LookupError: # what kind of encoding is that? # Whatever, we'll use utf-8 and hope for the best @@ -1131,6 +1134,7 @@ class LoreMessage: def load_hashes(self): if self.attestation is not None: return + logger.debug('Calculating hashes for: %s', self.full_subject) msg_out = mkstemp() patch_out = mkstemp() cmdargs = ['mailinfo', '--encoding=UTF-8', msg_out[1], patch_out[1]] @@ -1157,8 +1161,8 @@ class LoreMessage: os.unlink(msg_out[1]) p = None - with open(patch_out[1], 'r') as pfh: - patch = pfh.read() + with open(patch_out[1], 'rb') as pfh: + patch = pfh.read().decode(self.charset, errors='replace') if len(patch.strip()): diff = LoreMessage.get_clean_diff(patch) phasher = hashlib.sha256() -- cgit v1.2.3