diff options
author | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2021-05-18 16:33:29 -0400 |
---|---|---|
committer | Konstantin Ryabitsev <konstantin@linuxfoundation.org> | 2021-05-18 16:33:29 -0400 |
commit | 4950093c0c3ee71e7045b545626d2b232271cbc8 (patch) | |
tree | d2edceff7ce0814ae8caf0bce048f12f4bd1fa28 /b4/__init__.py | |
parent | 6bec820cb9d015e99e4f5bf375fba098ba5b2f30 (diff) | |
download | b4-4950093c0c3ee71e7045b545626d2b232271cbc8.tar.gz |
Don't use mboxo for anything
While trying to figure out some odd DKIM failures, I've discovered that
there is an important incompatibility between git's idea of what "mbox"
format is, and Python's mboxo implementation -- at least when it comes
to treating "\nFrom " escapes.
According to the "original mbox" standard, when a message body contains
a "\nFrom " sequence, it should be converted to "\n>From " in order not
to confuse the parser. When reading messages in that format, clients are
supposed to back-convert "\n>From " into their original form. This is
the so-called "mboxo" format, which is what Python's mailbox.mbox
supports:
https://docs.python.org/3/library/mailbox.html#mailbox.mbox
The "mboxrd" format was created to avoid a corruption problem whereas a
body that legitimately contains "\n>From " would be wrongly converted
into "\nFrom " upon parsing the mailbox, so mboxrd standard requires
that, when saving a mailbox, "\n>From " sequences are additionally
escaped as "\n>>From ". This is the format public-inbox supports, so
when we grab mailboxes from remote, they are in mboxrd format.
Git will try to guess the format of the mbox file, but it will ONLY
back-convert "\n>From " sequences when you specifically tell it that
it's "mboxrd" format, even when it's in fact "mboxo":
git am --patch-format=mboxrd
If you don't force the mboxrd format, git-am will preserve all escaped
"\n>From " lines as-is.
We've been previously operating on the assumption that git-am's mbox
support properly implements "mboxo", but this was wrong, resulting in
some commits like the following:
https://git.kernel.org/torvalds/c/137733d08f4a
This large-ish change ditches all internal use of Python's mboxo. When
asked to save mbox files, we will save them without any escaping, the
way git-am (i.e. git-mailsplit) espects them. The same goes when we're
outputting to stdout.
There is also a way now to pass -M to both "b4 am" and "b4 mbox" that
will save things as maildirs -- git-am supports this natively and thus
avoids any possible parsing ambiguities. You can set a config option
b4.save-maildirs=yes to make this the default behaviour.
The fallout of this is fairly benign, if annoying. There is no situation
in which a patch would have "\nFrom " as part of its body, so the
problem only affected commit messages. We will have a handful of these
sprinkled around the trees, and will hopefully not introduce any new
ones once everyone switches to the b4 version that outputs things in the
format git-am expects.
Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
Diffstat (limited to 'b4/__init__.py')
-rw-r--r-- | b4/__init__.py | 158 |
1 files changed, 98 insertions, 60 deletions
diff --git a/b4/__init__.py b/b4/__init__.py index 43fd078..bc669fe 100644 --- a/b4/__init__.py +++ b/b4/__init__.py @@ -11,6 +11,9 @@ import fnmatch import email.utils import email.policy import email.header +import tempfile +import pathlib + import requests import urllib.parse import datetime @@ -21,8 +24,6 @@ import mailbox # noinspection PyCompatibility import pwd -from pathlib import Path -from tempfile import mkstemp, TemporaryDirectory from contextlib import contextmanager from typing import Optional, Tuple, Set, List @@ -94,6 +95,7 @@ DEFAULT_CONFIG = { 'midmask': LOREADDR + '/r/%s', 'linkmask': LOREADDR + '/r/%s', 'trailer-order': DEFAULT_TRAILER_ORDER, + 'save-maildirs': 'no', # off: do not bother checking attestation # check: print an attaboy when attestation is found # softfail: print a warning when no attestation found @@ -216,14 +218,10 @@ class LoreMailbox: if reused: continue # Try to backfill from that project - tmp_mbox = mkstemp('b4-backfill-mbox')[1] - get_pi_thread_by_msgid(patch.msgid, tmp_mbox, useproject=projmap[entry[1]]) - mbx = mailbox.mbox(tmp_mbox) + backfills = get_pi_thread_by_msgid(patch.msgid, useproject=projmap[entry[1]]) was = len(self.msgid_map) - for msg in mbx: + for msg in backfills: self.add_message(msg) - mbx.close() - os.unlink(tmp_mbox) if len(self.msgid_map) > was: logger.info('Loaded %s messages from %s', len(self.msgid_map)-was, projurl) if self.series[revision].complete: @@ -541,8 +539,8 @@ class LoreSeries: return slug[:100] - def save_am_mbox(self, mbx, noaddtrailers=False, covertrailers=False, trailer_order=None, addmysob=False, - addlink=False, linkmask=None, cherrypick=None, copyccs=False): + def get_am_ready(self, noaddtrailers=False, covertrailers=False, trailer_order=None, addmysob=False, + addlink=False, linkmask=None, cherrypick=None, copyccs=False) -> list: usercfg = get_user_config() config = get_main_config() @@ -584,6 +582,7 @@ class LoreSeries: break at = 1 + msgs = list() for lmsg in self.patches[1:]: if cherrypick is not None: if at not in cherrypick: @@ -625,14 +624,14 @@ class LoreSeries: if noaddtrailers: add_trailers = False msg = lmsg.get_am_message(add_trailers=add_trailers, trailer_order=trailer_order, copyccs=copyccs) - # Pass a policy that avoids most legacy encoding horrors - mbx.add(msg.as_bytes(policy=emlpolicy)) + slug = '%04d_%s' % (lmsg.counter, re.sub(r'\W+', '_', lmsg.subject).strip('_').lower()) + msgs.append((slug, msg)) else: logger.error(' ERROR: missing [%s/%s]!', at, self.expected) at += 1 if attpolicy == 'off': - return mbx + return msgs if attsame and attref: logger.info(' ---') @@ -646,7 +645,7 @@ class LoreSeries: if not can_patatt: logger.info(' NOTE: install patatt for end-to-end signature verification') - return mbx + return msgs def check_applies_clean(self, gitdir, when=None): # Go through indexes and see if this series should apply cleanly @@ -1053,7 +1052,7 @@ class LoreMessage: signtime = self.date self.msg._headers.append((hn, hval)) # noqa - res = dkim.verify(self.msg.as_bytes().replace(b'\n>From ', b'\nFrom ')) + res = dkim.verify(self.msg.as_bytes()) attestor = LoreAttestorDKIM(res, identity, signtime, errors) logger.debug('DKIM verify results: %s=%s', identity, res) @@ -1484,9 +1483,9 @@ class LoreMessage: def get_am_message(self, add_trailers=True, trailer_order=None, copyccs=False): if add_trailers: self.fix_trailers(trailer_order=trailer_order, copyccs=copyccs) - am_body = self.body + am_body = self.body.rstrip('\r\n') am_msg = email.message.EmailMessage() - am_msg.set_payload(am_body.encode('utf-8')) + am_msg.set_payload(am_body.encode() + b'\n') # Clean up headers for hdrname, hdrval in self.msg.items(): lhdrname = hdrname.lower() @@ -1572,6 +1571,10 @@ class LoreSubject: subject = re.sub(r'^\s*\[[^]]*]\s*', '', subject) self.subject = subject + def get_slug(self): + unsafe = '%04d_%s' % (self.counter, self.subject) + return re.sub(r'\W+', '_', unsafe).strip('_').lower() + def __repr__(self): out = list() out.append(' full_subject: %s' % self.full_subject) @@ -1772,7 +1775,7 @@ def git_temp_worktree(gitdir=None, commitish=None): worktree is deleted when the contex manager is closed. Taken from gj_tools.""" dfn = None try: - with TemporaryDirectory() as dfn: + with tempfile.TemporaryDirectory() as dfn: gitargs = ['worktree', 'add', '--detach', '--no-checkout', dfn] if commitish: gitargs.append(commitish) @@ -1796,7 +1799,7 @@ def git_temp_clone(gitdir=None): logger.critical('Current directory is not a git checkout. Try using -g.') return None - with TemporaryDirectory() as dfn: + with tempfile.TemporaryDirectory() as dfn: gitargs = ['clone', '--mirror', '--shared', gitdir, dfn] git_run_command(None, gitargs) yield dfn @@ -1862,9 +1865,9 @@ def get_data_dir(appname: str = 'b4') -> str: if 'XDG_DATA_HOME' in os.environ: datahome = os.environ['XDG_DATA_HOME'] else: - datahome = os.path.join(str(Path.home()), '.local', 'share') + datahome = os.path.join(str(pathlib.Path.home()), '.local', 'share') datadir = os.path.join(datahome, appname) - Path(datadir).mkdir(parents=True, exist_ok=True) + pathlib.Path(datadir).mkdir(parents=True, exist_ok=True) return datadir @@ -1873,9 +1876,9 @@ def get_cache_dir(appname: str = 'b4') -> str: if 'XDG_CACHE_HOME' in os.environ: cachehome = os.environ['XDG_CACHE_HOME'] else: - cachehome = os.path.join(str(Path.home()), '.cache') + cachehome = os.path.join(str(pathlib.Path.home()), '.cache') cachedir = os.path.join(cachehome, appname) - Path(cachedir).mkdir(parents=True, exist_ok=True) + pathlib.Path(cachedir).mkdir(parents=True, exist_ok=True) if _CACHE_CLEANED: return cachedir @@ -1888,12 +1891,16 @@ def get_cache_dir(appname: str = 'b4') -> str: expmin = 600 expage = time.time() - expmin for entry in os.listdir(cachedir): - if entry.find('.mbx') <= 0 and entry.find('.lookup') <= 0: + if entry.find('.mbx') <= 0 and entry.find('.lookup') <= 0 and entry.find('.msgs'): continue - st = os.stat(os.path.join(cachedir, entry)) + fullpath = os.path.join(cachedir, entry) + st = os.stat(fullpath) if st.st_mtime < expage: logger.debug('Cleaning up cache: %s', entry) - os.unlink(os.path.join(cachedir, entry)) + if os.path.isdir(fullpath): + shutil.rmtree(fullpath) + else: + os.unlink(os.path.join(cachedir, entry)) _CACHE_CLEANED = True return cachedir @@ -1985,13 +1992,14 @@ def get_msgid(cmdargs) -> Optional[str]: return msgid -def save_strict_thread(in_mbx, out_mbx, msgid): +def get_strict_thread(msgs, msgid): want = {msgid} got = set() seen = set() maybe = dict() + strict = list() while True: - for msg in in_mbx: + for msg in msgs: c_msgid = LoreMessage.get_clean_msgid(msg) seen.add(c_msgid) if c_msgid in got: @@ -2016,7 +2024,7 @@ def save_strict_thread(in_mbx, out_mbx, msgid): maybe[ref].add(c_msgid) if c_msgid in want: - out_mbx.add(msg) + strict.append(msg) got.add(c_msgid) want.update(refs) want.discard(c_msgid) @@ -2038,19 +2046,41 @@ def save_strict_thread(in_mbx, out_mbx, msgid): if not len(want): break - if not len(out_mbx): + if not len(strict): return None - if len(in_mbx) > len(out_mbx): - logger.debug('Reduced mbox to strict matches only (%s->%s)', len(in_mbx), len(out_mbx)) + if len(msgs) > len(strict): + logger.debug('Reduced mbox to strict matches only (%s->%s)', len(msgs), len(strict)) + return strict -def get_pi_thread_by_url(t_mbx_url, savefile, nocache=False): - cachefile = get_cache_file(t_mbx_url, 'pi.mbx') - if os.path.exists(cachefile) and not nocache: - logger.debug('Using cached copy: %s', cachefile) - shutil.copyfile(cachefile, savefile) - return savefile + +def mailsplit_bytes(bmbox: bytes, outdir: str) -> list: + logger.debug('Mailsplitting the mbox into %s', outdir) + args = ['mailsplit', '--mboxrd', '-o%s' % outdir] + ecode, out = git_run_command(None, args, stdin=bmbox) + msgs = list() + if ecode > 0: + logger.critical('Unable to parse mbox received from the server') + return msgs + # Read in the files + for msg in os.listdir(outdir): + with open(os.path.join(outdir, msg), 'rb') as fh: + msgs.append(email.message_from_binary_file(fh)) + return msgs + + +def get_pi_thread_by_url(t_mbx_url, nocache=False): + msgs = list() + cachedir = get_cache_file(t_mbx_url, 'pi.msgs') + if os.path.exists(cachedir) and not nocache: + logger.info('Using cached copy: %s', cachedir) + for msg in os.listdir(cachedir): + with open(os.path.join(cachedir, msg), 'rb') as fh: + msgs.append(email.message_from_binary_file(fh)) + return msgs + + logger.critical('Grabbing thread from %s', t_mbx_url.split('://')[1]) session = get_requests_session() resp = session.get(t_mbx_url) if resp.status_code != 200: @@ -2061,16 +2091,16 @@ def get_pi_thread_by_url(t_mbx_url, savefile, nocache=False): if not len(t_mbox): logger.critical('No messages found for that query') return None - # Convert mboxrd to mboxo that python understands - t_mbox = t_mbox.replace(b'\n>>From ', b'\n>From ') - with open(savefile, 'wb') as fh: - logger.debug('Saving %s', savefile) - fh.write(t_mbox) - shutil.copyfile(savefile, cachefile) - return savefile + # Convert into individual files using git-mailsplit + with tempfile.TemporaryDirectory(suffix='-mailsplit') as tfd: + msgs = mailsplit_bytes(t_mbox, tfd) + if os.path.exists(cachedir): + shutil.rmtree(cachedir) + shutil.copytree(tfd, cachedir) + return msgs -def get_pi_thread_by_msgid(msgid, savefile, useproject=None, nocache=False): +def get_pi_thread_by_msgid(msgid, useproject=None, nocache=False): qmsgid = urllib.parse.quote_plus(msgid) config = get_main_config() # Grab the head from lore, to see where we are redirected @@ -2092,25 +2122,17 @@ def get_pi_thread_by_msgid(msgid, savefile, useproject=None, nocache=False): t_mbx_url = '%s/%s/t.mbox.gz' % (projurl, qmsgid) logger.debug('t_mbx_url=%s', t_mbx_url) - logger.critical('Grabbing thread from %s', projurl.split('://')[1]) - - tmp_mbox = mkstemp('b4-lookup-mbox')[1] - in_mbxf = get_pi_thread_by_url(t_mbx_url, tmp_mbox, nocache=nocache) - if not in_mbxf: - os.unlink(tmp_mbox) + msgs = get_pi_thread_by_url(t_mbx_url, nocache=nocache) + if not len(msgs): return None - in_mbx = mailbox.mbox(in_mbxf) - out_mbx = mailbox.mbox(savefile) - save_strict_thread(in_mbx, out_mbx, msgid) - in_mbx.close() - out_mbx.close() - os.unlink(in_mbxf) - return savefile + + strict = get_strict_thread(msgs, msgid) + return strict @contextmanager def git_format_patches(gitdir, start, end, prefixes=None, extraopts=None): - with TemporaryDirectory() as tmpd: + with tempfile.TemporaryDirectory() as tmpd: gitargs = ['format-patch', '--cover-letter', '-o', tmpd, '--signature', f'b4 {__VERSION__}'] if prefixes is not None and len(prefixes): gitargs += ['--subject-prefix', ' '.join(prefixes)] @@ -2238,3 +2260,19 @@ def get_gpg_uids(keyid: str) -> list: uids.append(chunks[9]) return uids + + +def save_git_am_mbox(msgs: list, dest): + # Git-am has its own understanding of what "mbox" format is that differs from Python's + # mboxo implementation. Specifically, it never escapes the ">From " lines found in bodies + # unless invoked with --patch-format=mboxrd (this is wrong, because ">From " escapes are also + # required in the original mbox "mboxo" format). + # So, save in the format that git-am expects + # "dest" should be a file handler in writable+binary mode + for msg in msgs: + bmsg = msg.as_bytes(unixfrom=True, policy=emlpolicy) + # public-inbox unixfrom says "mboxrd", so replace it with something else + # so there is no confusion as it's NOT mboxrd + bmsg = bmsg.replace(b'From mboxrd@z ', b'From git@z ') + bmsg = bmsg.rstrip(b'\r\n') + b'\n\n' + dest.write(bmsg) |