From 6de8a6106413068662ca2fa5a98ba2b6aa7f2d7b Mon Sep 17 00:00:00 2001 From: Konstantin Ryabitsev Date: Thu, 26 Mar 2020 18:01:36 -0400 Subject: Add initial "b4 pr" command set While working on "pull-request exploder" stuff for achiving on lore.kernel.org, I realized that this may be a useful set of features for developers as well, so here is a very simple framework for handing pull requests. Examples: b4 pr - downloads that message - parses the pull request - checks that the remote tip is where the message says it should be - makes sure the base-commit is present in the tree - makes sure the tip commit isn't already in one of the branches - checks if FETCH_HEAD already is at that commit - if the above two checks pass, performs a git fetch b4 pr --check Runs all of the checks above, but doesn't perform the actual fetch. Useful if you don't remember if you've already processed a pull request or not. b4 pr --explode Runs basic sanity checks and then: - performs a git fetch - runs a "git format-patch" for base-commit..FETCH_HEAD - adds the same to/from/cc headers as in the pull request - properly threads each patch below the pull request - saves that into a .mbox file The above is handy if you want to comment on something in a pull request and not have to hunt around for sender/cc information. Signed-off-by: Konstantin Ryabitsev --- b4/__init__.py | 119 +++++++++++++++++++++++- b4/command.py | 21 +++++ b4/mbox.py | 115 +++-------------------- b4/pr.py | 282 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 433 insertions(+), 104 deletions(-) create mode 100644 b4/pr.py diff --git a/b4/__init__.py b/b4/__init__.py index 850cce1..8a7cf8b 100644 --- a/b4/__init__.py +++ b/b4/__init__.py @@ -5,6 +5,8 @@ import subprocess import logging import hashlib import re +import sys +import gzip import os import fnmatch import email.utils @@ -13,6 +15,8 @@ import requests import urllib.parse import datetime import time +import shutil +import mailbox from pathlib import Path from tempfile import mkstemp @@ -516,6 +520,13 @@ class LoreMessage: self.trailers = set() self.followup_trailers = set() + # These are populated by pr + self.pr_base_commit = None + self.pr_repo = None + self.pr_ref = None + self.pr_tip_commit = None + self.pr_remote_tip_commit = None + self.attestation = None self.msgid = LoreMessage.get_clean_msgid(self.msg) @@ -607,6 +618,10 @@ class LoreMessage: trailers = set() for tname, tvalue in self.trailers: + if tname.lower() in ('fixes',): + trailers.add((tname, tvalue)) + continue + tmatch = False namedata = email.utils.getaddresses([tvalue])[0] tfrom = re.sub(r'\+[^@]+@', '@', namedata[1].lower()) @@ -678,8 +693,7 @@ class LoreMessage: @staticmethod def clean_header(hdrval): - uval = hdrval.replace('\n', ' ') - new_hdrval = re.sub(r'\s+', ' ', uval) + new_hdrval = re.sub(r'\n?\s+', ' ', str(hdrval)) return new_hdrval.strip() @staticmethod @@ -1354,6 +1368,40 @@ def get_requests_session(): return REQSESSION +def get_msgid_from_stdin(): + if not sys.stdin.isatty(): + message = email.message_from_string(sys.stdin.read()) + return message.get('Message-ID', None) + logger.error('Error: pipe a message or pass msgid as parameter') + sys.exit(1) + + +def get_msgid(cmdargs): + if not cmdargs.msgid: + logger.debug('Getting Message-ID from stdin') + msgid = get_msgid_from_stdin() + if msgid is None: + logger.error('Unable to find a valid message-id in stdin.') + sys.exit(1) + else: + msgid = cmdargs.msgid + + msgid = msgid.strip('<>') + # Handle the case when someone pastes a full URL to the message + matches = re.search(r'^https?://[^/]+/([^/]+)/([^/]+@[^/]+)', msgid, re.IGNORECASE) + if matches: + chunks = matches.groups() + msgid = chunks[1] + # Infer the project name from the URL, if possible + if chunks[0] != 'r': + cmdargs.useproject = chunks[0] + # Handle special case when msgid is prepended by id: or rfc822msgid: + if msgid.find('id:') >= 0: + msgid = re.sub(r'^\w*id:', '', msgid) + + return msgid + + def save_strict_thread(in_mbx, out_mbx, msgid): want = {msgid} got = set() @@ -1392,3 +1440,70 @@ def save_strict_thread(in_mbx, out_mbx, msgid): if len(in_mbx) > len(out_mbx): logger.info('Reduced thread to strict matches only (%s->%s)', len(in_mbx), len(out_mbx)) + + +def get_pi_thread_by_url(t_mbx_url, savefile): + session = get_requests_session() + resp = session.get(t_mbx_url) + if resp.status_code != 200: + logger.critical('Server returned an error: %s', resp.status_code) + return None + t_mbox = gzip.decompress(resp.content) + resp.close() + if not len(t_mbox): + logger.critical('No messages found for that query') + return None + with open(savefile, 'wb') as fh: + logger.debug('Saving %s', savefile) + fh.write(t_mbox) + return savefile + + +def get_pi_thread_by_msgid(msgid, savefile, useproject=None, nocache=False): + config = get_main_config() + cachedir = get_cache_dir() + cachefile = os.path.join(cachedir, '%s.pi.mbx' % urllib.parse.quote_plus(msgid)) + if os.path.exists(cachefile) and not nocache: + logger.debug('Using cached copy: %s', cachefile) + shutil.copyfile(cachefile, savefile) + return savefile + + # Grab the head from lore, to see where we are redirected + midmask = config['midmask'] % msgid + logger.info('Looking up %s', midmask) + session = get_requests_session() + resp = session.head(midmask) + if resp.status_code < 300 or resp.status_code > 400: + logger.critical('That message-id is not known.') + return None + canonical = resp.headers['Location'].rstrip('/') + resp.close() + t_mbx_url = '%s/t.mbox.gz' % canonical + + loc = urllib.parse.urlparse(t_mbx_url) + if useproject: + logger.debug('Modifying query to use %s', useproject) + t_mbx_url = '%s://%s/%s/%s/t.mbox.gz' % ( + loc.scheme, loc.netloc, useproject, msgid) + logger.debug('Will query: %s', t_mbx_url) + logger.critical('Grabbing thread from %s', loc.netloc) + in_mbxf = get_pi_thread_by_url(t_mbx_url, '%s-loose' % savefile) + if not in_mbxf: + return None + in_mbx = mailbox.mbox(in_mbxf) + out_mbx = mailbox.mbox(savefile) + save_strict_thread(in_mbx, out_mbx, msgid) + in_mbx.close() + out_mbx.close() + os.unlink(in_mbxf) + shutil.copyfile(savefile, cachefile) + return savefile + + +def git_format_patches(gitdir, start, end, reroll=None): + gitargs = ['format-patch', '--stdout'] + if reroll is not None: + gitargs += ['-v', str(reroll)] + gitargs += ['%s..%s' % (start, end)] + ecode, out = git_run_command(gitdir, gitargs) + return ecode, out diff --git a/b4/command.py b/b4/command.py index 3997035..f36f3ea 100644 --- a/b4/command.py +++ b/b4/command.py @@ -50,6 +50,11 @@ def cmd_verify(cmdargs): b4.attest.verify_attestation(cmdargs) +def cmd_pr(cmdargs): + import b4.pr + b4.pr.main(cmdargs) + + def cmd(): parser = argparse.ArgumentParser( description='A tool to work with public-inbox patches', @@ -112,6 +117,22 @@ def cmd(): sp_ver.add_argument('mbox', nargs=1, help='Mbox containing patches to attest') sp_ver.set_defaults(func=cmd_verify) + # b4 pr + sp_pr = subparsers.add_parser('pr', help='Fetch a pull request found in a message ID') + sp_pr.add_argument('-g', '--gitdir', default=None, + help='Operate on this git tree instead of current dir') + sp_pr.add_argument('-b', '--branch', default=None, + help='Check out FETCH_HEAD into this branch after fetching') + sp_pr.add_argument('-c', '--check', action='store_true', default=False, + help='Check if pull request has already been applied') + sp_pr.add_argument('-e', '--explode', action='store_true', default=False, + help='Convert a pull request into an mbox full of patches') + sp_pr.add_argument('-o', '--output-mbox', dest='outmbox', default=None, + help='Save exploded messages into this mailbox (default: msgid.mbx)') + sp_pr.add_argument('msgid', nargs='?', + help='Message ID to process, or pipe a raw message') + sp_pr.set_defaults(func=cmd_pr) + cmdargs = parser.parse_args() logger.setLevel(logging.DEBUG) diff --git a/b4/mbox.py b/b4/mbox.py index 7043fa1..3b00fd8 100644 --- a/b4/mbox.py +++ b/b4/mbox.py @@ -13,11 +13,9 @@ import email.message import email.utils import re import time -import shutil import urllib.parse import xml.etree.ElementTree -import gzip import b4 @@ -26,81 +24,8 @@ from tempfile import mkstemp logger = b4.logger -def get_msgid_from_stdin(): - if not sys.stdin.isatty(): - message = email.message_from_string(sys.stdin.read()) - return message.get('Message-ID', None) - logger.error('Error: pipe a message or pass msgid as parameter') - sys.exit(1) - - -def get_pi_thread_by_url(t_mbx_url, savefile): - session = b4.get_requests_session() - resp = session.get(t_mbx_url) - if resp.status_code != 200: - logger.critical('Server returned an error: %s', resp.status_code) - return None - t_mbox = gzip.decompress(resp.content) - resp.close() - if not len(t_mbox): - logger.critical('No messages found for that query') - return None - with open(savefile, 'wb') as fh: - logger.debug('Saving %s', savefile) - fh.write(t_mbox) - return savefile - - -def get_pi_thread_by_msgid(msgid, config, cmdargs): - wantname = cmdargs.wantname - outdir = cmdargs.outdir - if wantname: - savefile = os.path.join(outdir, wantname) - else: - # Save it into msgid.mbox - savefile = '%s.t.mbx' % msgid - savefile = os.path.join(outdir, savefile) - - cachedir = b4.get_cache_dir() - cachefile = os.path.join(cachedir, '%s.pi.mbx' % urllib.parse.quote_plus(msgid)) - if os.path.exists(cachefile) and not cmdargs.nocache: - logger.debug('Using cached copy: %s', cachefile) - shutil.copyfile(cachefile, savefile) - return savefile - - # Grab the head from lore, to see where we are redirected - midmask = config['midmask'] % msgid - logger.info('Looking up %s', midmask) - session = b4.get_requests_session() - resp = session.head(midmask) - if resp.status_code < 300 or resp.status_code > 400: - logger.critical('That message-id is not known.') - return None - canonical = resp.headers['Location'].rstrip('/') - resp.close() - t_mbx_url = '%s/t.mbox.gz' % canonical - - loc = urllib.parse.urlparse(t_mbx_url) - if cmdargs.useproject: - logger.debug('Modifying query to use %s', cmdargs.useproject) - t_mbx_url = '%s://%s/%s/%s/t.mbox.gz' % ( - loc.scheme, loc.netloc, cmdargs.useproject, msgid) - logger.debug('Will query: %s', t_mbx_url) - logger.critical('Grabbing thread from %s', loc.netloc) - in_mbxf = get_pi_thread_by_url(t_mbx_url, '%s-loose' % savefile) - if not in_mbxf: - return None - in_mbx = mailbox.mbox(in_mbxf) - out_mbx = mailbox.mbox(savefile) - b4.save_strict_thread(in_mbx, out_mbx, msgid) - in_mbx.close() - out_mbx.close() - os.unlink(in_mbxf) - shutil.copyfile(savefile, cachefile) - return savefile - - -def mbox_to_am(mboxfile, config, cmdargs): +def mbox_to_am(mboxfile, cmdargs): + config = b4.get_main_config() outdir = cmdargs.outdir wantver = cmdargs.wantver wantname = cmdargs.wantname @@ -342,7 +267,7 @@ def get_newest_series(mboxfile): continue t_mbx_url = '%st.mbox.gz' % link savefile = mkstemp('b4-get')[1] - nt_mboxfile = get_pi_thread_by_url(t_mbx_url, savefile) + nt_mboxfile = b4.get_pi_thread_by_url(t_mbx_url, savefile) nt_mbx = mailbox.mbox(nt_mboxfile) # Append all of these to the existing mailbox new_adds = 0 @@ -371,32 +296,18 @@ def main(cmdargs): # Force nocache mode cmdargs.nocache = True - config = b4.get_main_config() - if not cmdargs.localmbox: - if not cmdargs.msgid: - logger.debug('Getting Message-ID from stdin') - msgid = get_msgid_from_stdin() - if msgid is None: - logger.error('Unable to find a valid message-id in stdin.') - sys.exit(1) + msgid = b4.get_msgid(cmdargs) + wantname = cmdargs.wantname + outdir = cmdargs.outdir + if wantname: + savefile = os.path.join(outdir, wantname) else: - msgid = cmdargs.msgid + # Save it into msgid.mbox + savefile = '%s.t.mbx' % msgid + savefile = os.path.join(outdir, savefile) - msgid = msgid.strip('<>') - # Handle the case when someone pastes a full URL to the message - matches = re.search(r'^https?://[^/]+/([^/]+)/([^/]+@[^/]+)', msgid, re.IGNORECASE) - if matches: - chunks = matches.groups() - msgid = chunks[1] - # Infer the project name from the URL, if possible - if chunks[0] != 'r': - cmdargs.useproject = chunks[0] - # Handle special case when msgid is prepended by id: or rfc822msgid: - if msgid.find('id:') >= 0: - msgid = re.sub(r'^\w*id:', '', msgid) - - mboxfile = get_pi_thread_by_msgid(msgid, config, cmdargs) + mboxfile = b4.get_pi_thread_by_msgid(msgid, savefile, useproject=cmdargs.useproject, nocache=cmdargs.nocache) if mboxfile is None: return @@ -414,7 +325,7 @@ def main(cmdargs): get_newest_series(threadmbox) if cmdargs.subcmd == 'am': - mbox_to_am(threadmbox, config, cmdargs) + mbox_to_am(threadmbox, cmdargs) if not cmdargs.localmbox: os.unlink(threadmbox) else: diff --git a/b4/pr.py b/b4/pr.py new file mode 100644 index 0000000..8831cd0 --- /dev/null +++ b/b4/pr.py @@ -0,0 +1,282 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# SPDX-License-Identifier: GPL-2.0-or-later +# Copyright (C) 2020 by the Linux Foundation +# +__author__ = 'Konstantin Ryabitsev ' + +import os +import sys +import b4 +import re +import mailbox + +from datetime import timedelta +from tempfile import mkstemp +from email import utils + +from email import charset +charset.add_charset('utf-8', None) + +logger = b4.logger + +PULL_BODY_SINCE_ID_RE = [ + re.compile(r'changes since commit ([0-9a-f]{5,40}):', re.M | re.I) +] + +# I like these +PULL_BODY_WITH_COMMIT_ID_RE = [ + re.compile(r'fetch changes up to ([0-9a-f]{5,40}):', re.M | re.I), +] + +# I don't like these +PULL_BODY_REMOTE_REF_RE = [ + re.compile(r'^\s*([\w+-]+(?:://|@)[\w/.@:~-]+)[\s\\]+([\w/._-]+)\s*$', re.M | re.I), + re.compile(r'^\s*([\w+-]+(?:://|@)[\w/.@~-]+)\s*$', re.M | re.I), +] + + +def git_get_commit_id_from_repo_ref(repo, ref): + # We only handle git and http/s URLs + if not (repo.find('git://') == 0 or repo.find('http://') == 0 or repo.find('https://') == 0): + logger.debug('%s uses unsupported protocol', repo) + return None + + logger.debug('getting commit-id from: %s %s', repo, ref) + # Drop the leading "refs/", if any + ref = re.sub(r'^refs/', '', ref) + # Is it a full ref name or a shortname? + if ref.find('heads/') < 0 and ref.find('tags/') < 0: + # Try grabbing it as a head first + lines = b4.git_get_command_lines(None, ['ls-remote', repo, 'refs/heads/%s' % ref]) + if not lines: + # try it as a tag, then + lines = b4.git_get_command_lines(None, ['ls-remote', repo, 'refs/tags/%s^{}' % ref]) + + elif ref.find('tags/') == 0: + lines = b4.git_get_command_lines(None, ['ls-remote', repo, 'refs/%s^{}' % ref]) + + else: + # Grab it as a head and hope for the best + lines = b4.git_get_command_lines(None, ['ls-remote', repo, 'refs/%s' % ref]) + + if not lines: + # Oh well, we tried + logger.debug('did not find commit-id, ignoring pull request') + return None + + commit_id = lines[0].split()[0] + logger.debug('success, commit-id: %s', commit_id) + return commit_id + + +def git_commit_exists(gitdir, commit_id): + gitargs = ['cat-file', '-e', commit_id] + ecode, out = b4.git_run_command(gitdir, gitargs) + return ecode == 0 + + +def git_branch_contains(gitdir, commit_id): + gitargs = ['branch', '--contains', commit_id] + lines = b4.git_get_command_lines(gitdir, gitargs) + return lines + + +def parse_pr_data(msg): + lmsg = b4.LoreMessage(msg) + if lmsg.body is None: + logger.critical('Could not find a plain part in the message body') + return None + + logger.info('Looking at: %s', lmsg.full_subject) + + for since_re in PULL_BODY_SINCE_ID_RE: + matches = since_re.search(lmsg.body) + if matches: + lmsg.pr_base_commit = matches.groups()[0] + break + + for reporef_re in PULL_BODY_REMOTE_REF_RE: + matches = reporef_re.search(lmsg.body) + if matches: + chunks = matches.groups() + lmsg.pr_repo = chunks[0] + if len(chunks) > 1: + lmsg.pr_ref = chunks[1] + else: + lmsg.pr_ref = 'refs/heads/master' + break + + for cid_re in PULL_BODY_WITH_COMMIT_ID_RE: + matches = cid_re.search(lmsg.body) + if matches: + lmsg.pr_tip_commit = matches.groups()[0] + break + + if lmsg.pr_repo and lmsg.pr_ref: + lmsg.pr_remote_tip_commit = git_get_commit_id_from_repo_ref(lmsg.pr_repo, lmsg.pr_ref) + + return lmsg + + +def fetch_remote(gitdir, lmsg, branch=None): + # Do we know anything about this base commit? + if not git_commit_exists(gitdir, lmsg.pr_base_commit): + logger.critical('ERROR: git knows nothing about commit %s', lmsg.pr_base_commit) + logger.critical(' Are you running inside a git checkout and is it up-to-date?') + return 1 + + if lmsg.pr_tip_commit != lmsg.pr_remote_tip_commit: + logger.critical('ERROR: commit-id mismatch between pull request and remote') + logger.critical(' msg=%s, remote=%s', lmsg.pr_tip_commit, lmsg.pr_remote_tip_commit) + return 1 + + # Fetch it now + logger.info(' Fetching %s %s', lmsg.pr_repo, lmsg.pr_ref) + gitargs = ['fetch', lmsg.pr_repo, lmsg.pr_ref] + ecode, out = b4.git_run_command(None, gitargs, logstderr=True) + if ecode > 0: + logger.critical('ERROR: Could not fetch remote:') + logger.critical(out) + return ecode + + logger.info('---') + if branch: + gitargs = ['checkout', '-b', branch, 'FETCH_HEAD'] + logger.info('Fetched into branch %s', branch) + ecode, out = b4.git_run_command(None, gitargs) + if ecode > 0: + logger.critical('ERROR: Failed to create branch') + logger.critical(out) + return ecode + else: + logger.info('Successfully fetched into FETCH_HEAD') + + return 0 + + +def explode(gitdir, lmsg, savefile): + # We always fetch into FETCH_HEAD when exploding + fetch_remote(gitdir, lmsg) + logger.info('Generating patches starting from the base-commit') + reroll = None + if lmsg.revision > 1: + reroll = lmsg.revision + ecode, out = b4.git_format_patches(gitdir, lmsg.pr_base_commit, 'FETCH_HEAD', reroll=reroll) + if ecode > 0: + logger.critical('ERROR: Could not convert pull request into patches') + logger.critical(out) + sys.exit(ecode) + + # Save patches into a temporary file + patchmbx = mkstemp()[1] + with open(patchmbx, 'w') as fh: + fh.write(out) + pmbx = mailbox.mbox(patchmbx) + embx = mailbox.mbox(savefile) + cover = lmsg.get_am_message() + # Add base-commit to the cover + body = cover.get_payload() + body = '%s\nbase-commit: %s\n' % (body, lmsg.pr_base_commit) + cover.set_payload(body) + embx.add(cover.as_bytes(policy=b4.emlpolicy)) + + # Set the pull request message as cover letter + for msg in pmbx: + # Move the original From and Date into the body + body = msg.get_payload() + body = 'From: %s\nDate: %s\n\n%s' % (msg['from'], msg['date'], body) + msg.set_payload(body) + msubj = b4.LoreSubject(msg['subject']) + msg.replace_header('Subject', msubj.full_subject) + # Set from, to, cc, date headers to match the original pull request + msg.replace_header('From', b4.LoreMessage.clean_header(lmsg.msg['From'])) + # Add a number of seconds equalling the counter, in hopes it gets properly threaded + newdate = lmsg.date + timedelta(seconds=msubj.counter) + msg.replace_header('Date', utils.format_datetime(newdate)) + msg.add_header('To', b4.LoreMessage.clean_header(lmsg.msg['To'])) + if lmsg.msg['Cc']: + msg.add_header('Cc', b4.LoreMessage.clean_header(lmsg.msg['Cc'])) + # Set the message-id based on the original pull request msgid + msg.add_header('Message-Id', '' % (msubj.counter, lmsg.msgid)) + msg.add_header('In-Reply-To', '<%s>' % lmsg.msgid) + if lmsg.msg['References']: + msg.add_header('References', '%s <%s>' % ( + b4.LoreMessage.clean_header(lmsg.msg['References']), lmsg.msgid)) + else: + msg.add_header('References', '<%s>' % lmsg.msgid) + if lmsg.msg['List-Id']: + msg.add_header('List-Id', b4.LoreMessage.clean_header(lmsg.msg['List-Id'])) + msg.add_header('X-Mailer', 'b4-explode/%s' % b4.__VERSION__) + logger.info(' %s', msubj.full_subject) + msg.set_charset('utf-8') + embx.add(msg.as_bytes(policy=b4.emlpolicy)) + logger.info('---') + logger.info('Wrote %s patches into %s', len(pmbx), savefile) + pmbx.close() + embx.close() + sys.exit(0) + + +def main(cmdargs): + msgid = b4.get_msgid(cmdargs) + savefile = mkstemp()[1] + mboxfile = b4.get_pi_thread_by_msgid(msgid, savefile) + if mboxfile is None: + os.unlink(savefile) + return + # Find the message with the msgid we were asked about + mbx = mailbox.mbox(mboxfile) + lmsg = None + for msg in mbx: + mmsgid = b4.LoreMessage.get_clean_msgid(msg) + if mmsgid == msgid: + lmsg = parse_pr_data(msg) + + # Got all we need from it + mbx.close() + os.unlink(savefile) + + if lmsg is None: + logger.critical('ERROR: Could not find pull request info in %s', msgid) + sys.exit(1) + + gitdir = cmdargs.gitdir + if not lmsg.pr_tip_commit: + logger.critical('ERROR: could not find tip commit id') + sys.exit(1) + + if cmdargs.explode: + savefile = cmdargs.outmbox + if savefile is None: + savefile = '%s.mbx' % lmsg.msgid + if os.path.exists(savefile): + logger.info('File exists: %s', savefile) + sys.exit(1) + explode(gitdir, lmsg, savefile) + + exists = git_commit_exists(gitdir, lmsg.pr_tip_commit) + + if exists: + # Is it in any branch, or just flapping in the wind? + branches = git_branch_contains(gitdir, lmsg.pr_tip_commit) + if len(branches): + logger.info('Pull request tip commit exists in the following branches:') + for branch in branches: + logger.info(' %s', branch) + if cmdargs.check: + sys.exit(0) + sys.exit(1) + + # Is it at the tip of FETCH_HEAD? + loglines = b4.git_get_command_lines(gitdir, ['log', '-1', '--pretty=oneline', 'FETCH_HEAD']) + if len(loglines) and loglines[0].find(lmsg.pr_tip_commit) == 0: + logger.info('Pull request is at the tip of FETCH_HEAD') + if cmdargs.check: + sys.exit(0) + + elif cmdargs.check: + logger.info('Pull request does not appear to be in this tree.') + sys.exit(0) + + fetch_remote(gitdir, lmsg, branch=cmdargs.branch) -- cgit v1.2.3