Switch to using patchwork-compatible hashes

Using strict attestation hashes for auto-thankinator is problematic, because "git am" uses a certain degree of fuzzing, so when we try to find applied patches by running "git diff" on actual commits, line counts may not be bit-for-bit identical. Signed-off-by: Konstantin Ryabitsev <konstantin@linuxfoundation.org>
author: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2020-04-15 12:42:21 -0400
committer: Konstantin Ryabitsev <konstantin@linuxfoundation.org> 2020-04-15 12:42:21 -0400
commit: 28b6825da28519b6ca0e8b3bce57700a1120ca9a (patch)
tree: 7fb99d73ba8103752259eacd93920e343c2e38eb
parent: 4be04f0af9a22087d0052838e013ef2de1ce2ac3 (diff)
download: b4-28b6825da28519b6ca0e8b3bce57700a1120ca9a.tar.gz
3 files changed, 91 insertions, 31 deletions
diff --git a/b4/__init__.py b/b4/__init__.py
index d432ab1..626042a 100644
--- a/b4/__init__.py
+++ b/b4/__init__.py
@@ -620,6 +620,8 @@ class LoreMessage:
         self.pr_remote_tip_commit = None
 
         self.attestation = None
+        # Patchwork hash
+        self.pwhash = None
 
         self.msgid = LoreMessage.get_clean_msgid(self.msg)
         self.lsubject = LoreSubject(msg['Subject'])
@@ -815,20 +817,57 @@ class LoreMessage:
         return msgid
 
     @staticmethod
-    def get_patch_hash(diff):
-        # The aim is to represent the patch as if you did the following:
-        # git diff HEAD~.. | dos2unix | sha256sum
-        #
-        # This subroutine removes anything at the beginning of diff data, like
-        # diffstat or any other auxiliary data, and anything trailing at the end
-        # XXX: This currently doesn't work for git binary patches
-        #
+    def get_patchwork_hash(diff):
+        # Make sure we just have the diff without any extraneous content.
+        diff = LoreMessage.get_clean_diff(diff)
+        """Generate a hash from a diff. Lifted verbatim from patchwork."""
+
+        prefixes = ['-', '+', ' ']
+        hashed = hashlib.sha1()
+
+        for line in diff.split('\n'):
+            if len(line) <= 0:
+                continue
+
+            hunk_match = HUNK_RE.match(line)
+            filename_match = FILENAME_RE.match(line)
+
+            if filename_match:
+                # normalise -p1 top-directories
+                if filename_match.group(1) == '---':
+                    filename = 'a/'
+                else:
+                    filename = 'b/'
+                filename += '/'.join(filename_match.group(2).split('/')[1:])
+
+                line = filename_match.group(1) + ' ' + filename
+            elif hunk_match:
+                # remove line numbers, but leave line counts
+                def fn(x):
+                    if not x:
+                        return 1
+                    return int(x)
+
+                line_nos = list(map(fn, hunk_match.groups()))
+                line = '@@ -%d +%d @@' % tuple(line_nos)
+            elif line[0] in prefixes:
+                # if we have a +, - or context line, leave as-is
+                pass
+            else:
+                # other lines are ignored
+                continue
+
+            hashed.update((line + '\n').encode('utf-8'))
+
+        return hashed.hexdigest()
+
+    @staticmethod
+    def get_clean_diff(diff):
         diff = diff.replace('\r', '')
 
         # For keeping a buffer of lines preceding @@ ... @@
         buflines = list()
-
-        phasher = hashlib.sha256()
+        difflines = ''
 
         # Used for counting where we are in the patch
         pp = 0
@@ -846,21 +885,34 @@ class LoreMessage:
                         break
                     addlines.append(bline)
                 if addlines:
-                    phasher.update(('\n'.join(reversed(addlines)) + '\n').encode('utf-8'))
+                    difflines += '\n'.join(reversed(addlines)) + '\n'
                 buflines = list()
                 # Feed this line to the hasher
-                phasher.update((line + '\n').encode('utf-8'))
+                difflines += line + '\n'
                 continue
             if pp > 0:
                 # Inside the patch
-                phasher.update((line + '\n').encode('utf-8'))
+                difflines += line + '\n'
                 if len(line) and line[0] == '-':
                     continue
                 pp -= 1
                 continue
             # Not anything we recognize, so stick into buflines
             buflines.append(line)
+        return difflines
 
+    @staticmethod
+    def get_patch_hash(diff):
+        # The aim is to represent the patch as if you did the following:
+        # git diff HEAD~.. | dos2unix | sha256sum
+        #
+        # This subroutine removes anything at the beginning of diff data, like
+        # diffstat or any other auxiliary data, and anything trailing at the end
+        # XXX: This currently doesn't work for git binary patches
+        #
+        diff = LoreMessage.get_clean_diff(diff)
+        phasher = hashlib.sha256()
+        phasher.update(diff.encode('utf-8'))
         return phasher.hexdigest()
 
     def load_hashes(self):
@@ -896,6 +948,7 @@ class LoreMessage:
             patch = pfh.read()
             if len(patch.strip()):
                 p = LoreMessage.get_patch_hash(patch)
+                self.pwhash = LoreMessage.get_patchwork_hash(patch)
         os.unlink(patch_out[1])
 
         if i and m and p:
diff --git a/b4/mbox.py b/b4/mbox.py
index 8c4ddec..f64ed6f 100644
--- a/b4/mbox.py
+++ b/b4/mbox.py
@@ -161,7 +161,7 @@ def thanks_record_am(lser):
         if pmsg.attestation is None:
             logger.debug('Unable to get hashes for all patches, not tracking for thanks')
             return
-        patches.append((pmsg.subject, pmsg.attestation.p))
+        patches.append((pmsg.subject, pmsg.pwhash))
 
     lmsg = lser.patches[0]
     if lmsg is None:
diff --git a/b4/ty.py b/b4/ty.py
index b1861e4..888da9b 100644
--- a/b4/ty.py
+++ b/b4/ty.py
@@ -158,7 +158,8 @@ def get_all_commits(gitdir, branch, since='1.week', committer=None):
     for line in lines:
         commit_id, subject = line.split(maxsplit=1)
         ecode, out = git_get_rev_diff(gitdir, commit_id)
-        pwhash = b4.LoreMessage.get_patch_hash(out)
+        pwhash = b4.LoreMessage.get_patchwork_hash(out)
+        logger.debug('phash=%s', pwhash)
         MY_COMMITS[pwhash] = (commit_id, subject)
 
     return MY_COMMITS
@@ -171,6 +172,7 @@ def auto_locate_series(gitdir, jsondata, branch, since='1.week', loose=False):
     # We need to find all of them in the commits
     found = list()
     for patch in jsondata['patches']:
+        logger.debug('Checking %s', patch)
         if patch[1] in patchids:
             logger.debug('Found: %s', patch[0])
             found.append(commits[patch[1]])
@@ -182,6 +184,7 @@ def auto_locate_series(gitdir, jsondata, branch, since='1.week', loose=False):
                     break
 
     if len(found) == len(jsondata['patches']):
+        logger.debug('Found all the patches')
         return found
 
     return None
@@ -376,21 +379,24 @@ def send_selected(cmdargs):
         logger.info('Nothing to do')
         sys.exit(0)
 
-    listing = list()
-    for num in cmdargs.send:
-        try:
-            index = int(num) - 1
-            listing.append(tracked[index])
-        except ValueError:
-            logger.critical('Please provide the number of the message')
-            logger.info('---')
-            write_tracked(tracked)
-            sys.exit(1)
-        except IndexError:
-            logger.critical('Invalid index: %s', num)
-            logger.info('---')
-            write_tracked(tracked)
-            sys.exit(1)
+    if 'all' in cmdargs.discard:
+        listing = tracked
+    else:
+        listing = list()
+        for num in cmdargs.send:
+            try:
+                index = int(num) - 1
+                listing.append(tracked[index])
+            except ValueError:
+                logger.critical('Please provide the number of the message')
+                logger.info('---')
+                write_tracked(tracked)
+                sys.exit(1)
+            except IndexError:
+                logger.critical('Invalid index: %s', num)
+                logger.info('---')
+                write_tracked(tracked)
+                sys.exit(1)
     if not len(listing):
         logger.info('Nothing to do')
         sys.exit(0)
@@ -453,12 +459,13 @@ def get_wanted_branch(cmdargs):
     gitdir = cmdargs.gitdir
     if not cmdargs.branch:
         # Find out our current branch
-        gitargs = ['branch', '--show-current']
+        gitargs = ['rev-parse', '--abbrev-ref', 'HEAD']
         ecode, out = b4.git_run_command(gitdir, gitargs)
         if ecode > 0:
             logger.critical('Not able to get current branch (git branch --show-current)')
             sys.exit(1)
         wantbranch = out.strip()
+        logger.debug('will check branch=%s', wantbranch)
     else:
         # Make sure it's a real branch
         gitargs = ['branch', '--format=%(refname:short)', '--list']
author	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2020-04-15 12:42:21 -0400
committer	Konstantin Ryabitsev <konstantin@linuxfoundation.org>	2020-04-15 12:42:21 -0400
commit	28b6825da28519b6ca0e8b3bce57700a1120ca9a (patch)
tree	7fb99d73ba8103752259eacd93920e343c2e38eb
parent	4be04f0af9a22087d0052838e013ef2de1ce2ac3 (diff)
download	b4-28b6825da28519b6ca0e8b3bce57700a1120ca9a.tar.gz