From dd6997138e01357d95c0e0212d9af33b977fd012 Mon Sep 17 00:00:00 2001 From: Konstantin Ryabitsev Date: Tue, 17 Aug 2021 17:00:02 -0400 Subject: Filter duplicate results when querying new series When querying against /all/, we may get multiple hits for the same subject, so deal with it early. Signed-off-by: Konstantin Ryabitsev --- b4/mbox.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/b4/mbox.py b/b4/mbox.py index e11d2ae..bf6618b 100644 --- a/b4/mbox.py +++ b/b4/mbox.py @@ -418,6 +418,7 @@ def get_extra_series(msgs: list, direction: int = 1, wantvers: Optional[int] = N t_mbx_url = '%s/%s/t.mbox.gz' % (listarc.rstrip('/'), nt_msgid) potentials = b4.get_pi_thread_by_url(t_mbx_url, nocache=nocache) if potentials: + potentials = b4.get_strict_thread(potentials, nt_msgid) nt_msgs += potentials logger.info(' Added %s messages from that thread', len(potentials)) else: @@ -461,6 +462,7 @@ def get_extra_series(msgs: list, direction: int = 1, wantvers: Optional[int] = N resp.close() ns = {'atom': 'http://www.w3.org/2005/Atom'} entries = tree.findall('atom:entry', ns) + seen_urls = set() for entry in entries: title = entry.find('atom:title', ns).text @@ -497,6 +499,9 @@ def get_extra_series(msgs: list, direction: int = 1, wantvers: Optional[int] = N logger.debug('No idea what this is: %s', title) continue t_mbx_url = '%st.mbox.gz' % link + if t_mbx_url in seen_urls: + continue + seen_urls.add(t_mbx_url) logger.info('New revision: %s', title) potentials = b4.get_pi_thread_by_url(t_mbx_url, nocache=nocache) if potentials: -- cgit v1.2.3