scripts/regression-hotspots.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72

#!/usr/bin/env python3
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
#

import sys
import re
import sh
from urllib.request import urlopen, URLError
from io import BytesIO

def get_fixed_regression_bugs():
    url = 'https://bugs.libreoffice.org/buglist.cgi?bug_status=UNCONFIRMED&bug_status=NEW&bug_status=ASSIGNED&bug_status=REOPENED&bug_status=RESOLVED&bug_status=VERIFIED&bug_status=CLOSED&bug_status=NEEDINFO&bug_status=PLEASETEST&columnlist=&keywords=regression%2C%20&keywords_type=allwords&limit=0&list_id=354018&product=LibreOffice&query_format=advanced&resolution=FIXED&ctype=csv&human=0'
    try:
        resp = urlopen(url)
    except URLError:
        sys.stderr.write('Error fetching {}'.format(url))
        sys.exit(1)
    bug_ids=[]
    for line in [raw.decode('utf-8').strip('\n') for raw in BytesIO(resp.read())][1:]:
        bug_ids.append(int(line))
    return bug_ids

def get_dir_counts(file_counts, level):
    dir_counts = {}
    for (filename, count) in file_counts.items():
        fileparts = filename.split('/')
        if len(fileparts) > level:
            dirpart = '/'.join(fileparts[:level])
            if dirpart in dir_counts:
                dir_counts[dirpart]+=count
            else:
                dir_counts[dirpart]=count
    return dir_counts

def print_counts(counts):
    printorder = reversed(sorted((count, name) for (name, count) in counts.items()))
    for count in printorder:
        print('%5d %s' % (count[0], count[1]))

if __name__ == '__main__':
    file_counts = {}
    statregex = re.compile('^ ([^ ]+) \|')
    fixed_regression_ids = get_fixed_regression_bugs()
    sys.stderr.write('found %d fixed regressions: %s\n' % (len(fixed_regression_ids), fixed_regression_ids))
    for bug_id in fixed_regression_ids:
        sys.stderr.write('working on bug %d\n' % bug_id)
        # FIXME: use --numstat instead, which does not abbreviate filenames
        logstat = sh.git('--no-pager', 'log', '--grep', '[fdo|tdf]#%d' % bug_id, '--stat')
        for line in logstat:
            match = statregex.search(str(line))
            if match and match.group(1):
                filename = match.group(1)
                sys.stderr.write('regression fix touched file: %s\n' % filename)
                if filename in file_counts:
                    file_counts[filename]+=1
                else:
                    file_counts[filename]=1
    print('top level dirs:')
    print_counts(get_dir_counts(file_counts, 1))
    print('\nsecond level dirs:')
    print_counts(get_dir_counts(file_counts, 2))
    print('\nthird level dirs:')
    print_counts(get_dir_counts(file_counts, 3))
    print('\nfourth level dirs:')
    print_counts(get_dir_counts(file_counts, 4))
    print('\nfiles:')
    print_counts(file_counts)