#!/usr/bin/env python3
#
# This file is part of the LibreOffice project.
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at https://mozilla.org/MPL/2.0/.
#
# Uses https://github.com/gitpython-developers/GitPython
# Results published in https://wiki.documentfoundation.org/Development/RegressionHotspots
# Run in LibreOffice core directory directing output to a text file. Shouldn't take more than a minute.

import sys
import re
import git
import ssl

from urllib.request import urlopen, URLError
from io import BytesIO

def get_fixed_regression_bugs():
    url = 'https://bugs.documentfoundation.org/buglist.cgi?f1=component&f2=component&f3=component&f4=component&f5=component&n1=1&n2=1&n3=1&n4=1&n5=1&o1=equals&o2=equals&o3=equals&o4=equals&o5=equals&v1=ci-infra&v2=deletionRequest&v3=FirefoxOS%20app&v4=SI-GUI&v5=WWW&columnlist=&keywords=regression%2C%20&keywords_type=allwords&limit=0&product=LibreOffice&resolution=FIXED&ctype=csv&human=0'

    ctx = ssl.create_default_context()
    ctx.check_hostname = False
    ctx.verify_mode = ssl.CERT_NONE

    try:
        resp = urlopen(url, context=ctx)
    except URLError:
        sys.stderr.write('Error fetching {} -> {}\n'.format(url, URLError.errno))
        sys.exit(1)
    bug_ids=[]
    for line in [raw.decode('utf-8').strip('\n') for raw in BytesIO(resp.read())][1:]:
        bug_ids.append(int(line))
    return bug_ids

def get_dir_counts(file_counts, level):
    dir_counts = {}
    for (filename, count) in file_counts.items():
        fileparts = filename.split('/')
        if len(fileparts) > level:
            dirpart = '/'.join(fileparts[:level])
            if dirpart in dir_counts:
                dir_counts[dirpart]+=count
            else:
                dir_counts[dirpart]=count
    return dir_counts

def print_counts(counts):
    printorder = reversed(sorted((count, name) for (name, count) in counts.items()))
    # wiki page uses a widget to clamp the output while offering a button to expand
    print('<pre class="clamped">')
    for count in printorder:
        # we are mainly interested in the hottest spots, so skip counts below 10
        if count[0] >= 10:
            print('%5d %s' % (count[0], count[1]))
    print('</pre>')

if __name__ == '__main__':
    file_counts = {}
    excluderegex = re.compile(r'qa/|qadevOOo/|icon-themes/|extras/source/gallery/|extras/source/palettes/|extras/source/templates/|extras/source/truetype/|\.git-hooks|helpcontent2|dictionaries|translations|download\.lst|\.png|\.patch')
    fixed_regression_ids = get_fixed_regression_bugs()
    sys.stderr.write('found %d fixed regressions: %s\n' % (len(fixed_regression_ids), fixed_regression_ids))

    # build a dictionary of hashes and bug IDs from all commits targeting a report in FDO/TDF Bugzilla
    # (first commit with fdo# aka freedesktop.org is from 1 Oct 2010)
    # sometimes people accidentally leave out the #, so take that into account in the regexes
    gitbugs = {}
    buglog = git.Git('.').execute(['git', 'log', '--grep=(fdo|tdf)#*', '-E', '--oneline', '--since=1.10.2010'])
    if buglog:
        for line in buglog.split('\n'):
            githash = line.partition(' ')[0]
            # the regex search will ignore any commits hit by the grep where fdo|tdf# occurred below
            # the first line - this is desirable as the referred bug ID should appear in the subject line
            bugid = re.search(r"(?:fdo|tdf)#*([0-9]+)", line)
            if bugid:
                gitbugs[githash] = int(bugid.group(1))

    # create a list of bug fix hashes by filtering with the bug IDs we got from the Bugzilla query
    fix_hashes = [key for key, value in gitbugs.items() if value in fixed_regression_ids]

    for githash in fix_hashes:
        lognames = git.Git('.').execute(['git', 'show', githash, '--pretty=tformat:', '--name-only'])
        if lognames:
            for filename in lognames.split('\n'):
                if not excluderegex.search(filename):
                    sys.stderr.write('regression fix touched file: %s\n' % filename)
                    if filename in file_counts:
                        file_counts[filename]+=1
                    else:
                        file_counts[filename]=1

    print('=== files ===\n')
    print_counts(file_counts)
    print('\n=== fourth level dirs ===\n')
    print_counts(get_dir_counts(file_counts, 4))
    print('\n=== third level dirs ===\n')
    print_counts(get_dir_counts(file_counts, 3))
    print('\n=== second level dirs ===\n')
    print_counts(get_dir_counts(file_counts, 2))
    print('\n=== top level dirs ===\n')
    print_counts(get_dir_counts(file_counts, 1))