Source code for dak.stats

#! /usr/bin/env python3

""" Various statistical pr0nography fun and games """
# Copyright (C) 2000, 2001, 2002, 2003, 2006  James Troup <james@nocrew.org>
# Copyright (C) 2013  Luca Falavigna <dktrkranz@debian.org>

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

################################################################################

# <aj>    can we change the standards instead?
# <neuro> standards?
# <aj>    whatever we're not conforming to
# <aj>    if there's no written standard, why don't we declare linux as
#         the defacto standard
# <aj>    go us!

# [aj's attempt to avoid ABI changes for released architecture(s)]

################################################################################

import subprocess
import sys
import tempfile
import apt_pkg

from datetime import datetime
from email.utils import mktime_tz, parsedate_tz
from mailbox import mbox
from os import listdir
from os.path import isfile, join, splitext
from re import findall, DOTALL, MULTILINE
from sys import stderr
from yaml import safe_load, safe_dump

from daklib import utils
from daklib.dbconn import DBConn, get_suite_architectures, Suite, Architecture

################################################################################

Cnf = None

stats = {}
users = {}
buffer = 0
FORMAT_SWITCH = '2009-08'
blacklisted = ('dak', 'katie')

NEW = (r'^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)'
       r'\|(Moving to new|ACCEPT-TO-NEW)')
new_ACTIONS = r'^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]'
old_ACTIONS = (r'(?:lisa|process-new)\|program start\|(.*?)\|'
               r'(?:lisa|process-new)\|program end')
old_ACTION = r'^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|'

################################################################################


[docs]def usage(exit_code=0):
    print("""Usage: dak stats MODE
Print various stats.

  -h, --help                show this help and exit.

The following MODEs are available:

  arch-space    - displays space used by each architecture
  pkg-nums      - displays the number of packages by suite/architecture
  daily-install - displays daily install stats suitable for graphing
  new           - stores stats about the NEW queue
""")
    sys.exit(exit_code)

################################################################################


[docs]def per_arch_space_use():
    session = DBConn().session()
    q = session.execute("""
SELECT a.arch_string as Architecture, sum(f.size) AS sum
  FROM files f, binaries b, architecture a
  WHERE a.id=b.architecture AND f.id=b.file
  GROUP BY a.arch_string ORDER BY sum""").fetchall()
    for j in q:
        print("%-15.15s %s" % (j[0], j[1]))
    print()
    q = session.execute("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'").fetchall()
    print("%-15.15s %s" % ("Source", q[0][0]))

################################################################################


[docs]def daily_install_stats():
    stats = {}
    f = open("2001-11")
    for line in f.readlines():
        split = line.strip().split('|')
        program = split[1]
        if program != "katie" and program != "process-accepted":
            continue
        action = split[2]
        if action != "installing changes" and action != "installed":
            continue
        date = split[0][:8]
        if date not in stats:
            stats[date] = {}
            stats[date]["packages"] = 0
            stats[date]["size"] = 0.0
        if action == "installing changes":
            stats[date]["packages"] += 1
        elif action == "installed":
            stats[date]["size"] += float(split[5])

    dates = sorted(stats)
    for date in dates:
        packages = stats[date]["packages"]
        size = int(stats[date]["size"] / 1024.0 / 1024.0)
        print("%s %s %s" % (date, packages, size))

################################################################################


[docs]def output_format(suite):
    output_suite = []
    for word in suite.split("-"):
        output_suite.append(word[0])
    return "-".join(output_suite)


[docs]def number_of_packages():
    arches = {}
    arch_ids = {}
    suites = {}
    suite_ids = {}
    d = {}
    session = DBConn().session()
    # Build up suite mapping
    for i in session.query(Suite).all():
        suites[i.suite_id] = i.suite_name
        suite_ids[i.suite_name] = i.suite_id
    # Build up architecture mapping
    for i in session.query(Architecture).all():
        arches[i.arch_id] = i.arch_string
        arch_ids[i.arch_string] = i.arch_id
    # Pre-create the dictionary
    for suite_id in suites.keys():
        d[suite_id] = {}
        for arch_id in arches.keys():
            d[suite_id][arch_id] = 0
    # Get the raw data for binaries
    # Simultate 'GROUP by suite, architecture' with a dictionary
    # XXX: Why don't we just get the DB to do this?
    for i in session.execute("""SELECT suite, architecture, COUNT(suite)
                                FROM bin_associations
                           LEFT JOIN binaries ON bin = binaries.id
                            GROUP BY suite, architecture""").fetchall():
        d[i[0]][i[1]] = i[2]
    # Get the raw data for source
    arch_id = arch_ids["source"]
    for i in session.execute('SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite').fetchall():
        (suite_id, count) = i
        d[suite_id][arch_id] = d[suite_id][arch_id] + count
    ## Print the results
    # Setup
    suite_list = list(suites.values())
    suite_id_list = []
    suite_arches = {}
    for suite in suite_list:
        suite_id = suite_ids[suite]
        suite_arches[suite_id] = {}
        for arch in get_suite_architectures(suite):
            suite_arches[suite_id][arch.arch_string] = ""
        suite_id_list.append(suite_id)
    output_list = [output_format(i) for i in suite_list]
    longest_suite = max(len(suite) for suite in output_list)
    arch_list = sorted(arches.values())
    longest_arch = max(len(arch) for arch in arch_list)
    # Header
    output = (" " * longest_arch) + " |"
    for suite in output_list:
        output = output + suite.center(longest_suite) + " |"
    output = output + "\n" + (len(output) * "-") + "\n"
    # per-arch data
    for arch in arch_list:
        arch_id = arch_ids[arch]
        output = output + arch.center(longest_arch) + " |"
        for suite_id in suite_id_list:
            if arch in suite_arches[suite_id]:
                count = "%d" % d[suite_id][arch_id]
            else:
                count = "-"
            output = output + count.rjust(longest_suite) + " |"
        output = output + "\n"
    print(output)

################################################################################


[docs]def parse_new_uploads(data):
    global stats
    latest_timestamp = stats['timestamp']
    for entry in findall(NEW, data, MULTILINE):
        timestamp = entry[0]
        if stats['timestamp'] >= timestamp:
            continue
        date = parse_timestamp(timestamp)
        if date not in stats:
            stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
                           'REJECT': 0, 'PROD': 0}, 'members': {}}
        stats[date]['stats']['NEW'] += 1
        stats['history']['stats']['NEW'] += 1
        latest_timestamp = timestamp
    return latest_timestamp


[docs]def parse_actions(data, logdate):
    global stats
    latest_timestamp = stats['timestamp']
    if logdate <= FORMAT_SWITCH:
        for batch in findall(old_ACTIONS, data, DOTALL):
            who = batch.split()[0]
            if who in blacklisted:
                continue
            for entry in findall(old_ACTION, batch, MULTILINE):
                action = entry[1]
                if action.startswith('Accepting'):
                    action = 'ACCEPT'
                elif action.startswith('rejected'):
                    action = 'REJECT'
                timestamp = entry[0]
                if stats['timestamp'] >= timestamp:
                    continue
                date = parse_timestamp(entry[0])
                if date not in stats:
                    stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
                                   'REJECT': 0, 'PROD': 0}, 'members': {}}
                stats[date]['stats'][action] += 1
                stats['history']['stats'][action] += 1
                if who not in stats[date]['members']:
                    stats[date]['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
                                                   'PROD': 0}
                stats[date]['members'][who][action] += 1
                if who not in stats['history']['members']:
                    stats['history']['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
                                                    'PROD': 0}
                stats['history']['members'][who][action] += 1
                latest_timestamp = timestamp
        parse_prod(logdate)
    if logdate >= FORMAT_SWITCH:
        for entry in findall(new_ACTIONS, data, MULTILINE):
            action = entry[2]
            timestamp = entry[0]
            if stats['timestamp'] >= timestamp:
                continue
            date = parse_timestamp(timestamp)
            if date not in stats:
                stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
                               'REJECT': 0, 'PROD': 0}, 'members': {}}
            member = entry[1]
            if member in blacklisted:
                continue
            if date not in stats:
                stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
                               'REJECT': 0, 'PROD': 0}, 'members': {}}
            if member not in stats[date]['members']:
                stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
                                                  'PROD': 0}
            if member not in stats['history']['members']:
                stats['history']['members'][member] = {'ACCEPT': 0,
                                                       'REJECT': 0, 'PROD': 0}
            stats[date]['stats'][action] += 1
            stats[date]['members'][member][action] += 1
            stats['history']['stats'][action] += 1
            stats['history']['members'][member][action] += 1
            latest_timestamp = timestamp
    return latest_timestamp


[docs]def parse_prod(logdate):
    global stats
    global users
    maildate = ''.join([x[-2:] for x in logdate.split('-')])
    mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive',
                       'mail-%s.xz' % maildate)
    if not isfile(mailarchive):
        return
    with tempfile.NamedTemporaryFile(dir=utils.get_conf()['Dir::TempPath']) as tmpfile:
        with open(mailarchive, 'rb') as fh:
            subprocess.check_call(['xzcat'], stdin=fh, stdout=tmpfile)
        for message in mbox(tmpfile.name):
            if (message['subject']
                    and message['subject'].startswith('Comments regarding')):
                try:
                    member = users[' '.join(message['From'].split()[:-1])]
                except KeyError:
                    continue
                ts = mktime_tz(parsedate_tz(message['date']))
                timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
                date = parse_timestamp(timestamp)
                if date not in stats:
                    stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
                                     'REJECT': 0, 'PROD': 0}, 'members': {}}
                if member not in stats[date]['members']:
                    stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
                                                         'PROD': 0}
                if member not in stats['history']['members']:
                    stats['history']['members'][member] = {'ACCEPT': 0,
                                                           'REJECT': 0, 'PROD': 0}
                stats[date]['stats']['PROD'] += 1
                stats[date]['members'][member]['PROD'] += 1
                stats['history']['stats']['PROD'] += 1
                stats['history']['members'][member]['PROD'] += 1


[docs]def parse_timestamp(timestamp):
    y = int(timestamp[:4])
    m = int(timestamp[4:6])
    return '%d-%02d' % (y, m)


[docs]def new_stats(logdir, yaml):
    global Cnf
    global stats
    try:
        with open(yaml, 'r') as fd:
            stats = safe_load(fd)
    except OSError:
        pass
    if not stats:
        stats = {'history': {'stats': {'NEW': 0, 'ACCEPT': 0,
                 'REJECT': 0, 'PROD': 0}, 'members': {}},
                 'timestamp': '19700101000000'}
    latest_timestamp = stats['timestamp']
    for fn in sorted(listdir(logdir)):
        if fn == 'current':
            continue
        log = splitext(fn)[0]
        if log < parse_timestamp(stats['timestamp']):
            continue
        logfile = join(logdir, fn)
        if isfile(logfile):
            if fn.endswith('.bz2'):
                # This hack is required becaue python2 does not support
                # multi-stream files (http://bugs.python.org/issue1625)
                with open(logfile, 'rb') as fh:
                    data = subprocess.check_output(['bzcat'], stdin=fh)
            elif fn.endswith('.xz'):
                with open(logfile, 'rb') as fh:
                    data = subprocess.check_output(['xzcat'], stdin=fh)
            elif fn.endswith('.zst'):
                with open(logfile, 'rb') as fh:
                    data = subprocess.check_output(['zstdcat'], stdin=fh)
            else:
                with open(logfile, 'rb') as fd:
                    data = fd.read()
            try:
                data = data.decode()
            except UnicodeDecodeError:
                data = data.decode('latin1')
            ts = parse_new_uploads(data)
            if ts > latest_timestamp:
                latest_timestamp = ts
            ts = parse_actions(data, log)
            if ts > latest_timestamp:
                latest_timestamp = ts
            stderr.write('.')
            stderr.flush()
    stderr.write('\n')
    stderr.flush()
    stats['timestamp'] = latest_timestamp
    with open(yaml, 'w') as fd:
        safe_dump(stats, fd)

################################################################################


[docs]def main():
    global Cnf
    global users

    Cnf = utils.get_conf()
    Arguments = [('h', "help", "Stats::Options::Help")]
    for i in ["help"]:
        key = "Stats::Options::%s" % i
        if key not in Cnf:
            Cnf[key] = ""

    args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv)

    Options = Cnf.subtree("Stats::Options")
    if Options["Help"]:
        usage()

    if len(args) < 1:
        utils.warn("dak stats requires a MODE argument")
        usage(1)
    elif len(args) > 1:
        if args[0].lower() != "new":
            utils.warn("dak stats accepts only one MODE argument")
            usage(1)
    elif args[0].lower() == "new":
        utils.warn("new MODE requires an output file")
        usage(1)
    mode = args[0].lower()

    if mode == "arch-space":
        per_arch_space_use()
    elif mode == "pkg-nums":
        number_of_packages()
    elif mode == "daily-install":
        daily_install_stats()
    elif mode == "new":
        users = utils.get_users_from_ldap()
        new_stats(Cnf["Dir::Log"], args[1])
    else:
        utils.warn("unknown mode '%s'" % (mode))
        usage(1)

################################################################################


if __name__ == '__main__':
    main()