Source code for dak.stats

#! /usr/bin/env python3

""" Various statistical pr0nography fun and games """
# Copyright (C) 2000, 2001, 2002, 2003, 2006  James Troup <james@nocrew.org>
# Copyright (C) 2013  Luca Falavigna <dktrkranz@debian.org>

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

################################################################################

# <aj>    can we change the standards instead?
# <neuro> standards?
# <aj>    whatever we're not conforming to
# <aj>    if there's no written standard, why don't we declare linux as
#         the defacto standard
# <aj>    go us!

# [aj's attempt to avoid ABI changes for released architecture(s)]

################################################################################

import subprocess
import sys
import tempfile
import apt_pkg

from datetime import datetime
from email.utils import mktime_tz, parsedate_tz
from mailbox import mbox
from os import listdir
from os.path import isfile, join, splitext
from re import findall, DOTALL, MULTILINE
from sys import stderr
from yaml import safe_load, safe_dump

from daklib import utils
from daklib.dbconn import DBConn, get_suite_architectures, Suite, Architecture

################################################################################

Cnf = None

stats = {}
users = {}
buffer = 0
FORMAT_SWITCH = '2009-08'
blacklisted = ('dak', 'katie')

NEW = (r'^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)'
       r'\|(Moving to new|ACCEPT-TO-NEW)')
new_ACTIONS = r'^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]'
old_ACTIONS = (r'(?:lisa|process-new)\|program start\|(.*?)\|'
               r'(?:lisa|process-new)\|program end')
old_ACTION = r'^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|'

################################################################################


[docs]def usage(exit_code=0): print("""Usage: dak stats MODE Print various stats. -h, --help show this help and exit. The following MODEs are available: arch-space - displays space used by each architecture pkg-nums - displays the number of packages by suite/architecture daily-install - displays daily install stats suitable for graphing new - stores stats about the NEW queue """) sys.exit(exit_code)
################################################################################
[docs]def per_arch_space_use(): session = DBConn().session() q = session.execute(""" SELECT a.arch_string as Architecture, sum(f.size) AS sum FROM files f, binaries b, architecture a WHERE a.id=b.architecture AND f.id=b.file GROUP BY a.arch_string ORDER BY sum""").fetchall() for j in q: print("%-15.15s %s" % (j[0], j[1])) print() q = session.execute("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'").fetchall() print("%-15.15s %s" % ("Source", q[0][0]))
################################################################################
[docs]def daily_install_stats(): stats = {} f = open("2001-11") for line in f.readlines(): split = line.strip().split('|') program = split[1] if program != "katie" and program != "process-accepted": continue action = split[2] if action != "installing changes" and action != "installed": continue date = split[0][:8] if date not in stats: stats[date] = {} stats[date]["packages"] = 0 stats[date]["size"] = 0.0 if action == "installing changes": stats[date]["packages"] += 1 elif action == "installed": stats[date]["size"] += float(split[5]) dates = sorted(stats) for date in dates: packages = stats[date]["packages"] size = int(stats[date]["size"] / 1024.0 / 1024.0) print("%s %s %s" % (date, packages, size))
################################################################################
[docs]def output_format(suite): output_suite = [] for word in suite.split("-"): output_suite.append(word[0]) return "-".join(output_suite)
[docs]def number_of_packages(): arches = {} arch_ids = {} suites = {} suite_ids = {} d = {} session = DBConn().session() # Build up suite mapping for i in session.query(Suite).all(): suites[i.suite_id] = i.suite_name suite_ids[i.suite_name] = i.suite_id # Build up architecture mapping for i in session.query(Architecture).all(): arches[i.arch_id] = i.arch_string arch_ids[i.arch_string] = i.arch_id # Pre-create the dictionary for suite_id in suites.keys(): d[suite_id] = {} for arch_id in arches.keys(): d[suite_id][arch_id] = 0 # Get the raw data for binaries # Simultate 'GROUP by suite, architecture' with a dictionary # XXX: Why don't we just get the DB to do this? for i in session.execute("""SELECT suite, architecture, COUNT(suite) FROM bin_associations LEFT JOIN binaries ON bin = binaries.id GROUP BY suite, architecture""").fetchall(): d[i[0]][i[1]] = i[2] # Get the raw data for source arch_id = arch_ids["source"] for i in session.execute('SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite').fetchall(): (suite_id, count) = i d[suite_id][arch_id] = d[suite_id][arch_id] + count ## Print the results # Setup suite_list = list(suites.values()) suite_id_list = [] suite_arches = {} for suite in suite_list: suite_id = suite_ids[suite] suite_arches[suite_id] = {} for arch in get_suite_architectures(suite): suite_arches[suite_id][arch.arch_string] = "" suite_id_list.append(suite_id) output_list = [output_format(i) for i in suite_list] longest_suite = max(len(suite) for suite in output_list) arch_list = sorted(arches.values()) longest_arch = max(len(arch) for arch in arch_list) # Header output = (" " * longest_arch) + " |" for suite in output_list: output = output + suite.center(longest_suite) + " |" output = output + "\n" + (len(output) * "-") + "\n" # per-arch data for arch in arch_list: arch_id = arch_ids[arch] output = output + arch.center(longest_arch) + " |" for suite_id in suite_id_list: if arch in suite_arches[suite_id]: count = "%d" % d[suite_id][arch_id] else: count = "-" output = output + count.rjust(longest_suite) + " |" output = output + "\n" print(output)
################################################################################
[docs]def parse_new_uploads(data): global stats latest_timestamp = stats['timestamp'] for entry in findall(NEW, data, MULTILINE): timestamp = entry[0] if stats['timestamp'] >= timestamp: continue date = parse_timestamp(timestamp) if date not in stats: stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}} stats[date]['stats']['NEW'] += 1 stats['history']['stats']['NEW'] += 1 latest_timestamp = timestamp return latest_timestamp
[docs]def parse_actions(data, logdate): global stats latest_timestamp = stats['timestamp'] if logdate <= FORMAT_SWITCH: for batch in findall(old_ACTIONS, data, DOTALL): who = batch.split()[0] if who in blacklisted: continue for entry in findall(old_ACTION, batch, MULTILINE): action = entry[1] if action.startswith('Accepting'): action = 'ACCEPT' elif action.startswith('rejected'): action = 'REJECT' timestamp = entry[0] if stats['timestamp'] >= timestamp: continue date = parse_timestamp(entry[0]) if date not in stats: stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}} stats[date]['stats'][action] += 1 stats['history']['stats'][action] += 1 if who not in stats[date]['members']: stats[date]['members'][who] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} stats[date]['members'][who][action] += 1 if who not in stats['history']['members']: stats['history']['members'][who] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} stats['history']['members'][who][action] += 1 latest_timestamp = timestamp parse_prod(logdate) if logdate >= FORMAT_SWITCH: for entry in findall(new_ACTIONS, data, MULTILINE): action = entry[2] timestamp = entry[0] if stats['timestamp'] >= timestamp: continue date = parse_timestamp(timestamp) if date not in stats: stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}} member = entry[1] if member in blacklisted: continue if date not in stats: stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}} if member not in stats[date]['members']: stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} if member not in stats['history']['members']: stats['history']['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} stats[date]['stats'][action] += 1 stats[date]['members'][member][action] += 1 stats['history']['stats'][action] += 1 stats['history']['members'][member][action] += 1 latest_timestamp = timestamp return latest_timestamp
[docs]def parse_prod(logdate): global stats global users maildate = ''.join([x[-2:] for x in logdate.split('-')]) mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive', 'mail-%s.xz' % maildate) if not isfile(mailarchive): return with tempfile.NamedTemporaryFile(dir=utils.get_conf()['Dir::TempPath']) as tmpfile: with open(mailarchive, 'rb') as fh: subprocess.check_call(['xzcat'], stdin=fh, stdout=tmpfile) for message in mbox(tmpfile.name): if (message['subject'] and message['subject'].startswith('Comments regarding')): try: member = users[' '.join(message['From'].split()[:-1])] except KeyError: continue ts = mktime_tz(parsedate_tz(message['date'])) timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") date = parse_timestamp(timestamp) if date not in stats: stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}} if member not in stats[date]['members']: stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} if member not in stats['history']['members']: stats['history']['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 'PROD': 0} stats[date]['stats']['PROD'] += 1 stats[date]['members'][member]['PROD'] += 1 stats['history']['stats']['PROD'] += 1 stats['history']['members'][member]['PROD'] += 1
[docs]def parse_timestamp(timestamp): y = int(timestamp[:4]) m = int(timestamp[4:6]) return '%d-%02d' % (y, m)
[docs]def new_stats(logdir, yaml): global Cnf global stats try: with open(yaml, 'r') as fd: stats = safe_load(fd) except OSError: pass if not stats: stats = {'history': {'stats': {'NEW': 0, 'ACCEPT': 0, 'REJECT': 0, 'PROD': 0}, 'members': {}}, 'timestamp': '19700101000000'} latest_timestamp = stats['timestamp'] for fn in sorted(listdir(logdir)): if fn == 'current': continue log = splitext(fn)[0] if log < parse_timestamp(stats['timestamp']): continue logfile = join(logdir, fn) if isfile(logfile): if fn.endswith('.bz2'): # This hack is required becaue python2 does not support # multi-stream files (http://bugs.python.org/issue1625) with open(logfile, 'rb') as fh: data = subprocess.check_output(['bzcat'], stdin=fh) elif fn.endswith('.xz'): with open(logfile, 'rb') as fh: data = subprocess.check_output(['xzcat'], stdin=fh) elif fn.endswith('.zst'): with open(logfile, 'rb') as fh: data = subprocess.check_output(['zstdcat'], stdin=fh) else: with open(logfile, 'rb') as fd: data = fd.read() try: data = data.decode() except UnicodeDecodeError: data = data.decode('latin1') ts = parse_new_uploads(data) if ts > latest_timestamp: latest_timestamp = ts ts = parse_actions(data, log) if ts > latest_timestamp: latest_timestamp = ts stderr.write('.') stderr.flush() stderr.write('\n') stderr.flush() stats['timestamp'] = latest_timestamp with open(yaml, 'w') as fd: safe_dump(stats, fd)
################################################################################
[docs]def main(): global Cnf global users Cnf = utils.get_conf() Arguments = [('h', "help", "Stats::Options::Help")] for i in ["help"]: key = "Stats::Options::%s" % i if key not in Cnf: Cnf[key] = "" args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) Options = Cnf.subtree("Stats::Options") if Options["Help"]: usage() if len(args) < 1: utils.warn("dak stats requires a MODE argument") usage(1) elif len(args) > 1: if args[0].lower() != "new": utils.warn("dak stats accepts only one MODE argument") usage(1) elif args[0].lower() == "new": utils.warn("new MODE requires an output file") usage(1) mode = args[0].lower() if mode == "arch-space": per_arch_space_use() elif mode == "pkg-nums": number_of_packages() elif mode == "daily-install": daily_install_stats() elif mode == "new": users = utils.get_users_from_ldap() new_stats(Cnf["Dir::Log"], args[1]) else: utils.warn("unknown mode '%s'" % (mode)) usage(1)
################################################################################ if __name__ == '__main__': main()