1#! /usr/bin/env python3
3"""Various statistical pr0nography fun and games"""
4# Copyright (C) 2000, 2001, 2002, 2003, 2006 James Troup <james@nocrew.org>
5# Copyright (C) 2013 Luca Falavigna <dktrkranz@debian.org>
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21################################################################################
23# <aj> can we change the standards instead?
24# <neuro> standards?
25# <aj> whatever we're not conforming to
26# <aj> if there's no written standard, why don't we declare linux as
27# the defacto standard
28# <aj> go us!
30# [aj's attempt to avoid ABI changes for released architecture(s)]
32################################################################################
34import subprocess
35import sys
36import tempfile
37from datetime import datetime
38from email.utils import mktime_tz, parsedate_tz
39from mailbox import mbox
40from os import listdir
41from os.path import isfile, join, splitext
42from re import DOTALL, MULTILINE, findall
43from sys import stderr
45import apt_pkg
46from yaml import safe_dump, safe_load
48from daklib import utils
49from daklib.dbconn import Architecture, DBConn, Suite, get_suite_architectures
51################################################################################
53Cnf = None
55stats = {}
56users = {}
57buffer = 0
58FORMAT_SWITCH = "2009-08"
59blacklisted = ("dak", "katie")
61NEW = (
62 r"^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)"
63 r"\|(Moving to new|ACCEPT-TO-NEW)"
64)
65new_ACTIONS = r"^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]"
66old_ACTIONS = (
67 r"(?:lisa|process-new)\|program start\|(.*?)\|" r"(?:lisa|process-new)\|program end"
68)
69old_ACTION = r"^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|"
71################################################################################
74def usage(exit_code=0):
75 print(
76 """Usage: dak stats MODE
77Print various stats.
79 -h, --help show this help and exit.
81The following MODEs are available:
83 arch-space - displays space used by each architecture
84 pkg-nums - displays the number of packages by suite/architecture
85 daily-install - displays daily install stats suitable for graphing
86 new - stores stats about the NEW queue
87"""
88 )
89 sys.exit(exit_code)
92################################################################################
95def per_arch_space_use():
96 session = DBConn().session()
97 q = session.execute(
98 """
99SELECT a.arch_string as Architecture, sum(f.size) AS sum
100 FROM files f, binaries b, architecture a
101 WHERE a.id=b.architecture AND f.id=b.file
102 GROUP BY a.arch_string ORDER BY sum"""
103 ).fetchall()
104 for j in q:
105 print("%-15.15s %s" % (j[0], j[1]))
106 print()
107 q = session.execute(
108 "SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'"
109 ).fetchall()
110 print("%-15.15s %s" % ("Source", q[0][0]))
113################################################################################
116def daily_install_stats():
117 stats = {}
118 f = open("2001-11")
119 for line in f.readlines():
120 split = line.strip().split("|")
121 program = split[1]
122 if program != "katie" and program != "process-accepted":
123 continue
124 action = split[2]
125 if action != "installing changes" and action != "installed":
126 continue
127 date = split[0][:8]
128 if date not in stats:
129 stats[date] = {}
130 stats[date]["packages"] = 0
131 stats[date]["size"] = 0.0
132 if action == "installing changes":
133 stats[date]["packages"] += 1
134 elif action == "installed":
135 stats[date]["size"] += float(split[5])
137 dates = sorted(stats)
138 for date in dates:
139 packages = stats[date]["packages"]
140 size = int(stats[date]["size"] / 1024.0 / 1024.0)
141 print("%s %s %s" % (date, packages, size))
144################################################################################
147def output_format(suite):
148 output_suite = []
149 for word in suite.split("-"):
150 output_suite.append(word[0])
151 return "-".join(output_suite)
154def number_of_packages():
155 arches = {}
156 arch_ids = {}
157 suites = {}
158 suite_ids = {}
159 d = {}
160 session = DBConn().session()
161 # Build up suite mapping
162 for i in session.query(Suite).all():
163 suites[i.suite_id] = i.suite_name
164 suite_ids[i.suite_name] = i.suite_id
165 # Build up architecture mapping
166 for i in session.query(Architecture).all():
167 arches[i.arch_id] = i.arch_string
168 arch_ids[i.arch_string] = i.arch_id
169 # Pre-create the dictionary
170 for suite_id in suites.keys():
171 d[suite_id] = {}
172 for arch_id in arches.keys():
173 d[suite_id][arch_id] = 0
174 # Get the raw data for binaries
175 # Simultate 'GROUP by suite, architecture' with a dictionary
176 # XXX: Why don't we just get the DB to do this?
177 for i in session.execute(
178 """SELECT suite, architecture, COUNT(suite)
179 FROM bin_associations
180 LEFT JOIN binaries ON bin = binaries.id
181 GROUP BY suite, architecture"""
182 ).fetchall():
183 d[i[0]][i[1]] = i[2]
184 # Get the raw data for source
185 arch_id = arch_ids["source"]
186 for i in session.execute(
187 "SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite"
188 ).fetchall():
189 (suite_id, count) = i
190 d[suite_id][arch_id] = d[suite_id][arch_id] + count
191 ## Print the results
192 # Setup
193 suite_list = list(suites.values())
194 suite_id_list = []
195 suite_arches = {}
196 for suite in suite_list:
197 suite_id = suite_ids[suite]
198 suite_arches[suite_id] = {}
199 for arch in get_suite_architectures(suite):
200 suite_arches[suite_id][arch.arch_string] = ""
201 suite_id_list.append(suite_id)
202 output_list = [output_format(i) for i in suite_list]
203 longest_suite = max(len(suite) for suite in output_list)
204 arch_list = sorted(arches.values())
205 longest_arch = max(len(arch) for arch in arch_list)
206 # Header
207 output = (" " * longest_arch) + " |"
208 for suite in output_list:
209 output = output + suite.center(longest_suite) + " |"
210 output = output + "\n" + (len(output) * "-") + "\n"
211 # per-arch data
212 for arch in arch_list:
213 arch_id = arch_ids[arch]
214 output = output + arch.center(longest_arch) + " |"
215 for suite_id in suite_id_list:
216 if arch in suite_arches[suite_id]:
217 count = "%d" % d[suite_id][arch_id]
218 else:
219 count = "-"
220 output = output + count.rjust(longest_suite) + " |"
221 output = output + "\n"
222 print(output)
225################################################################################
228def parse_new_uploads(data):
229 global stats
230 latest_timestamp = stats["timestamp"]
231 for entry in findall(NEW, data, MULTILINE):
232 timestamp = entry[0]
233 if stats["timestamp"] >= timestamp:
234 continue
235 date = parse_timestamp(timestamp)
236 if date not in stats:
237 stats[date] = {
238 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
239 "members": {},
240 }
241 stats[date]["stats"]["NEW"] += 1
242 stats["history"]["stats"]["NEW"] += 1
243 latest_timestamp = timestamp
244 return latest_timestamp
247def parse_actions(data, logdate):
248 global stats
249 latest_timestamp = stats["timestamp"]
250 if logdate <= FORMAT_SWITCH:
251 for batch in findall(old_ACTIONS, data, DOTALL):
252 who = batch.split()[0]
253 if who in blacklisted:
254 continue
255 for entry in findall(old_ACTION, batch, MULTILINE):
256 action = entry[1]
257 if action.startswith("Accepting"):
258 action = "ACCEPT"
259 elif action.startswith("rejected"):
260 action = "REJECT"
261 timestamp = entry[0]
262 if stats["timestamp"] >= timestamp:
263 continue
264 date = parse_timestamp(entry[0])
265 if date not in stats:
266 stats[date] = {
267 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
268 "members": {},
269 }
270 stats[date]["stats"][action] += 1
271 stats["history"]["stats"][action] += 1
272 if who not in stats[date]["members"]:
273 stats[date]["members"][who] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0}
274 stats[date]["members"][who][action] += 1
275 if who not in stats["history"]["members"]:
276 stats["history"]["members"][who] = {
277 "ACCEPT": 0,
278 "REJECT": 0,
279 "PROD": 0,
280 }
281 stats["history"]["members"][who][action] += 1
282 latest_timestamp = timestamp
283 parse_prod(logdate)
284 if logdate >= FORMAT_SWITCH:
285 for entry in findall(new_ACTIONS, data, MULTILINE):
286 action = entry[2]
287 timestamp = entry[0]
288 if stats["timestamp"] >= timestamp:
289 continue
290 date = parse_timestamp(timestamp)
291 if date not in stats:
292 stats[date] = {
293 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
294 "members": {},
295 }
296 member = entry[1]
297 if member in blacklisted:
298 continue
299 if date not in stats:
300 stats[date] = {
301 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
302 "members": {},
303 }
304 if member not in stats[date]["members"]:
305 stats[date]["members"][member] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0}
306 if member not in stats["history"]["members"]:
307 stats["history"]["members"][member] = {
308 "ACCEPT": 0,
309 "REJECT": 0,
310 "PROD": 0,
311 }
312 stats[date]["stats"][action] += 1
313 stats[date]["members"][member][action] += 1
314 stats["history"]["stats"][action] += 1
315 stats["history"]["members"][member][action] += 1
316 latest_timestamp = timestamp
317 return latest_timestamp
320def parse_prod(logdate):
321 global stats
322 global users
323 maildate = "".join([x[-2:] for x in logdate.split("-")])
324 mailarchive = join(
325 utils.get_conf()["Dir::Base"], "mail/archive", "mail-%s.xz" % maildate
326 )
327 if not isfile(mailarchive):
328 return
329 with tempfile.NamedTemporaryFile(dir=utils.get_conf()["Dir::TempPath"]) as tmpfile:
330 with open(mailarchive, "rb") as fh:
331 subprocess.check_call(["xzcat"], stdin=fh, stdout=tmpfile)
332 for message in mbox(tmpfile.name):
333 if message["subject"] and message["subject"].startswith(
334 "Comments regarding"
335 ):
336 try:
337 member = users[" ".join(message["From"].split()[:-1])]
338 except KeyError:
339 continue
340 ts = mktime_tz(parsedate_tz(message["date"]))
341 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
342 date = parse_timestamp(timestamp)
343 if date not in stats:
344 stats[date] = {
345 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
346 "members": {},
347 }
348 if member not in stats[date]["members"]:
349 stats[date]["members"][member] = {
350 "ACCEPT": 0,
351 "REJECT": 0,
352 "PROD": 0,
353 }
354 if member not in stats["history"]["members"]:
355 stats["history"]["members"][member] = {
356 "ACCEPT": 0,
357 "REJECT": 0,
358 "PROD": 0,
359 }
360 stats[date]["stats"]["PROD"] += 1
361 stats[date]["members"][member]["PROD"] += 1
362 stats["history"]["stats"]["PROD"] += 1
363 stats["history"]["members"][member]["PROD"] += 1
366def parse_timestamp(timestamp):
367 y = int(timestamp[:4])
368 m = int(timestamp[4:6])
369 return "%d-%02d" % (y, m)
372def new_stats(logdir, yaml):
373 global Cnf
374 global stats
375 try:
376 with open(yaml, "r") as fd:
377 stats = safe_load(fd)
378 except OSError:
379 pass
380 if not stats:
381 stats = {
382 "history": {
383 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
384 "members": {},
385 },
386 "timestamp": "19700101000000",
387 }
388 latest_timestamp = stats["timestamp"]
389 for fn in sorted(listdir(logdir)):
390 if fn == "current":
391 continue
392 log = splitext(fn)[0]
393 if log < parse_timestamp(stats["timestamp"]):
394 continue
395 logfile = join(logdir, fn)
396 if isfile(logfile):
397 if fn.endswith(".bz2"):
398 # This hack is required becaue python2 does not support
399 # multi-stream files (http://bugs.python.org/issue1625)
400 with open(logfile, "rb") as fh:
401 data = subprocess.check_output(["bzcat"], stdin=fh)
402 elif fn.endswith(".xz"):
403 with open(logfile, "rb") as fh:
404 data = subprocess.check_output(["xzcat"], stdin=fh)
405 elif fn.endswith(".zst"):
406 with open(logfile, "rb") as fh:
407 data = subprocess.check_output(["zstdcat"], stdin=fh)
408 else:
409 with open(logfile, "rb") as fd:
410 data = fd.read()
411 try:
412 data = data.decode()
413 except UnicodeDecodeError:
414 data = data.decode("latin1")
415 ts = parse_new_uploads(data)
416 if ts > latest_timestamp:
417 latest_timestamp = ts
418 ts = parse_actions(data, log)
419 if ts > latest_timestamp:
420 latest_timestamp = ts
421 stderr.write(".")
422 stderr.flush()
423 stderr.write("\n")
424 stderr.flush()
425 stats["timestamp"] = latest_timestamp
426 with open(yaml, "w") as fd:
427 safe_dump(stats, fd)
430################################################################################
433def main():
434 global Cnf
435 global users
437 Cnf = utils.get_conf()
438 Arguments = [("h", "help", "Stats::Options::Help")]
439 for i in ["help"]:
440 key = "Stats::Options::%s" % i
441 if key not in Cnf: 441 ↛ 439line 441 didn't jump to line 439, because the condition on line 441 was never false
442 Cnf[key] = ""
444 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv)
446 Options = Cnf.subtree("Stats::Options")
447 if Options["Help"]: 447 ↛ 450line 447 didn't jump to line 450, because the condition on line 447 was never false
448 usage()
450 if len(args) < 1:
451 utils.warn("dak stats requires a MODE argument")
452 usage(1)
453 elif len(args) > 1:
454 if args[0].lower() != "new":
455 utils.warn("dak stats accepts only one MODE argument")
456 usage(1)
457 elif args[0].lower() == "new":
458 utils.warn("new MODE requires an output file")
459 usage(1)
460 mode = args[0].lower()
462 if mode == "arch-space":
463 per_arch_space_use()
464 elif mode == "pkg-nums":
465 number_of_packages()
466 elif mode == "daily-install":
467 daily_install_stats()
468 elif mode == "new":
469 users = utils.get_users_from_ldap()
470 new_stats(Cnf["Dir::Log"], args[1])
471 else:
472 utils.warn("unknown mode '%s'" % (mode))
473 usage(1)
476################################################################################
479if __name__ == "__main__":
480 main()