Coverage for dak/stats.py: 12%
286 statements
« prev ^ index » next coverage.py v7.6.0, created at 2026-01-04 16:18 +0000
« prev ^ index » next coverage.py v7.6.0, created at 2026-01-04 16:18 +0000
1#! /usr/bin/env python3
3"""Various statistical pr0nography fun and games"""
4# Copyright (C) 2000, 2001, 2002, 2003, 2006 James Troup <james@nocrew.org>
5# Copyright (C) 2013 Luca Falavigna <dktrkranz@debian.org>
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21################################################################################
23# <aj> can we change the standards instead?
24# <neuro> standards?
25# <aj> whatever we're not conforming to
26# <aj> if there's no written standard, why don't we declare linux as
27# the defacto standard
28# <aj> go us!
30# [aj's attempt to avoid ABI changes for released architecture(s)]
32################################################################################
34import subprocess
35import sys
36import tempfile
37from datetime import datetime
38from email.utils import mktime_tz, parsedate_tz
39from mailbox import mbox
40from os import listdir
41from os.path import isfile, join, splitext
42from re import DOTALL, MULTILINE, findall
43from sys import stderr
44from typing import Any, NoReturn
46import apt_pkg
47from sqlalchemy import sql
48from yaml import safe_dump, safe_load
50from daklib import utils
51from daklib.dbconn import Architecture, DBConn, Suite, get_suite_architectures
53################################################################################
55Cnf: apt_pkg.Configuration
57stats: dict[str, Any] = {}
58users: dict[str, str] = {}
59buffer = 0
60FORMAT_SWITCH = "2009-08"
61blacklisted = ("dak", "katie")
63NEW = (
64 r"^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)"
65 r"\|(Moving to new|ACCEPT-TO-NEW)"
66)
67new_ACTIONS = r"^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]"
68old_ACTIONS = (
69 r"(?:lisa|process-new)\|program start\|(.*?)\|" r"(?:lisa|process-new)\|program end"
70)
71old_ACTION = r"^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|"
73################################################################################
76def usage(exit_code=0) -> NoReturn:
77 print(
78 """Usage: dak stats MODE
79Print various stats.
81 -h, --help show this help and exit.
83The following MODEs are available:
85 arch-space - displays space used by each architecture
86 pkg-nums - displays the number of packages by suite/architecture
87 daily-install - displays daily install stats suitable for graphing
88 new - stores stats about the NEW queue
89"""
90 )
91 sys.exit(exit_code)
94################################################################################
97def per_arch_space_use() -> None:
98 session = DBConn().session()
99 q = session.execute(
100 sql.text(
101 """
102SELECT a.arch_string as Architecture, sum(f.size) AS sum
103 FROM files f, binaries b, architecture a
104 WHERE a.id=b.architecture AND f.id=b.file
105 GROUP BY a.arch_string ORDER BY sum"""
106 )
107 ).fetchall()
108 for j in q:
109 print("%-15.15s %s" % (j[0], j[1]))
110 print()
111 q = session.execute(
112 sql.text(
113 "SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'"
114 )
115 ).fetchall()
116 print("%-15.15s %s" % ("Source", q[0][0]))
119################################################################################
122def daily_install_stats() -> None:
123 stats: dict[str, dict[str, float]] = {}
124 f = open("2001-11")
125 for line in f.readlines():
126 split = line.strip().split("|")
127 program = split[1]
128 if program != "katie" and program != "process-accepted":
129 continue
130 action = split[2]
131 if action != "installing changes" and action != "installed":
132 continue
133 date = split[0][:8]
134 if date not in stats:
135 stats[date] = {"packages": 0, "size": 0.0}
136 if action == "installing changes":
137 stats[date]["packages"] += 1
138 elif action == "installed":
139 stats[date]["size"] += float(split[5])
141 dates = sorted(stats)
142 for date in dates:
143 packages = stats[date]["packages"]
144 size = int(stats[date]["size"] / 1024.0 / 1024.0)
145 print("%s %s %s" % (date, packages, size))
148################################################################################
151def output_format(suite: str) -> str:
152 output_suite = []
153 for word in suite.split("-"):
154 output_suite.append(word[0])
155 return "-".join(output_suite)
158def number_of_packages() -> None:
159 arches: dict[int, str] = {}
160 arch_ids: dict[str, int] = {}
161 suites: dict[int, str] = {}
162 suite_ids: dict[str, int] = {}
163 session = DBConn().session()
164 # Build up suite mapping
165 for s in session.query(Suite).all():
166 suites[s.suite_id] = s.suite_name
167 suite_ids[s.suite_name] = s.suite_id
168 # Build up architecture mapping
169 for a in session.query(Architecture).all():
170 arches[a.arch_id] = a.arch_string
171 arch_ids[a.arch_string] = a.arch_id
173 # Pre-create the dictionary
174 d: dict[int, dict[int, int]] = {
175 suite_id: {arch_id: 0 for arch_id in arches.keys()}
176 for suite_id in suites.keys()
177 }
179 # Get the raw data for binaries
180 # Simultate 'GROUP by suite, architecture' with a dictionary
181 # XXX: Why don't we just get the DB to do this?
182 for i in session.execute(
183 sql.text(
184 """SELECT suite, architecture, COUNT(suite)
185 FROM bin_associations
186 LEFT JOIN binaries ON bin = binaries.id
187 GROUP BY suite, architecture"""
188 )
189 ).fetchall():
190 d[i[0]][i[1]] = i[2]
191 # Get the raw data for source
192 arch_id = arch_ids["source"]
193 for i in session.execute(
194 sql.text("SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite")
195 ).fetchall():
196 (suite_id, count) = i
197 d[suite_id][arch_id] = d[suite_id][arch_id] + count
198 ## Print the results
199 # Setup
200 suite_list = list(suites.values())
201 suite_id_list = []
202 suite_arches: dict[int, set[str]] = {}
203 for suite in suite_list:
204 suite_id = suite_ids[suite]
205 suite_arches[suite_id] = {a.arch_string for a in get_suite_architectures(suite)}
206 suite_id_list.append(suite_id)
207 output_list = [output_format(i) for i in suite_list]
208 longest_suite = max(len(suite) for suite in output_list)
209 arch_list = sorted(arches.values())
210 longest_arch = max(len(arch) for arch in arch_list)
211 # Header
212 output = (" " * longest_arch) + " |"
213 for suite in output_list:
214 output = output + suite.center(longest_suite) + " |"
215 output = output + "\n" + (len(output) * "-") + "\n"
216 # per-arch data
217 for arch in arch_list:
218 arch_id = arch_ids[arch]
219 output = output + arch.center(longest_arch) + " |"
220 for suite_id in suite_id_list:
221 if arch in suite_arches[suite_id]:
222 count = "%d" % d[suite_id][arch_id]
223 else:
224 count = "-"
225 output = output + count.rjust(longest_suite) + " |"
226 output = output + "\n"
227 print(output)
230################################################################################
233def parse_new_uploads(data: str) -> str:
234 global stats
235 latest_timestamp: str = stats["timestamp"]
236 for entry in findall(NEW, data, MULTILINE):
237 timestamp = entry[0]
238 if stats["timestamp"] >= timestamp:
239 continue
240 date = parse_timestamp(timestamp)
241 if date not in stats:
242 stats[date] = {
243 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
244 "members": {},
245 }
246 stats[date]["stats"]["NEW"] += 1
247 stats["history"]["stats"]["NEW"] += 1
248 latest_timestamp = timestamp
249 return latest_timestamp
252def parse_actions(data: str, logdate: str) -> str:
253 global stats
254 latest_timestamp: str = stats["timestamp"]
255 if logdate <= FORMAT_SWITCH:
256 for batch in findall(old_ACTIONS, data, DOTALL):
257 who = batch.split()[0]
258 if who in blacklisted:
259 continue
260 for entry in findall(old_ACTION, batch, MULTILINE):
261 action = entry[1]
262 if action.startswith("Accepting"):
263 action = "ACCEPT"
264 elif action.startswith("rejected"):
265 action = "REJECT"
266 timestamp = entry[0]
267 if stats["timestamp"] >= timestamp:
268 continue
269 date = parse_timestamp(entry[0])
270 if date not in stats:
271 stats[date] = {
272 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
273 "members": {},
274 }
275 stats[date]["stats"][action] += 1
276 stats["history"]["stats"][action] += 1
277 if who not in stats[date]["members"]:
278 stats[date]["members"][who] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0}
279 stats[date]["members"][who][action] += 1
280 if who not in stats["history"]["members"]:
281 stats["history"]["members"][who] = {
282 "ACCEPT": 0,
283 "REJECT": 0,
284 "PROD": 0,
285 }
286 stats["history"]["members"][who][action] += 1
287 latest_timestamp = timestamp
288 parse_prod(logdate)
289 if logdate >= FORMAT_SWITCH:
290 for entry in findall(new_ACTIONS, data, MULTILINE):
291 action = entry[2]
292 timestamp = entry[0]
293 if stats["timestamp"] >= timestamp:
294 continue
295 date = parse_timestamp(timestamp)
296 if date not in stats:
297 stats[date] = {
298 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
299 "members": {},
300 }
301 member = entry[1]
302 if member in blacklisted:
303 continue
304 if date not in stats:
305 stats[date] = {
306 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
307 "members": {},
308 }
309 if member not in stats[date]["members"]:
310 stats[date]["members"][member] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0}
311 if member not in stats["history"]["members"]:
312 stats["history"]["members"][member] = {
313 "ACCEPT": 0,
314 "REJECT": 0,
315 "PROD": 0,
316 }
317 stats[date]["stats"][action] += 1
318 stats[date]["members"][member][action] += 1
319 stats["history"]["stats"][action] += 1
320 stats["history"]["members"][member][action] += 1
321 latest_timestamp = timestamp
322 return latest_timestamp
325def parse_prod(logdate: str) -> None:
326 global stats
327 global users
328 maildate = "".join([x[-2:] for x in logdate.split("-")])
329 mailarchive = join(
330 utils.get_conf()["Dir::Base"], "mail/archive", "mail-%s.xz" % maildate
331 )
332 if not isfile(mailarchive):
333 return
334 with tempfile.NamedTemporaryFile(dir=utils.get_conf()["Dir::TempPath"]) as tmpfile:
335 with open(mailarchive, "rb") as fh:
336 subprocess.check_call(["xzcat"], stdin=fh, stdout=tmpfile)
337 for message in mbox(tmpfile.name):
338 if message["subject"] and message["subject"].startswith(
339 "Comments regarding"
340 ):
341 try:
342 member = users[" ".join(message["From"].split()[:-1])]
343 except KeyError:
344 continue
345 message_date = parsedate_tz(message["date"])
346 assert message_date is not None
347 ts = mktime_tz(message_date)
348 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
349 date = parse_timestamp(timestamp)
350 if date not in stats:
351 stats[date] = {
352 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
353 "members": {},
354 }
355 if member not in stats[date]["members"]:
356 stats[date]["members"][member] = {
357 "ACCEPT": 0,
358 "REJECT": 0,
359 "PROD": 0,
360 }
361 if member not in stats["history"]["members"]:
362 stats["history"]["members"][member] = {
363 "ACCEPT": 0,
364 "REJECT": 0,
365 "PROD": 0,
366 }
367 stats[date]["stats"]["PROD"] += 1
368 stats[date]["members"][member]["PROD"] += 1
369 stats["history"]["stats"]["PROD"] += 1
370 stats["history"]["members"][member]["PROD"] += 1
373def parse_timestamp(timestamp: str) -> str:
374 y = int(timestamp[:4])
375 m = int(timestamp[4:6])
376 return "%d-%02d" % (y, m)
379def new_stats(logdir: str, yaml: str) -> None:
380 global Cnf
381 global stats
382 try:
383 with open(yaml, "r") as fd:
384 stats = safe_load(fd)
385 except OSError:
386 pass
387 if not stats:
388 stats = {
389 "history": {
390 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},
391 "members": {},
392 },
393 "timestamp": "19700101000000",
394 }
395 latest_timestamp = stats["timestamp"]
396 for fn in sorted(listdir(logdir)):
397 if fn == "current":
398 continue
399 log = splitext(fn)[0]
400 if log < parse_timestamp(stats["timestamp"]):
401 continue
402 logfile = join(logdir, fn)
403 if isfile(logfile):
404 if fn.endswith(".bz2"):
405 # This hack is required becaue python2 does not support
406 # multi-stream files (http://bugs.python.org/issue1625)
407 with open(logfile, "rb") as fh:
408 data = subprocess.check_output(["bzcat"], stdin=fh)
409 elif fn.endswith(".xz"):
410 with open(logfile, "rb") as fh:
411 data = subprocess.check_output(["xzcat"], stdin=fh)
412 elif fn.endswith(".zst"):
413 with open(logfile, "rb") as fh:
414 data = subprocess.check_output(["zstdcat"], stdin=fh)
415 else:
416 with open(logfile, "rb") as fd:
417 data = fd.read()
418 try:
419 data_str = data.decode()
420 except UnicodeDecodeError:
421 data_str = data.decode("latin1")
422 ts = parse_new_uploads(data_str)
423 if ts > latest_timestamp:
424 latest_timestamp = ts
425 ts = parse_actions(data_str, log)
426 if ts > latest_timestamp:
427 latest_timestamp = ts
428 stderr.write(".")
429 stderr.flush()
430 stderr.write("\n")
431 stderr.flush()
432 stats["timestamp"] = latest_timestamp
433 with open(yaml, "w") as fd:
434 safe_dump(stats, fd)
437################################################################################
440def main() -> None:
441 global Cnf
442 global users
444 Cnf = utils.get_conf()
445 Arguments = [("h", "help", "Stats::Options::Help")]
446 for i in ["help"]:
447 key = "Stats::Options::%s" % i
448 if key not in Cnf: 448 ↛ 446line 448 didn't jump to line 446 because the condition on line 448 was always true
449 Cnf[key] = "" # type: ignore[index]
451 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) # type: ignore[attr-defined]
453 Options = Cnf.subtree("Stats::Options") # type: ignore[attr-defined]
454 if Options["Help"]: 454 ↛ 457line 454 didn't jump to line 457 because the condition on line 454 was always true
455 usage()
457 if len(args) < 1:
458 utils.warn("dak stats requires a MODE argument")
459 usage(1)
460 elif len(args) > 1:
461 if args[0].lower() != "new":
462 utils.warn("dak stats accepts only one MODE argument")
463 usage(1)
464 elif args[0].lower() == "new":
465 utils.warn("new MODE requires an output file")
466 usage(1)
467 mode = args[0].lower()
469 if mode == "arch-space":
470 per_arch_space_use()
471 elif mode == "pkg-nums":
472 number_of_packages()
473 elif mode == "daily-install":
474 daily_install_stats()
475 elif mode == "new":
476 users = utils.get_users_from_ldap()
477 new_stats(Cnf["Dir::Log"], args[1])
478 else:
479 utils.warn("unknown mode '%s'" % (mode))
480 usage(1)
483################################################################################
486if __name__ == "__main__":
487 main()