1#! /usr/bin/env python3
3""" Various statistical pr0nography fun and games """
4# Copyright (C) 2000, 2001, 2002, 2003, 2006 James Troup <james@nocrew.org>
5# Copyright (C) 2013 Luca Falavigna <dktrkranz@debian.org>
7# This program is free software; you can redistribute it and/or modify
8# it under the terms of the GNU General Public License as published by
9# the Free Software Foundation; either version 2 of the License, or
10# (at your option) any later version.
12# This program is distributed in the hope that it will be useful,
13# but WITHOUT ANY WARRANTY; without even the implied warranty of
14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15# GNU General Public License for more details.
17# You should have received a copy of the GNU General Public License
18# along with this program; if not, write to the Free Software
19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
21################################################################################
23# <aj> can we change the standards instead?
24# <neuro> standards?
25# <aj> whatever we're not conforming to
26# <aj> if there's no written standard, why don't we declare linux as
27# the defacto standard
28# <aj> go us!
30# [aj's attempt to avoid ABI changes for released architecture(s)]
32################################################################################
34import subprocess
35import sys
36import tempfile
37import apt_pkg
39from datetime import datetime
40from email.utils import mktime_tz, parsedate_tz
41from mailbox import mbox
42from os import listdir
43from os.path import isfile, join, splitext
44from re import findall, DOTALL, MULTILINE
45from sys import stderr
46from yaml import safe_load, safe_dump
48from daklib import utils
49from daklib.dbconn import DBConn, get_suite_architectures, Suite, Architecture
51################################################################################
53Cnf = None
55stats = {}
56users = {}
57buffer = 0
58FORMAT_SWITCH = '2009-08'
59blacklisted = ('dak', 'katie')
61NEW = (r'^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)'
62 r'\|(Moving to new|ACCEPT-TO-NEW)')
63new_ACTIONS = r'^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]'
64old_ACTIONS = (r'(?:lisa|process-new)\|program start\|(.*?)\|'
65 r'(?:lisa|process-new)\|program end')
66old_ACTION = r'^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|'
68################################################################################
71def usage(exit_code=0):
72 print("""Usage: dak stats MODE
73Print various stats.
75 -h, --help show this help and exit.
77The following MODEs are available:
79 arch-space - displays space used by each architecture
80 pkg-nums - displays the number of packages by suite/architecture
81 daily-install - displays daily install stats suitable for graphing
82 new - stores stats about the NEW queue
83""")
84 sys.exit(exit_code)
86################################################################################
89def per_arch_space_use():
90 session = DBConn().session()
91 q = session.execute("""
92SELECT a.arch_string as Architecture, sum(f.size) AS sum
93 FROM files f, binaries b, architecture a
94 WHERE a.id=b.architecture AND f.id=b.file
95 GROUP BY a.arch_string ORDER BY sum""").fetchall()
96 for j in q:
97 print("%-15.15s %s" % (j[0], j[1]))
98 print()
99 q = session.execute("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'").fetchall()
100 print("%-15.15s %s" % ("Source", q[0][0]))
102################################################################################
105def daily_install_stats():
106 stats = {}
107 f = open("2001-11")
108 for line in f.readlines():
109 split = line.strip().split('|')
110 program = split[1]
111 if program != "katie" and program != "process-accepted":
112 continue
113 action = split[2]
114 if action != "installing changes" and action != "installed":
115 continue
116 date = split[0][:8]
117 if date not in stats:
118 stats[date] = {}
119 stats[date]["packages"] = 0
120 stats[date]["size"] = 0.0
121 if action == "installing changes":
122 stats[date]["packages"] += 1
123 elif action == "installed":
124 stats[date]["size"] += float(split[5])
126 dates = sorted(stats)
127 for date in dates:
128 packages = stats[date]["packages"]
129 size = int(stats[date]["size"] / 1024.0 / 1024.0)
130 print("%s %s %s" % (date, packages, size))
132################################################################################
135def output_format(suite):
136 output_suite = []
137 for word in suite.split("-"):
138 output_suite.append(word[0])
139 return "-".join(output_suite)
142def number_of_packages():
143 arches = {}
144 arch_ids = {}
145 suites = {}
146 suite_ids = {}
147 d = {}
148 session = DBConn().session()
149 # Build up suite mapping
150 for i in session.query(Suite).all():
151 suites[i.suite_id] = i.suite_name
152 suite_ids[i.suite_name] = i.suite_id
153 # Build up architecture mapping
154 for i in session.query(Architecture).all():
155 arches[i.arch_id] = i.arch_string
156 arch_ids[i.arch_string] = i.arch_id
157 # Pre-create the dictionary
158 for suite_id in suites.keys():
159 d[suite_id] = {}
160 for arch_id in arches.keys():
161 d[suite_id][arch_id] = 0
162 # Get the raw data for binaries
163 # Simultate 'GROUP by suite, architecture' with a dictionary
164 # XXX: Why don't we just get the DB to do this?
165 for i in session.execute("""SELECT suite, architecture, COUNT(suite)
166 FROM bin_associations
167 LEFT JOIN binaries ON bin = binaries.id
168 GROUP BY suite, architecture""").fetchall():
169 d[i[0]][i[1]] = i[2]
170 # Get the raw data for source
171 arch_id = arch_ids["source"]
172 for i in session.execute('SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite').fetchall():
173 (suite_id, count) = i
174 d[suite_id][arch_id] = d[suite_id][arch_id] + count
175 ## Print the results
176 # Setup
177 suite_list = list(suites.values())
178 suite_id_list = []
179 suite_arches = {}
180 for suite in suite_list:
181 suite_id = suite_ids[suite]
182 suite_arches[suite_id] = {}
183 for arch in get_suite_architectures(suite):
184 suite_arches[suite_id][arch.arch_string] = ""
185 suite_id_list.append(suite_id)
186 output_list = [output_format(i) for i in suite_list]
187 longest_suite = max(len(suite) for suite in output_list)
188 arch_list = sorted(arches.values())
189 longest_arch = max(len(arch) for arch in arch_list)
190 # Header
191 output = (" " * longest_arch) + " |"
192 for suite in output_list:
193 output = output + suite.center(longest_suite) + " |"
194 output = output + "\n" + (len(output) * "-") + "\n"
195 # per-arch data
196 for arch in arch_list:
197 arch_id = arch_ids[arch]
198 output = output + arch.center(longest_arch) + " |"
199 for suite_id in suite_id_list:
200 if arch in suite_arches[suite_id]:
201 count = "%d" % d[suite_id][arch_id]
202 else:
203 count = "-"
204 output = output + count.rjust(longest_suite) + " |"
205 output = output + "\n"
206 print(output)
208################################################################################
211def parse_new_uploads(data):
212 global stats
213 latest_timestamp = stats['timestamp']
214 for entry in findall(NEW, data, MULTILINE):
215 timestamp = entry[0]
216 if stats['timestamp'] >= timestamp:
217 continue
218 date = parse_timestamp(timestamp)
219 if date not in stats:
220 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
221 'REJECT': 0, 'PROD': 0}, 'members': {}}
222 stats[date]['stats']['NEW'] += 1
223 stats['history']['stats']['NEW'] += 1
224 latest_timestamp = timestamp
225 return latest_timestamp
228def parse_actions(data, logdate):
229 global stats
230 latest_timestamp = stats['timestamp']
231 if logdate <= FORMAT_SWITCH:
232 for batch in findall(old_ACTIONS, data, DOTALL):
233 who = batch.split()[0]
234 if who in blacklisted:
235 continue
236 for entry in findall(old_ACTION, batch, MULTILINE):
237 action = entry[1]
238 if action.startswith('Accepting'):
239 action = 'ACCEPT'
240 elif action.startswith('rejected'):
241 action = 'REJECT'
242 timestamp = entry[0]
243 if stats['timestamp'] >= timestamp:
244 continue
245 date = parse_timestamp(entry[0])
246 if date not in stats:
247 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
248 'REJECT': 0, 'PROD': 0}, 'members': {}}
249 stats[date]['stats'][action] += 1
250 stats['history']['stats'][action] += 1
251 if who not in stats[date]['members']:
252 stats[date]['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
253 'PROD': 0}
254 stats[date]['members'][who][action] += 1
255 if who not in stats['history']['members']:
256 stats['history']['members'][who] = {'ACCEPT': 0, 'REJECT': 0,
257 'PROD': 0}
258 stats['history']['members'][who][action] += 1
259 latest_timestamp = timestamp
260 parse_prod(logdate)
261 if logdate >= FORMAT_SWITCH:
262 for entry in findall(new_ACTIONS, data, MULTILINE):
263 action = entry[2]
264 timestamp = entry[0]
265 if stats['timestamp'] >= timestamp:
266 continue
267 date = parse_timestamp(timestamp)
268 if date not in stats:
269 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
270 'REJECT': 0, 'PROD': 0}, 'members': {}}
271 member = entry[1]
272 if member in blacklisted:
273 continue
274 if date not in stats:
275 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
276 'REJECT': 0, 'PROD': 0}, 'members': {}}
277 if member not in stats[date]['members']:
278 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
279 'PROD': 0}
280 if member not in stats['history']['members']:
281 stats['history']['members'][member] = {'ACCEPT': 0,
282 'REJECT': 0, 'PROD': 0}
283 stats[date]['stats'][action] += 1
284 stats[date]['members'][member][action] += 1
285 stats['history']['stats'][action] += 1
286 stats['history']['members'][member][action] += 1
287 latest_timestamp = timestamp
288 return latest_timestamp
291def parse_prod(logdate):
292 global stats
293 global users
294 maildate = ''.join([x[-2:] for x in logdate.split('-')])
295 mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive',
296 'mail-%s.xz' % maildate)
297 if not isfile(mailarchive):
298 return
299 with tempfile.NamedTemporaryFile(dir=utils.get_conf()['Dir::TempPath']) as tmpfile:
300 with open(mailarchive, 'rb') as fh:
301 subprocess.check_call(['xzcat'], stdin=fh, stdout=tmpfile)
302 for message in mbox(tmpfile.name):
303 if (message['subject']
304 and message['subject'].startswith('Comments regarding')):
305 try:
306 member = users[' '.join(message['From'].split()[:-1])]
307 except KeyError:
308 continue
309 ts = mktime_tz(parsedate_tz(message['date']))
310 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")
311 date = parse_timestamp(timestamp)
312 if date not in stats:
313 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0,
314 'REJECT': 0, 'PROD': 0}, 'members': {}}
315 if member not in stats[date]['members']:
316 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0,
317 'PROD': 0}
318 if member not in stats['history']['members']:
319 stats['history']['members'][member] = {'ACCEPT': 0,
320 'REJECT': 0, 'PROD': 0}
321 stats[date]['stats']['PROD'] += 1
322 stats[date]['members'][member]['PROD'] += 1
323 stats['history']['stats']['PROD'] += 1
324 stats['history']['members'][member]['PROD'] += 1
327def parse_timestamp(timestamp):
328 y = int(timestamp[:4])
329 m = int(timestamp[4:6])
330 return '%d-%02d' % (y, m)
333def new_stats(logdir, yaml):
334 global Cnf
335 global stats
336 try:
337 with open(yaml, 'r') as fd:
338 stats = safe_load(fd)
339 except OSError:
340 pass
341 if not stats:
342 stats = {'history': {'stats': {'NEW': 0, 'ACCEPT': 0,
343 'REJECT': 0, 'PROD': 0}, 'members': {}},
344 'timestamp': '19700101000000'}
345 latest_timestamp = stats['timestamp']
346 for fn in sorted(listdir(logdir)):
347 if fn == 'current':
348 continue
349 log = splitext(fn)[0]
350 if log < parse_timestamp(stats['timestamp']):
351 continue
352 logfile = join(logdir, fn)
353 if isfile(logfile):
354 if fn.endswith('.bz2'):
355 # This hack is required becaue python2 does not support
356 # multi-stream files (http://bugs.python.org/issue1625)
357 with open(logfile, 'rb') as fh:
358 data = subprocess.check_output(['bzcat'], stdin=fh)
359 elif fn.endswith('.xz'):
360 with open(logfile, 'rb') as fh:
361 data = subprocess.check_output(['xzcat'], stdin=fh)
362 elif fn.endswith('.zst'):
363 with open(logfile, 'rb') as fh:
364 data = subprocess.check_output(['zstdcat'], stdin=fh)
365 else:
366 with open(logfile, 'rb') as fd:
367 data = fd.read()
368 try:
369 data = data.decode()
370 except UnicodeDecodeError:
371 data = data.decode('latin1')
372 ts = parse_new_uploads(data)
373 if ts > latest_timestamp:
374 latest_timestamp = ts
375 ts = parse_actions(data, log)
376 if ts > latest_timestamp:
377 latest_timestamp = ts
378 stderr.write('.')
379 stderr.flush()
380 stderr.write('\n')
381 stderr.flush()
382 stats['timestamp'] = latest_timestamp
383 with open(yaml, 'w') as fd:
384 safe_dump(stats, fd)
386################################################################################
389def main():
390 global Cnf
391 global users
393 Cnf = utils.get_conf()
394 Arguments = [('h', "help", "Stats::Options::Help")]
395 for i in ["help"]:
396 key = "Stats::Options::%s" % i
397 if key not in Cnf: 397 ↛ 395line 397 didn't jump to line 395, because the condition on line 397 was never false
398 Cnf[key] = ""
400 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv)
402 Options = Cnf.subtree("Stats::Options")
403 if Options["Help"]: 403 ↛ 406line 403 didn't jump to line 406, because the condition on line 403 was never false
404 usage()
406 if len(args) < 1:
407 utils.warn("dak stats requires a MODE argument")
408 usage(1)
409 elif len(args) > 1:
410 if args[0].lower() != "new":
411 utils.warn("dak stats accepts only one MODE argument")
412 usage(1)
413 elif args[0].lower() == "new":
414 utils.warn("new MODE requires an output file")
415 usage(1)
416 mode = args[0].lower()
418 if mode == "arch-space":
419 per_arch_space_use()
420 elif mode == "pkg-nums":
421 number_of_packages()
422 elif mode == "daily-install":
423 daily_install_stats()
424 elif mode == "new":
425 users = utils.get_users_from_ldap()
426 new_stats(Cnf["Dir::Log"], args[1])
427 else:
428 utils.warn("unknown mode '%s'" % (mode))
429 usage(1)
431################################################################################
434if __name__ == '__main__':
435 main()