Package dak :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module dak.stats

  1  #! /usr/bin/env python3 
  2   
  3  """Various statistical pr0nography fun and games""" 
  4  # Copyright (C) 2000, 2001, 2002, 2003, 2006  James Troup <james@nocrew.org> 
  5  # Copyright (C) 2013  Luca Falavigna <dktrkranz@debian.org> 
  6   
  7  # This program is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11   
 12  # This program is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16   
 17  # You should have received a copy of the GNU General Public License 
 18  # along with this program; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20   
 21  ################################################################################ 
 22   
 23  # <aj>    can we change the standards instead? 
 24  # <neuro> standards? 
 25  # <aj>    whatever we're not conforming to 
 26  # <aj>    if there's no written standard, why don't we declare linux as 
 27  #         the defacto standard 
 28  # <aj>    go us! 
 29   
 30  # [aj's attempt to avoid ABI changes for released architecture(s)] 
 31   
 32  ################################################################################ 
 33   
 34  import subprocess 
 35  import sys 
 36  import tempfile 
 37  from datetime import datetime 
 38  from email.utils import mktime_tz, parsedate_tz 
 39  from mailbox import mbox 
 40  from os import listdir 
 41  from os.path import isfile, join, splitext 
 42  from re import DOTALL, MULTILINE, findall 
 43  from sys import stderr 
 44   
 45  import apt_pkg 
 46  from yaml import safe_dump, safe_load 
 47   
 48  from daklib import utils 
 49  from daklib.dbconn import Architecture, DBConn, Suite, get_suite_architectures 
 50   
 51  ################################################################################ 
 52   
 53  Cnf = None 
 54   
 55  stats = {} 
 56  users = {} 
 57  buffer = 0 
 58  FORMAT_SWITCH = "2009-08" 
 59  blacklisted = ("dak", "katie") 
 60   
 61  NEW = ( 
 62      r"^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)" 
 63      r"\|(Moving to new|ACCEPT-TO-NEW)" 
 64  ) 
 65  new_ACTIONS = r"^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]" 
 66  old_ACTIONS = ( 
 67      r"(?:lisa|process-new)\|program start\|(.*?)\|" r"(?:lisa|process-new)\|program end" 
 68  ) 
 69  old_ACTION = r"^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|" 
 70   
 71  ################################################################################ 
 72   
 73   
74 -def usage(exit_code=0):
75 print( 76 """Usage: dak stats MODE 77 Print various stats. 78 79 -h, --help show this help and exit. 80 81 The following MODEs are available: 82 83 arch-space - displays space used by each architecture 84 pkg-nums - displays the number of packages by suite/architecture 85 daily-install - displays daily install stats suitable for graphing 86 new - stores stats about the NEW queue 87 """ 88 ) 89 sys.exit(exit_code)
90 91 92 ################################################################################ 93 94
95 -def per_arch_space_use():
96 session = DBConn().session() 97 q = session.execute( 98 """ 99 SELECT a.arch_string as Architecture, sum(f.size) AS sum 100 FROM files f, binaries b, architecture a 101 WHERE a.id=b.architecture AND f.id=b.file 102 GROUP BY a.arch_string ORDER BY sum""" 103 ).fetchall() 104 for j in q: 105 print("%-15.15s %s" % (j[0], j[1])) 106 print() 107 q = session.execute( 108 "SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'" 109 ).fetchall() 110 print("%-15.15s %s" % ("Source", q[0][0]))
111 112 113 ################################################################################ 114 115
116 -def daily_install_stats():
117 stats = {} 118 f = open("2001-11") 119 for line in f.readlines(): 120 split = line.strip().split("|") 121 program = split[1] 122 if program != "katie" and program != "process-accepted": 123 continue 124 action = split[2] 125 if action != "installing changes" and action != "installed": 126 continue 127 date = split[0][:8] 128 if date not in stats: 129 stats[date] = {} 130 stats[date]["packages"] = 0 131 stats[date]["size"] = 0.0 132 if action == "installing changes": 133 stats[date]["packages"] += 1 134 elif action == "installed": 135 stats[date]["size"] += float(split[5]) 136 137 dates = sorted(stats) 138 for date in dates: 139 packages = stats[date]["packages"] 140 size = int(stats[date]["size"] / 1024.0 / 1024.0) 141 print("%s %s %s" % (date, packages, size))
142 143 144 ################################################################################ 145 146
147 -def output_format(suite):
148 output_suite = [] 149 for word in suite.split("-"): 150 output_suite.append(word[0]) 151 return "-".join(output_suite)
152 153
154 -def number_of_packages():
155 arches = {} 156 arch_ids = {} 157 suites = {} 158 suite_ids = {} 159 d = {} 160 session = DBConn().session() 161 # Build up suite mapping 162 for i in session.query(Suite).all(): 163 suites[i.suite_id] = i.suite_name 164 suite_ids[i.suite_name] = i.suite_id 165 # Build up architecture mapping 166 for i in session.query(Architecture).all(): 167 arches[i.arch_id] = i.arch_string 168 arch_ids[i.arch_string] = i.arch_id 169 # Pre-create the dictionary 170 for suite_id in suites.keys(): 171 d[suite_id] = {} 172 for arch_id in arches.keys(): 173 d[suite_id][arch_id] = 0 174 # Get the raw data for binaries 175 # Simultate 'GROUP by suite, architecture' with a dictionary 176 # XXX: Why don't we just get the DB to do this? 177 for i in session.execute( 178 """SELECT suite, architecture, COUNT(suite) 179 FROM bin_associations 180 LEFT JOIN binaries ON bin = binaries.id 181 GROUP BY suite, architecture""" 182 ).fetchall(): 183 d[i[0]][i[1]] = i[2] 184 # Get the raw data for source 185 arch_id = arch_ids["source"] 186 for i in session.execute( 187 "SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite" 188 ).fetchall(): 189 (suite_id, count) = i 190 d[suite_id][arch_id] = d[suite_id][arch_id] + count 191 ## Print the results 192 # Setup 193 suite_list = list(suites.values()) 194 suite_id_list = [] 195 suite_arches = {} 196 for suite in suite_list: 197 suite_id = suite_ids[suite] 198 suite_arches[suite_id] = {} 199 for arch in get_suite_architectures(suite): 200 suite_arches[suite_id][arch.arch_string] = "" 201 suite_id_list.append(suite_id) 202 output_list = [output_format(i) for i in suite_list] 203 longest_suite = max(len(suite) for suite in output_list) 204 arch_list = sorted(arches.values()) 205 longest_arch = max(len(arch) for arch in arch_list) 206 # Header 207 output = (" " * longest_arch) + " |" 208 for suite in output_list: 209 output = output + suite.center(longest_suite) + " |" 210 output = output + "\n" + (len(output) * "-") + "\n" 211 # per-arch data 212 for arch in arch_list: 213 arch_id = arch_ids[arch] 214 output = output + arch.center(longest_arch) + " |" 215 for suite_id in suite_id_list: 216 if arch in suite_arches[suite_id]: 217 count = "%d" % d[suite_id][arch_id] 218 else: 219 count = "-" 220 output = output + count.rjust(longest_suite) + " |" 221 output = output + "\n" 222 print(output)
223 224 225 ################################################################################ 226 227
228 -def parse_new_uploads(data):
229 global stats 230 latest_timestamp = stats["timestamp"] 231 for entry in findall(NEW, data, MULTILINE): 232 timestamp = entry[0] 233 if stats["timestamp"] >= timestamp: 234 continue 235 date = parse_timestamp(timestamp) 236 if date not in stats: 237 stats[date] = { 238 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 239 "members": {}, 240 } 241 stats[date]["stats"]["NEW"] += 1 242 stats["history"]["stats"]["NEW"] += 1 243 latest_timestamp = timestamp 244 return latest_timestamp
245 246
247 -def parse_actions(data, logdate):
248 global stats 249 latest_timestamp = stats["timestamp"] 250 if logdate <= FORMAT_SWITCH: 251 for batch in findall(old_ACTIONS, data, DOTALL): 252 who = batch.split()[0] 253 if who in blacklisted: 254 continue 255 for entry in findall(old_ACTION, batch, MULTILINE): 256 action = entry[1] 257 if action.startswith("Accepting"): 258 action = "ACCEPT" 259 elif action.startswith("rejected"): 260 action = "REJECT" 261 timestamp = entry[0] 262 if stats["timestamp"] >= timestamp: 263 continue 264 date = parse_timestamp(entry[0]) 265 if date not in stats: 266 stats[date] = { 267 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 268 "members": {}, 269 } 270 stats[date]["stats"][action] += 1 271 stats["history"]["stats"][action] += 1 272 if who not in stats[date]["members"]: 273 stats[date]["members"][who] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0} 274 stats[date]["members"][who][action] += 1 275 if who not in stats["history"]["members"]: 276 stats["history"]["members"][who] = { 277 "ACCEPT": 0, 278 "REJECT": 0, 279 "PROD": 0, 280 } 281 stats["history"]["members"][who][action] += 1 282 latest_timestamp = timestamp 283 parse_prod(logdate) 284 if logdate >= FORMAT_SWITCH: 285 for entry in findall(new_ACTIONS, data, MULTILINE): 286 action = entry[2] 287 timestamp = entry[0] 288 if stats["timestamp"] >= timestamp: 289 continue 290 date = parse_timestamp(timestamp) 291 if date not in stats: 292 stats[date] = { 293 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 294 "members": {}, 295 } 296 member = entry[1] 297 if member in blacklisted: 298 continue 299 if date not in stats: 300 stats[date] = { 301 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 302 "members": {}, 303 } 304 if member not in stats[date]["members"]: 305 stats[date]["members"][member] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0} 306 if member not in stats["history"]["members"]: 307 stats["history"]["members"][member] = { 308 "ACCEPT": 0, 309 "REJECT": 0, 310 "PROD": 0, 311 } 312 stats[date]["stats"][action] += 1 313 stats[date]["members"][member][action] += 1 314 stats["history"]["stats"][action] += 1 315 stats["history"]["members"][member][action] += 1 316 latest_timestamp = timestamp 317 return latest_timestamp
318 319
320 -def parse_prod(logdate):
321 global stats 322 global users 323 maildate = "".join([x[-2:] for x in logdate.split("-")]) 324 mailarchive = join( 325 utils.get_conf()["Dir::Base"], "mail/archive", "mail-%s.xz" % maildate 326 ) 327 if not isfile(mailarchive): 328 return 329 with tempfile.NamedTemporaryFile(dir=utils.get_conf()["Dir::TempPath"]) as tmpfile: 330 with open(mailarchive, "rb") as fh: 331 subprocess.check_call(["xzcat"], stdin=fh, stdout=tmpfile) 332 for message in mbox(tmpfile.name): 333 if message["subject"] and message["subject"].startswith( 334 "Comments regarding" 335 ): 336 try: 337 member = users[" ".join(message["From"].split()[:-1])] 338 except KeyError: 339 continue 340 ts = mktime_tz(parsedate_tz(message["date"])) 341 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") 342 date = parse_timestamp(timestamp) 343 if date not in stats: 344 stats[date] = { 345 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 346 "members": {}, 347 } 348 if member not in stats[date]["members"]: 349 stats[date]["members"][member] = { 350 "ACCEPT": 0, 351 "REJECT": 0, 352 "PROD": 0, 353 } 354 if member not in stats["history"]["members"]: 355 stats["history"]["members"][member] = { 356 "ACCEPT": 0, 357 "REJECT": 0, 358 "PROD": 0, 359 } 360 stats[date]["stats"]["PROD"] += 1 361 stats[date]["members"][member]["PROD"] += 1 362 stats["history"]["stats"]["PROD"] += 1 363 stats["history"]["members"][member]["PROD"] += 1
364 365
366 -def parse_timestamp(timestamp):
367 y = int(timestamp[:4]) 368 m = int(timestamp[4:6]) 369 return "%d-%02d" % (y, m)
370 371
372 -def new_stats(logdir, yaml):
373 global Cnf 374 global stats 375 try: 376 with open(yaml, "r") as fd: 377 stats = safe_load(fd) 378 except OSError: 379 pass 380 if not stats: 381 stats = { 382 "history": { 383 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 384 "members": {}, 385 }, 386 "timestamp": "19700101000000", 387 } 388 latest_timestamp = stats["timestamp"] 389 for fn in sorted(listdir(logdir)): 390 if fn == "current": 391 continue 392 log = splitext(fn)[0] 393 if log < parse_timestamp(stats["timestamp"]): 394 continue 395 logfile = join(logdir, fn) 396 if isfile(logfile): 397 if fn.endswith(".bz2"): 398 # This hack is required becaue python2 does not support 399 # multi-stream files (http://bugs.python.org/issue1625) 400 with open(logfile, "rb") as fh: 401 data = subprocess.check_output(["bzcat"], stdin=fh) 402 elif fn.endswith(".xz"): 403 with open(logfile, "rb") as fh: 404 data = subprocess.check_output(["xzcat"], stdin=fh) 405 elif fn.endswith(".zst"): 406 with open(logfile, "rb") as fh: 407 data = subprocess.check_output(["zstdcat"], stdin=fh) 408 else: 409 with open(logfile, "rb") as fd: 410 data = fd.read() 411 try: 412 data = data.decode() 413 except UnicodeDecodeError: 414 data = data.decode("latin1") 415 ts = parse_new_uploads(data) 416 if ts > latest_timestamp: 417 latest_timestamp = ts 418 ts = parse_actions(data, log) 419 if ts > latest_timestamp: 420 latest_timestamp = ts 421 stderr.write(".") 422 stderr.flush() 423 stderr.write("\n") 424 stderr.flush() 425 stats["timestamp"] = latest_timestamp 426 with open(yaml, "w") as fd: 427 safe_dump(stats, fd)
428 429 430 ################################################################################ 431 432
433 -def main():
434 global Cnf 435 global users 436 437 Cnf = utils.get_conf() 438 Arguments = [("h", "help", "Stats::Options::Help")] 439 for i in ["help"]: 440 key = "Stats::Options::%s" % i 441 if key not in Cnf: 442 Cnf[key] = "" 443 444 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) 445 446 Options = Cnf.subtree("Stats::Options") 447 if Options["Help"]: 448 usage() 449 450 if len(args) < 1: 451 utils.warn("dak stats requires a MODE argument") 452 usage(1) 453 elif len(args) > 1: 454 if args[0].lower() != "new": 455 utils.warn("dak stats accepts only one MODE argument") 456 usage(1) 457 elif args[0].lower() == "new": 458 utils.warn("new MODE requires an output file") 459 usage(1) 460 mode = args[0].lower() 461 462 if mode == "arch-space": 463 per_arch_space_use() 464 elif mode == "pkg-nums": 465 number_of_packages() 466 elif mode == "daily-install": 467 daily_install_stats() 468 elif mode == "new": 469 users = utils.get_users_from_ldap() 470 new_stats(Cnf["Dir::Log"], args[1]) 471 else: 472 utils.warn("unknown mode '%s'" % (mode)) 473 usage(1)
474 475 476 ################################################################################ 477 478 479 if __name__ == "__main__": 480 main() 481