Package dak :: Module stats
[hide private]
[frames] | no frames]

Source Code for Module dak.stats

  1  #! /usr/bin/env python3 
  2   
  3  """ Various statistical pr0nography fun and games """ 
  4  # Copyright (C) 2000, 2001, 2002, 2003, 2006  James Troup <james@nocrew.org> 
  5  # Copyright (C) 2013  Luca Falavigna <dktrkranz@debian.org> 
  6   
  7  # This program is free software; you can redistribute it and/or modify 
  8  # it under the terms of the GNU General Public License as published by 
  9  # the Free Software Foundation; either version 2 of the License, or 
 10  # (at your option) any later version. 
 11   
 12  # This program is distributed in the hope that it will be useful, 
 13  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 14  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 15  # GNU General Public License for more details. 
 16   
 17  # You should have received a copy of the GNU General Public License 
 18  # along with this program; if not, write to the Free Software 
 19  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 20   
 21  ################################################################################ 
 22   
 23  # <aj>    can we change the standards instead? 
 24  # <neuro> standards? 
 25  # <aj>    whatever we're not conforming to 
 26  # <aj>    if there's no written standard, why don't we declare linux as 
 27  #         the defacto standard 
 28  # <aj>    go us! 
 29   
 30  # [aj's attempt to avoid ABI changes for released architecture(s)] 
 31   
 32  ################################################################################ 
 33   
 34  import subprocess 
 35  import sys 
 36  import tempfile 
 37  import apt_pkg 
 38   
 39  from datetime import datetime 
 40  from email.utils import mktime_tz, parsedate_tz 
 41  from mailbox import mbox 
 42  from os import listdir 
 43  from os.path import isfile, join, splitext 
 44  from re import findall, DOTALL, MULTILINE 
 45  from sys import stderr 
 46  from yaml import safe_load, safe_dump 
 47   
 48  from daklib import utils 
 49  from daklib.dbconn import DBConn, get_suite_architectures, Suite, Architecture 
 50   
 51  ################################################################################ 
 52   
 53  Cnf = None 
 54   
 55  stats = {} 
 56  users = {} 
 57  buffer = 0 
 58  FORMAT_SWITCH = '2009-08' 
 59  blacklisted = ('dak', 'katie') 
 60   
 61  NEW = (r'^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)' 
 62         r'\|(Moving to new|ACCEPT-TO-NEW)') 
 63  new_ACTIONS = r'^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]' 
 64  old_ACTIONS = (r'(?:lisa|process-new)\|program start\|(.*?)\|' 
 65                 r'(?:lisa|process-new)\|program end') 
 66  old_ACTION = r'^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|' 
 67   
 68  ################################################################################ 
 69   
 70   
71 -def usage(exit_code=0):
72 print("""Usage: dak stats MODE 73 Print various stats. 74 75 -h, --help show this help and exit. 76 77 The following MODEs are available: 78 79 arch-space - displays space used by each architecture 80 pkg-nums - displays the number of packages by suite/architecture 81 daily-install - displays daily install stats suitable for graphing 82 new - stores stats about the NEW queue 83 """) 84 sys.exit(exit_code)
85 86 ################################################################################ 87 88
89 -def per_arch_space_use():
90 session = DBConn().session() 91 q = session.execute(""" 92 SELECT a.arch_string as Architecture, sum(f.size) AS sum 93 FROM files f, binaries b, architecture a 94 WHERE a.id=b.architecture AND f.id=b.file 95 GROUP BY a.arch_string ORDER BY sum""").fetchall() 96 for j in q: 97 print("%-15.15s %s" % (j[0], j[1])) 98 print() 99 q = session.execute("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'").fetchall() 100 print("%-15.15s %s" % ("Source", q[0][0]))
101 102 ################################################################################ 103 104
105 -def daily_install_stats():
106 stats = {} 107 f = open("2001-11") 108 for line in f.readlines(): 109 split = line.strip().split('|') 110 program = split[1] 111 if program != "katie" and program != "process-accepted": 112 continue 113 action = split[2] 114 if action != "installing changes" and action != "installed": 115 continue 116 date = split[0][:8] 117 if date not in stats: 118 stats[date] = {} 119 stats[date]["packages"] = 0 120 stats[date]["size"] = 0.0 121 if action == "installing changes": 122 stats[date]["packages"] += 1 123 elif action == "installed": 124 stats[date]["size"] += float(split[5]) 125 126 dates = sorted(stats) 127 for date in dates: 128 packages = stats[date]["packages"] 129 size = int(stats[date]["size"] / 1024.0 / 1024.0) 130 print("%s %s %s" % (date, packages, size))
131 132 ################################################################################ 133 134
135 -def output_format(suite):
136 output_suite = [] 137 for word in suite.split("-"): 138 output_suite.append(word[0]) 139 return "-".join(output_suite)
140 141
142 -def number_of_packages():
143 arches = {} 144 arch_ids = {} 145 suites = {} 146 suite_ids = {} 147 d = {} 148 session = DBConn().session() 149 # Build up suite mapping 150 for i in session.query(Suite).all(): 151 suites[i.suite_id] = i.suite_name 152 suite_ids[i.suite_name] = i.suite_id 153 # Build up architecture mapping 154 for i in session.query(Architecture).all(): 155 arches[i.arch_id] = i.arch_string 156 arch_ids[i.arch_string] = i.arch_id 157 # Pre-create the dictionary 158 for suite_id in suites.keys(): 159 d[suite_id] = {} 160 for arch_id in arches.keys(): 161 d[suite_id][arch_id] = 0 162 # Get the raw data for binaries 163 # Simultate 'GROUP by suite, architecture' with a dictionary 164 # XXX: Why don't we just get the DB to do this? 165 for i in session.execute("""SELECT suite, architecture, COUNT(suite) 166 FROM bin_associations 167 LEFT JOIN binaries ON bin = binaries.id 168 GROUP BY suite, architecture""").fetchall(): 169 d[i[0]][i[1]] = i[2] 170 # Get the raw data for source 171 arch_id = arch_ids["source"] 172 for i in session.execute('SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite').fetchall(): 173 (suite_id, count) = i 174 d[suite_id][arch_id] = d[suite_id][arch_id] + count 175 ## Print the results 176 # Setup 177 suite_list = list(suites.values()) 178 suite_id_list = [] 179 suite_arches = {} 180 for suite in suite_list: 181 suite_id = suite_ids[suite] 182 suite_arches[suite_id] = {} 183 for arch in get_suite_architectures(suite): 184 suite_arches[suite_id][arch.arch_string] = "" 185 suite_id_list.append(suite_id) 186 output_list = [output_format(i) for i in suite_list] 187 longest_suite = max(len(suite) for suite in output_list) 188 arch_list = sorted(arches.values()) 189 longest_arch = max(len(arch) for arch in arch_list) 190 # Header 191 output = (" " * longest_arch) + " |" 192 for suite in output_list: 193 output = output + suite.center(longest_suite) + " |" 194 output = output + "\n" + (len(output) * "-") + "\n" 195 # per-arch data 196 for arch in arch_list: 197 arch_id = arch_ids[arch] 198 output = output + arch.center(longest_arch) + " |" 199 for suite_id in suite_id_list: 200 if arch in suite_arches[suite_id]: 201 count = "%d" % d[suite_id][arch_id] 202 else: 203 count = "-" 204 output = output + count.rjust(longest_suite) + " |" 205 output = output + "\n" 206 print(output)
207 208 ################################################################################ 209 210
211 -def parse_new_uploads(data):
212 global stats 213 latest_timestamp = stats['timestamp'] 214 for entry in findall(NEW, data, MULTILINE): 215 timestamp = entry[0] 216 if stats['timestamp'] >= timestamp: 217 continue 218 date = parse_timestamp(timestamp) 219 if date not in stats: 220 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 221 'REJECT': 0, 'PROD': 0}, 'members': {}} 222 stats[date]['stats']['NEW'] += 1 223 stats['history']['stats']['NEW'] += 1 224 latest_timestamp = timestamp 225 return latest_timestamp
226 227
228 -def parse_actions(data, logdate):
229 global stats 230 latest_timestamp = stats['timestamp'] 231 if logdate <= FORMAT_SWITCH: 232 for batch in findall(old_ACTIONS, data, DOTALL): 233 who = batch.split()[0] 234 if who in blacklisted: 235 continue 236 for entry in findall(old_ACTION, batch, MULTILINE): 237 action = entry[1] 238 if action.startswith('Accepting'): 239 action = 'ACCEPT' 240 elif action.startswith('rejected'): 241 action = 'REJECT' 242 timestamp = entry[0] 243 if stats['timestamp'] >= timestamp: 244 continue 245 date = parse_timestamp(entry[0]) 246 if date not in stats: 247 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 248 'REJECT': 0, 'PROD': 0}, 'members': {}} 249 stats[date]['stats'][action] += 1 250 stats['history']['stats'][action] += 1 251 if who not in stats[date]['members']: 252 stats[date]['members'][who] = {'ACCEPT': 0, 'REJECT': 0, 253 'PROD': 0} 254 stats[date]['members'][who][action] += 1 255 if who not in stats['history']['members']: 256 stats['history']['members'][who] = {'ACCEPT': 0, 'REJECT': 0, 257 'PROD': 0} 258 stats['history']['members'][who][action] += 1 259 latest_timestamp = timestamp 260 parse_prod(logdate) 261 if logdate >= FORMAT_SWITCH: 262 for entry in findall(new_ACTIONS, data, MULTILINE): 263 action = entry[2] 264 timestamp = entry[0] 265 if stats['timestamp'] >= timestamp: 266 continue 267 date = parse_timestamp(timestamp) 268 if date not in stats: 269 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 270 'REJECT': 0, 'PROD': 0}, 'members': {}} 271 member = entry[1] 272 if member in blacklisted: 273 continue 274 if date not in stats: 275 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 276 'REJECT': 0, 'PROD': 0}, 'members': {}} 277 if member not in stats[date]['members']: 278 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 279 'PROD': 0} 280 if member not in stats['history']['members']: 281 stats['history']['members'][member] = {'ACCEPT': 0, 282 'REJECT': 0, 'PROD': 0} 283 stats[date]['stats'][action] += 1 284 stats[date]['members'][member][action] += 1 285 stats['history']['stats'][action] += 1 286 stats['history']['members'][member][action] += 1 287 latest_timestamp = timestamp 288 return latest_timestamp
289 290
291 -def parse_prod(logdate):
292 global stats 293 global users 294 maildate = ''.join([x[-2:] for x in logdate.split('-')]) 295 mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive', 296 'mail-%s.xz' % maildate) 297 if not isfile(mailarchive): 298 return 299 with tempfile.NamedTemporaryFile(dir=utils.get_conf()['Dir::TempPath']) as tmpfile: 300 with open(mailarchive, 'rb') as fh: 301 subprocess.check_call(['xzcat'], stdin=fh, stdout=tmpfile) 302 for message in mbox(tmpfile.name): 303 if (message['subject'] 304 and message['subject'].startswith('Comments regarding')): 305 try: 306 member = users[' '.join(message['From'].split()[:-1])] 307 except KeyError: 308 continue 309 ts = mktime_tz(parsedate_tz(message['date'])) 310 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") 311 date = parse_timestamp(timestamp) 312 if date not in stats: 313 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 314 'REJECT': 0, 'PROD': 0}, 'members': {}} 315 if member not in stats[date]['members']: 316 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 317 'PROD': 0} 318 if member not in stats['history']['members']: 319 stats['history']['members'][member] = {'ACCEPT': 0, 320 'REJECT': 0, 'PROD': 0} 321 stats[date]['stats']['PROD'] += 1 322 stats[date]['members'][member]['PROD'] += 1 323 stats['history']['stats']['PROD'] += 1 324 stats['history']['members'][member]['PROD'] += 1
325 326
327 -def parse_timestamp(timestamp):
328 y = int(timestamp[:4]) 329 m = int(timestamp[4:6]) 330 return '%d-%02d' % (y, m)
331 332
333 -def new_stats(logdir, yaml):
334 global Cnf 335 global stats 336 try: 337 with open(yaml, 'r') as fd: 338 stats = safe_load(fd) 339 except OSError: 340 pass 341 if not stats: 342 stats = {'history': {'stats': {'NEW': 0, 'ACCEPT': 0, 343 'REJECT': 0, 'PROD': 0}, 'members': {}}, 344 'timestamp': '19700101000000'} 345 latest_timestamp = stats['timestamp'] 346 for fn in sorted(listdir(logdir)): 347 if fn == 'current': 348 continue 349 log = splitext(fn)[0] 350 if log < parse_timestamp(stats['timestamp']): 351 continue 352 logfile = join(logdir, fn) 353 if isfile(logfile): 354 if fn.endswith('.bz2'): 355 # This hack is required becaue python2 does not support 356 # multi-stream files (http://bugs.python.org/issue1625) 357 with open(logfile, 'rb') as fh: 358 data = subprocess.check_output(['bzcat'], stdin=fh) 359 elif fn.endswith('.xz'): 360 with open(logfile, 'rb') as fh: 361 data = subprocess.check_output(['xzcat'], stdin=fh) 362 elif fn.endswith('.zst'): 363 with open(logfile, 'rb') as fh: 364 data = subprocess.check_output(['zstdcat'], stdin=fh) 365 else: 366 with open(logfile, 'rb') as fd: 367 data = fd.read() 368 try: 369 data = data.decode() 370 except UnicodeDecodeError: 371 data = data.decode('latin1') 372 ts = parse_new_uploads(data) 373 if ts > latest_timestamp: 374 latest_timestamp = ts 375 ts = parse_actions(data, log) 376 if ts > latest_timestamp: 377 latest_timestamp = ts 378 stderr.write('.') 379 stderr.flush() 380 stderr.write('\n') 381 stderr.flush() 382 stats['timestamp'] = latest_timestamp 383 with open(yaml, 'w') as fd: 384 safe_dump(stats, fd)
385 386 ################################################################################ 387 388
389 -def main():
390 global Cnf 391 global users 392 393 Cnf = utils.get_conf() 394 Arguments = [('h', "help", "Stats::Options::Help")] 395 for i in ["help"]: 396 key = "Stats::Options::%s" % i 397 if key not in Cnf: 398 Cnf[key] = "" 399 400 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) 401 402 Options = Cnf.subtree("Stats::Options") 403 if Options["Help"]: 404 usage() 405 406 if len(args) < 1: 407 utils.warn("dak stats requires a MODE argument") 408 usage(1) 409 elif len(args) > 1: 410 if args[0].lower() != "new": 411 utils.warn("dak stats accepts only one MODE argument") 412 usage(1) 413 elif args[0].lower() == "new": 414 utils.warn("new MODE requires an output file") 415 usage(1) 416 mode = args[0].lower() 417 418 if mode == "arch-space": 419 per_arch_space_use() 420 elif mode == "pkg-nums": 421 number_of_packages() 422 elif mode == "daily-install": 423 daily_install_stats() 424 elif mode == "new": 425 users = utils.get_users_from_ldap() 426 new_stats(Cnf["Dir::Log"], args[1]) 427 else: 428 utils.warn("unknown mode '%s'" % (mode)) 429 usage(1)
430 431 ################################################################################ 432 433 434 if __name__ == '__main__': 435 main() 436