1#! /usr/bin/env python3 

2 

3"""Various statistical pr0nography fun and games""" 

4# Copyright (C) 2000, 2001, 2002, 2003, 2006 James Troup <james@nocrew.org> 

5# Copyright (C) 2013 Luca Falavigna <dktrkranz@debian.org> 

6 

7# This program is free software; you can redistribute it and/or modify 

8# it under the terms of the GNU General Public License as published by 

9# the Free Software Foundation; either version 2 of the License, or 

10# (at your option) any later version. 

11 

12# This program is distributed in the hope that it will be useful, 

13# but WITHOUT ANY WARRANTY; without even the implied warranty of 

14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

15# GNU General Public License for more details. 

16 

17# You should have received a copy of the GNU General Public License 

18# along with this program; if not, write to the Free Software 

19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 

20 

21################################################################################ 

22 

23# <aj> can we change the standards instead? 

24# <neuro> standards? 

25# <aj> whatever we're not conforming to 

26# <aj> if there's no written standard, why don't we declare linux as 

27# the defacto standard 

28# <aj> go us! 

29 

30# [aj's attempt to avoid ABI changes for released architecture(s)] 

31 

32################################################################################ 

33 

34import subprocess 

35import sys 

36import tempfile 

37from datetime import datetime 

38from email.utils import mktime_tz, parsedate_tz 

39from mailbox import mbox 

40from os import listdir 

41from os.path import isfile, join, splitext 

42from re import DOTALL, MULTILINE, findall 

43from sys import stderr 

44 

45import apt_pkg 

46from yaml import safe_dump, safe_load 

47 

48from daklib import utils 

49from daklib.dbconn import Architecture, DBConn, Suite, get_suite_architectures 

50 

51################################################################################ 

52 

53Cnf = None 

54 

55stats = {} 

56users = {} 

57buffer = 0 

58FORMAT_SWITCH = "2009-08" 

59blacklisted = ("dak", "katie") 

60 

61NEW = ( 

62 r"^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)" 

63 r"\|(Moving to new|ACCEPT-TO-NEW)" 

64) 

65new_ACTIONS = r"^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]" 

66old_ACTIONS = ( 

67 r"(?:lisa|process-new)\|program start\|(.*?)\|" r"(?:lisa|process-new)\|program end" 

68) 

69old_ACTION = r"^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|" 

70 

71################################################################################ 

72 

73 

74def usage(exit_code=0): 

75 print( 

76 """Usage: dak stats MODE 

77Print various stats. 

78 

79 -h, --help show this help and exit. 

80 

81The following MODEs are available: 

82 

83 arch-space - displays space used by each architecture 

84 pkg-nums - displays the number of packages by suite/architecture 

85 daily-install - displays daily install stats suitable for graphing 

86 new - stores stats about the NEW queue 

87""" 

88 ) 

89 sys.exit(exit_code) 

90 

91 

92################################################################################ 

93 

94 

95def per_arch_space_use(): 

96 session = DBConn().session() 

97 q = session.execute( 

98 """ 

99SELECT a.arch_string as Architecture, sum(f.size) AS sum 

100 FROM files f, binaries b, architecture a 

101 WHERE a.id=b.architecture AND f.id=b.file 

102 GROUP BY a.arch_string ORDER BY sum""" 

103 ).fetchall() 

104 for j in q: 

105 print("%-15.15s %s" % (j[0], j[1])) 

106 print() 

107 q = session.execute( 

108 "SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'" 

109 ).fetchall() 

110 print("%-15.15s %s" % ("Source", q[0][0])) 

111 

112 

113################################################################################ 

114 

115 

116def daily_install_stats(): 

117 stats = {} 

118 f = open("2001-11") 

119 for line in f.readlines(): 

120 split = line.strip().split("|") 

121 program = split[1] 

122 if program != "katie" and program != "process-accepted": 

123 continue 

124 action = split[2] 

125 if action != "installing changes" and action != "installed": 

126 continue 

127 date = split[0][:8] 

128 if date not in stats: 

129 stats[date] = {} 

130 stats[date]["packages"] = 0 

131 stats[date]["size"] = 0.0 

132 if action == "installing changes": 

133 stats[date]["packages"] += 1 

134 elif action == "installed": 

135 stats[date]["size"] += float(split[5]) 

136 

137 dates = sorted(stats) 

138 for date in dates: 

139 packages = stats[date]["packages"] 

140 size = int(stats[date]["size"] / 1024.0 / 1024.0) 

141 print("%s %s %s" % (date, packages, size)) 

142 

143 

144################################################################################ 

145 

146 

147def output_format(suite): 

148 output_suite = [] 

149 for word in suite.split("-"): 

150 output_suite.append(word[0]) 

151 return "-".join(output_suite) 

152 

153 

154def number_of_packages(): 

155 arches = {} 

156 arch_ids = {} 

157 suites = {} 

158 suite_ids = {} 

159 d = {} 

160 session = DBConn().session() 

161 # Build up suite mapping 

162 for i in session.query(Suite).all(): 

163 suites[i.suite_id] = i.suite_name 

164 suite_ids[i.suite_name] = i.suite_id 

165 # Build up architecture mapping 

166 for i in session.query(Architecture).all(): 

167 arches[i.arch_id] = i.arch_string 

168 arch_ids[i.arch_string] = i.arch_id 

169 # Pre-create the dictionary 

170 for suite_id in suites.keys(): 

171 d[suite_id] = {} 

172 for arch_id in arches.keys(): 

173 d[suite_id][arch_id] = 0 

174 # Get the raw data for binaries 

175 # Simultate 'GROUP by suite, architecture' with a dictionary 

176 # XXX: Why don't we just get the DB to do this? 

177 for i in session.execute( 

178 """SELECT suite, architecture, COUNT(suite) 

179 FROM bin_associations 

180 LEFT JOIN binaries ON bin = binaries.id 

181 GROUP BY suite, architecture""" 

182 ).fetchall(): 

183 d[i[0]][i[1]] = i[2] 

184 # Get the raw data for source 

185 arch_id = arch_ids["source"] 

186 for i in session.execute( 

187 "SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite" 

188 ).fetchall(): 

189 (suite_id, count) = i 

190 d[suite_id][arch_id] = d[suite_id][arch_id] + count 

191 ## Print the results 

192 # Setup 

193 suite_list = list(suites.values()) 

194 suite_id_list = [] 

195 suite_arches = {} 

196 for suite in suite_list: 

197 suite_id = suite_ids[suite] 

198 suite_arches[suite_id] = {} 

199 for arch in get_suite_architectures(suite): 

200 suite_arches[suite_id][arch.arch_string] = "" 

201 suite_id_list.append(suite_id) 

202 output_list = [output_format(i) for i in suite_list] 

203 longest_suite = max(len(suite) for suite in output_list) 

204 arch_list = sorted(arches.values()) 

205 longest_arch = max(len(arch) for arch in arch_list) 

206 # Header 

207 output = (" " * longest_arch) + " |" 

208 for suite in output_list: 

209 output = output + suite.center(longest_suite) + " |" 

210 output = output + "\n" + (len(output) * "-") + "\n" 

211 # per-arch data 

212 for arch in arch_list: 

213 arch_id = arch_ids[arch] 

214 output = output + arch.center(longest_arch) + " |" 

215 for suite_id in suite_id_list: 

216 if arch in suite_arches[suite_id]: 

217 count = "%d" % d[suite_id][arch_id] 

218 else: 

219 count = "-" 

220 output = output + count.rjust(longest_suite) + " |" 

221 output = output + "\n" 

222 print(output) 

223 

224 

225################################################################################ 

226 

227 

228def parse_new_uploads(data): 

229 global stats 

230 latest_timestamp = stats["timestamp"] 

231 for entry in findall(NEW, data, MULTILINE): 

232 timestamp = entry[0] 

233 if stats["timestamp"] >= timestamp: 

234 continue 

235 date = parse_timestamp(timestamp) 

236 if date not in stats: 

237 stats[date] = { 

238 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

239 "members": {}, 

240 } 

241 stats[date]["stats"]["NEW"] += 1 

242 stats["history"]["stats"]["NEW"] += 1 

243 latest_timestamp = timestamp 

244 return latest_timestamp 

245 

246 

247def parse_actions(data, logdate): 

248 global stats 

249 latest_timestamp = stats["timestamp"] 

250 if logdate <= FORMAT_SWITCH: 

251 for batch in findall(old_ACTIONS, data, DOTALL): 

252 who = batch.split()[0] 

253 if who in blacklisted: 

254 continue 

255 for entry in findall(old_ACTION, batch, MULTILINE): 

256 action = entry[1] 

257 if action.startswith("Accepting"): 

258 action = "ACCEPT" 

259 elif action.startswith("rejected"): 

260 action = "REJECT" 

261 timestamp = entry[0] 

262 if stats["timestamp"] >= timestamp: 

263 continue 

264 date = parse_timestamp(entry[0]) 

265 if date not in stats: 

266 stats[date] = { 

267 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

268 "members": {}, 

269 } 

270 stats[date]["stats"][action] += 1 

271 stats["history"]["stats"][action] += 1 

272 if who not in stats[date]["members"]: 

273 stats[date]["members"][who] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0} 

274 stats[date]["members"][who][action] += 1 

275 if who not in stats["history"]["members"]: 

276 stats["history"]["members"][who] = { 

277 "ACCEPT": 0, 

278 "REJECT": 0, 

279 "PROD": 0, 

280 } 

281 stats["history"]["members"][who][action] += 1 

282 latest_timestamp = timestamp 

283 parse_prod(logdate) 

284 if logdate >= FORMAT_SWITCH: 

285 for entry in findall(new_ACTIONS, data, MULTILINE): 

286 action = entry[2] 

287 timestamp = entry[0] 

288 if stats["timestamp"] >= timestamp: 

289 continue 

290 date = parse_timestamp(timestamp) 

291 if date not in stats: 

292 stats[date] = { 

293 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

294 "members": {}, 

295 } 

296 member = entry[1] 

297 if member in blacklisted: 

298 continue 

299 if date not in stats: 

300 stats[date] = { 

301 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

302 "members": {}, 

303 } 

304 if member not in stats[date]["members"]: 

305 stats[date]["members"][member] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0} 

306 if member not in stats["history"]["members"]: 

307 stats["history"]["members"][member] = { 

308 "ACCEPT": 0, 

309 "REJECT": 0, 

310 "PROD": 0, 

311 } 

312 stats[date]["stats"][action] += 1 

313 stats[date]["members"][member][action] += 1 

314 stats["history"]["stats"][action] += 1 

315 stats["history"]["members"][member][action] += 1 

316 latest_timestamp = timestamp 

317 return latest_timestamp 

318 

319 

320def parse_prod(logdate): 

321 global stats 

322 global users 

323 maildate = "".join([x[-2:] for x in logdate.split("-")]) 

324 mailarchive = join( 

325 utils.get_conf()["Dir::Base"], "mail/archive", "mail-%s.xz" % maildate 

326 ) 

327 if not isfile(mailarchive): 

328 return 

329 with tempfile.NamedTemporaryFile(dir=utils.get_conf()["Dir::TempPath"]) as tmpfile: 

330 with open(mailarchive, "rb") as fh: 

331 subprocess.check_call(["xzcat"], stdin=fh, stdout=tmpfile) 

332 for message in mbox(tmpfile.name): 

333 if message["subject"] and message["subject"].startswith( 

334 "Comments regarding" 

335 ): 

336 try: 

337 member = users[" ".join(message["From"].split()[:-1])] 

338 except KeyError: 

339 continue 

340 ts = mktime_tz(parsedate_tz(message["date"])) 

341 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") 

342 date = parse_timestamp(timestamp) 

343 if date not in stats: 

344 stats[date] = { 

345 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

346 "members": {}, 

347 } 

348 if member not in stats[date]["members"]: 

349 stats[date]["members"][member] = { 

350 "ACCEPT": 0, 

351 "REJECT": 0, 

352 "PROD": 0, 

353 } 

354 if member not in stats["history"]["members"]: 

355 stats["history"]["members"][member] = { 

356 "ACCEPT": 0, 

357 "REJECT": 0, 

358 "PROD": 0, 

359 } 

360 stats[date]["stats"]["PROD"] += 1 

361 stats[date]["members"][member]["PROD"] += 1 

362 stats["history"]["stats"]["PROD"] += 1 

363 stats["history"]["members"][member]["PROD"] += 1 

364 

365 

366def parse_timestamp(timestamp): 

367 y = int(timestamp[:4]) 

368 m = int(timestamp[4:6]) 

369 return "%d-%02d" % (y, m) 

370 

371 

372def new_stats(logdir, yaml): 

373 global Cnf 

374 global stats 

375 try: 

376 with open(yaml, "r") as fd: 

377 stats = safe_load(fd) 

378 except OSError: 

379 pass 

380 if not stats: 

381 stats = { 

382 "history": { 

383 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

384 "members": {}, 

385 }, 

386 "timestamp": "19700101000000", 

387 } 

388 latest_timestamp = stats["timestamp"] 

389 for fn in sorted(listdir(logdir)): 

390 if fn == "current": 

391 continue 

392 log = splitext(fn)[0] 

393 if log < parse_timestamp(stats["timestamp"]): 

394 continue 

395 logfile = join(logdir, fn) 

396 if isfile(logfile): 

397 if fn.endswith(".bz2"): 

398 # This hack is required becaue python2 does not support 

399 # multi-stream files (http://bugs.python.org/issue1625) 

400 with open(logfile, "rb") as fh: 

401 data = subprocess.check_output(["bzcat"], stdin=fh) 

402 elif fn.endswith(".xz"): 

403 with open(logfile, "rb") as fh: 

404 data = subprocess.check_output(["xzcat"], stdin=fh) 

405 elif fn.endswith(".zst"): 

406 with open(logfile, "rb") as fh: 

407 data = subprocess.check_output(["zstdcat"], stdin=fh) 

408 else: 

409 with open(logfile, "rb") as fd: 

410 data = fd.read() 

411 try: 

412 data = data.decode() 

413 except UnicodeDecodeError: 

414 data = data.decode("latin1") 

415 ts = parse_new_uploads(data) 

416 if ts > latest_timestamp: 

417 latest_timestamp = ts 

418 ts = parse_actions(data, log) 

419 if ts > latest_timestamp: 

420 latest_timestamp = ts 

421 stderr.write(".") 

422 stderr.flush() 

423 stderr.write("\n") 

424 stderr.flush() 

425 stats["timestamp"] = latest_timestamp 

426 with open(yaml, "w") as fd: 

427 safe_dump(stats, fd) 

428 

429 

430################################################################################ 

431 

432 

433def main(): 

434 global Cnf 

435 global users 

436 

437 Cnf = utils.get_conf() 

438 Arguments = [("h", "help", "Stats::Options::Help")] 

439 for i in ["help"]: 

440 key = "Stats::Options::%s" % i 

441 if key not in Cnf: 441 ↛ 439line 441 didn't jump to line 439, because the condition on line 441 was never false

442 Cnf[key] = "" 

443 

444 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) 

445 

446 Options = Cnf.subtree("Stats::Options") 

447 if Options["Help"]: 447 ↛ 450line 447 didn't jump to line 450, because the condition on line 447 was never false

448 usage() 

449 

450 if len(args) < 1: 

451 utils.warn("dak stats requires a MODE argument") 

452 usage(1) 

453 elif len(args) > 1: 

454 if args[0].lower() != "new": 

455 utils.warn("dak stats accepts only one MODE argument") 

456 usage(1) 

457 elif args[0].lower() == "new": 

458 utils.warn("new MODE requires an output file") 

459 usage(1) 

460 mode = args[0].lower() 

461 

462 if mode == "arch-space": 

463 per_arch_space_use() 

464 elif mode == "pkg-nums": 

465 number_of_packages() 

466 elif mode == "daily-install": 

467 daily_install_stats() 

468 elif mode == "new": 

469 users = utils.get_users_from_ldap() 

470 new_stats(Cnf["Dir::Log"], args[1]) 

471 else: 

472 utils.warn("unknown mode '%s'" % (mode)) 

473 usage(1) 

474 

475 

476################################################################################ 

477 

478 

479if __name__ == "__main__": 

480 main()