Coverage for dak/stats.py: 12%

286 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2026-01-04 16:18 +0000

1#! /usr/bin/env python3 

2 

3"""Various statistical pr0nography fun and games""" 

4# Copyright (C) 2000, 2001, 2002, 2003, 2006 James Troup <james@nocrew.org> 

5# Copyright (C) 2013 Luca Falavigna <dktrkranz@debian.org> 

6 

7# This program is free software; you can redistribute it and/or modify 

8# it under the terms of the GNU General Public License as published by 

9# the Free Software Foundation; either version 2 of the License, or 

10# (at your option) any later version. 

11 

12# This program is distributed in the hope that it will be useful, 

13# but WITHOUT ANY WARRANTY; without even the implied warranty of 

14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

15# GNU General Public License for more details. 

16 

17# You should have received a copy of the GNU General Public License 

18# along with this program; if not, write to the Free Software 

19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 

20 

21################################################################################ 

22 

23# <aj> can we change the standards instead? 

24# <neuro> standards? 

25# <aj> whatever we're not conforming to 

26# <aj> if there's no written standard, why don't we declare linux as 

27# the defacto standard 

28# <aj> go us! 

29 

30# [aj's attempt to avoid ABI changes for released architecture(s)] 

31 

32################################################################################ 

33 

34import subprocess 

35import sys 

36import tempfile 

37from datetime import datetime 

38from email.utils import mktime_tz, parsedate_tz 

39from mailbox import mbox 

40from os import listdir 

41from os.path import isfile, join, splitext 

42from re import DOTALL, MULTILINE, findall 

43from sys import stderr 

44from typing import Any, NoReturn 

45 

46import apt_pkg 

47from sqlalchemy import sql 

48from yaml import safe_dump, safe_load 

49 

50from daklib import utils 

51from daklib.dbconn import Architecture, DBConn, Suite, get_suite_architectures 

52 

53################################################################################ 

54 

55Cnf: apt_pkg.Configuration 

56 

57stats: dict[str, Any] = {} 

58users: dict[str, str] = {} 

59buffer = 0 

60FORMAT_SWITCH = "2009-08" 

61blacklisted = ("dak", "katie") 

62 

63NEW = ( 

64 r"^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)" 

65 r"\|(Moving to new|ACCEPT-TO-NEW)" 

66) 

67new_ACTIONS = r"^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]" 

68old_ACTIONS = ( 

69 r"(?:lisa|process-new)\|program start\|(.*?)\|" r"(?:lisa|process-new)\|program end" 

70) 

71old_ACTION = r"^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|" 

72 

73################################################################################ 

74 

75 

76def usage(exit_code=0) -> NoReturn: 

77 print( 

78 """Usage: dak stats MODE 

79Print various stats. 

80 

81 -h, --help show this help and exit. 

82 

83The following MODEs are available: 

84 

85 arch-space - displays space used by each architecture 

86 pkg-nums - displays the number of packages by suite/architecture 

87 daily-install - displays daily install stats suitable for graphing 

88 new - stores stats about the NEW queue 

89""" 

90 ) 

91 sys.exit(exit_code) 

92 

93 

94################################################################################ 

95 

96 

97def per_arch_space_use() -> None: 

98 session = DBConn().session() 

99 q = session.execute( 

100 sql.text( 

101 """ 

102SELECT a.arch_string as Architecture, sum(f.size) AS sum 

103 FROM files f, binaries b, architecture a 

104 WHERE a.id=b.architecture AND f.id=b.file 

105 GROUP BY a.arch_string ORDER BY sum""" 

106 ) 

107 ).fetchall() 

108 for j in q: 

109 print("%-15.15s %s" % (j[0], j[1])) 

110 print() 

111 q = session.execute( 

112 sql.text( 

113 "SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'" 

114 ) 

115 ).fetchall() 

116 print("%-15.15s %s" % ("Source", q[0][0])) 

117 

118 

119################################################################################ 

120 

121 

122def daily_install_stats() -> None: 

123 stats: dict[str, dict[str, float]] = {} 

124 f = open("2001-11") 

125 for line in f.readlines(): 

126 split = line.strip().split("|") 

127 program = split[1] 

128 if program != "katie" and program != "process-accepted": 

129 continue 

130 action = split[2] 

131 if action != "installing changes" and action != "installed": 

132 continue 

133 date = split[0][:8] 

134 if date not in stats: 

135 stats[date] = {"packages": 0, "size": 0.0} 

136 if action == "installing changes": 

137 stats[date]["packages"] += 1 

138 elif action == "installed": 

139 stats[date]["size"] += float(split[5]) 

140 

141 dates = sorted(stats) 

142 for date in dates: 

143 packages = stats[date]["packages"] 

144 size = int(stats[date]["size"] / 1024.0 / 1024.0) 

145 print("%s %s %s" % (date, packages, size)) 

146 

147 

148################################################################################ 

149 

150 

151def output_format(suite: str) -> str: 

152 output_suite = [] 

153 for word in suite.split("-"): 

154 output_suite.append(word[0]) 

155 return "-".join(output_suite) 

156 

157 

158def number_of_packages() -> None: 

159 arches: dict[int, str] = {} 

160 arch_ids: dict[str, int] = {} 

161 suites: dict[int, str] = {} 

162 suite_ids: dict[str, int] = {} 

163 session = DBConn().session() 

164 # Build up suite mapping 

165 for s in session.query(Suite).all(): 

166 suites[s.suite_id] = s.suite_name 

167 suite_ids[s.suite_name] = s.suite_id 

168 # Build up architecture mapping 

169 for a in session.query(Architecture).all(): 

170 arches[a.arch_id] = a.arch_string 

171 arch_ids[a.arch_string] = a.arch_id 

172 

173 # Pre-create the dictionary 

174 d: dict[int, dict[int, int]] = { 

175 suite_id: {arch_id: 0 for arch_id in arches.keys()} 

176 for suite_id in suites.keys() 

177 } 

178 

179 # Get the raw data for binaries 

180 # Simultate 'GROUP by suite, architecture' with a dictionary 

181 # XXX: Why don't we just get the DB to do this? 

182 for i in session.execute( 

183 sql.text( 

184 """SELECT suite, architecture, COUNT(suite) 

185 FROM bin_associations 

186 LEFT JOIN binaries ON bin = binaries.id 

187 GROUP BY suite, architecture""" 

188 ) 

189 ).fetchall(): 

190 d[i[0]][i[1]] = i[2] 

191 # Get the raw data for source 

192 arch_id = arch_ids["source"] 

193 for i in session.execute( 

194 sql.text("SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite") 

195 ).fetchall(): 

196 (suite_id, count) = i 

197 d[suite_id][arch_id] = d[suite_id][arch_id] + count 

198 ## Print the results 

199 # Setup 

200 suite_list = list(suites.values()) 

201 suite_id_list = [] 

202 suite_arches: dict[int, set[str]] = {} 

203 for suite in suite_list: 

204 suite_id = suite_ids[suite] 

205 suite_arches[suite_id] = {a.arch_string for a in get_suite_architectures(suite)} 

206 suite_id_list.append(suite_id) 

207 output_list = [output_format(i) for i in suite_list] 

208 longest_suite = max(len(suite) for suite in output_list) 

209 arch_list = sorted(arches.values()) 

210 longest_arch = max(len(arch) for arch in arch_list) 

211 # Header 

212 output = (" " * longest_arch) + " |" 

213 for suite in output_list: 

214 output = output + suite.center(longest_suite) + " |" 

215 output = output + "\n" + (len(output) * "-") + "\n" 

216 # per-arch data 

217 for arch in arch_list: 

218 arch_id = arch_ids[arch] 

219 output = output + arch.center(longest_arch) + " |" 

220 for suite_id in suite_id_list: 

221 if arch in suite_arches[suite_id]: 

222 count = "%d" % d[suite_id][arch_id] 

223 else: 

224 count = "-" 

225 output = output + count.rjust(longest_suite) + " |" 

226 output = output + "\n" 

227 print(output) 

228 

229 

230################################################################################ 

231 

232 

233def parse_new_uploads(data: str) -> str: 

234 global stats 

235 latest_timestamp: str = stats["timestamp"] 

236 for entry in findall(NEW, data, MULTILINE): 

237 timestamp = entry[0] 

238 if stats["timestamp"] >= timestamp: 

239 continue 

240 date = parse_timestamp(timestamp) 

241 if date not in stats: 

242 stats[date] = { 

243 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

244 "members": {}, 

245 } 

246 stats[date]["stats"]["NEW"] += 1 

247 stats["history"]["stats"]["NEW"] += 1 

248 latest_timestamp = timestamp 

249 return latest_timestamp 

250 

251 

252def parse_actions(data: str, logdate: str) -> str: 

253 global stats 

254 latest_timestamp: str = stats["timestamp"] 

255 if logdate <= FORMAT_SWITCH: 

256 for batch in findall(old_ACTIONS, data, DOTALL): 

257 who = batch.split()[0] 

258 if who in blacklisted: 

259 continue 

260 for entry in findall(old_ACTION, batch, MULTILINE): 

261 action = entry[1] 

262 if action.startswith("Accepting"): 

263 action = "ACCEPT" 

264 elif action.startswith("rejected"): 

265 action = "REJECT" 

266 timestamp = entry[0] 

267 if stats["timestamp"] >= timestamp: 

268 continue 

269 date = parse_timestamp(entry[0]) 

270 if date not in stats: 

271 stats[date] = { 

272 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

273 "members": {}, 

274 } 

275 stats[date]["stats"][action] += 1 

276 stats["history"]["stats"][action] += 1 

277 if who not in stats[date]["members"]: 

278 stats[date]["members"][who] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0} 

279 stats[date]["members"][who][action] += 1 

280 if who not in stats["history"]["members"]: 

281 stats["history"]["members"][who] = { 

282 "ACCEPT": 0, 

283 "REJECT": 0, 

284 "PROD": 0, 

285 } 

286 stats["history"]["members"][who][action] += 1 

287 latest_timestamp = timestamp 

288 parse_prod(logdate) 

289 if logdate >= FORMAT_SWITCH: 

290 for entry in findall(new_ACTIONS, data, MULTILINE): 

291 action = entry[2] 

292 timestamp = entry[0] 

293 if stats["timestamp"] >= timestamp: 

294 continue 

295 date = parse_timestamp(timestamp) 

296 if date not in stats: 

297 stats[date] = { 

298 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

299 "members": {}, 

300 } 

301 member = entry[1] 

302 if member in blacklisted: 

303 continue 

304 if date not in stats: 

305 stats[date] = { 

306 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

307 "members": {}, 

308 } 

309 if member not in stats[date]["members"]: 

310 stats[date]["members"][member] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0} 

311 if member not in stats["history"]["members"]: 

312 stats["history"]["members"][member] = { 

313 "ACCEPT": 0, 

314 "REJECT": 0, 

315 "PROD": 0, 

316 } 

317 stats[date]["stats"][action] += 1 

318 stats[date]["members"][member][action] += 1 

319 stats["history"]["stats"][action] += 1 

320 stats["history"]["members"][member][action] += 1 

321 latest_timestamp = timestamp 

322 return latest_timestamp 

323 

324 

325def parse_prod(logdate: str) -> None: 

326 global stats 

327 global users 

328 maildate = "".join([x[-2:] for x in logdate.split("-")]) 

329 mailarchive = join( 

330 utils.get_conf()["Dir::Base"], "mail/archive", "mail-%s.xz" % maildate 

331 ) 

332 if not isfile(mailarchive): 

333 return 

334 with tempfile.NamedTemporaryFile(dir=utils.get_conf()["Dir::TempPath"]) as tmpfile: 

335 with open(mailarchive, "rb") as fh: 

336 subprocess.check_call(["xzcat"], stdin=fh, stdout=tmpfile) 

337 for message in mbox(tmpfile.name): 

338 if message["subject"] and message["subject"].startswith( 

339 "Comments regarding" 

340 ): 

341 try: 

342 member = users[" ".join(message["From"].split()[:-1])] 

343 except KeyError: 

344 continue 

345 message_date = parsedate_tz(message["date"]) 

346 assert message_date is not None 

347 ts = mktime_tz(message_date) 

348 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") 

349 date = parse_timestamp(timestamp) 

350 if date not in stats: 

351 stats[date] = { 

352 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

353 "members": {}, 

354 } 

355 if member not in stats[date]["members"]: 

356 stats[date]["members"][member] = { 

357 "ACCEPT": 0, 

358 "REJECT": 0, 

359 "PROD": 0, 

360 } 

361 if member not in stats["history"]["members"]: 

362 stats["history"]["members"][member] = { 

363 "ACCEPT": 0, 

364 "REJECT": 0, 

365 "PROD": 0, 

366 } 

367 stats[date]["stats"]["PROD"] += 1 

368 stats[date]["members"][member]["PROD"] += 1 

369 stats["history"]["stats"]["PROD"] += 1 

370 stats["history"]["members"][member]["PROD"] += 1 

371 

372 

373def parse_timestamp(timestamp: str) -> str: 

374 y = int(timestamp[:4]) 

375 m = int(timestamp[4:6]) 

376 return "%d-%02d" % (y, m) 

377 

378 

379def new_stats(logdir: str, yaml: str) -> None: 

380 global Cnf 

381 global stats 

382 try: 

383 with open(yaml, "r") as fd: 

384 stats = safe_load(fd) 

385 except OSError: 

386 pass 

387 if not stats: 

388 stats = { 

389 "history": { 

390 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0}, 

391 "members": {}, 

392 }, 

393 "timestamp": "19700101000000", 

394 } 

395 latest_timestamp = stats["timestamp"] 

396 for fn in sorted(listdir(logdir)): 

397 if fn == "current": 

398 continue 

399 log = splitext(fn)[0] 

400 if log < parse_timestamp(stats["timestamp"]): 

401 continue 

402 logfile = join(logdir, fn) 

403 if isfile(logfile): 

404 if fn.endswith(".bz2"): 

405 # This hack is required becaue python2 does not support 

406 # multi-stream files (http://bugs.python.org/issue1625) 

407 with open(logfile, "rb") as fh: 

408 data = subprocess.check_output(["bzcat"], stdin=fh) 

409 elif fn.endswith(".xz"): 

410 with open(logfile, "rb") as fh: 

411 data = subprocess.check_output(["xzcat"], stdin=fh) 

412 elif fn.endswith(".zst"): 

413 with open(logfile, "rb") as fh: 

414 data = subprocess.check_output(["zstdcat"], stdin=fh) 

415 else: 

416 with open(logfile, "rb") as fd: 

417 data = fd.read() 

418 try: 

419 data_str = data.decode() 

420 except UnicodeDecodeError: 

421 data_str = data.decode("latin1") 

422 ts = parse_new_uploads(data_str) 

423 if ts > latest_timestamp: 

424 latest_timestamp = ts 

425 ts = parse_actions(data_str, log) 

426 if ts > latest_timestamp: 

427 latest_timestamp = ts 

428 stderr.write(".") 

429 stderr.flush() 

430 stderr.write("\n") 

431 stderr.flush() 

432 stats["timestamp"] = latest_timestamp 

433 with open(yaml, "w") as fd: 

434 safe_dump(stats, fd) 

435 

436 

437################################################################################ 

438 

439 

440def main() -> None: 

441 global Cnf 

442 global users 

443 

444 Cnf = utils.get_conf() 

445 Arguments = [("h", "help", "Stats::Options::Help")] 

446 for i in ["help"]: 

447 key = "Stats::Options::%s" % i 

448 if key not in Cnf: 448 ↛ 446line 448 didn't jump to line 446 because the condition on line 448 was always true

449 Cnf[key] = "" # type: ignore[index] 

450 

451 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) # type: ignore[attr-defined] 

452 

453 Options = Cnf.subtree("Stats::Options") # type: ignore[attr-defined] 

454 if Options["Help"]: 454 ↛ 457line 454 didn't jump to line 457 because the condition on line 454 was always true

455 usage() 

456 

457 if len(args) < 1: 

458 utils.warn("dak stats requires a MODE argument") 

459 usage(1) 

460 elif len(args) > 1: 

461 if args[0].lower() != "new": 

462 utils.warn("dak stats accepts only one MODE argument") 

463 usage(1) 

464 elif args[0].lower() == "new": 

465 utils.warn("new MODE requires an output file") 

466 usage(1) 

467 mode = args[0].lower() 

468 

469 if mode == "arch-space": 

470 per_arch_space_use() 

471 elif mode == "pkg-nums": 

472 number_of_packages() 

473 elif mode == "daily-install": 

474 daily_install_stats() 

475 elif mode == "new": 

476 users = utils.get_users_from_ldap() 

477 new_stats(Cnf["Dir::Log"], args[1]) 

478 else: 

479 utils.warn("unknown mode '%s'" % (mode)) 

480 usage(1) 

481 

482 

483################################################################################ 

484 

485 

486if __name__ == "__main__": 

487 main()