1#! /usr/bin/env python3 

2 

3""" Various statistical pr0nography fun and games """ 

4# Copyright (C) 2000, 2001, 2002, 2003, 2006 James Troup <james@nocrew.org> 

5# Copyright (C) 2013 Luca Falavigna <dktrkranz@debian.org> 

6 

7# This program is free software; you can redistribute it and/or modify 

8# it under the terms of the GNU General Public License as published by 

9# the Free Software Foundation; either version 2 of the License, or 

10# (at your option) any later version. 

11 

12# This program is distributed in the hope that it will be useful, 

13# but WITHOUT ANY WARRANTY; without even the implied warranty of 

14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

15# GNU General Public License for more details. 

16 

17# You should have received a copy of the GNU General Public License 

18# along with this program; if not, write to the Free Software 

19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 

20 

21################################################################################ 

22 

23# <aj> can we change the standards instead? 

24# <neuro> standards? 

25# <aj> whatever we're not conforming to 

26# <aj> if there's no written standard, why don't we declare linux as 

27# the defacto standard 

28# <aj> go us! 

29 

30# [aj's attempt to avoid ABI changes for released architecture(s)] 

31 

32################################################################################ 

33 

34import subprocess 

35import sys 

36import tempfile 

37import apt_pkg 

38 

39from datetime import datetime 

40from email.utils import mktime_tz, parsedate_tz 

41from mailbox import mbox 

42from os import listdir 

43from os.path import isfile, join, splitext 

44from re import findall, DOTALL, MULTILINE 

45from sys import stderr 

46from yaml import safe_load, safe_dump 

47 

48from daklib import utils 

49from daklib.dbconn import DBConn, get_suite_architectures, Suite, Architecture 

50 

51################################################################################ 

52 

53Cnf = None 

54 

55stats = {} 

56users = {} 

57buffer = 0 

58FORMAT_SWITCH = '2009-08' 

59blacklisted = ('dak', 'katie') 

60 

61NEW = (r'^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)' 

62 r'\|(Moving to new|ACCEPT-TO-NEW)') 

63new_ACTIONS = r'^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]' 

64old_ACTIONS = (r'(?:lisa|process-new)\|program start\|(.*?)\|' 

65 r'(?:lisa|process-new)\|program end') 

66old_ACTION = r'^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|' 

67 

68################################################################################ 

69 

70 

71def usage(exit_code=0): 

72 print("""Usage: dak stats MODE 

73Print various stats. 

74 

75 -h, --help show this help and exit. 

76 

77The following MODEs are available: 

78 

79 arch-space - displays space used by each architecture 

80 pkg-nums - displays the number of packages by suite/architecture 

81 daily-install - displays daily install stats suitable for graphing 

82 new - stores stats about the NEW queue 

83""") 

84 sys.exit(exit_code) 

85 

86################################################################################ 

87 

88 

89def per_arch_space_use(): 

90 session = DBConn().session() 

91 q = session.execute(""" 

92SELECT a.arch_string as Architecture, sum(f.size) AS sum 

93 FROM files f, binaries b, architecture a 

94 WHERE a.id=b.architecture AND f.id=b.file 

95 GROUP BY a.arch_string ORDER BY sum""").fetchall() 

96 for j in q: 

97 print("%-15.15s %s" % (j[0], j[1])) 

98 print() 

99 q = session.execute("SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'").fetchall() 

100 print("%-15.15s %s" % ("Source", q[0][0])) 

101 

102################################################################################ 

103 

104 

105def daily_install_stats(): 

106 stats = {} 

107 f = open("2001-11") 

108 for line in f.readlines(): 

109 split = line.strip().split('|') 

110 program = split[1] 

111 if program != "katie" and program != "process-accepted": 

112 continue 

113 action = split[2] 

114 if action != "installing changes" and action != "installed": 

115 continue 

116 date = split[0][:8] 

117 if date not in stats: 

118 stats[date] = {} 

119 stats[date]["packages"] = 0 

120 stats[date]["size"] = 0.0 

121 if action == "installing changes": 

122 stats[date]["packages"] += 1 

123 elif action == "installed": 

124 stats[date]["size"] += float(split[5]) 

125 

126 dates = sorted(stats) 

127 for date in dates: 

128 packages = stats[date]["packages"] 

129 size = int(stats[date]["size"] / 1024.0 / 1024.0) 

130 print("%s %s %s" % (date, packages, size)) 

131 

132################################################################################ 

133 

134 

135def output_format(suite): 

136 output_suite = [] 

137 for word in suite.split("-"): 

138 output_suite.append(word[0]) 

139 return "-".join(output_suite) 

140 

141 

142def number_of_packages(): 

143 arches = {} 

144 arch_ids = {} 

145 suites = {} 

146 suite_ids = {} 

147 d = {} 

148 session = DBConn().session() 

149 # Build up suite mapping 

150 for i in session.query(Suite).all(): 

151 suites[i.suite_id] = i.suite_name 

152 suite_ids[i.suite_name] = i.suite_id 

153 # Build up architecture mapping 

154 for i in session.query(Architecture).all(): 

155 arches[i.arch_id] = i.arch_string 

156 arch_ids[i.arch_string] = i.arch_id 

157 # Pre-create the dictionary 

158 for suite_id in suites.keys(): 

159 d[suite_id] = {} 

160 for arch_id in arches.keys(): 

161 d[suite_id][arch_id] = 0 

162 # Get the raw data for binaries 

163 # Simultate 'GROUP by suite, architecture' with a dictionary 

164 # XXX: Why don't we just get the DB to do this? 

165 for i in session.execute("""SELECT suite, architecture, COUNT(suite) 

166 FROM bin_associations 

167 LEFT JOIN binaries ON bin = binaries.id 

168 GROUP BY suite, architecture""").fetchall(): 

169 d[i[0]][i[1]] = i[2] 

170 # Get the raw data for source 

171 arch_id = arch_ids["source"] 

172 for i in session.execute('SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite').fetchall(): 

173 (suite_id, count) = i 

174 d[suite_id][arch_id] = d[suite_id][arch_id] + count 

175 ## Print the results 

176 # Setup 

177 suite_list = list(suites.values()) 

178 suite_id_list = [] 

179 suite_arches = {} 

180 for suite in suite_list: 

181 suite_id = suite_ids[suite] 

182 suite_arches[suite_id] = {} 

183 for arch in get_suite_architectures(suite): 

184 suite_arches[suite_id][arch.arch_string] = "" 

185 suite_id_list.append(suite_id) 

186 output_list = [output_format(i) for i in suite_list] 

187 longest_suite = max(len(suite) for suite in output_list) 

188 arch_list = sorted(arches.values()) 

189 longest_arch = max(len(arch) for arch in arch_list) 

190 # Header 

191 output = (" " * longest_arch) + " |" 

192 for suite in output_list: 

193 output = output + suite.center(longest_suite) + " |" 

194 output = output + "\n" + (len(output) * "-") + "\n" 

195 # per-arch data 

196 for arch in arch_list: 

197 arch_id = arch_ids[arch] 

198 output = output + arch.center(longest_arch) + " |" 

199 for suite_id in suite_id_list: 

200 if arch in suite_arches[suite_id]: 

201 count = "%d" % d[suite_id][arch_id] 

202 else: 

203 count = "-" 

204 output = output + count.rjust(longest_suite) + " |" 

205 output = output + "\n" 

206 print(output) 

207 

208################################################################################ 

209 

210 

211def parse_new_uploads(data): 

212 global stats 

213 latest_timestamp = stats['timestamp'] 

214 for entry in findall(NEW, data, MULTILINE): 

215 timestamp = entry[0] 

216 if stats['timestamp'] >= timestamp: 

217 continue 

218 date = parse_timestamp(timestamp) 

219 if date not in stats: 

220 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 

221 'REJECT': 0, 'PROD': 0}, 'members': {}} 

222 stats[date]['stats']['NEW'] += 1 

223 stats['history']['stats']['NEW'] += 1 

224 latest_timestamp = timestamp 

225 return latest_timestamp 

226 

227 

228def parse_actions(data, logdate): 

229 global stats 

230 latest_timestamp = stats['timestamp'] 

231 if logdate <= FORMAT_SWITCH: 

232 for batch in findall(old_ACTIONS, data, DOTALL): 

233 who = batch.split()[0] 

234 if who in blacklisted: 

235 continue 

236 for entry in findall(old_ACTION, batch, MULTILINE): 

237 action = entry[1] 

238 if action.startswith('Accepting'): 

239 action = 'ACCEPT' 

240 elif action.startswith('rejected'): 

241 action = 'REJECT' 

242 timestamp = entry[0] 

243 if stats['timestamp'] >= timestamp: 

244 continue 

245 date = parse_timestamp(entry[0]) 

246 if date not in stats: 

247 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 

248 'REJECT': 0, 'PROD': 0}, 'members': {}} 

249 stats[date]['stats'][action] += 1 

250 stats['history']['stats'][action] += 1 

251 if who not in stats[date]['members']: 

252 stats[date]['members'][who] = {'ACCEPT': 0, 'REJECT': 0, 

253 'PROD': 0} 

254 stats[date]['members'][who][action] += 1 

255 if who not in stats['history']['members']: 

256 stats['history']['members'][who] = {'ACCEPT': 0, 'REJECT': 0, 

257 'PROD': 0} 

258 stats['history']['members'][who][action] += 1 

259 latest_timestamp = timestamp 

260 parse_prod(logdate) 

261 if logdate >= FORMAT_SWITCH: 

262 for entry in findall(new_ACTIONS, data, MULTILINE): 

263 action = entry[2] 

264 timestamp = entry[0] 

265 if stats['timestamp'] >= timestamp: 

266 continue 

267 date = parse_timestamp(timestamp) 

268 if date not in stats: 

269 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 

270 'REJECT': 0, 'PROD': 0}, 'members': {}} 

271 member = entry[1] 

272 if member in blacklisted: 

273 continue 

274 if date not in stats: 

275 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 

276 'REJECT': 0, 'PROD': 0}, 'members': {}} 

277 if member not in stats[date]['members']: 

278 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 

279 'PROD': 0} 

280 if member not in stats['history']['members']: 

281 stats['history']['members'][member] = {'ACCEPT': 0, 

282 'REJECT': 0, 'PROD': 0} 

283 stats[date]['stats'][action] += 1 

284 stats[date]['members'][member][action] += 1 

285 stats['history']['stats'][action] += 1 

286 stats['history']['members'][member][action] += 1 

287 latest_timestamp = timestamp 

288 return latest_timestamp 

289 

290 

291def parse_prod(logdate): 

292 global stats 

293 global users 

294 maildate = ''.join([x[-2:] for x in logdate.split('-')]) 

295 mailarchive = join(utils.get_conf()['Dir::Base'], 'mail/archive', 

296 'mail-%s.xz' % maildate) 

297 if not isfile(mailarchive): 

298 return 

299 with tempfile.NamedTemporaryFile(dir=utils.get_conf()['Dir::TempPath']) as tmpfile: 

300 with open(mailarchive, 'rb') as fh: 

301 subprocess.check_call(['xzcat'], stdin=fh, stdout=tmpfile) 

302 for message in mbox(tmpfile.name): 

303 if (message['subject'] 

304 and message['subject'].startswith('Comments regarding')): 

305 try: 

306 member = users[' '.join(message['From'].split()[:-1])] 

307 except KeyError: 

308 continue 

309 ts = mktime_tz(parsedate_tz(message['date'])) 

310 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S") 

311 date = parse_timestamp(timestamp) 

312 if date not in stats: 

313 stats[date] = {'stats': {'NEW': 0, 'ACCEPT': 0, 

314 'REJECT': 0, 'PROD': 0}, 'members': {}} 

315 if member not in stats[date]['members']: 

316 stats[date]['members'][member] = {'ACCEPT': 0, 'REJECT': 0, 

317 'PROD': 0} 

318 if member not in stats['history']['members']: 

319 stats['history']['members'][member] = {'ACCEPT': 0, 

320 'REJECT': 0, 'PROD': 0} 

321 stats[date]['stats']['PROD'] += 1 

322 stats[date]['members'][member]['PROD'] += 1 

323 stats['history']['stats']['PROD'] += 1 

324 stats['history']['members'][member]['PROD'] += 1 

325 

326 

327def parse_timestamp(timestamp): 

328 y = int(timestamp[:4]) 

329 m = int(timestamp[4:6]) 

330 return '%d-%02d' % (y, m) 

331 

332 

333def new_stats(logdir, yaml): 

334 global Cnf 

335 global stats 

336 try: 

337 with open(yaml, 'r') as fd: 

338 stats = safe_load(fd) 

339 except OSError: 

340 pass 

341 if not stats: 

342 stats = {'history': {'stats': {'NEW': 0, 'ACCEPT': 0, 

343 'REJECT': 0, 'PROD': 0}, 'members': {}}, 

344 'timestamp': '19700101000000'} 

345 latest_timestamp = stats['timestamp'] 

346 for fn in sorted(listdir(logdir)): 

347 if fn == 'current': 

348 continue 

349 log = splitext(fn)[0] 

350 if log < parse_timestamp(stats['timestamp']): 

351 continue 

352 logfile = join(logdir, fn) 

353 if isfile(logfile): 

354 if fn.endswith('.bz2'): 

355 # This hack is required becaue python2 does not support 

356 # multi-stream files (http://bugs.python.org/issue1625) 

357 with open(logfile, 'rb') as fh: 

358 data = subprocess.check_output(['bzcat'], stdin=fh) 

359 elif fn.endswith('.xz'): 

360 with open(logfile, 'rb') as fh: 

361 data = subprocess.check_output(['xzcat'], stdin=fh) 

362 elif fn.endswith('.zst'): 

363 with open(logfile, 'rb') as fh: 

364 data = subprocess.check_output(['zstdcat'], stdin=fh) 

365 else: 

366 with open(logfile, 'rb') as fd: 

367 data = fd.read() 

368 try: 

369 data = data.decode() 

370 except UnicodeDecodeError: 

371 data = data.decode('latin1') 

372 ts = parse_new_uploads(data) 

373 if ts > latest_timestamp: 

374 latest_timestamp = ts 

375 ts = parse_actions(data, log) 

376 if ts > latest_timestamp: 

377 latest_timestamp = ts 

378 stderr.write('.') 

379 stderr.flush() 

380 stderr.write('\n') 

381 stderr.flush() 

382 stats['timestamp'] = latest_timestamp 

383 with open(yaml, 'w') as fd: 

384 safe_dump(stats, fd) 

385 

386################################################################################ 

387 

388 

389def main(): 

390 global Cnf 

391 global users 

392 

393 Cnf = utils.get_conf() 

394 Arguments = [('h', "help", "Stats::Options::Help")] 

395 for i in ["help"]: 

396 key = "Stats::Options::%s" % i 

397 if key not in Cnf: 397 ↛ 395line 397 didn't jump to line 395, because the condition on line 397 was never false

398 Cnf[key] = "" 

399 

400 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) 

401 

402 Options = Cnf.subtree("Stats::Options") 

403 if Options["Help"]: 403 ↛ 406line 403 didn't jump to line 406, because the condition on line 403 was never false

404 usage() 

405 

406 if len(args) < 1: 

407 utils.warn("dak stats requires a MODE argument") 

408 usage(1) 

409 elif len(args) > 1: 

410 if args[0].lower() != "new": 

411 utils.warn("dak stats accepts only one MODE argument") 

412 usage(1) 

413 elif args[0].lower() == "new": 

414 utils.warn("new MODE requires an output file") 

415 usage(1) 

416 mode = args[0].lower() 

417 

418 if mode == "arch-space": 

419 per_arch_space_use() 

420 elif mode == "pkg-nums": 

421 number_of_packages() 

422 elif mode == "daily-install": 

423 daily_install_stats() 

424 elif mode == "new": 

425 users = utils.get_users_from_ldap() 

426 new_stats(Cnf["Dir::Log"], args[1]) 

427 else: 

428 utils.warn("unknown mode '%s'" % (mode)) 

429 usage(1) 

430 

431################################################################################ 

432 

433 

434if __name__ == '__main__': 

435 main()