Coverage for dak/stats.py: 12%

1#! /usr/bin/env python3

3"""Various statistical pr0nography fun and games"""

7# This program is free software; you can redistribute it and/or modify

8# it under the terms of the GNU General Public License as published by

9# the Free Software Foundation; either version 2 of the License, or

10# (at your option) any later version.

12# This program is distributed in the hope that it will be useful,

13# but WITHOUT ANY WARRANTY; without even the implied warranty of

14# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

15# GNU General Public License for more details.

17# You should have received a copy of the GNU General Public License

18# along with this program; if not, write to the Free Software

19# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

21################################################################################

23# <aj> can we change the standards instead?

24# <neuro> standards?

25# <aj> whatever we're not conforming to

26# <aj> if there's no written standard, why don't we declare linux as

27# the defacto standard

28# <aj> go us!

30# [aj's attempt to avoid ABI changes for released architecture(s)]

32################################################################################

34import subprocess

35import sys

36import tempfile

37from datetime import datetime

38from email.utils import mktime_tz, parsedate_tz

39from mailbox import mbox

40from os import listdir

41from os.path import isfile, join, splitext

42from re import DOTALL, MULTILINE, findall

43from sys import stderr

45import apt_pkg

46from yaml import safe_dump, safe_load

48from daklib import utils

49from daklib.dbconn import Architecture, DBConn, Suite, get_suite_architectures

51################################################################################

53Cnf = None

55stats = {}

56users = {}

57buffer = 0

58FORMAT_SWITCH = "2009-08"

59blacklisted = ("dak", "katie")

61NEW = (

62 r"^(\d{14})\|(?:jennifer|process-unchecked|.*?\|dak)"

63 r"\|(Moving to new|ACCEPT-TO-NEW)"

64)

65new_ACTIONS = r"^(\d{14})\|[^\|]*\|(\S+)\|NEW (\S+)[:\|]"

66old_ACTIONS = (

68)

69old_ACTION = r"^(\d{14})\|(?:lisa|process-new)\|(Accepting changes|rejected)\|"

71################################################################################

74def usage(exit_code=0):

75 print(

76 """Usage: dak stats MODE

77Print various stats.

79 -h, --help show this help and exit.

81The following MODEs are available:

83 arch-space - displays space used by each architecture

84 pkg-nums - displays the number of packages by suite/architecture

85 daily-install - displays daily install stats suitable for graphing

86 new - stores stats about the NEW queue

87"""

88 )

89 sys.exit(exit_code)

92################################################################################

95def per_arch_space_use():

96 session = DBConn().session()

97 q = session.execute(

98 """

99SELECT a.arch_string as Architecture, sum(f.size) AS sum

100 FROM files f, binaries b, architecture a

101 WHERE a.id=b.architecture AND f.id=b.file

102 GROUP BY a.arch_string ORDER BY sum"""

103 ).fetchall()

104 for j in q:

105 print("%-15.15s %s" % (j[0], j[1]))

106 print()

107 q = session.execute(

108 "SELECT sum(size) FROM files WHERE filename ~ '.(diff.gz|tar.gz|dsc)$'"

109 ).fetchall()

110 print("%-15.15s %s" % ("Source", q[0][0]))

111

112

113################################################################################

114

115

116def daily_install_stats():

117 stats = {}

118 f = open("2001-11")

119 for line in f.readlines():

120 split = line.strip().split("|")

121 program = split[1]

122 if program != "katie" and program != "process-accepted":

123 continue

124 action = split[2]

125 if action != "installing changes" and action != "installed":

126 continue

127 date = split[0][:8]

128 if date not in stats:

129 stats[date] = {}

130 stats[date]["packages"] = 0

131 stats[date]["size"] = 0.0

132 if action == "installing changes":

133 stats[date]["packages"] += 1

134 elif action == "installed":

135 stats[date]["size"] += float(split[5])

136

137 dates = sorted(stats)

138 for date in dates:

139 packages = stats[date]["packages"]

140 size = int(stats[date]["size"] / 1024.0 / 1024.0)

141 print("%s %s %s" % (date, packages, size))

142

143

144################################################################################

145

146

147def output_format(suite):

148 output_suite = []

149 for word in suite.split("-"):

150 output_suite.append(word[0])

151 return "-".join(output_suite)

152

153

154def number_of_packages():

155 arches = {}

156 arch_ids = {}

157 suites = {}

158 suite_ids = {}

159 d = {}

160 session = DBConn().session()

161 # Build up suite mapping

162 for i in session.query(Suite).all():

163 suites[i.suite_id] = i.suite_name

164 suite_ids[i.suite_name] = i.suite_id

165 # Build up architecture mapping

166 for i in session.query(Architecture).all():

167 arches[i.arch_id] = i.arch_string

168 arch_ids[i.arch_string] = i.arch_id

169 # Pre-create the dictionary

170 for suite_id in suites.keys():

171 d[suite_id] = {}

172 for arch_id in arches.keys():

173 d[suite_id][arch_id] = 0

174 # Get the raw data for binaries

175 # Simultate 'GROUP by suite, architecture' with a dictionary

176 # XXX: Why don't we just get the DB to do this?

177 for i in session.execute(

178 """SELECT suite, architecture, COUNT(suite)

179 FROM bin_associations

180 LEFT JOIN binaries ON bin = binaries.id

181 GROUP BY suite, architecture"""

182 ).fetchall():

183 d[i[0]][i[1]] = i[2]

184 # Get the raw data for source

185 arch_id = arch_ids["source"]

186 for i in session.execute(

187 "SELECT suite, COUNT(suite) FROM src_associations GROUP BY suite"

188 ).fetchall():

189 (suite_id, count) = i

190 d[suite_id][arch_id] = d[suite_id][arch_id] + count

191 ## Print the results

192 # Setup

193 suite_list = list(suites.values())

194 suite_id_list = []

195 suite_arches = {}

196 for suite in suite_list:

197 suite_id = suite_ids[suite]

198 suite_arches[suite_id] = {}

199 for arch in get_suite_architectures(suite):

200 suite_arches[suite_id][arch.arch_string] = ""

201 suite_id_list.append(suite_id)

202 output_list = [output_format(i) for i in suite_list]

203 longest_suite = max(len(suite) for suite in output_list)

204 arch_list = sorted(arches.values())

205 longest_arch = max(len(arch) for arch in arch_list)

206 # Header

207 output = (" " * longest_arch) + " |"

208 for suite in output_list:

209 output = output + suite.center(longest_suite) + " |"

210 output = output + "\n" + (len(output) * "-") + "\n"

211 # per-arch data

212 for arch in arch_list:

213 arch_id = arch_ids[arch]

214 output = output + arch.center(longest_arch) + " |"

215 for suite_id in suite_id_list:

216 if arch in suite_arches[suite_id]:

217 count = "%d" % d[suite_id][arch_id]

218 else:

219 count = "-"

220 output = output + count.rjust(longest_suite) + " |"

221 output = output + "\n"

222 print(output)

223

224

225################################################################################

226

227

228def parse_new_uploads(data):

229 global stats

230 latest_timestamp = stats["timestamp"]

231 for entry in findall(NEW, data, MULTILINE):

232 timestamp = entry[0]

233 if stats["timestamp"] >= timestamp:

234 continue

235 date = parse_timestamp(timestamp)

236 if date not in stats:

237 stats[date] = {

238 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},

239 "members": {},

240 }

241 stats[date]["stats"]["NEW"] += 1

242 stats["history"]["stats"]["NEW"] += 1

243 latest_timestamp = timestamp

244 return latest_timestamp

245

246

247def parse_actions(data, logdate):

248 global stats

249 latest_timestamp = stats["timestamp"]

250 if logdate <= FORMAT_SWITCH:

251 for batch in findall(old_ACTIONS, data, DOTALL):

252 who = batch.split()[0]

253 if who in blacklisted:

254 continue

255 for entry in findall(old_ACTION, batch, MULTILINE):

256 action = entry[1]

257 if action.startswith("Accepting"):

258 action = "ACCEPT"

259 elif action.startswith("rejected"):

260 action = "REJECT"

261 timestamp = entry[0]

262 if stats["timestamp"] >= timestamp:

263 continue

264 date = parse_timestamp(entry[0])

265 if date not in stats:

266 stats[date] = {

267 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},

268 "members": {},

269 }

270 stats[date]["stats"][action] += 1

271 stats["history"]["stats"][action] += 1

272 if who not in stats[date]["members"]:

273 stats[date]["members"][who] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0}

274 stats[date]["members"][who][action] += 1

275 if who not in stats["history"]["members"]:

276 stats["history"]["members"][who] = {

277 "ACCEPT": 0,

278 "REJECT": 0,

279 "PROD": 0,

280 }

281 stats["history"]["members"][who][action] += 1

282 latest_timestamp = timestamp

283 parse_prod(logdate)

284 if logdate >= FORMAT_SWITCH:

285 for entry in findall(new_ACTIONS, data, MULTILINE):

286 action = entry[2]

287 timestamp = entry[0]

288 if stats["timestamp"] >= timestamp:

289 continue

290 date = parse_timestamp(timestamp)

291 if date not in stats:

292 stats[date] = {

293 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},

294 "members": {},

295 }

296 member = entry[1]

297 if member in blacklisted:

298 continue

299 if date not in stats:

300 stats[date] = {

301 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},

302 "members": {},

303 }

304 if member not in stats[date]["members"]:

305 stats[date]["members"][member] = {"ACCEPT": 0, "REJECT": 0, "PROD": 0}

306 if member not in stats["history"]["members"]:

307 stats["history"]["members"][member] = {

308 "ACCEPT": 0,

309 "REJECT": 0,

310 "PROD": 0,

311 }

312 stats[date]["stats"][action] += 1

313 stats[date]["members"][member][action] += 1

314 stats["history"]["stats"][action] += 1

315 stats["history"]["members"][member][action] += 1

316 latest_timestamp = timestamp

317 return latest_timestamp

318

319

320def parse_prod(logdate):

321 global stats

322 global users

323 maildate = "".join([x[-2:] for x in logdate.split("-")])

324 mailarchive = join(

325 utils.get_conf()["Dir::Base"], "mail/archive", "mail-%s.xz" % maildate

326 )

327 if not isfile(mailarchive):

328 return

329 with tempfile.NamedTemporaryFile(dir=utils.get_conf()["Dir::TempPath"]) as tmpfile:

330 with open(mailarchive, "rb") as fh:

331 subprocess.check_call(["xzcat"], stdin=fh, stdout=tmpfile)

332 for message in mbox(tmpfile.name):

333 if message["subject"] and message["subject"].startswith(

334 "Comments regarding"

335 ):

336 try:

337 member = users[" ".join(message["From"].split()[:-1])]

338 except KeyError:

339 continue

340 ts = mktime_tz(parsedate_tz(message["date"]))

341 timestamp = datetime.fromtimestamp(ts).strftime("%Y%m%d%H%M%S")

342 date = parse_timestamp(timestamp)

343 if date not in stats:

344 stats[date] = {

345 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},

346 "members": {},

347 }

348 if member not in stats[date]["members"]:

349 stats[date]["members"][member] = {

350 "ACCEPT": 0,

351 "REJECT": 0,

352 "PROD": 0,

353 }

354 if member not in stats["history"]["members"]:

355 stats["history"]["members"][member] = {

356 "ACCEPT": 0,

357 "REJECT": 0,

358 "PROD": 0,

359 }

360 stats[date]["stats"]["PROD"] += 1

361 stats[date]["members"][member]["PROD"] += 1

362 stats["history"]["stats"]["PROD"] += 1

363 stats["history"]["members"][member]["PROD"] += 1

364

365

366def parse_timestamp(timestamp):

367 y = int(timestamp[:4])

368 m = int(timestamp[4:6])

369 return "%d-%02d" % (y, m)

370

371

372def new_stats(logdir, yaml):

373 global Cnf

374 global stats

375 try:

376 with open(yaml, "r") as fd:

377 stats = safe_load(fd)

378 except OSError:

379 pass

380 if not stats:

381 stats = {

382 "history": {

383 "stats": {"NEW": 0, "ACCEPT": 0, "REJECT": 0, "PROD": 0},

384 "members": {},

385 },

386 "timestamp": "19700101000000",

387 }

388 latest_timestamp = stats["timestamp"]

389 for fn in sorted(listdir(logdir)):

390 if fn == "current":

391 continue

392 log = splitext(fn)[0]

393 if log < parse_timestamp(stats["timestamp"]):

394 continue

395 logfile = join(logdir, fn)

396 if isfile(logfile):

397 if fn.endswith(".bz2"):

398 # This hack is required becaue python2 does not support

399 # multi-stream files (http://bugs.python.org/issue1625)

400 with open(logfile, "rb") as fh:

401 data = subprocess.check_output(["bzcat"], stdin=fh)

402 elif fn.endswith(".xz"):

403 with open(logfile, "rb") as fh:

404 data = subprocess.check_output(["xzcat"], stdin=fh)

405 elif fn.endswith(".zst"):

406 with open(logfile, "rb") as fh:

407 data = subprocess.check_output(["zstdcat"], stdin=fh)

408 else:

409 with open(logfile, "rb") as fd:

410 data = fd.read()

411 try:

412 data = data.decode()

413 except UnicodeDecodeError:

414 data = data.decode("latin1")

415 ts = parse_new_uploads(data)

416 if ts > latest_timestamp:

417 latest_timestamp = ts

418 ts = parse_actions(data, log)

419 if ts > latest_timestamp:

420 latest_timestamp = ts

421 stderr.write(".")

422 stderr.flush()

423 stderr.write("\n")

424 stderr.flush()

425 stats["timestamp"] = latest_timestamp

426 with open(yaml, "w") as fd:

427 safe_dump(stats, fd)

428

429

430################################################################################

431

432

433def main():

434 global Cnf

435 global users

436

437 Cnf = utils.get_conf()

438 Arguments = [("h", "help", "Stats::Options::Help")]

439 for i in ["help"]:

440 key = "Stats::Options::%s" % i

441 if key not in Cnf: 441 ↛ 439line 441 didn't jump to line 439, because the condition on line 441 was never false

442 Cnf[key] = ""

443

444 args = apt_pkg.parse_commandline(Cnf, Arguments, sys.argv)

445

446 Options = Cnf.subtree("Stats::Options")

447 if Options["Help"]: 447 ↛ 450line 447 didn't jump to line 450, because the condition on line 447 was never false

448 usage()

449

450 if len(args) < 1:

451 utils.warn("dak stats requires a MODE argument")

452 usage(1)

453 elif len(args) > 1:

454 if args[0].lower() != "new":

455 utils.warn("dak stats accepts only one MODE argument")

456 usage(1)

457 elif args[0].lower() == "new":

458 utils.warn("new MODE requires an output file")

459 usage(1)

460 mode = args[0].lower()

461

462 if mode == "arch-space":

463 per_arch_space_use()

464 elif mode == "pkg-nums":

465 number_of_packages()

466 elif mode == "daily-install":

467 daily_install_stats()

468 elif mode == "new":

469 users = utils.get_users_from_ldap()

470 new_stats(Cnf["Dir::Log"], args[1])

471 else:

472 utils.warn("unknown mode '%s'" % (mode))

473 usage(1)

474

475

476################################################################################

477

478

479if __name__ == "__main__":

480 main()

Coverage for dak/stats.py : 12%

291 statements 48 run 243 missing 2 excluded 2 partial