1#!/usr/bin/env python3 

2 

3""" 

4Generate an HTML page and PNG charts wih the age of packages in NEW 

5 

6To be run once a day. 

7 

8The tool stores persistent status in the RRD database and the pickeled queue 

9in order to speed up daily runs. 

10Directories and database files are created as needed. 

11 

12If the RRD directory is deleted the script rebuilds past years from the historical 

13logs, and that can take a while, otherwise it just parses fresh data. 

14""" 

15# Copyright 2020-2024 Federico Ceratto <federico@debian.org> 

16# License: GPL-2+ 

17# Tests are in tests/test_graph_new.py 

18 

19from datetime import datetime, timedelta 

20from glob import glob 

21from textwrap import dedent 

22from typing import Optional, Iterator 

23from os import makedirs 

24import pickle 

25import os.path 

26import sys 

27import subprocess 

28 

29import rrdtool # debdeps: python3-rrdtool 

30 

31import apt_pkg 

32from daklib import utils 

33 

34SECONDS_IN_DAY = 86400 

35debug_mode = False 

36 

37 

38def debug(msg: str) -> None: 

39 """Verbose messages for debugging""" 

40 if debug_mode: 40 ↛ 41line 40 didn't jump to line 41, because the condition on line 40 was never true

41 print(msg) 

42 

43 

44def write_index(out_dir: str) -> None: 

45 """Generates index.html page in the output directory""" 

46 index_html = """ 

47<!DOCTYPE html> 

48<html lang="en"> 

49 <head> 

50 <meta charset="utf-8" /> 

51 <link rel="shortcut icon" href="https://www.debian.org/favicon.ico" /> 

52 <title>Debian NEW queue</title> 

53 </head> 

54 <body> 

55 <p>How much time a new package spends in the NEW queue.</p> 

56 <p>Divided in percentiles: 0 (aka age of the oldest package), 50 (aka median) and 99</p> 

57 

58 <img src="new_queue_wait_time_all.png"> 

59 <p></p> 

60 <img src="new_queue_wait_time_year.png"> 

61 <p></p> 

62 <img src="new_queue_wait_time_month.png"> 

63 <p></p> 

64 <p>Generated on {}</p> 

65 </body> 

66</html> 

67""" 

68 fname = os.path.join(out_dir, "index.html") 

69 with open(fname, "w") as f: 

70 f.write(index_html.format(datetime.utcnow())) 

71 

72 

73def init_rrd(rrdfn: str, t: datetime) -> None: 

74 """Initialize empty RRD""" 

75 print(f"RRD file {rrdfn} not found. Initializing it now.") 

76 rrdtool.create( 

77 rrdfn, 

78 "--start", 

79 t.strftime("%s"), 

80 "--step", 

81 str(SECONDS_IN_DAY), # 1 update per day 

82 "DS:p0:GAUGE:172800:U:U", # 0 percentile 

83 "DS:p50:GAUGE:172800:U:U", # 50 percentile 

84 "DS:p99:GAUGE:172800:U:U", # 99 percentile 

85 "RRA:AVERAGE:0.5:1:3650", 

86 ) 

87 

88 

89def gen_stats(t: datetime, q: dict) -> tuple: 

90 """Extract percentiles of package ages""" 

91 s = sorted(q.values()) 

92 p99 = t - s[int(len(s) * 0.50)] 

93 p50 = t - s[int(len(s) * 0.05)] 

94 p0 = t - s[int(len(s) * 0.02)] 

95 return (p0, p50, p99) 

96 

97 

98def plot_graph( 

99 out_dir: str, fname: str, rrdfn: str, start, end: datetime, title: str 

100) -> None: 

101 fname = os.path.join(out_dir, fname) 

102 print(f"Writing {fname}") 

103 args = ( 

104 "--start", 

105 start.strftime("%s"), 

106 "--end", 

107 end.strftime("%s"), 

108 "--width", 

109 "900", 

110 "--height", 

111 "200", 

112 "--title", 

113 "Days spent in NEW over {}".format(title), 

114 "-v", 

115 "days", 

116 f"DEF:p0={rrdfn}:p0:AVERAGE", 

117 f"DEF:p50={rrdfn}:p50:AVERAGE", 

118 f"DEF:p99={rrdfn}:p99:AVERAGE", 

119 "LINE1:p0#FF8080:0 percentile", 

120 "LINE2:p50#8F8080:50 percentile", 

121 "LINE3:p99#8FFF80:99 percentile", 

122 ) 

123 # print(" ".join(args)) 

124 rrdtool.graph(fname, *args) 

125 

126 

127def _handle_conf() -> tuple: 

128 """Load configuration parameters from Cnf""" 

129 Cnf = utils.get_conf() 

130 Arguments = [ 

131 ("h", "help", "GraphNew::Options::Help"), 

132 ("r", "rrd", "GraphNew::Options::Rrd", "HasArg"), 

133 ("o", "outdir", "GraphNew::Options::Outdir", "HasArg"), 

134 ("b", "dak_base_dir", "Dir::Base", "HasArg"), 

135 ] 

136 apt_pkg.parse_commandline(Cnf, Arguments, sys.argv) 

137 

138 try: 

139 Options = Cnf.subtree("GraphNew::Options") 

140 except KeyError: 

141 msg = dedent( 

142 """ 

143 The GraphNew::Options subtree is missing from the config file 

144 Please use --rrd ... --outdir ... (see --help) or add: 

145 

146 GraphNew 

147 { 

148 Options 

149 { 

150 Rrd "<RRD directory>"; 

151 Outdir "<HTML output directory>"; 

152 } 

153 } 

154 """ 

155 ) 

156 print(msg, file=sys.stderr) 

157 sys.exit(1) 

158 

159 if "Help" in Options: 

160 usage() 

161 

162 if "GraphNew::Options::Rrd" in Cnf: 

163 rrd_dir = Cnf["GraphNew::Options::Rrd"] 

164 elif "Dir::Rrd" in Cnf: 

165 rrd_dir = Cnf["Dir::Rrd"] 

166 else: 

167 print("No RRD directory configured.\n", file=sys.stderr) 

168 sys.exit(1) 

169 

170 try: 

171 outdir = Cnf["GraphNew::Options::Outdir"] 

172 except Exception: 

173 print("No output directory configured\n", file=sys.stderr) 

174 sys.exit(1) 

175 

176 log_dir = Cnf.get("Dir::Log") 

177 assert log_dir, "Dir::Log is missing from dak.conf or empty" 

178 return rrd_dir, outdir, log_dir 

179 

180 

181def skip_file(fn: str, last_update) -> bool: 

182 """Skip files that are already processed""" 

183 if last_update is None: 

184 return False 

185 fn = os.path.split(fn)[1] 

186 basename, ext = os.path.splitext(fn) 

187 if ext == "": 

188 return False # always process last, uncompressed file 

189 filetime = datetime.strptime(basename, "%Y-%m") 

190 return filetime < last_update 

191 

192 

193def extract_queue_events( 

194 log_dir: str, last_update: Optional[datetime] 

195) -> Iterator[tuple]: 

196 """Extracts NEW queue events from log files""" 

197 compressed_glob = os.path.join(log_dir, "20*xz") 

198 debug(f"Scanning for compressed logfiles using glob '{compressed_glob}'") 

199 compressed_fns = sorted(glob(compressed_glob)) 

200 fns = compressed_fns[5:] # 5 oldest logfiles have no events 

201 current_fn = os.path.join(log_dir, datetime.utcnow().strftime("%Y-%m")) 

202 debug(f"Also adding uncompressed log {current_fn}") 

203 fns.append(current_fn) 

204 print("%d files to process" % len(fns)) 

205 events_per_month_cnt = None 

206 for fn in fns: 

207 if skip_file(fn, last_update): 

208 debug(f"Skipping {fn}") 

209 continue 

210 

211 if events_per_month_cnt is not None: 

212 debug(f"Events in the month: {events_per_month_cnt}") 

213 

214 events_per_month_cnt = 0 

215 debug(f"Processing {fn}") 

216 cmd = ( 

217 """xzgrep -h -e ACCEPT-TO-NEW -e "|NEW ACCEPT|" -e "|REJECT|" """ 

218 f"""-e "|NEW REJECT|" -e "|Policy Queue " {fn}""" 

219 ) 

220 data = subprocess.check_output(cmd, shell=True).decode() 

221 for line in data.splitlines(): 

222 line = line.rstrip() 

223 if line.startswith("#"): 

224 continue 

225 try: 

226 # <timestamp> <stage> <actor> <event> ... 

227 ts, _, _, event, remaining = line.split("|", 4) 

228 except ValueError: 

229 continue 

230 

231 if event.startswith("Policy Queue "): 

232 _, pname = remaining.split("|", 1) 

233 else: 

234 pname = remaining 

235 

236 assert "|" not in pname, repr(line) 

237 assert len(ts) == 14 

238 

239 event_time = datetime.strptime(ts, "%Y%m%d%H%M%S") 

240 events_per_month_cnt += 1 

241 yield pname, event, event_time 

242 

243 

244def process_events( 

245 events, last_update: Optional[datetime], init_time: Optional[datetime], queue 

246) -> tuple: 

247 """Process logged events like ACCEPT-TO-NEW, ACCEPT, REJECT and 

248 update the RRD database accordingly""" 

249 glitch_cnt = 0 

250 processed_events_cnt = 0 

251 previous_day = None 

252 rrdtool_updates = [] 

253 for pname, event, event_time in events: 

254 if last_update and last_update >= event_time: 254 ↛ 255line 254 didn't jump to line 255, because the condition on line 254 was never true

255 continue 

256 

257 processed_events_cnt += 1 

258 if init_time is None: 

259 # Initialise RRD once 

260 i = event_time - timedelta(days=1) 

261 debug(f"Very first event at: {event_time}") 

262 init_time = i 

263 previous_day = event_time.date() 

264 

265 pname = pname.rsplit(".", 1)[0] 

266 

267 # Update queue dict 

268 exit_events = set( 

269 ( 

270 "NEW ACCEPT", 

271 "NEW REJECT", 

272 "Policy Queue ACCEPT", 

273 "REJECT", 

274 "Policy Queue REJECT", 

275 ) 

276 ) 

277 if event == "ACCEPT-TO-NEW": 

278 queue[pname] = event_time 

279 elif event in exit_events: 279 ↛ 285line 279 didn't jump to line 285, because the condition on line 279 was never false

280 # Approved or rejected, exits NEW 

281 if pname in queue: 

282 queue.pop(pname) 

283 else: 

284 glitch_cnt += 1 

285 elif event == "ACCEPT": 

286 pass 

287 

288 if event_time.date() != previous_day: 

289 # On day change run statistics, update RRD and queue.pickle file 

290 stats = gen_stats(event_time, queue) 

291 ux = event_time.strftime("%s") 

292 v = f"{ux}:{stats[0].days}:{stats[1].days}:{stats[2].days}" 

293 rrdtool_updates.append(v) 

294 

295 previous_day = event_time.date() 

296 

297 debug(f"glitches count: {glitch_cnt}") 

298 debug(f"processed events count: {processed_events_cnt}") 

299 debug(f"queue len: {len(queue)}") 

300 return rrdtool_updates, init_time 

301 

302 

303def generate_output(rrdfn: str, out_dir: str, init_time) -> None: 

304 """Generate png charts and index.html in the output directory""" 

305 end_time = rrdtool.lastupdate(rrdfn)["date"] 

306 ofn = "new_queue_wait_time_all.png" 

307 plot_graph(out_dir, ofn, rrdfn, init_time, end_time, "all") 

308 

309 start_time = end_time - timedelta(days=365) 

310 ofn = "new_queue_wait_time_year.png" 

311 plot_graph(out_dir, ofn, rrdfn, start_time, end_time, "one year") 

312 

313 start_time = end_time - timedelta(days=30) 

314 ofn = "new_queue_wait_time_month.png" 

315 plot_graph(out_dir, ofn, rrdfn, start_time, end_time, "one month") 

316 write_index(out_dir) 

317 

318 

319def main(verbose_debug=False) -> None: 

320 global debug_mode 

321 debug_mode = verbose_debug 

322 rrd_dir, out_dir, log_dir = _handle_conf() 

323 makedirs(rrd_dir, exist_ok=True) 

324 makedirs(out_dir, exist_ok=True) 

325 rrdfn = os.path.join(rrd_dir, "graph_new.rrd") 

326 queue_fn = os.path.join(rrd_dir, "graph_new.pickle") 

327 init_time: Optional[datetime] = None 

328 last_update: Optional[datetime] = None 

329 if os.path.isfile(rrdfn): 

330 init_time = datetime.fromtimestamp(rrdtool.first(rrdfn)) 

331 last_update = rrdtool.lastupdate(rrdfn)["date"] 

332 with open(queue_fn, "rb") as f: 

333 queue = pickle.load(f) 

334 else: 

335 queue = {} 

336 

337 events = tuple(extract_queue_events(log_dir, last_update)) 

338 print(f"{len(events)} events to process") 

339 if events: 

340 debug(f"First event to process at: {events[0][2]}") 

341 

342 rrdtool_updates, begin_time = process_events(events, last_update, init_time, queue) 

343 if rrdtool_updates: 

344 debug(f"First RRD update: {rrdtool_updates[0]}") 

345 

346 with open(queue_fn, "wb") as f: 

347 pickle.dump(queue, f) 

348 

349 if init_time is None: 

350 assert begin_time, "This happens only if no events are found at all" 

351 init_time = begin_time 

352 init_rrd(rrdfn, begin_time) 

353 

354 for u in rrdtool_updates: 

355 rrdtool.update(rrdfn, u) 

356 

357 generate_output(rrdfn, out_dir, init_time) 

358 

359 

360def usage(exit_code=0) -> None: 

361 msg = dedent( 

362 """\ 

363 Usage: dak graph-new 

364 Graphs the age of packages in the NEW queue. 

365 

366 -h, --help show this help and exit. 

367 -r, --rrd=key Directory where rrd files to be updated are stored 

368 -o, --outdir=key Directory where the output is stored 

369 """ 

370 ) 

371 print(msg) 

372 sys.exit(exit_code) 

373 

374 

375# Uncomment this for manual/debugging runs 

376# if __name__ == "__main__": 

377# main(verbose_debug=True)