Coverage for dak/check_archive.py: 14%
260 statements
« prev ^ index » next coverage.py v7.6.0, created at 2026-05-10 21:38 +0000
« prev ^ index » next coverage.py v7.6.0, created at 2026-05-10 21:38 +0000
1#! /usr/bin/env python3
3"""Various different sanity checks
5@contact: Debian FTP Master <ftpmaster@debian.org>
6@copyright: (C) 2000, 2001, 2002, 2003, 2004, 2006 James Troup <james@nocrew.org>
7@license: GNU General Public License version 2 or later
8"""
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24################################################################################
26# And, lo, a great and menacing voice rose from the depths, and with
27# great wrath and vehemence it's voice boomed across the
28# land... ``hehehehehehe... that *tickles*''
29# -- aj on IRC
31################################################################################
33import errno
34import os
35import stat
36import sys
37import time
38from collections.abc import Iterable
39from typing import NoReturn, cast
41import apt_pkg
42from sqlalchemy import sql
43from sqlalchemy.engine import CursorResult
45from daklib import utils
46from daklib.config import Config
47from daklib.dak_exceptions import InvalidDscError
48from daklib.dbconn import (
49 Archive,
50 ArchiveFile,
51 DBConn,
52 DBSource,
53 DSCFile,
54 PoolFile,
55 get_component_names,
56 get_or_set_metadatakey,
57 get_suite,
58 get_suite_architectures,
59)
61################################################################################
63db_files: dict = {} #: Cache of filenames as known by the database
64waste = 0.0 #: How many bytes are "wasted" by files not referenced in database
65excluded: dict = {} #: List of files which are excluded from files check
66current_file: str | None = None
67future_files: dict[str, int] = {}
68current_time = time.time() #: now()
70################################################################################
73def usage(exit_code=0) -> NoReturn:
74 print(
75 """Usage: dak check-archive MODE
76Run various sanity checks of the archive and/or database.
78 -h, --help show this help and exit.
80The following MODEs are available:
82 checksums - validate the checksums stored in the database
83 files - check files in the database against what's in the archive
84 dsc-syntax - validate the syntax of .dsc files in the archive
85 missing-overrides - check for missing overrides
86 source-in-one-dir - ensure the source for each package is in one directory
87 timestamps - check for future timestamps in .deb's
88 files-in-dsc - ensure each .dsc references appropriate Files
89 validate-indices - ensure files mentioned in Packages & Sources exist
90 files-not-symlinks - check files in the database aren't symlinks
91 validate-builddeps - validate build-dependencies of .dsc files in the archive
92 add-missing-source-checksums - add missing checksums for source packages
93"""
94 )
95 sys.exit(exit_code)
98################################################################################
101def check_files() -> None:
102 """
103 Prepare the dictionary of existing filenames, then walk through the archive
104 pool/ directory to compare it.
105 """
106 session = DBConn().session()
108 query = """
109 SELECT archive.name, suite.suite_name, f.filename
110 FROM binaries b
111 JOIN bin_associations ba ON b.id = ba.bin
112 JOIN suite ON ba.suite = suite.id
113 JOIN archive ON suite.archive_id = archive.id
114 JOIN files f ON b.file = f.id
115 WHERE NOT EXISTS (SELECT 1 FROM files_archive_map af
116 WHERE af.archive_id = suite.archive_id
117 AND af.file_id = b.file)
118 ORDER BY archive.name, suite.suite_name, f.filename
119 """
120 for row in session.execute(sql.text(query)):
121 print("MISSING-ARCHIVE-FILE {0} {1} {2}".format(*row))
123 query = """
124 SELECT archive.name, suite.suite_name, f.filename
125 FROM source s
126 JOIN src_associations sa ON s.id = sa.source
127 JOIN suite ON sa.suite = suite.id
128 JOIN archive ON suite.archive_id = archive.id
129 JOIN dsc_files df ON s.id = df.source
130 JOIN files f ON df.file = f.id
131 WHERE NOT EXISTS (SELECT 1 FROM files_archive_map af
132 WHERE af.archive_id = suite.archive_id
133 AND af.file_id = df.file)
134 ORDER BY archive.name, suite.suite_name, f.filename
135 """
136 for row in session.execute(sql.text(query)):
137 print("MISSING-ARCHIVE-FILE {0} {1} {2}".format(*row))
139 archive_files = (
140 session.query(ArchiveFile)
141 .join(ArchiveFile.archive)
142 .join(ArchiveFile.file)
143 .order_by(Archive.archive_name, PoolFile.filename)
144 )
146 expected_files = set()
147 for af in archive_files:
148 path = af.path
149 expected_files.add(af.path)
150 if not os.path.exists(path):
151 print(
152 "MISSING-FILE {0} {1} {2}".format(
153 af.archive.archive_name, af.file.filename, path
154 )
155 )
157 archives = session.query(Archive).order_by(Archive.archive_name)
159 for a in archives:
160 top = os.path.join(a.path, "pool")
161 for dirpath, dirnames, filenames in os.walk(top):
162 for fn in filenames:
163 path = os.path.join(dirpath, fn)
164 if path in expected_files:
165 continue
166 print("UNEXPECTED-FILE {0} {1}".format(a.archive_name, path))
169################################################################################
172def check_dscs() -> None:
173 """
174 Parse every .dsc file in the archive and check for it's validity.
175 """
177 count = 0
179 for src in (
180 DBConn().session().query(DBSource).order_by(DBSource.source, DBSource.version)
181 ):
182 f = src.poolfile.fullpath
183 try:
184 utils.parse_changes(f, signing_rules=1, dsc_file=True)
185 except InvalidDscError:
186 utils.warn("syntax error in .dsc file %s" % f)
187 count += 1
188 except UnicodeDecodeError:
189 utils.warn("found invalid dsc file (%s), not properly utf-8 encoded" % f)
190 count += 1
191 except OSError as e:
192 if e.errno == errno.ENOENT:
193 utils.warn("missing dsc file (%s)" % f)
194 count += 1
195 else:
196 raise
197 except Exception as e:
198 utils.warn("miscellaneous error parsing dsc file (%s): %s" % (f, str(e)))
199 count += 1
201 if count:
202 utils.warn("Found %s invalid .dsc files." % (count))
205################################################################################
208def check_override() -> None:
209 """
210 Check for missing overrides in stable and unstable.
211 """
212 session = DBConn().session()
214 for suite_name in ["stable", "unstable"]:
215 print(suite_name)
216 print("-" * len(suite_name))
217 print()
218 suite = get_suite(suite_name)
219 assert suite is not None
220 q = session.execute(
221 sql.text(
222 """
223SELECT DISTINCT b.package FROM binaries b, bin_associations ba
224 WHERE b.id = ba.bin AND ba.suite = :suiteid AND NOT EXISTS
225 (SELECT 1 FROM override o WHERE o.suite = :suiteid AND o.package = b.package)"""
226 ),
227 {"suiteid": suite.suite_id},
228 )
230 for j in q.fetchall():
231 print(j[0])
233 q = session.execute(
234 sql.text(
235 """
236SELECT DISTINCT s.source FROM source s, src_associations sa
237 WHERE s.id = sa.source AND sa.suite = :suiteid AND NOT EXISTS
238 (SELECT 1 FROM override o WHERE o.suite = :suiteid and o.package = s.source)"""
239 ),
240 {"suiteid": suite.suite_id},
241 )
242 for j in q.fetchall():
243 print(j[0])
246################################################################################
249def check_source_in_one_dir() -> None:
250 """
251 Ensure that the source files for any given package is all in one
252 directory so that 'apt-get source' works...
253 """
255 cnf = Config()
257 # Not the most enterprising method, but hey...
258 broken_count = 0
260 session = DBConn().session()
262 q = session.query(DBSource)
263 for s in q.all():
264 first_path = ""
265 first_filename = ""
266 broken = False
268 qf = session.query(PoolFile).join(DSCFile).filter_by(source_id=s.source_id)
269 for f in qf.all():
270 # 0: path
271 # 1: filename
272 filename = os.path.join(cnf["Dir::Root"], f.filename)
273 path = os.path.dirname(filename)
275 if first_path == "":
276 first_path = path
277 first_filename = filename
278 elif first_path != path:
279 symlink = path + "/" + os.path.basename(first_filename)
280 if not os.path.exists(symlink):
281 broken = True
282 print(
283 "WOAH, we got a live one here... %s [%s] {%s}"
284 % (filename, s.source_id, symlink)
285 )
286 if broken:
287 broken_count += 1
289 print(
290 "Found %d source packages where the source is not all in one directory."
291 % (broken_count)
292 )
295################################################################################
298def check_checksums() -> None:
299 """
300 Validate all files
301 """
302 print("Getting file information from database...")
303 q = DBConn().session().query(PoolFile)
305 print("Checking file checksums & sizes...")
306 for f in q:
307 filename = f.fullpath
309 try:
310 fi = open(filename)
311 except:
312 utils.warn("can't open '%s'." % (filename))
313 continue
315 size = os.stat(filename)[stat.ST_SIZE]
316 if size != f.filesize:
317 utils.warn(
318 "**WARNING** size mismatch for '%s' ('%s' [current] vs. '%s' [db])."
319 % (filename, size, f.filesize)
320 )
322 md5sum = apt_pkg.md5sum(fi)
323 if md5sum != f.md5sum:
324 utils.warn(
325 "**WARNING** md5sum mismatch for '%s' ('%s' [current] vs. '%s' [db])."
326 % (filename, md5sum, f.md5sum)
327 )
329 fi.seek(0)
330 sha1sum = apt_pkg.sha1sum(fi) # type: ignore[attr-defined]
331 if sha1sum != f.sha1sum:
332 utils.warn(
333 "**WARNING** sha1sum mismatch for '%s' ('%s' [current] vs. '%s' [db])."
334 % (filename, sha1sum, f.sha1sum)
335 )
337 fi.seek(0)
338 sha256sum = apt_pkg.sha256sum(fi) # type: ignore[attr-defined]
339 if sha256sum != f.sha256sum:
340 utils.warn(
341 "**WARNING** sha256sum mismatch for '%s' ('%s' [current] vs. '%s' [db])."
342 % (filename, sha256sum, f.sha256sum)
343 )
344 fi.close()
346 print("Done.")
349################################################################################
350#
353def Ent(Kind, Name, Link, Mode, UID, GID, Size, MTime: int, Major, Minor) -> None:
354 global future_files
355 assert current_file is not None
357 if MTime > current_time:
358 future_files[current_file] = MTime
359 print(
360 "%s: %s '%s','%s',%u,%u,%u,%u,%u,%u,%u"
361 % (
362 current_file,
363 Kind,
364 Name,
365 Link,
366 Mode,
367 UID,
368 GID,
369 Size,
370 MTime,
371 Major,
372 Minor,
373 )
374 )
377def check_timestamps() -> None:
378 """
379 Check all files for timestamps in the future; common from hardware
380 (e.g. alpha) which have far-future dates as their default dates.
381 """
382 return
384 # global current_file
385 #
386 # q = DBConn().session().query(PoolFile).filter(PoolFile.filename.like(".deb$"))
387 #
388 # db_files.clear()
389 # count = 0
390 #
391 # for pf in q.all():
392 # filename = os.path.abspath(os.path.join(pf.location.path, pf.filename))
393 # if os.access(filename, os.R_OK):
394 # with open(filename) as f:
395 # current_file = filename
396 # print("Processing %s." % (filename), file=sys.stderr)
397 # apt_inst.debExtract(f, Ent, "control.tar.gz")
398 # f.seek(0)
399 # apt_inst.debExtract(f, Ent, "data.tar.gz")
400 # count += 1
401 #
402 # print("Checked %d files (out of %d)." % (count, len(db_files)))
405################################################################################
408def check_files_in_dsc() -> None:
409 """
410 Ensure each .dsc lists appropriate files in its Files field (according
411 to the format announced in its Format field).
412 """
413 return
415 # count = 0
416 #
417 # print("Building list of database files...")
418 # q = DBConn().session().query(PoolFile).filter(PoolFile.filename.like(".dsc$"))
419 #
420 # if q.count() > 0:
421 # print("Checking %d files..." % q.count())
422 # else:
423 # print("No files to check.")
424 #
425 # cnf = Config()
426 # for pf in q.all():
427 # filename = os.path.abspath(os.path.join(cnf["Dir::Root"], pf.filename))
428 #
429 # try:
430 # # NB: don't enforce .dsc syntax
431 # dsc = utils.parse_changes(filename, dsc_file=True)
432 # except:
433 # utils.fubar("error parsing .dsc file '%s'." % (filename))
434 #
435 # reasons = utils.check_dsc_files(filename, dsc)
436 # for r in reasons:
437 # utils.warn(r)
438 #
439 # if len(reasons) > 0:
440 # count += 1
441 #
442 # if count:
443 # utils.warn("Found %s invalid .dsc files." % (count))
446################################################################################
449def validate_sources(suite: str, component: str) -> None:
450 """
451 Ensure files mentioned in Sources exist
452 """
453 cnf = Config()
454 filename = "%s/dists/%s/%s/source/Sources" % (cnf["Dir::Root"], suite, component)
455 filename = utils.find_possibly_compressed_file(filename)
456 print("Processing %s..." % (filename))
457 with apt_pkg.TagFile(filename) as Sources:
458 while Sources.step(): # type: ignore[attr-defined]
459 section: apt_pkg.TagSection = Sources.section # type: ignore[attr-defined]
460 source = section.find("Package")
461 directory = section.find("Directory")
462 files = section.find("Files")
463 for i in files.split("\n"):
464 (md5, size, name) = i.split()
465 filename = "%s/%s/%s" % (cnf["Dir::Root"], directory, name)
466 if not os.path.exists(filename):
467 if directory.find("potato") == -1:
468 print("W: %s missing." % (filename))
469 else:
470 pool_location = utils.poolify(source)
471 pool_filename = "%s/%s/%s" % (
472 cnf["Dir::Pool"],
473 pool_location,
474 name,
475 )
476 if not os.path.exists(pool_filename):
477 print("E: %s missing (%s)." % (filename, pool_filename))
478 else:
479 # Create symlink
480 pool_filename = os.path.normpath(pool_filename)
481 filename = os.path.normpath(filename)
482 src = utils.clean_symlink(
483 pool_filename, filename, cnf["Dir::Root"]
484 )
485 print("Symlinking: %s -> %s" % (filename, src))
488########################################
491def validate_packages(suite: str, component: str, architecture: str) -> None:
492 """
493 Ensure files mentioned in Packages exist
494 """
495 cnf = Config()
496 filename = "%s/dists/%s/%s/binary-%s/Packages" % (
497 cnf["Dir::Root"],
498 suite,
499 component,
500 architecture,
501 )
502 filename = utils.find_possibly_compressed_file(filename)
503 print("Processing %s..." % (filename))
504 with apt_pkg.TagFile(filename) as Packages:
505 while Packages.step(): # type: ignore[attr-defined]
506 section: apt_pkg.TagSection = Packages.section # type: ignore[attr-defined]
507 filename = "%s/%s" % (cnf["Dir::Root"], section.find("Filename"))
508 if not os.path.exists(filename):
509 print("W: %s missing." % (filename))
512########################################
515def check_indices_files_exist() -> None:
516 """
517 Ensure files mentioned in Packages & Sources exist
518 """
519 for suite in ["stable", "testing", "unstable"]:
520 for component in get_component_names():
521 architectures = get_suite_architectures(suite)
522 for arch in [i.arch_string.lower() for i in architectures]:
523 if arch == "source":
524 validate_sources(suite, component)
525 elif arch == "all":
526 continue
527 else:
528 validate_packages(suite, component, arch)
531################################################################################
534def check_files_not_symlinks() -> None:
535 """
536 Check files in the database aren't symlinks
537 """
538 return
540 # print("Building list of database files... ", end=" ")
541 # q = DBConn().session().query(PoolFile).filter(PoolFile.filename.like(".dsc$"))
542 #
543 # for pf in q.all():
544 # filename = os.path.abspath(os.path.join(pf.location.path, pf.filename))
545 # if os.access(filename, os.R_OK) == 0:
546 # utils.warn("%s: doesn't exist." % (filename))
547 # else:
548 # if os.path.islink(filename):
549 # utils.warn("%s: is a symlink." % (filename))
552################################################################################
555def chk_bd_process_dir(dirname: str, filenames: Iterable[str]) -> None:
556 for name in filenames:
557 if not name.endswith(".dsc"):
558 continue
559 filename = os.path.abspath(dirname + "/" + name)
560 dsc = utils.parse_changes(filename, dsc_file=True)
561 for field_name in ["build-depends", "build-depends-indep"]:
562 field = dsc.get(field_name)
563 if field:
564 try:
565 apt_pkg.parse_src_depends(field)
566 except:
567 print("E: [%s] %s: %s" % (filename, field_name, field))
570################################################################################
573def check_build_depends() -> None:
574 """Validate build-dependencies of .dsc files in the archive"""
575 cnf = Config()
576 for dirpath, dirnames, filenames in os.walk(cnf["Dir::Root"]):
577 chk_bd_process_dir(dirpath, filenames)
580################################################################################
583_add_missing_source_checksums_query = R"""
584INSERT INTO source_metadata
585 (src_id, key_id, value)
586SELECT
587 s.id,
588 :checksum_key,
589 E'\n' ||
590 (SELECT STRING_AGG(' ' || tmp.checksum || ' ' || tmp.size || ' ' || tmp.basename, E'\n' ORDER BY tmp.basename)
591 FROM
592 (SELECT
593 CASE :checksum_type
594 WHEN 'Files' THEN f.md5sum
595 WHEN 'Checksums-Sha1' THEN f.sha1sum
596 WHEN 'Checksums-Sha256' THEN f.sha256sum
597 END AS checksum,
598 f.size,
599 SUBSTRING(f.filename FROM E'/([^/]*)\\Z') AS basename
600 FROM files f JOIN dsc_files ON f.id = dsc_files.file
601 WHERE dsc_files.source = s.id AND f.id != s.file
602 ) AS tmp
603 )
605 FROM
606 source s
607 WHERE NOT EXISTS (SELECT 1 FROM source_metadata md WHERE md.src_id=s.id AND md.key_id = :checksum_key);
608"""
611def add_missing_source_checksums() -> None:
612 """Add missing source checksums to source_metadata"""
613 session = DBConn().session()
614 for checksum in ["Files", "Checksums-Sha1", "Checksums-Sha256"]:
615 checksum_key = get_or_set_metadatakey(checksum, session).key_id
616 rows = cast(
617 CursorResult,
618 session.execute(
619 sql.text(_add_missing_source_checksums_query),
620 {"checksum_key": checksum_key, "checksum_type": checksum},
621 ),
622 ).rowcount
623 if rows > 0:
624 print("Added {0} missing entries for {1}".format(rows, checksum))
625 session.commit()
628################################################################################
631def main() -> None:
632 global db_files, waste, excluded
634 cnf = Config()
636 Arguments = [("h", "help", "Check-Archive::Options::Help")]
637 for i in ["help"]:
638 key = "Check-Archive::Options::%s" % i
639 if key not in cnf: 639 ↛ 637line 639 didn't jump to line 637 because the condition on line 639 was always true
640 cnf[key] = ""
642 args = apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) # type: ignore[attr-defined]
644 Options = cnf.subtree("Check-Archive::Options")
645 if Options["Help"]: 645 ↛ 648line 645 didn't jump to line 648 because the condition on line 645 was always true
646 usage()
648 if len(args) < 1:
649 utils.warn("dak check-archive requires at least one argument")
650 usage(1)
651 elif len(args) > 1:
652 utils.warn("dak check-archive accepts only one argument")
653 usage(1)
654 mode = args[0].lower()
656 # Initialize DB
657 DBConn()
659 if mode == "checksums":
660 check_checksums()
661 elif mode == "files":
662 check_files()
663 elif mode == "dsc-syntax":
664 check_dscs()
665 elif mode == "missing-overrides":
666 check_override()
667 elif mode == "source-in-one-dir":
668 check_source_in_one_dir()
669 elif mode == "timestamps":
670 check_timestamps()
671 elif mode == "files-in-dsc":
672 check_files_in_dsc()
673 elif mode == "validate-indices":
674 check_indices_files_exist()
675 elif mode == "files-not-symlinks":
676 check_files_not_symlinks()
677 elif mode == "validate-builddeps":
678 check_build_depends()
679 elif mode == "add-missing-source-checksums":
680 add_missing_source_checksums()
681 else:
682 utils.warn("unknown mode '%s'" % (mode))
683 usage(1)
686################################################################################
689if __name__ == "__main__":
690 main()