Coverage for dak/archive_dedup_pool.py: 56%
53 statements
« prev ^ index » next coverage.py v7.6.0, created at 2026-01-04 16:18 +0000
« prev ^ index » next coverage.py v7.6.0, created at 2026-01-04 16:18 +0000
1#! /usr/bin/env python3
2# vim:set et ts=4 sw=4:
4"""De-duplicates files in the pool directory
6@contact: Debian FTP Master <ftpmaster@debian.org>
7@copyright: 2017 Bastian Blank <waldi@debian.org>
8@license: GNU General Public License version 2 or later
9"""
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24################################################################################
26import errno
27import os
28import sys
29from typing import TYPE_CHECKING
31import apt_pkg
32from sqlalchemy import sql
34from daklib import daklog
35from daklib.config import Config
36from daklib.dbconn import DBConn
38if TYPE_CHECKING:
39 from sqlalchemy.orm import Session
41Options: apt_pkg.Configuration
42Logger: daklog.Logger
44################################################################################
45################################################################################
46################################################################################
49def usage(exit_code=0):
50 print(
51 """Usage: dak archive-dedup-pool [OPTION]...
52 -h, --help show this help and exit.
53 -V, --version display the version number and exit
54"""
55 )
56 sys.exit(exit_code)
59################################################################################
62def dedup_one(size: int, reference: str, *filenames: str) -> None:
63 stat_reference = os.stat(reference)
65 # safety net
66 if stat_reference.st_size != size:
67 raise RuntimeError(
68 "Size of {} does not match database: {} != {}".format(
69 reference, size, stat_reference.st_size
70 )
71 )
73 for filename in filenames:
74 stat_filename = os.stat(filename)
76 # if file is already a hard-linked, ignore
77 if stat_reference == stat_filename:
78 continue
80 # safety net
81 if stat_filename.st_size != size:
82 raise RuntimeError(
83 "Size of {} does not match database: {} != {}".format(
84 filename, size, stat_filename.st_size
85 )
86 )
88 tempfile = filename + ".new"
89 os.link(reference, tempfile)
90 try:
91 Logger.log(["deduplicate", filename, reference])
92 os.rename(tempfile, filename)
93 finally:
94 try:
95 os.unlink(tempfile)
96 except OSError as e:
97 if e.errno != errno.ENOENT:
98 raise
101################################################################################
104def dedup(session: "Session") -> None:
105 results = session.execute(
106 sql.text(
107 """
108SELECT DISTINCT *
109 FROM (
110 SELECT
111 f.size,
112 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER (
113 -- we aggregate all files with the same size, sha256sum and archive
114 PARTITION BY f.size, f.sha256sum, a.id
115 -- the oldest should be first
116 ORDER by f.created
117 -- we always want to see all rows
118 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
119 )
120 AS filenames
121 FROM
122 files AS f INNER JOIN
123 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN
124 component c ON fa.component_id = c.id INNER JOIN
125 archive a ON fa.archive_id = a.id
126 ) AS f
127 -- we only care about entries with more than one filename
128 WHERE array_length(filenames, 1) > 1
129 """
130 )
131 ).mappings()
133 for i in results: 133 ↛ 134line 133 didn't jump to line 134 because the loop on line 133 never started
134 dedup_one(i["size"], *i["filenames"])
137################################################################################
140def main():
141 global Options, Logger
143 cnf = Config()
144 session = DBConn().session()
146 Arguments = [("h", "help", "Archive-Dedup-Pool::Options::Help")]
148 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) # type: ignore[attr-defined]
150 for i in ["help"]:
151 key = "Archive-Dedup-Pool::Options::%s" % i
152 if key not in cnf:
153 cnf[key] = ""
155 Options = cnf.subtree("Archive-Dedup-Pool::Options")
157 if Options["Help"]:
158 usage()
160 Logger = daklog.Logger("archive-dedup-pool")
162 dedup(session)
164 Logger.close()
167################################################################################
170if __name__ == "__main__":
171 main()