1#! /usr/bin/env python3
2# vim:set et ts=4 sw=4:
4"""De-duplicates files in the pool directory
6@contact: Debian FTP Master <ftpmaster@debian.org>
7@copyright: 2017 Bastian Blank <waldi@debian.org>
8@license: GNU General Public License version 2 or later
9"""
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24################################################################################
26import errno
27import os
28import sys
30import apt_pkg
32from daklib import daklog
33from daklib.config import Config
34from daklib.dbconn import DBConn
36Options = None
37Logger = None
39################################################################################
40################################################################################
41################################################################################
44def usage(exit_code=0):
45 print(
46 """Usage: dak archive-dedup-pool [OPTION]...
47 -h, --help show this help and exit.
48 -V, --version display the version number and exit
49"""
50 )
51 sys.exit(exit_code)
54################################################################################
57def dedup_one(size, reference, *filenames):
58 stat_reference = os.stat(reference)
60 # safety net
61 if stat_reference.st_size != size:
62 raise RuntimeError(
63 "Size of {} does not match database: {} != {}".format(
64 reference, size, stat_reference.st_size
65 )
66 )
68 for filename in filenames:
69 stat_filename = os.stat(filename)
71 # if file is already a hard-linked, ignore
72 if stat_reference == stat_filename:
73 continue
75 # safety net
76 if stat_filename.st_size != size:
77 raise RuntimeError(
78 "Size of {} does not match database: {} != {}".format(
79 filename, size, stat_filename.st_size
80 )
81 )
83 tempfile = filename + ".new"
84 os.link(reference, tempfile)
85 try:
86 Logger.log(["deduplicate", filename, reference])
87 os.rename(tempfile, filename)
88 finally:
89 try:
90 os.unlink(tempfile)
91 except OSError as e:
92 if e.errno != errno.ENOENT:
93 raise
96################################################################################
99def dedup(session):
100 results = session.execute(
101 """
102SELECT DISTINCT *
103 FROM (
104 SELECT
105 f.size,
106 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER (
107 -- we aggregate all files with the same size, sha256sum and archive
108 PARTITION BY f.size, f.sha256sum, a.id
109 -- the oldest should be first
110 ORDER by f.created
111 -- we always want to see all rows
112 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
113 )
114 AS filenames
115 FROM
116 files AS f INNER JOIN
117 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN
118 component c ON fa.component_id = c.id INNER JOIN
119 archive a ON fa.archive_id = a.id
120 ) AS f
121 -- we only care about entries with more than one filename
122 WHERE array_length(filenames, 1) > 1
123 """
124 )
126 for i in results: 126 ↛ 127line 126 didn't jump to line 127, because the loop on line 126 never started
127 dedup_one(i["size"], *i["filenames"])
130################################################################################
133def main():
134 global Options, Logger
136 cnf = Config()
137 session = DBConn().session()
139 Arguments = [("h", "help", "Archive-Dedup-Pool::Options::Help")]
141 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv)
143 for i in ["help"]:
144 key = "Archive-Dedup-Pool::Options::%s" % i
145 if key not in cnf:
146 cnf[key] = ""
148 Options = cnf.subtree("Archive-Dedup-Pool::Options")
150 if Options["Help"]:
151 usage()
153 Logger = daklog.Logger("archive-dedup-pool")
155 dedup(session)
157 Logger.close()
160################################################################################
163if __name__ == "__main__":
164 main()