1#! /usr/bin/env python3
2# vim:set et ts=4 sw=4:
4""" De-duplicates files in the pool directory
6@contact: Debian FTP Master <ftpmaster@debian.org>
7@copyright: 2017 Bastian Blank <waldi@debian.org>
8@license: GNU General Public License version 2 or later
9"""
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
24################################################################################
26import apt_pkg
27import errno
28import os
29import sys
31from daklib.dbconn import DBConn
32from daklib import daklog
33from daklib.config import Config
35Options = None
36Logger = None
38################################################################################
39################################################################################
40################################################################################
43def usage(exit_code=0):
44 print("""Usage: dak archive-dedup-pool [OPTION]...
45 -h, --help show this help and exit.
46 -V, --version display the version number and exit
47""")
48 sys.exit(exit_code)
50################################################################################
53def dedup_one(size, reference, *filenames):
54 stat_reference = os.stat(reference)
56 # safety net
57 if stat_reference.st_size != size:
58 raise RuntimeError('Size of {} does not match database: {} != {}'.format(
59 reference, size, stat_reference.st_size))
61 for filename in filenames:
62 stat_filename = os.stat(filename)
64 # if file is already a hard-linked, ignore
65 if stat_reference == stat_filename:
66 continue
68 # safety net
69 if stat_filename.st_size != size:
70 raise RuntimeError('Size of {} does not match database: {} != {}'.format(
71 filename, size, stat_filename.st_size))
73 tempfile = filename + '.new'
74 os.link(reference, tempfile)
75 try:
76 Logger.log(["deduplicate", filename, reference])
77 os.rename(tempfile, filename)
78 finally:
79 try:
80 os.unlink(tempfile)
81 except OSError as e:
82 if e.errno != errno.ENOENT:
83 raise
85################################################################################
88def dedup(session):
89 results = session.execute("""
90SELECT DISTINCT *
91 FROM (
92 SELECT
93 f.size,
94 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER (
95 -- we aggregate all files with the same size, sha256sum and archive
96 PARTITION BY f.size, f.sha256sum, a.id
97 -- the oldest should be first
98 ORDER by f.created
99 -- we always want to see all rows
100 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
101 )
102 AS filenames
103 FROM
104 files AS f INNER JOIN
105 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN
106 component c ON fa.component_id = c.id INNER JOIN
107 archive a ON fa.archive_id = a.id
108 ) AS f
109 -- we only care about entries with more than one filename
110 WHERE array_length(filenames, 1) > 1
111 """)
113 for i in results: 113 ↛ 114line 113 didn't jump to line 114, because the loop on line 113 never started
114 dedup_one(i['size'], *i['filenames'])
116################################################################################
119def main():
120 global Options, Logger
122 cnf = Config()
123 session = DBConn().session()
125 Arguments = [('h', "help", "Archive-Dedup-Pool::Options::Help")]
127 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv)
129 for i in ["help"]:
130 key = "Archive-Dedup-Pool::Options::%s" % i
131 if key not in cnf:
132 cnf[key] = ""
134 Options = cnf.subtree("Archive-Dedup-Pool::Options")
136 if Options["Help"]:
137 usage()
139 Logger = daklog.Logger("archive-dedup-pool")
141 dedup(session)
143 Logger.close()
145################################################################################
148if __name__ == '__main__':
149 main()