1
2
3
4 """ De-duplicates files in the pool directory
5
6 @contact: Debian FTP Master <ftpmaster@debian.org>
7 @copyright: 2017 Bastian Blank <waldi@debian.org>
8 @license: GNU General Public License version 2 or later
9 """
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 import apt_pkg
27 import errno
28 import os
29 import sys
30
31 from daklib.dbconn import DBConn
32 from daklib import daklog
33 from daklib.config import Config
34
35 Options = None
36 Logger = None
37
38
39
40
41
42
44 print("""Usage: dak archive-dedup-pool [OPTION]...
45 -h, --help show this help and exit.
46 -V, --version display the version number and exit
47 """)
48 sys.exit(exit_code)
49
50
51
52
54 stat_reference = os.stat(reference)
55
56
57 if stat_reference.st_size != size:
58 raise RuntimeError('Size of {} does not match database: {} != {}'.format(
59 reference, size, stat_reference.st_size))
60
61 for filename in filenames:
62 stat_filename = os.stat(filename)
63
64
65 if stat_reference == stat_filename:
66 continue
67
68
69 if stat_filename.st_size != size:
70 raise RuntimeError('Size of {} does not match database: {} != {}'.format(
71 filename, size, stat_filename.st_size))
72
73 tempfile = filename + '.new'
74 os.link(reference, tempfile)
75 try:
76 Logger.log(["deduplicate", filename, reference])
77 os.rename(tempfile, filename)
78 finally:
79 try:
80 os.unlink(tempfile)
81 except OSError as e:
82 if e.errno != errno.ENOENT:
83 raise
84
85
86
87
89 results = session.execute("""
90 SELECT DISTINCT *
91 FROM (
92 SELECT
93 f.size,
94 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER (
95 -- we aggregate all files with the same size, sha256sum and archive
96 PARTITION BY f.size, f.sha256sum, a.id
97 -- the oldest should be first
98 ORDER by f.created
99 -- we always want to see all rows
100 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
101 )
102 AS filenames
103 FROM
104 files AS f INNER JOIN
105 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN
106 component c ON fa.component_id = c.id INNER JOIN
107 archive a ON fa.archive_id = a.id
108 ) AS f
109 -- we only care about entries with more than one filename
110 WHERE array_length(filenames, 1) > 1
111 """)
112
113 for i in results:
114 dedup_one(i['size'], *i['filenames'])
115
116
117
118
120 global Options, Logger
121
122 cnf = Config()
123 session = DBConn().session()
124
125 Arguments = [('h', "help", "Archive-Dedup-Pool::Options::Help")]
126
127 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv)
128
129 for i in ["help"]:
130 key = "Archive-Dedup-Pool::Options::%s" % i
131 if key not in cnf:
132 cnf[key] = ""
133
134 Options = cnf.subtree("Archive-Dedup-Pool::Options")
135
136 if Options["Help"]:
137 usage()
138
139 Logger = daklog.Logger("archive-dedup-pool")
140
141 dedup(session)
142
143 Logger.close()
144
145
146
147
148 if __name__ == '__main__':
149 main()
150