1
2
3
4 """De-duplicates files in the pool directory
5
6 @contact: Debian FTP Master <ftpmaster@debian.org>
7 @copyright: 2017 Bastian Blank <waldi@debian.org>
8 @license: GNU General Public License version 2 or later
9 """
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26 import errno
27 import os
28 import sys
29
30 import apt_pkg
31
32 from daklib import daklog
33 from daklib.config import Config
34 from daklib.dbconn import DBConn
35
36 Options = None
37 Logger = None
38
39
40
41
42
43
45 print(
46 """Usage: dak archive-dedup-pool [OPTION]...
47 -h, --help show this help and exit.
48 -V, --version display the version number and exit
49 """
50 )
51 sys.exit(exit_code)
52
53
54
55
56
58 stat_reference = os.stat(reference)
59
60
61 if stat_reference.st_size != size:
62 raise RuntimeError(
63 "Size of {} does not match database: {} != {}".format(
64 reference, size, stat_reference.st_size
65 )
66 )
67
68 for filename in filenames:
69 stat_filename = os.stat(filename)
70
71
72 if stat_reference == stat_filename:
73 continue
74
75
76 if stat_filename.st_size != size:
77 raise RuntimeError(
78 "Size of {} does not match database: {} != {}".format(
79 filename, size, stat_filename.st_size
80 )
81 )
82
83 tempfile = filename + ".new"
84 os.link(reference, tempfile)
85 try:
86 Logger.log(["deduplicate", filename, reference])
87 os.rename(tempfile, filename)
88 finally:
89 try:
90 os.unlink(tempfile)
91 except OSError as e:
92 if e.errno != errno.ENOENT:
93 raise
94
95
96
97
98
100 results = session.execute(
101 """
102 SELECT DISTINCT *
103 FROM (
104 SELECT
105 f.size,
106 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER (
107 -- we aggregate all files with the same size, sha256sum and archive
108 PARTITION BY f.size, f.sha256sum, a.id
109 -- the oldest should be first
110 ORDER by f.created
111 -- we always want to see all rows
112 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING
113 )
114 AS filenames
115 FROM
116 files AS f INNER JOIN
117 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN
118 component c ON fa.component_id = c.id INNER JOIN
119 archive a ON fa.archive_id = a.id
120 ) AS f
121 -- we only care about entries with more than one filename
122 WHERE array_length(filenames, 1) > 1
123 """
124 )
125
126 for i in results:
127 dedup_one(i["size"], *i["filenames"])
128
129
130
131
132
134 global Options, Logger
135
136 cnf = Config()
137 session = DBConn().session()
138
139 Arguments = [("h", "help", "Archive-Dedup-Pool::Options::Help")]
140
141 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv)
142
143 for i in ["help"]:
144 key = "Archive-Dedup-Pool::Options::%s" % i
145 if key not in cnf:
146 cnf[key] = ""
147
148 Options = cnf.subtree("Archive-Dedup-Pool::Options")
149
150 if Options["Help"]:
151 usage()
152
153 Logger = daklog.Logger("archive-dedup-pool")
154
155 dedup(session)
156
157 Logger.close()
158
159
160
161
162
163 if __name__ == "__main__":
164 main()
165