1#! /usr/bin/env python3 

2# vim:set et ts=4 sw=4: 

3 

4"""De-duplicates files in the pool directory 

5 

6@contact: Debian FTP Master <ftpmaster@debian.org> 

7@copyright: 2017 Bastian Blank <waldi@debian.org> 

8@license: GNU General Public License version 2 or later 

9""" 

10# This program is free software; you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation; either version 2 of the License, or 

13# (at your option) any later version. 

14 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19 

20# You should have received a copy of the GNU General Public License 

21# along with this program; if not, write to the Free Software 

22# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 

23 

24################################################################################ 

25 

26import errno 

27import os 

28import sys 

29 

30import apt_pkg 

31 

32from daklib import daklog 

33from daklib.config import Config 

34from daklib.dbconn import DBConn 

35 

36Options = None 

37Logger = None 

38 

39################################################################################ 

40################################################################################ 

41################################################################################ 

42 

43 

44def usage(exit_code=0): 

45 print( 

46 """Usage: dak archive-dedup-pool [OPTION]... 

47 -h, --help show this help and exit. 

48 -V, --version display the version number and exit 

49""" 

50 ) 

51 sys.exit(exit_code) 

52 

53 

54################################################################################ 

55 

56 

57def dedup_one(size, reference, *filenames): 

58 stat_reference = os.stat(reference) 

59 

60 # safety net 

61 if stat_reference.st_size != size: 

62 raise RuntimeError( 

63 "Size of {} does not match database: {} != {}".format( 

64 reference, size, stat_reference.st_size 

65 ) 

66 ) 

67 

68 for filename in filenames: 

69 stat_filename = os.stat(filename) 

70 

71 # if file is already a hard-linked, ignore 

72 if stat_reference == stat_filename: 

73 continue 

74 

75 # safety net 

76 if stat_filename.st_size != size: 

77 raise RuntimeError( 

78 "Size of {} does not match database: {} != {}".format( 

79 filename, size, stat_filename.st_size 

80 ) 

81 ) 

82 

83 tempfile = filename + ".new" 

84 os.link(reference, tempfile) 

85 try: 

86 Logger.log(["deduplicate", filename, reference]) 

87 os.rename(tempfile, filename) 

88 finally: 

89 try: 

90 os.unlink(tempfile) 

91 except OSError as e: 

92 if e.errno != errno.ENOENT: 

93 raise 

94 

95 

96################################################################################ 

97 

98 

99def dedup(session): 

100 results = session.execute( 

101 """ 

102SELECT DISTINCT * 

103 FROM ( 

104 SELECT 

105 f.size, 

106 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER ( 

107 -- we aggregate all files with the same size, sha256sum and archive 

108 PARTITION BY f.size, f.sha256sum, a.id 

109 -- the oldest should be first 

110 ORDER by f.created 

111 -- we always want to see all rows 

112 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 

113 ) 

114 AS filenames 

115 FROM 

116 files AS f INNER JOIN 

117 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN 

118 component c ON fa.component_id = c.id INNER JOIN 

119 archive a ON fa.archive_id = a.id 

120 ) AS f 

121 -- we only care about entries with more than one filename 

122 WHERE array_length(filenames, 1) > 1 

123 """ 

124 ) 

125 

126 for i in results: 126 ↛ 127line 126 didn't jump to line 127, because the loop on line 126 never started

127 dedup_one(i["size"], *i["filenames"]) 

128 

129 

130################################################################################ 

131 

132 

133def main(): 

134 global Options, Logger 

135 

136 cnf = Config() 

137 session = DBConn().session() 

138 

139 Arguments = [("h", "help", "Archive-Dedup-Pool::Options::Help")] 

140 

141 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) 

142 

143 for i in ["help"]: 

144 key = "Archive-Dedup-Pool::Options::%s" % i 

145 if key not in cnf: 

146 cnf[key] = "" 

147 

148 Options = cnf.subtree("Archive-Dedup-Pool::Options") 

149 

150 if Options["Help"]: 

151 usage() 

152 

153 Logger = daklog.Logger("archive-dedup-pool") 

154 

155 dedup(session) 

156 

157 Logger.close() 

158 

159 

160################################################################################ 

161 

162 

163if __name__ == "__main__": 

164 main()