1#! /usr/bin/env python3 

2# vim:set et ts=4 sw=4: 

3 

4""" De-duplicates files in the pool directory 

5 

6@contact: Debian FTP Master <ftpmaster@debian.org> 

7@copyright: 2017 Bastian Blank <waldi@debian.org> 

8@license: GNU General Public License version 2 or later 

9""" 

10# This program is free software; you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation; either version 2 of the License, or 

13# (at your option) any later version. 

14 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19 

20# You should have received a copy of the GNU General Public License 

21# along with this program; if not, write to the Free Software 

22# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 

23 

24################################################################################ 

25 

26import apt_pkg 

27import errno 

28import os 

29import sys 

30 

31from daklib.dbconn import DBConn 

32from daklib import daklog 

33from daklib.config import Config 

34 

35Options = None 

36Logger = None 

37 

38################################################################################ 

39################################################################################ 

40################################################################################ 

41 

42 

43def usage(exit_code=0): 

44 print("""Usage: dak archive-dedup-pool [OPTION]... 

45 -h, --help show this help and exit. 

46 -V, --version display the version number and exit 

47""") 

48 sys.exit(exit_code) 

49 

50################################################################################ 

51 

52 

53def dedup_one(size, reference, *filenames): 

54 stat_reference = os.stat(reference) 

55 

56 # safety net 

57 if stat_reference.st_size != size: 

58 raise RuntimeError('Size of {} does not match database: {} != {}'.format( 

59 reference, size, stat_reference.st_size)) 

60 

61 for filename in filenames: 

62 stat_filename = os.stat(filename) 

63 

64 # if file is already a hard-linked, ignore 

65 if stat_reference == stat_filename: 

66 continue 

67 

68 # safety net 

69 if stat_filename.st_size != size: 

70 raise RuntimeError('Size of {} does not match database: {} != {}'.format( 

71 filename, size, stat_filename.st_size)) 

72 

73 tempfile = filename + '.new' 

74 os.link(reference, tempfile) 

75 try: 

76 Logger.log(["deduplicate", filename, reference]) 

77 os.rename(tempfile, filename) 

78 finally: 

79 try: 

80 os.unlink(tempfile) 

81 except OSError as e: 

82 if e.errno != errno.ENOENT: 

83 raise 

84 

85################################################################################ 

86 

87 

88def dedup(session): 

89 results = session.execute(""" 

90SELECT DISTINCT * 

91 FROM ( 

92 SELECT 

93 f.size, 

94 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER ( 

95 -- we aggregate all files with the same size, sha256sum and archive 

96 PARTITION BY f.size, f.sha256sum, a.id 

97 -- the oldest should be first 

98 ORDER by f.created 

99 -- we always want to see all rows 

100 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 

101 ) 

102 AS filenames 

103 FROM 

104 files AS f INNER JOIN 

105 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN 

106 component c ON fa.component_id = c.id INNER JOIN 

107 archive a ON fa.archive_id = a.id 

108 ) AS f 

109 -- we only care about entries with more than one filename 

110 WHERE array_length(filenames, 1) > 1 

111 """) 

112 

113 for i in results: 113 ↛ 114line 113 didn't jump to line 114, because the loop on line 113 never started

114 dedup_one(i['size'], *i['filenames']) 

115 

116################################################################################ 

117 

118 

119def main(): 

120 global Options, Logger 

121 

122 cnf = Config() 

123 session = DBConn().session() 

124 

125 Arguments = [('h', "help", "Archive-Dedup-Pool::Options::Help")] 

126 

127 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) 

128 

129 for i in ["help"]: 

130 key = "Archive-Dedup-Pool::Options::%s" % i 

131 if key not in cnf: 

132 cnf[key] = "" 

133 

134 Options = cnf.subtree("Archive-Dedup-Pool::Options") 

135 

136 if Options["Help"]: 

137 usage() 

138 

139 Logger = daklog.Logger("archive-dedup-pool") 

140 

141 dedup(session) 

142 

143 Logger.close() 

144 

145################################################################################ 

146 

147 

148if __name__ == '__main__': 

149 main()