Coverage for dak/archive_dedup_pool.py: 56%

53 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2026-01-04 16:18 +0000

1#! /usr/bin/env python3 

2# vim:set et ts=4 sw=4: 

3 

4"""De-duplicates files in the pool directory 

5 

6@contact: Debian FTP Master <ftpmaster@debian.org> 

7@copyright: 2017 Bastian Blank <waldi@debian.org> 

8@license: GNU General Public License version 2 or later 

9""" 

10# This program is free software; you can redistribute it and/or modify 

11# it under the terms of the GNU General Public License as published by 

12# the Free Software Foundation; either version 2 of the License, or 

13# (at your option) any later version. 

14 

15# This program is distributed in the hope that it will be useful, 

16# but WITHOUT ANY WARRANTY; without even the implied warranty of 

17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 

18# GNU General Public License for more details. 

19 

20# You should have received a copy of the GNU General Public License 

21# along with this program; if not, write to the Free Software 

22# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 

23 

24################################################################################ 

25 

26import errno 

27import os 

28import sys 

29from typing import TYPE_CHECKING 

30 

31import apt_pkg 

32from sqlalchemy import sql 

33 

34from daklib import daklog 

35from daklib.config import Config 

36from daklib.dbconn import DBConn 

37 

38if TYPE_CHECKING: 

39 from sqlalchemy.orm import Session 

40 

41Options: apt_pkg.Configuration 

42Logger: daklog.Logger 

43 

44################################################################################ 

45################################################################################ 

46################################################################################ 

47 

48 

49def usage(exit_code=0): 

50 print( 

51 """Usage: dak archive-dedup-pool [OPTION]... 

52 -h, --help show this help and exit. 

53 -V, --version display the version number and exit 

54""" 

55 ) 

56 sys.exit(exit_code) 

57 

58 

59################################################################################ 

60 

61 

62def dedup_one(size: int, reference: str, *filenames: str) -> None: 

63 stat_reference = os.stat(reference) 

64 

65 # safety net 

66 if stat_reference.st_size != size: 

67 raise RuntimeError( 

68 "Size of {} does not match database: {} != {}".format( 

69 reference, size, stat_reference.st_size 

70 ) 

71 ) 

72 

73 for filename in filenames: 

74 stat_filename = os.stat(filename) 

75 

76 # if file is already a hard-linked, ignore 

77 if stat_reference == stat_filename: 

78 continue 

79 

80 # safety net 

81 if stat_filename.st_size != size: 

82 raise RuntimeError( 

83 "Size of {} does not match database: {} != {}".format( 

84 filename, size, stat_filename.st_size 

85 ) 

86 ) 

87 

88 tempfile = filename + ".new" 

89 os.link(reference, tempfile) 

90 try: 

91 Logger.log(["deduplicate", filename, reference]) 

92 os.rename(tempfile, filename) 

93 finally: 

94 try: 

95 os.unlink(tempfile) 

96 except OSError as e: 

97 if e.errno != errno.ENOENT: 

98 raise 

99 

100 

101################################################################################ 

102 

103 

104def dedup(session: "Session") -> None: 

105 results = session.execute( 

106 sql.text( 

107 """ 

108SELECT DISTINCT * 

109 FROM ( 

110 SELECT 

111 f.size, 

112 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER ( 

113 -- we aggregate all files with the same size, sha256sum and archive 

114 PARTITION BY f.size, f.sha256sum, a.id 

115 -- the oldest should be first 

116 ORDER by f.created 

117 -- we always want to see all rows 

118 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 

119 ) 

120 AS filenames 

121 FROM 

122 files AS f INNER JOIN 

123 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN 

124 component c ON fa.component_id = c.id INNER JOIN 

125 archive a ON fa.archive_id = a.id 

126 ) AS f 

127 -- we only care about entries with more than one filename 

128 WHERE array_length(filenames, 1) > 1 

129 """ 

130 ) 

131 ).mappings() 

132 

133 for i in results: 133 ↛ 134line 133 didn't jump to line 134 because the loop on line 133 never started

134 dedup_one(i["size"], *i["filenames"]) 

135 

136 

137################################################################################ 

138 

139 

140def main(): 

141 global Options, Logger 

142 

143 cnf = Config() 

144 session = DBConn().session() 

145 

146 Arguments = [("h", "help", "Archive-Dedup-Pool::Options::Help")] 

147 

148 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) # type: ignore[attr-defined] 

149 

150 for i in ["help"]: 

151 key = "Archive-Dedup-Pool::Options::%s" % i 

152 if key not in cnf: 

153 cnf[key] = "" 

154 

155 Options = cnf.subtree("Archive-Dedup-Pool::Options") 

156 

157 if Options["Help"]: 

158 usage() 

159 

160 Logger = daklog.Logger("archive-dedup-pool") 

161 

162 dedup(session) 

163 

164 Logger.close() 

165 

166 

167################################################################################ 

168 

169 

170if __name__ == "__main__": 

171 main()