Package dak :: Module archive_dedup_pool
[hide private]
[frames] | no frames]

Source Code for Module dak.archive_dedup_pool

  1  #! /usr/bin/env python3 
  2  # vim:set et ts=4 sw=4: 
  3   
  4  """De-duplicates files in the pool directory 
  5   
  6  @contact: Debian FTP Master <ftpmaster@debian.org> 
  7  @copyright: 2017 Bastian Blank <waldi@debian.org> 
  8  @license: GNU General Public License version 2 or later 
  9  """ 
 10  # This program is free software; you can redistribute it and/or modify 
 11  # it under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation; either version 2 of the License, or 
 13  # (at your option) any later version. 
 14   
 15  # This program is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with this program; if not, write to the Free Software 
 22  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 23   
 24  ################################################################################ 
 25   
 26  import errno 
 27  import os 
 28  import sys 
 29   
 30  import apt_pkg 
 31   
 32  from daklib import daklog 
 33  from daklib.config import Config 
 34  from daklib.dbconn import DBConn 
 35   
 36  Options = None 
 37  Logger = None 
 38   
 39  ################################################################################ 
 40  ################################################################################ 
 41  ################################################################################ 
 42   
 43   
44 -def usage(exit_code=0):
45 print( 46 """Usage: dak archive-dedup-pool [OPTION]... 47 -h, --help show this help and exit. 48 -V, --version display the version number and exit 49 """ 50 ) 51 sys.exit(exit_code)
52 53 54 ################################################################################ 55 56
57 -def dedup_one(size, reference, *filenames):
58 stat_reference = os.stat(reference) 59 60 # safety net 61 if stat_reference.st_size != size: 62 raise RuntimeError( 63 "Size of {} does not match database: {} != {}".format( 64 reference, size, stat_reference.st_size 65 ) 66 ) 67 68 for filename in filenames: 69 stat_filename = os.stat(filename) 70 71 # if file is already a hard-linked, ignore 72 if stat_reference == stat_filename: 73 continue 74 75 # safety net 76 if stat_filename.st_size != size: 77 raise RuntimeError( 78 "Size of {} does not match database: {} != {}".format( 79 filename, size, stat_filename.st_size 80 ) 81 ) 82 83 tempfile = filename + ".new" 84 os.link(reference, tempfile) 85 try: 86 Logger.log(["deduplicate", filename, reference]) 87 os.rename(tempfile, filename) 88 finally: 89 try: 90 os.unlink(tempfile) 91 except OSError as e: 92 if e.errno != errno.ENOENT: 93 raise
94 95 96 ################################################################################ 97 98
99 -def dedup(session):
100 results = session.execute( 101 """ 102 SELECT DISTINCT * 103 FROM ( 104 SELECT 105 f.size, 106 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER ( 107 -- we aggregate all files with the same size, sha256sum and archive 108 PARTITION BY f.size, f.sha256sum, a.id 109 -- the oldest should be first 110 ORDER by f.created 111 -- we always want to see all rows 112 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 113 ) 114 AS filenames 115 FROM 116 files AS f INNER JOIN 117 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN 118 component c ON fa.component_id = c.id INNER JOIN 119 archive a ON fa.archive_id = a.id 120 ) AS f 121 -- we only care about entries with more than one filename 122 WHERE array_length(filenames, 1) > 1 123 """ 124 ) 125 126 for i in results: 127 dedup_one(i["size"], *i["filenames"])
128 129 130 ################################################################################ 131 132
133 -def main():
134 global Options, Logger 135 136 cnf = Config() 137 session = DBConn().session() 138 139 Arguments = [("h", "help", "Archive-Dedup-Pool::Options::Help")] 140 141 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) 142 143 for i in ["help"]: 144 key = "Archive-Dedup-Pool::Options::%s" % i 145 if key not in cnf: 146 cnf[key] = "" 147 148 Options = cnf.subtree("Archive-Dedup-Pool::Options") 149 150 if Options["Help"]: 151 usage() 152 153 Logger = daklog.Logger("archive-dedup-pool") 154 155 dedup(session) 156 157 Logger.close()
158 159 160 ################################################################################ 161 162 163 if __name__ == "__main__": 164 main() 165