Package dak :: Module archive_dedup_pool
[hide private]
[frames] | no frames]

Source Code for Module dak.archive_dedup_pool

  1  #! /usr/bin/env python3 
  2  # vim:set et ts=4 sw=4: 
  3   
  4  """ De-duplicates files in the pool directory 
  5   
  6  @contact: Debian FTP Master <ftpmaster@debian.org> 
  7  @copyright: 2017 Bastian Blank <waldi@debian.org> 
  8  @license: GNU General Public License version 2 or later 
  9  """ 
 10  # This program is free software; you can redistribute it and/or modify 
 11  # it under the terms of the GNU General Public License as published by 
 12  # the Free Software Foundation; either version 2 of the License, or 
 13  # (at your option) any later version. 
 14   
 15  # This program is distributed in the hope that it will be useful, 
 16  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 17  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 18  # GNU General Public License for more details. 
 19   
 20  # You should have received a copy of the GNU General Public License 
 21  # along with this program; if not, write to the Free Software 
 22  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 23   
 24  ################################################################################ 
 25   
 26  import apt_pkg 
 27  import errno 
 28  import os 
 29  import sys 
 30   
 31  from daklib.dbconn import DBConn 
 32  from daklib import daklog 
 33  from daklib.config import Config 
 34   
 35  Options = None 
 36  Logger = None 
 37   
 38  ################################################################################ 
 39  ################################################################################ 
 40  ################################################################################ 
 41   
 42   
43 -def usage(exit_code=0):
44 print("""Usage: dak archive-dedup-pool [OPTION]... 45 -h, --help show this help and exit. 46 -V, --version display the version number and exit 47 """) 48 sys.exit(exit_code)
49 50 ################################################################################ 51 52
53 -def dedup_one(size, reference, *filenames):
54 stat_reference = os.stat(reference) 55 56 # safety net 57 if stat_reference.st_size != size: 58 raise RuntimeError('Size of {} does not match database: {} != {}'.format( 59 reference, size, stat_reference.st_size)) 60 61 for filename in filenames: 62 stat_filename = os.stat(filename) 63 64 # if file is already a hard-linked, ignore 65 if stat_reference == stat_filename: 66 continue 67 68 # safety net 69 if stat_filename.st_size != size: 70 raise RuntimeError('Size of {} does not match database: {} != {}'.format( 71 filename, size, stat_filename.st_size)) 72 73 tempfile = filename + '.new' 74 os.link(reference, tempfile) 75 try: 76 Logger.log(["deduplicate", filename, reference]) 77 os.rename(tempfile, filename) 78 finally: 79 try: 80 os.unlink(tempfile) 81 except OSError as e: 82 if e.errno != errno.ENOENT: 83 raise
84 85 ################################################################################ 86 87
88 -def dedup(session):
89 results = session.execute(""" 90 SELECT DISTINCT * 91 FROM ( 92 SELECT 93 f.size, 94 array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER ( 95 -- we aggregate all files with the same size, sha256sum and archive 96 PARTITION BY f.size, f.sha256sum, a.id 97 -- the oldest should be first 98 ORDER by f.created 99 -- we always want to see all rows 100 ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 101 ) 102 AS filenames 103 FROM 104 files AS f INNER JOIN 105 files_archive_map AS fa ON f.id = fa.file_id INNER JOIN 106 component c ON fa.component_id = c.id INNER JOIN 107 archive a ON fa.archive_id = a.id 108 ) AS f 109 -- we only care about entries with more than one filename 110 WHERE array_length(filenames, 1) > 1 111 """) 112 113 for i in results: 114 dedup_one(i['size'], *i['filenames'])
115 116 ################################################################################ 117 118
119 -def main():
120 global Options, Logger 121 122 cnf = Config() 123 session = DBConn().session() 124 125 Arguments = [('h', "help", "Archive-Dedup-Pool::Options::Help")] 126 127 apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) 128 129 for i in ["help"]: 130 key = "Archive-Dedup-Pool::Options::%s" % i 131 if key not in cnf: 132 cnf[key] = "" 133 134 Options = cnf.subtree("Archive-Dedup-Pool::Options") 135 136 if Options["Help"]: 137 usage() 138 139 Logger = daklog.Logger("archive-dedup-pool") 140 141 dedup(session) 142 143 Logger.close()
144 145 ################################################################################ 146 147 148 if __name__ == '__main__': 149 main() 150