Source code for dak.archive_dedup_pool

#! /usr/bin/env python3
# vim:set et ts=4 sw=4:

""" De-duplicates files in the pool directory

@contact: Debian FTP Master <ftpmaster@debian.org>
@copyright: 2017 Bastian Blank <waldi@debian.org>
@license: GNU General Public License version 2 or later
"""
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

################################################################################

import apt_pkg
import errno
import os
import sys

from daklib.dbconn import DBConn
from daklib import daklog
from daklib.config import Config

Options = None
Logger = None

################################################################################
################################################################################
################################################################################


[docs]def usage(exit_code=0): print("""Usage: dak archive-dedup-pool [OPTION]... -h, --help show this help and exit. -V, --version display the version number and exit """) sys.exit(exit_code)
################################################################################
[docs]def dedup_one(size, reference, *filenames): stat_reference = os.stat(reference) # safety net if stat_reference.st_size != size: raise RuntimeError('Size of {} does not match database: {} != {}'.format( reference, size, stat_reference.st_size)) for filename in filenames: stat_filename = os.stat(filename) # if file is already a hard-linked, ignore if stat_reference == stat_filename: continue # safety net if stat_filename.st_size != size: raise RuntimeError('Size of {} does not match database: {} != {}'.format( filename, size, stat_filename.st_size)) tempfile = filename + '.new' os.link(reference, tempfile) try: Logger.log(["deduplicate", filename, reference]) os.rename(tempfile, filename) finally: try: os.unlink(tempfile) except OSError as e: if e.errno != errno.ENOENT: raise
################################################################################
[docs]def dedup(session): results = session.execute(""" SELECT DISTINCT * FROM ( SELECT f.size, array_agg(a.path || '/pool/' || c.name || '/' || f.filename) OVER ( -- we aggregate all files with the same size, sha256sum and archive PARTITION BY f.size, f.sha256sum, a.id -- the oldest should be first ORDER by f.created -- we always want to see all rows ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) AS filenames FROM files AS f INNER JOIN files_archive_map AS fa ON f.id = fa.file_id INNER JOIN component c ON fa.component_id = c.id INNER JOIN archive a ON fa.archive_id = a.id ) AS f -- we only care about entries with more than one filename WHERE array_length(filenames, 1) > 1 """) for i in results: dedup_one(i['size'], *i['filenames'])
################################################################################
[docs]def main(): global Options, Logger cnf = Config() session = DBConn().session() Arguments = [('h', "help", "Archive-Dedup-Pool::Options::Help")] apt_pkg.parse_commandline(cnf.Cnf, Arguments, sys.argv) for i in ["help"]: key = "Archive-Dedup-Pool::Options::%s" % i if key not in cnf: cnf[key] = "" Options = cnf.subtree("Archive-Dedup-Pool::Options") if Options["Help"]: usage() Logger = daklog.Logger("archive-dedup-pool") dedup(session) Logger.close()
################################################################################ if __name__ == '__main__': main()