Make the image build reproducible

Change the mkimage script so that the produced image is
reproducible. This involves:

   - removing the ldconfig aux-cache as it changes on every build.
   - set the mtimes of the files to a specific date so that the
     resulting tar file will have the same contents.
   - The `strings` guard around the unsafe-io tweak seems to be
     non-deterministic. It was sometimes not adding the tweak
     for the same file. Remove it as we don't care about older
     than jessie.
   - Importing the image by constructing a docker image with
     a specific timestamp and doing `docker load`.

Also change the buildall script to build each image twice and
confirm that the same tarball is produced, and that results
in the layers in the imported images matching.

Add a dockerdiff script that checks that two images are equivalent,
and tries to show the differences if not. This is useful when the
build script reports differences, as it can point to what the
differences are.
This commit is contained in:
James Westby
2017-02-02 10:49:46 +01:00
committed by James Westby
parent 6befeedf99
commit 44030c910b
4 changed files with 192 additions and 23 deletions
+76 -10
View File
@@ -1,7 +1,37 @@
#!/bin/bash
# Build a minideb image for each supported dist
#
# First we build the image as a tarball, then we import it and tag it.
#
# However we aim to allow our images to be reproduced. That means
# we need more control over the import process. We also build and import
# each image twice to confirm that our images are still reproducible.
#
# To reproduce an image you have to:
#
# - Produce exactly the same base tarball. `mkimage` will take care of that
# for the same package inputs.
# - Import the image with the same config (`CMD` etc.)
# - Have the same creation date on the image.
#
# That last requirement leads us to some extra work to re-use timestamps.
#
# The steps are:
#
# 1. Pull image from Dockerhub and save creation date and image_id
# 2. Build image locally and import it, setting creation date to the pulled one
# 3. Build the image again and import it, also setting creation date to the pulled one
# 4. Compare the built image ids. Error if they are not the same (Docker thinks images are different, thanks to checksum)
# 5. Compare built image id with pulled image id. Both will have same creation date but may differ in checksum so ids may be different
# - If the image is the same as the pulled one then nothing changed in this build
# - If the image differs from the pulled one then:
# - Re-import the locally built image with the current timestamp so it will be shown as a new image
# - Tag the built image with the target tag, ready to push.
set -e
set -u
set -o pipefail
DISTS="jessie
unstable
@@ -13,19 +43,55 @@ GCR_BASENAME=gcr.io/bitnami-containers/minideb
mkdir -p build
log() {
echo "$@" >&2
}
for DIST in $DISTS; do
[ -f debootstrap/$DIST ] || (echo "buildall: Unknown distribution: $DIST" && exit 1)
echo "============================================"
echo "Building $BASENAME:$DIST"
echo "============================================"
current_ts="$(date -u +%Y-%m-%dT%H:%M:%S.%NZ)"
if docker pull $BASENAME:$DIST > /dev/null; then
target_ts="$(docker inspect $BASENAME:$DIST | jq --raw-output ".[0].Created")"
pulled_image_id="$(docker inspect $BASENAME:$DIST | jq --raw-output ".[0].Id")"
else
target_ts="$current_ts"
pulled_image_id=
fi
log "============================================"
log "Building $BASENAME:$DIST"
log "============================================"
./mkimage build/$DIST.tar $DIST
IMPORTED=$(docker import --change "CMD /bin/bash" build/$DIST.tar)
echo "============================================"
echo "Running tests for $BASENAME:$DIST"
echo "============================================"
./test $IMPORTED
docker tag $IMPORTED $BASENAME:$DIST
docker tag $IMPORTED $GCR_BASENAME:$DIST
built_image_id=$(./import build/$DIST.tar "$target_ts")
log "============================================"
log "Running tests for $BASENAME:$DIST"
log "============================================"
./test $built_image_id
log "============================================"
log "Rebuilding $BASENAME:$DIST to test reproducibility"
log "============================================"
./mkimage build/${DIST}-repro.tar $DIST
repro_image_id=$(./import build/${DIST}-repro.tar "$target_ts")
if [ "$repro_image_id" != "$built_image_id" ]; then
log "$BASENAME:$DIST differs after a rebuild. Examine $built_image_id and $repro_image_id"
log "to find the differences and fix the build to be reproducible again."
log "Running \`./dockerdiff $built_image_id $repro_image_id\` might be useful."
exit 1
fi
rm build/${DIST}-repro.tar
if [ -n "$pulled_image_id" ]; then
if [ "$built_image_id" != "$pulled_image_id" ]; then
log "Image changed $built_image_id (new) != $pulled_image_id (old)"
# Re-import with the current timestamp so that the image shows
# as new
built_image_id="$(./import build/$DIST.tar "$current_ts")"
else
log "Image didn't change"
continue
fi
fi
docker tag $built_image_id $BASENAME:$DIST
docker tag $built_image_id $GCR_BASENAME:$DIST
log "Tagged $built_image_id as $BASENAME:$DIST $GCR_BASENAME:$DIST"
done
docker tag $BASENAME:$LATEST $BASENAME:latest
docker tag $GCR_BASENAME:$LATEST $GCR_BASENAME:latest
Executable
+46
View File
@@ -0,0 +1,46 @@
#!/bin/bash
# Compare two docker images, reporting what changed.
# The script will exit 1 if there are differences between the images
# other than their tags.
#
# It will also try and show what the differences are, comparing
# - the image config
# - the installed dpkg packages
# - changed file metadata
# - changed file checksums
set -e
set -u
set -o pipefail
IMAGE1=$1
IMAGE2=$2
inspect() {
docker inspect $1 | jq ".[0]|del(.RepoTags,.RepoDigests)"
}
dpkgl() {
docker run --rm $1 dpkg -l
}
lslr() {
docker run --rm $1 bash -c 'find / -xdev -not -path /proc -a -not -path /sys -print0 | sort -z | xargs -0 ls -ld'
}
md5() {
docker run --rm $1 bash -c 'find / -xdev -not -path /etc/hosts -a -not -path /etc/hostname -a -type f -print0 | sort -z | xargs -0 md5sum'
}
_diff() {
local cmd=$1
diff -u --label $IMAGE1 --label $IMAGE2 <($cmd $IMAGE1) <($cmd $IMAGE2)
}
if ! _diff inspect; then
_diff dpkgl || true
_diff lslr || true
_diff md5 || true
exit 1
fi
Executable
+42
View File
@@ -0,0 +1,42 @@
#!/bin/bash
# Import a tarball as a docker image, specifying the desired image
# creation date.
# This is useful as there's no other way to manipulate the creation
# date, and the date is part of the calculation of the image id.
# This means that the only way to reproduce an image is to specify
# the same timestamp.
set -e
set -u
set -o pipefail
CONF_TEMPLATE='{"architecture":"amd64","comment":"from Bitnami with love","config":{"Hostname":"","Domainname":"","User":"","AttachStdin":false,"AttachStdout":false,"AttachStderr":false,"Tty":false,"OpenStdin":false,"StdinOnce":false,"Env":null,"Cmd":["/bin/bash"],"Image":"","Volumes":null,"WorkingDir":"","Entrypoint":null,"OnBuild":null,"Labels":null},"container_config":{"Hostname":"","Domainname":"","User":"","AttachStdin":false,"AttachStdout":false,"AttachStderr":false,"Tty":false,"OpenStdin":false,"StdinOnce":false,"Env":null,"Cmd":null,"Image":"","Volumes":null,"WorkingDir":"","Entrypoint":null,"OnBuild":null,"Labels":null},"created":"%TIMESTAMP%","docker_version":"1.13.0","history":[{"created":"%TIMESTAMP%","comment":"from Bitnami with love"}],"os":"linux","rootfs":{"type":"layers","diff_ids":["sha256:%LAYERSUM%"]}}'
MANIFEST_TEMPLATE='[{"Config":"%CONF_SHA%.json","RepoTags":null,"Layers":["%LAYERSUM%/layer.tar"]}]'
SOURCE=${1:?Specify the tarball to import}
TIMESTAMP=${2:?Specify the timestamp to use}
import() {
local TDIR="$(mktemp -d)"
local LAYERSUM="$(sha256sum $SOURCE | awk '{print $1}')"
mkdir $TDIR/$LAYERSUM
cp $SOURCE $TDIR/$LAYERSUM/layer.tar
echo -n '1.0' > $TDIR/$LAYERSUM/VERSION
local CONF="$(echo -n "$CONF_TEMPLATE" | sed -e "s/%TIMESTAMP%/$TIMESTAMP/g" -e "s/%LAYERSUM%/$LAYERSUM/g")"
local CONF_SHA="$(echo -n "$CONF" | sha256sum | awk '{print $1}')"
echo -n "$CONF" > "$TDIR/${CONF_SHA}.json"
local MANIFEST="$(echo -n "$MANIFEST_TEMPLATE" | sed -e "s/%CONF_SHA%/$CONF_SHA/g" -e "s/%LAYERSUM%/$LAYERSUM/g")"
echo -n "$MANIFEST" > $TDIR/manifest.json
tar cf $TDIR/import.tar -C $TDIR manifest.json "${CONF_SHA}.json" "$LAYERSUM"
local ID=$(docker load -i $TDIR/import.tar | awk '{print $4}')
if [ "$ID" != "sha256:$CONF_SHA" ]; then
echo "Failed to load $ID correctly, expected id to be $CONF_SHA, source in $TDIR" >&2
exit 1
fi
rm -r "$TDIR"
echo $ID
}
import
+28 -13
View File
@@ -1,6 +1,7 @@
#!/bin/bash
set -e
set -u
set -o pipefail
ROOT=$(cd $(dirname $0) && pwd)
@@ -40,10 +41,15 @@ fi
chroot "$rootfsDir" apt-get update
chroot "$rootfsDir" apt-get upgrade -y -o Dpkg::Options::="--force-confdef"
chroot "rootfsDir" dpkg -l | tee "$TARGET.manifest"
chroot "$rootfsDir" dpkg -l | tee "$TARGET.manifest"
echo "Applying docker-specific tweaks"
# These are copied from the docker contrib/mkimage/debootstrap script.
# Modifications:
# - remove `strings` check for applying the --force-unsafe-io tweak.
# This was sometimes wrongly detected as not applying, and we aren't
# interested in building versions that this guard would apply to,
# so simply apply the tweak unconditionally.
# get path to "chroot" in our current PATH
chrootPath="$(type -P chroot)"
@@ -82,18 +88,15 @@ chmod +x "$rootfsDir/usr/sbin/policy-rc.d"
# don't even have kernels installed
rm -f "$rootfsDir/etc/apt/apt.conf.d/01autoremove-kernels"
# Ubuntu 10.04 sucks... :)
if strings "$rootfsDir/usr/bin/dpkg" | grep -q unsafe-io; then
# force dpkg not to call sync() after package extraction (speeding up installs)
echo >&2 "+ echo force-unsafe-io > '$rootfsDir/etc/dpkg/dpkg.cfg.d/docker-apt-speedup'"
cat > "$rootfsDir/etc/dpkg/dpkg.cfg.d/docker-apt-speedup" <<-'EOF'
# For most Docker users, package installs happen during "docker build", which
# doesn't survive power loss and gets restarted clean afterwards anyhow, so
# this minor tweak gives us a nice speedup (much nicer on spinning disks,
# obviously).
force-unsafe-io
EOF
fi
# force dpkg not to call sync() after package extraction (speeding up installs)
echo >&2 "+ echo force-unsafe-io > '$rootfsDir/etc/dpkg/dpkg.cfg.d/docker-apt-speedup'"
cat > "$rootfsDir/etc/dpkg/dpkg.cfg.d/docker-apt-speedup" <<-'EOF'
# For most Docker users, package installs happen during "docker build", which
# doesn't survive power loss and gets restarted clean afterwards anyhow, so
# this minor tweak gives us a nice speedup (much nicer on spinning disks,
# obviously).
force-unsafe-io
EOF
if [ -d "$rootfsDir/etc/apt/apt.conf.d" ]; then
# _keep_ us lean by effectively running "apt-get clean" after every install
@@ -188,12 +191,24 @@ EOF
chmod 0755 "$rootfsDir/usr/sbin/install_packages"
# Capture the most recent date that a package in the image was changed.
# We don't care about the particular date, or which package it comes from,
# we just need a date that isn't very far in the past.
BUILD_DATE="$(find $rootfsDir/usr/share/doc -name changelog.Debian.gz -exec dpkg-parsechangelog -SDate -l'{}' \; | xargs -l -i date --date="{}" +%s | sort -n | tail -n 1)"
echo "Trimming down"
for DIR in $DIRS_TO_TRIM; do
rm -r "$rootfsDir/$DIR"/*
done
# Remove the aux-cache as it isn't reproducible. It doesn't seem to
# cause any problems to remove it.
rm "$rootfsDir/var/cache/ldconfig/aux-cache"
find "$rootfsDir/usr/share/doc" -mindepth 1 -not -name copyright -not -type d -delete
find "$rootfsDir/usr/share/doc" -mindepth 1 -type d -empty -delete
# Set the mtime on all files to be no older than $BUILD_DATE.
# This is required to have the same metadata on files so that the
# same tarball is produced. We assume that it is not important
# that any file have a newer mtime than this.
find "$rootfsDir" -depth -newermt "@$BUILD_DATE" -print0 | xargs -0r touch --no-dereference --date="@$BUILD_DATE"
echo "Total size"
du -skh "$rootfsDir"
echo "Package sizes"