Add script to dump system files used by build

In order to move towards builds which only use resources in the Hafnium
repo, this patch adds a script which runs the build with strace and
dumps all files touched in the process. Files in the Hafnium directory
and in /tmp are automatically filtered out.

Bug: 132428451
Test: ./build/strace_open.sh opened_files.txt
Change-Id: I03a2df4eedf40c456b65920ec8bf98ad08e747c6
diff --git a/build/docker/Dockerfile b/build/docker/Dockerfile
index 93c8caa..0361e9c 100644
--- a/build/docker/Dockerfile
+++ b/build/docker/Dockerfile
@@ -40,4 +40,5 @@
 		python \
 		python-git                     `# for Linux checkpatch` \
 		python-ply                     `# for Linux checkpatch` \
+		strace                         `# for strace_open.sh` \
 	&& rm -rf /var/lib/apt/lists/*
diff --git a/build/parse_strace_open.py b/build/parse_strace_open.py
new file mode 100755
index 0000000..5dd878b
--- /dev/null
+++ b/build/parse_strace_open.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python
+# Copyright 2019 The Hafnium Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script which parses the output of `strace` and dumping a list of files
+that were touched by the traced processes outside of whitelisted folders.
+It assumes that strace was invoked with the following arguments:
+    -e trace=%file,chdir,%process   record required syscalls
+    -qq                             silence 'exit code' records
+    -o <file>                       output format is different when writing
+                                    to a file from printing to the console
+"""
+
+import argparse
+import os
+import sys
+
+FORK_SYSCALLS = [
+    "clone",
+    "fork",
+    "vfork",
+    ]
+OPEN_SYSCALLS = [
+    "access",
+    "creat",
+    "lstat",
+    "mkdir",
+    "open",
+    "openat",
+    "readlink",
+    "stat",
+    ]
+
+def get_unfinished(line):
+    pos = line.find("<unfinished ...>")
+    if pos < 0:
+        return None
+    else:
+        return line[:pos]
+
+def get_resumed(line):
+    pos = line.find(" resumed>")
+    if pos < 0:
+        return None
+    else:
+        return line[pos + len(" resumed>"):]
+
+def merge_unfinished_lines(lines):
+    """Process input lines and merge those split by an interrupting syscall."""
+    # Lines in the order they were started being written.
+    finished = []
+
+    # Pending unfinished lines. Map from PID to index in `finished`.
+    cursor = {}
+
+    for line in lines:
+        pid = int(line.split()[0])
+
+        resumed = get_resumed(line)
+        if resumed is not None:
+            assert(pid in cursor)
+            unfinished = get_unfinished(resumed)
+            if unfinished is not None:
+                finished[cursor[pid]] += unfinished
+            else:
+                finished[cursor[pid]] += resumed
+                del(cursor[pid])
+        else:
+            assert(pid not in cursor)
+            unfinished = get_unfinished(line)
+            if unfinished is not None:
+                # Line is unfinished. Store its location to `cursor`.
+                cursor[pid] = len(finished)
+                finished += [ unfinished ]
+            else:
+                finished += [ line ]
+    return finished
+
+def abs_path(cwd, path):
+    """If `path` is relative, resolve it against the current working directory.
+       Also normalize the resulting path."""
+    if path[0] != '/':
+        path = os.path.join(cwd, path)
+    path = os.path.abspath(path)
+    # while '//' in path:
+    #     path = path.replace('//', '/')
+    path = os.path.realpath(path)
+    return path
+
+def get_touched_files(lines, orig_cwd):
+    """Parse strace output and return all files that an open()-like syscall was
+       called on."""
+    files = set()
+
+    # Map from PID to the current working directory.
+    cwd = {}
+
+    # Map from PID to executable name
+    executable = {}
+
+    # Map from PID to the PID of the process which forked it.
+    fork_of = {}
+
+    first_pid = True
+    for line in lines:
+        # Split line: <pid>  <syscall info>
+        line = line.split()
+        pid = int(line[0])
+        syscall = " ".join(line[1:])
+
+        # If seeing a PID for the first time, derive its working directory
+        # from its parent.
+        if pid not in cwd:
+            if first_pid:
+                # Very first line of strace output. Set working directory from
+                # command line arguments (should match cwd of strace).
+                first_pid = False
+                cwd[pid] = orig_cwd
+            else:
+                # There should have been a fork/clone syscall which spawned this
+                # process. Inherit its working directory.
+                cwd[pid] = cwd[fork_of[pid]]
+
+        # We are looking for lines which match:
+        #   name(arg1, arg2, ..., argN) = result
+        left_bracket = syscall.find("(")
+        right_bracket = syscall.rfind(")")
+        assign_sign = syscall.rfind("=")
+        if left_bracket < 0 or right_bracket < 0 or assign_sign < right_bracket:
+            continue
+
+        syscall_name = syscall[:left_bracket]
+        syscall_result = syscall[assign_sign+2:]
+
+        syscall_args = syscall[left_bracket+1:right_bracket].split(",")
+        syscall_args = list(map(lambda x: x.strip(), syscall_args))
+
+        if syscall_name in FORK_SYSCALLS:
+            # If this is a fork, keep track of the parent-child relationship.
+            # The child's PID is the syscall's return code.
+            new_pid = int(syscall_result)
+            fork_of[new_pid] = pid
+            executable[new_pid] = executable[pid]
+        elif syscall_name == "chdir":
+            # If this is a change of working directory, keep track of it.
+            # It is in the first argument in quotes.
+            new_dir = syscall_args[0][1:-1]
+            cwd[pid] = abs_path(cwd[pid], new_dir)
+        elif syscall_name == "execve":
+            # If this is executing a new program, record its name.
+            # It is in the first argument in quotes.
+            binary_name = syscall_args[0][1:-1]
+            executable[pid] = binary_name
+        elif syscall_name in OPEN_SYSCALLS:
+            # If this is a syscall touching a file, record the path.
+            # We ignore the result code, i.e. record the path even if the
+            # syscall failed to open it.
+            arg_idx = 0
+            if syscall_name == "openat":
+                # openat() can open a file (second arg) relative to a given
+                # folder (first arg). We only support passing AT_FDCWD, ie.
+                # resolve against the current working directory.
+                arg_idx = 1
+                assert(syscall_args[0] == "AT_FDCWD")
+            fname = abs_path(cwd[pid], syscall_args[arg_idx][1:-1])
+            # Record the file and the name of the program which touched it.
+            files.add((fname, executable[pid]))
+    return files
+
+def filter_results(files, root_dir):
+    """Remove paths which are whitelisted from the results."""
+    # Anything in the Hafnium directory is allowed.
+    files = filter(lambda x: not x[0].startswith(root_dir + "/"), files)
+    # Clang puts intermediate files in /tmp.
+    files = filter(lambda x: not x[0].startswith("/tmp/"), files)
+    return list(files)
+
+def main(args):
+    parser = argparse.ArgumentParser()
+    parser.add_argument("root_dir",
+                        help="Root directory of Hafnium, cwd of strace")
+    args, make_args = parser.parse_known_args()
+
+    stdin = map(lambda x: x.strip(), sys.stdin.readlines())
+    stdin = merge_unfinished_lines(stdin)
+    files = get_touched_files(stdin, args.root_dir)
+    files = filter_results(files, args.root_dir)
+    files = sorted(list(files))
+
+    print("\n".join(map(lambda x: "{} ({})".format(x[0], x[1]), files)))
+
+if __name__ == "__main__":
+    main(sys.argv)
diff --git a/build/run_in_container.sh b/build/run_in_container.sh
index e3c6bd0..ae0850e 100755
--- a/build/run_in_container.sh
+++ b/build/run_in_container.sh
@@ -42,14 +42,30 @@
 	"${SCRIPT_DIR}/docker"
 IMAGE_ID="$(cat ${IID_FILE})"
 
-# Check if script was invoked with '-i' as first argument. If so, run
-# container in interactive mode.
+# Parse command line arguments
 INTERACTIVE=false
-if [ "${1:-}" == "-i" ]
-then
-	INTERACTIVE=true
-	shift
-fi
+ALLOW_PTRACE=false
+while true
+do
+	case "${1:-}" in
+	-i)
+		INTERACTIVE=true
+		shift
+		;;
+	-p)
+		ALLOW_PTRACE=true
+		shift
+		;;
+	-*)
+		echo "ERROR: Unknown command line flag: $1" 1>&2
+		echo "Usage: $0 [-i] [-p] <command>"
+		exit 1
+		;;
+	*)
+		break
+		;;
+	esac
+done
 
 ARGS=()
 # Run with a pseduo-TTY for nicer logging.
@@ -59,6 +75,12 @@
 then
 	ARGS+=(-i)
 fi
+# Allow ptrace() syscall if invoked with '-p'.
+if [ "${ALLOW_PTRACE}" == "true" ]
+then
+	echo "WARNING: Docker seccomp profile is disabled!" 1>&2
+	ARGS+=(--cap-add=SYS_PTRACE --security-opt seccomp=unconfined)
+fi
 # Set environment variable informing the build that we are running inside
 # a container.
 ARGS+=(-e HAFNIUM_HERMETIC_BUILD=inside)
diff --git a/build/strace_open.sh b/build/strace_open.sh
new file mode 100755
index 0000000..e960daf
--- /dev/null
+++ b/build/strace_open.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# Copyright 2019 The Hafnium Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -euxo pipefail
+
+SCRIPT_NAME="$(realpath "${BASH_SOURCE[0]}")"
+SCRIPT_DIR="$(realpath "$(dirname "${BASH_SOURCE[0]}")")"
+ROOT_DIR="$(realpath ${SCRIPT_DIR}/..)"
+
+if [ "${HAFNIUM_HERMETIC_BUILD:-}" == "true" ]
+then
+	exec "${ROOT_DIR}/build/run_in_container.sh" -p ${SCRIPT_NAME} $@
+fi
+
+if [ $# != 1 ]
+then
+	echo "Usage: $0 <output_file>" 1>&2
+	exit 1
+fi
+
+MAKE="$(which make)"
+STRACE="$(which strace)"
+
+# Set up a temp directory and register a cleanup function on exit.
+TMP_DIR="$(mktemp -d)"
+function cleanup() {
+	rm -rf "${TMP_DIR}"
+}
+trap cleanup EXIT
+
+STRACE_LOG="${TMP_DIR}/strace.log"
+
+echo "Building with strace"
+pushd ${ROOT_DIR}
+${MAKE} clobber
+${STRACE} \
+	-o "${STRACE_LOG}" \
+	-f \
+	-qq \
+	-e trace=%file,chdir,%process \
+	${MAKE}
+popd
+
+echo "Processing strace output"
+"${SCRIPT_DIR}/parse_strace_open.py" ${ROOT_DIR} < "${STRACE_LOG}" > "$1"