From 0e8056290ea2d67aeb3495c62db4af498a9c5182 Mon Sep 17 00:00:00 2001 From: David Korczynski Date: Wed, 16 Oct 2024 14:26:58 -0700 Subject: [PATCH 1/6] infra: add script to capture replayable commands Signed-off-by: David Korczynski --- infra/base-images/base-builder/Dockerfile | 1 + infra/base-images/base-builder/bash_parser.py | 96 +++++++++++++++++++ infra/base-images/base-builder/compile | 6 ++ 3 files changed, 103 insertions(+) create mode 100644 infra/base-images/base-builder/bash_parser.py diff --git a/infra/base-images/base-builder/Dockerfile b/infra/base-images/base-builder/Dockerfile index 4fa7a91000a2..6d79c621ea8c 100644 --- a/infra/base-images/base-builder/Dockerfile +++ b/infra/base-images/base-builder/Dockerfile @@ -165,6 +165,7 @@ COPY bazel_build_fuzz_tests \ install_rust.sh \ install_swift.sh \ python_coverage_helper.py \ + bash_parser.py \ srcmap \ write_labels.py \ /usr/local/bin/ diff --git a/infra/base-images/base-builder/bash_parser.py b/infra/base-images/base-builder/bash_parser.py new file mode 100644 index 000000000000..583e0d76716e --- /dev/null +++ b/infra/base-images/base-builder/bash_parser.py @@ -0,0 +1,96 @@ +import os +import sys +import bashlex + +from glob import glob + + +s1="""# build project +autoconf +autoheader +./configure +make -j$(nproc) libhts.a test/fuzz/hts_open_fuzzer.o + +# build fuzzers +$CXX $CXXFLAGS -o "$OUT/hts_open_fuzzer" test/fuzz/hts_open_fuzzer.o $LIB_FUZZING_ENGINE libhts.a -lz -lbz2 -llzma -lcurl -lcrypto -lpthread""" + +def find_all_bash_scripts_in_src(): + all_scripts = [] + all_scripts = [y for x in os.walk('/src/') for y in glob(os.path.join(x[0], '*.sh'))] + scripts_we_care_about = [] + to_ignore = {'aflplusplus', 'honggfuzz', '/fuzztest', '/centipede'} + for s in all_scripts: + if any([x for x in to_ignore if x in s]): + continue + scripts_we_care_about.append(s) + #for root, subFolder, files in os.walk('/src/'): + # for item in files: + # if item.endswith(".sh") : + # all_scripts.append(item) + print(scripts_we_care_about) + return scripts_we_care_about + + +def should_include_command(ast_tree): + if 'configure' in ast_tree.parts[0].word: + return False + if 'autoheader' in ast_tree.parts[0].word: + return False + if 'autoconf' in ast_tree.parts[0].word: + return False + if 'cmake' in ast_tree.parts[0].word: + return False + + if len(ast_tree.parts) > 1 and 'make' in ast_tree.parts[0].word and 'clean' in ast_tree.parts[1].word: + return False + return True + + +def is_local_redirection(ast_node, all_scripts): + #print("Checking") + if len(ast_node.parts) >= 2: + if ast_node.parts[0].word == '.': + suffixes_matching = [] + #print(ast_node.parts[1].word) + for bash_script in all_scripts: + #print("- %s"%(bash_script)) + if bash_script.endswith(ast_node.parts[1].word): + suffixes_matching.append(bash_script) + #print(suffixes_matching) + return suffixes_matching + return [] + +def parse_script(bash_script, all_scripts) -> str: + new_script = '' + with open(bash_script, 'r', encoding='utf-8') as f: + build_script = f.read() + #print(build_script) + parts = bashlex.parse(build_script) + for part in parts: + try: + if not should_include_command(part): + continue + except: + continue + + matches = is_local_redirection(part, all_scripts) + if len(matches) == 1: + new_script += parse_script(matches[0], all_scripts) + '\n' + continue + #print(part.dump()) + idx_start = part.pos[0] + idx_end = part.pos[1] + new_script += build_script[idx_start:idx_end] + new_script += '\n' + #print("[%s]"%(build_script[idx_start:idx_end])) + return new_script + + +if __name__ == "__main__": + all_scripts = find_all_bash_scripts_in_src() + replay_bash_script = parse_script(sys.argv[1], all_scripts) + + print("REPLAYABLE BASH SCRIPT") + print("#"*60) + print(replay_bash_script) + print("#"*60) \ No newline at end of file diff --git a/infra/base-images/base-builder/compile b/infra/base-images/base-builder/compile index f023ca2e76c1..51979faa851f 100755 --- a/infra/base-images/base-builder/compile +++ b/infra/base-images/base-builder/compile @@ -231,6 +231,10 @@ if [ "${OSS_FUZZ_ON_DEMAND}" != "0" ]; then exit 0 fi +#echo 'export SHELLOPTS' | cat - $SRC/build.sh > temp && mv temp $SRC/build.sh +python3 -m pip install bashlex +python3 /usr/local/bin/bash_parser.py $SRC/build.sh + BUILD_CMD="bash -eux $SRC/build.sh" # Set +u temporarily to continue even if GOPATH and OSSFUZZ_RUSTPATH are undefined. @@ -248,6 +252,7 @@ if [ "$FUZZING_LANGUAGE" = "rust" ]; then cp -r /rust/rustup/toolchains/$rustdef/lib/rustlib/src/rust/library/ /rustc/$rustch/ fi +#export SHELLOPTS if [ "${BUILD_UID-0}" -ne "0" ]; then adduser -u $BUILD_UID --disabled-password --gecos '' builder chown -R builder $SRC $OUT $WORK @@ -263,6 +268,7 @@ else $COPY_SOURCES_CMD 2>/dev/null || true fi fi +#unset SHELLOPTS if [ "$SANITIZER" = "introspector" ]; then unset CXXFLAGS From 4a4969df129e3b08104ce2017401f092c7a9dd88 Mon Sep 17 00:00:00 2001 From: David Korczynski Date: Wed, 16 Oct 2024 14:30:41 -0700 Subject: [PATCH 2/6] nits Signed-off-by: David Korczynski --- infra/base-images/base-builder/bash_parser.py | 23 ++++--------------- 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/infra/base-images/base-builder/bash_parser.py b/infra/base-images/base-builder/bash_parser.py index 583e0d76716e..afdf437d94a9 100644 --- a/infra/base-images/base-builder/bash_parser.py +++ b/infra/base-images/base-builder/bash_parser.py @@ -1,21 +1,9 @@ -import os import sys import bashlex from glob import glob - -s1="""# build project -autoconf -autoheader -./configure -make -j$(nproc) libhts.a test/fuzz/hts_open_fuzzer.o - -# build fuzzers -$CXX $CXXFLAGS -o "$OUT/hts_open_fuzzer" test/fuzz/hts_open_fuzzer.o $LIB_FUZZING_ENGINE libhts.a -lz -lbz2 -llzma -lcurl -lcrypto -lpthread""" - def find_all_bash_scripts_in_src(): - all_scripts = [] all_scripts = [y for x in os.walk('/src/') for y in glob(os.path.join(x[0], '*.sh'))] scripts_we_care_about = [] to_ignore = {'aflplusplus', 'honggfuzz', '/fuzztest', '/centipede'} @@ -23,10 +11,7 @@ def find_all_bash_scripts_in_src(): if any([x for x in to_ignore if x in s]): continue scripts_we_care_about.append(s) - #for root, subFolder, files in os.walk('/src/'): - # for item in files: - # if item.endswith(".sh") : - # all_scripts.append(item) + print(scripts_we_care_about) return scripts_we_care_about @@ -47,6 +32,8 @@ def should_include_command(ast_tree): def is_local_redirection(ast_node, all_scripts): + """Return the list of scripts corresponding to the command, in case + the command is an execution of a local script.""" #print("Checking") if len(ast_node.parts) >= 2: if ast_node.parts[0].word == '.': @@ -64,7 +51,6 @@ def parse_script(bash_script, all_scripts) -> str: new_script = '' with open(bash_script, 'r', encoding='utf-8') as f: build_script = f.read() - #print(build_script) parts = bashlex.parse(build_script) for part in parts: try: @@ -77,7 +63,8 @@ def parse_script(bash_script, all_scripts) -> str: if len(matches) == 1: new_script += parse_script(matches[0], all_scripts) + '\n' continue - #print(part.dump()) + + # Extract the command from the script string idx_start = part.pos[0] idx_end = part.pos[1] new_script += build_script[idx_start:idx_end] From 73c1ab296137589bf16d077ec6d4cb90c2ad3e46 Mon Sep 17 00:00:00 2001 From: David Korczynski Date: Wed, 16 Oct 2024 14:31:42 -0700 Subject: [PATCH 3/6] nit Signed-off-by: David Korczynski --- infra/base-images/base-builder/bash_parser.py | 1 + 1 file changed, 1 insertion(+) diff --git a/infra/base-images/base-builder/bash_parser.py b/infra/base-images/base-builder/bash_parser.py index afdf437d94a9..44af269148ba 100644 --- a/infra/base-images/base-builder/bash_parser.py +++ b/infra/base-images/base-builder/bash_parser.py @@ -1,3 +1,4 @@ +import os import sys import bashlex From 53263d4bbda68c886db14f51c49b3777b6c2ae86 Mon Sep 17 00:00:00 2001 From: David Korczynski Date: Wed, 16 Oct 2024 15:02:49 -0700 Subject: [PATCH 4/6] write script to out Signed-off-by: David Korczynski --- infra/base-images/base-builder/bash_parser.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/infra/base-images/base-builder/bash_parser.py b/infra/base-images/base-builder/bash_parser.py index 44af269148ba..b8e4c4b68bf0 100644 --- a/infra/base-images/base-builder/bash_parser.py +++ b/infra/base-images/base-builder/bash_parser.py @@ -35,17 +35,33 @@ def should_include_command(ast_tree): def is_local_redirection(ast_node, all_scripts): """Return the list of scripts corresponding to the command, in case the command is an execution of a local script.""" - #print("Checking") + # print("Checking") + + # Capture local script called with ./random/path/build.sh if len(ast_node.parts) >= 2: if ast_node.parts[0].word == '.': suffixes_matching = [] #print(ast_node.parts[1].word) for bash_script in all_scripts: #print("- %s"%(bash_script)) - if bash_script.endswith(ast_node.parts[1].word): + cmd_to_exec = ast_node.parts[1].word.replace('$SRC', 'src') + if bash_script.endswith(cmd_to_exec): suffixes_matching.append(bash_script) #print(suffixes_matching) return suffixes_matching + # Capture a local script called with $SRC/random/path/build.sh + if len(ast_node.parts) >= 1: + if '$SRC' in ast_node.parts[0].word: + suffixes_matching = [] + print(ast_node.parts[0].word) + for bash_script in all_scripts: + print("- %s"%(bash_script)) + cmd_to_exec = ast_node.parts[0].word.replace('$SRC', 'src') + if bash_script.endswith(cmd_to_exec): + suffixes_matching.append(bash_script) + print(suffixes_matching) + return suffixes_matching + return [] def parse_script(bash_script, all_scripts) -> str: @@ -81,4 +97,6 @@ def parse_script(bash_script, all_scripts) -> str: print("REPLAYABLE BASH SCRIPT") print("#"*60) print(replay_bash_script) - print("#"*60) \ No newline at end of file + print("#"*60) + with open('/out/replay-build-script.sh', 'w') as f: + f.write(replay_bash_script) \ No newline at end of file From d707cc733be4f28168832fa969b8b21304fe03e9 Mon Sep 17 00:00:00 2001 From: David Korczynski Date: Thu, 17 Oct 2024 16:03:28 -0700 Subject: [PATCH 5/6] handle individual nodes Signed-off-by: David Korczynski --- infra/base-images/base-builder/bash_parser.py | 274 ++++++++++++------ 1 file changed, 190 insertions(+), 84 deletions(-) diff --git a/infra/base-images/base-builder/bash_parser.py b/infra/base-images/base-builder/bash_parser.py index b8e4c4b68bf0..7d66ff5a38f7 100644 --- a/infra/base-images/base-builder/bash_parser.py +++ b/infra/base-images/base-builder/bash_parser.py @@ -4,99 +4,205 @@ from glob import glob + def find_all_bash_scripts_in_src(): - all_scripts = [y for x in os.walk('/src/') for y in glob(os.path.join(x[0], '*.sh'))] - scripts_we_care_about = [] - to_ignore = {'aflplusplus', 'honggfuzz', '/fuzztest', '/centipede'} - for s in all_scripts: - if any([x for x in to_ignore if x in s]): - continue - scripts_we_care_about.append(s) - - print(scripts_we_care_about) - return scripts_we_care_about - - -def should_include_command(ast_tree): - if 'configure' in ast_tree.parts[0].word: - return False - if 'autoheader' in ast_tree.parts[0].word: - return False - if 'autoconf' in ast_tree.parts[0].word: - return False - if 'cmake' in ast_tree.parts[0].word: - return False - - if len(ast_tree.parts) > 1 and 'make' in ast_tree.parts[0].word and 'clean' in ast_tree.parts[1].word: - return False + all_scripts = [ + y for x in os.walk('/src/') for y in glob(os.path.join(x[0], '*.sh')) + ] + scripts_we_care_about = [] + to_ignore = {'aflplusplus', 'honggfuzz', '/fuzztest', '/centipede'} + for s in all_scripts: + if any([x for x in to_ignore if x in s]): + continue + scripts_we_care_about.append(s) + + print(scripts_we_care_about) + return scripts_we_care_about + + +def should_discard_command(ast_tree) -> bool: + """Returns True if the command shuold be avoided, otherwise False""" + try: + first_word = ast_tree.parts[0].word + except: # pylint: disable=bare-except + return False + if 'configure' in first_word: + return True + if 'autoheader' in first_word: + return True + if 'autoconf' in first_word: + return True + if 'cmake' in first_word: + return True + if 'autogen.sh' in first_word: + return True + + try: + second_word = ast_tree.parts[1].word + except: # pylint: disable=bare-except + return False + + if 'make' in first_word and 'clean' in second_word: return True + return False + def is_local_redirection(ast_node, all_scripts): - """Return the list of scripts corresponding to the command, in case + """Return the list of scripts corresponding to the command, in case the command is an execution of a local script.""" - # print("Checking") - - # Capture local script called with ./random/path/build.sh - if len(ast_node.parts) >= 2: - if ast_node.parts[0].word == '.': - suffixes_matching = [] - #print(ast_node.parts[1].word) - for bash_script in all_scripts: - #print("- %s"%(bash_script)) - cmd_to_exec = ast_node.parts[1].word.replace('$SRC', 'src') - if bash_script.endswith(cmd_to_exec): - suffixes_matching.append(bash_script) - #print(suffixes_matching) - return suffixes_matching - # Capture a local script called with $SRC/random/path/build.sh - if len(ast_node.parts) >= 1: - if '$SRC' in ast_node.parts[0].word: - suffixes_matching = [] - print(ast_node.parts[0].word) - for bash_script in all_scripts: - print("- %s"%(bash_script)) - cmd_to_exec = ast_node.parts[0].word.replace('$SRC', 'src') - if bash_script.endswith(cmd_to_exec): - suffixes_matching.append(bash_script) - print(suffixes_matching) - return suffixes_matching - - return [] + # print("Checking") -def parse_script(bash_script, all_scripts) -> str: - new_script = '' - with open(bash_script, 'r', encoding='utf-8') as f: - build_script = f.read() - parts = bashlex.parse(build_script) - for part in parts: - try: - if not should_include_command(part): - continue - except: - continue - - matches = is_local_redirection(part, all_scripts) - if len(matches) == 1: - new_script += parse_script(matches[0], all_scripts) + '\n' - continue - - # Extract the command from the script string + # Capture local script called with ./random/path/build.sh + + if len(ast_node.parts) >= 2: + try: + ast_node.parts[0].word + except: + return [] + if ast_node.parts[0].word == '.': + suffixes_matching = [] + #print(ast_node.parts[1].word) + for bash_script in all_scripts: + #print("- %s"%(bash_script)) + cmd_to_exec = ast_node.parts[1].word.replace('$SRC', 'src') + if bash_script.endswith(cmd_to_exec): + suffixes_matching.append(bash_script) + #print(suffixes_matching) + return suffixes_matching + # Capture a local script called with $SRC/random/path/build.sh + if len(ast_node.parts) >= 1: + if '$SRC' in ast_node.parts[0].word: + suffixes_matching = [] + print(ast_node.parts[0].word) + for bash_script in all_scripts: + print("- %s" % (bash_script)) + cmd_to_exec = ast_node.parts[0].word.replace('$SRC', 'src') + if bash_script.endswith(cmd_to_exec): + suffixes_matching.append(bash_script) + print(suffixes_matching) + return suffixes_matching + + return [] + + +def handle_ast_command(ast_node, all_scripts_in_fs, raw_script): + """Generate bash script string for command node""" + new_script = '' + if should_discard_command(ast_node): + return '' + + matches = is_local_redirection(ast_node, all_scripts_in_fs) + if len(matches) == 1: + new_script += parse_script(matches[0], all_scripts_in_fs) + '\n' + return '' + + # Extract the command from the script string + idx_start = ast_node.pos[0] + idx_end = ast_node.pos[1] + new_script += raw_script[idx_start:idx_end] + #new_script += '\n' + + # If mkdir is used, then ensure that '-p' is provided, as + # otherwise we will run into failures. We don't have to worry + # about multiple uses of -p as `mkdir -p -p -p`` is valid. + new_script = new_script.replace('mkdir', 'mkdir -p') + return new_script + + +def handle_ast_list(ast_node, all_scripts_in_fs, raw_script): + new_script = '' + try_hard = 1 + + if not try_hard: + list_start = ast_node.pos[0] + list_end = ast_node.pos[1] + new_script += raw_script[list_start:list_end] # + '\n' + else: + # This is more refined logic. Ideally, this should work, but it's a bit + # more intricate to get right due to e.g. white-space between positions + # and more extensive parsing needed. We don't neccesarily need this + # level of success rate for what we're trying to achieve, so am disabling + # this for now. + for part in ast_node.parts: + if part.kind == 'list': + new_script += handle_ast_list(part, all_scripts_in_fs, raw_script) + elif part.kind == 'command': + new_script += handle_ast_command(part, all_scripts_in_fs, raw_script) + else: idx_start = part.pos[0] idx_end = part.pos[1] - new_script += build_script[idx_start:idx_end] - new_script += '\n' - #print("[%s]"%(build_script[idx_start:idx_end])) - return new_script + new_script += raw_script[idx_start:idx_end] + new_script += ' ' + + # Make sure what was created is valid syntax, and otherwise return empty + try: + bashlex.parse(new_script) + except: # pylint: disable=bare-except + # Maybe return the original here instead of skipping? + return '' + return new_script + + +def handle_ast_compound(ast_node, all_scripts_in_fs, raw_script): + new_script = '' + try_hard = 1 + list_start = ast_node.pos[0] + list_end = ast_node.pos[1] + new_script += raw_script[list_start:list_end] + '\n' + return new_script + + +def handle_node(ast_node, all_scripts_in_fs, build_script): + """Generates a bash script string for a given node""" + if ast_node.kind == 'command': + #if should_discard_command(part): + # continue + + #matches = is_local_redirection(part, all_scripts) + #if len(matches) == 1: + # new_script += parse_script(matches[0], all_scripts) + '\n' + # continue + + # Extract the command from the script string + #idx_start = part.pos[0] + #idx_end = part.pos[1] + #new_script += build_script[idx_start:idx_end] + #new_script += '\n' + #print("[%s]"%(build_script[idx_start:idx_end])) + return handle_ast_command(ast_node, all_scripts_in_fs, build_script) + elif ast_node.kind == 'list': + return handle_ast_list(ast_node, all_scripts_in_fs, build_script) + elif ast_node.kind == 'compound': + print('todo: handle compound') + return handle_ast_compound(ast_node, all_scripts_in_fs, build_script) + else: + raise Exception(f'Missing node handling: {ast_node.kind}') + + +def parse_script(bash_script, all_scripts) -> str: + """Top-level bash script parser""" + new_script = '' + with open(bash_script, 'r', encoding='utf-8') as f: + build_script = f.read() + parts = bashlex.parse(build_script) + for part in parts: + new_script += handle_node(part, all_scripts, build_script) + new_script += '\n' + print("-" * 45) + print(part.kind) + print(part.dump()) + + return new_script if __name__ == "__main__": - all_scripts = find_all_bash_scripts_in_src() - replay_bash_script = parse_script(sys.argv[1], all_scripts) - - print("REPLAYABLE BASH SCRIPT") - print("#"*60) - print(replay_bash_script) - print("#"*60) - with open('/out/replay-build-script.sh', 'w') as f: - f.write(replay_bash_script) \ No newline at end of file + all_scripts = find_all_bash_scripts_in_src() + replay_bash_script = parse_script(sys.argv[1], all_scripts) + + print("REPLAYABLE BASH SCRIPT") + print("#" * 60) + print(replay_bash_script) + print("#" * 60) + with open('/out/replay-build-script.sh', 'w') as f: + f.write(replay_bash_script) From f0e8c3fe2a10a49559830da81a8893a6beefa12b Mon Sep 17 00:00:00 2001 From: David Korczynski Date: Thu, 17 Oct 2024 16:03:47 -0700 Subject: [PATCH 6/6] remove unused code Signed-off-by: David Korczynski --- infra/base-images/base-builder/bash_parser.py | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/infra/base-images/base-builder/bash_parser.py b/infra/base-images/base-builder/bash_parser.py index 7d66ff5a38f7..dd2b43eb3314 100644 --- a/infra/base-images/base-builder/bash_parser.py +++ b/infra/base-images/base-builder/bash_parser.py @@ -156,20 +156,6 @@ def handle_ast_compound(ast_node, all_scripts_in_fs, raw_script): def handle_node(ast_node, all_scripts_in_fs, build_script): """Generates a bash script string for a given node""" if ast_node.kind == 'command': - #if should_discard_command(part): - # continue - - #matches = is_local_redirection(part, all_scripts) - #if len(matches) == 1: - # new_script += parse_script(matches[0], all_scripts) + '\n' - # continue - - # Extract the command from the script string - #idx_start = part.pos[0] - #idx_end = part.pos[1] - #new_script += build_script[idx_start:idx_end] - #new_script += '\n' - #print("[%s]"%(build_script[idx_start:idx_end])) return handle_ast_command(ast_node, all_scripts_in_fs, build_script) elif ast_node.kind == 'list': return handle_ast_list(ast_node, all_scripts_in_fs, build_script)