Skip to content

Commit

Permalink
[Fuzzing] Use initial contents in ClusterFuzz (#7192)
Browse files Browse the repository at this point in the history
The ClusterFuzz bundler now looks through all of our test suites and
packages all testcases that are suitable for ClusterFuzz to use. This adds
more variety to the wasm files we fuzz there, as the test suite has
corner cases that the main fuzzer is unlikely to generate.

This adds a comment in the JS whenever it uses initial content, to
make debugging easier, something like

[10, 20, 30] /* using initial content 17.wasm */

(this is the reason for the change to extract_wasms.py)
  • Loading branch information
kripken authored Jan 7, 2025
1 parent f9d78d8 commit 8d0f662
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 4 deletions.
45 changes: 45 additions & 0 deletions scripts/bundle_clusterfuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
'''

import os
import subprocess
import sys
import tarfile

Expand All @@ -87,7 +88,9 @@
# Delete the argument, as importing |shared| scans it.
sys.argv.pop()

from test import fuzzing # noqa
from test import shared # noqa
from test import support # noqa

# Pick where to get the builds
if build_dir:
Expand All @@ -97,6 +100,14 @@
binaryen_bin = shared.options.binaryen_bin
binaryen_lib = shared.options.binaryen_lib

# ClusterFuzz's run.py uses these features. Keep this in sync with that, so that
# we only bundle initial content that makes sense for it.
features = [
'-all',
'--disable-shared-everything',
'--disable-fp16',
]

with tarfile.open(output_file, "w:gz") as tar:
# run.py
run = os.path.join(shared.options.binaryen_root, 'scripts', 'clusterfuzz', 'run.py')
Expand Down Expand Up @@ -128,6 +139,40 @@
print(f' ......... : {path}')
tar.add(path, arcname=f'lib/{name}')

# Add tests we will use as initial content under initial/. We put all the
# tests from the test suite there.
print(' .. initial content: ')
temp_wasm = 'temp.wasm'
index = 0
all_tests = shared.get_all_tests()
for i, test in enumerate(all_tests):
if not fuzzing.is_fuzzable(test):
continue
for wast, asserts in support.split_wast(test):
if not wast:
continue
support.write_wast(temp_wasm, wast)
# If the file is not valid for our features, skip it. In the same
# operation, also convert to binary if this was text (binary is more
# compact).
cmd = shared.WASM_OPT + ['-q', temp_wasm, '-o', temp_wasm] + features
if subprocess.run(cmd, stderr=subprocess.PIPE).returncode:
continue

# Looks good.
tar.add(temp_wasm, arcname=f'initial/{index}.wasm')
index += 1
print(f'\r {100 * i / len(all_tests):.2f}%', end='', flush=True)
print(f' (num: {index})')

# Write initial/num.txt which contains the number of testcases in that
# directory (saves run.py from needing to listdir each time).
num_txt = 'num.txt'
with open(num_txt, 'w') as f:
f.write(f'{index}')
tar.add(num_txt, arcname='initial/num.txt')


print('Done.')
print('To run the tests on this bundle, do:')
print()
Expand Down
2 changes: 1 addition & 1 deletion scripts/clusterfuzz/extract_wasms.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def repl(text):


# Replace the wasm files and write them out.
js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\);', repl, js)
js = re.sub(r'var \w+ = new Uint8Array\(\[([\d,]+)\]\)', repl, js)

# Write out the new JS.
with open(f'{out}.js', 'w') as f:
Expand Down
37 changes: 34 additions & 3 deletions scripts/clusterfuzz/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@
# testcase.
JS_SHELL_PATH = os.path.join(ROOT_DIR, 'scripts', 'fuzz_shell.js')

# The path to the directory with initial contents.
INITIAL_CONTENT_PATH = os.path.join(ROOT_DIR, 'initial')

# The file that contains the number of initial contents
INITIAL_CONTENT_NUM_PATH = os.path.join(ROOT_DIR, 'initial', 'num.txt')

# The arguments we provide to wasm-opt to generate wasm files.
FUZZER_ARGS = [
# Generate a wasm from random data.
Expand All @@ -76,7 +82,8 @@
'--fuzz-passes',
# Enable all features but disable ones not yet ready for fuzzing. This may
# be a smaller set than fuzz_opt.py, as that enables a few experimental
# flags, while here we just fuzz with d8's --wasm-staging.
# flags, while here we just fuzz with d8's --wasm-staging. This should be
# synchonized with bundle_clusterfuzz.
'-all',
'--disable-shared-everything',
'--disable-fp16',
Expand All @@ -92,6 +99,17 @@ def get_file_name(prefix, index):
# (We also use urandom below, which uses this under the hood.)
system_random = random.SystemRandom()

# The number of initial content testcases that were bundled for us, in the
# "initial/" subdir.
with open(INITIAL_CONTENT_NUM_PATH) as f:
num_initial_contents = int(f.read())


def get_random_initial_content():
index = system_random.randint(0, num_initial_contents - 1)
return os.path.join(INITIAL_CONTENT_PATH, f'{index}.wasm')


# In production ClusterFuzz we retry whenever we see a wasm-opt error. We are
# not looking for wasm-opt issues there, and just use it to generate testcases
# for VMs. For local testing, however, we may want to disable retrying, which
Expand All @@ -117,9 +135,19 @@ def get_wasm_contents(i, output_dir):
with open(input_data_file_path, 'wb') as file:
file.write(os.urandom(random_size))

# Generate wasm from the random data.
# Generate a command to use wasm-opt with the proper args to generate
# wasm content from the input data.
cmd = [FUZZER_BINARY_PATH] + FUZZER_ARGS
cmd += ['-o', wasm_file_path, input_data_file_path]

# Sometimes use a file from the initial content testcases.
if system_random.random() < 0.5:
initial_content = get_random_initial_content()
cmd += ['--initial-fuzz=' + initial_content]
else:
initial_content = None

# Generate wasm from the random data.
try:
subprocess.check_call(cmd)
except subprocess.CalledProcessError:
Expand Down Expand Up @@ -148,7 +176,10 @@ def get_wasm_contents(i, output_dir):

# Convert to a string, and wrap into a typed array.
wasm_contents = ','.join([str(c) for c in wasm_contents])
return f'new Uint8Array([{wasm_contents}])'
js = f'new Uint8Array([{wasm_contents}])'
if initial_content:
js = f'{js} /* using initial content {os.path.basename(initial_content)} */'
return js


# Returns the contents of a .js fuzz file, given the index of the testcase and
Expand Down
60 changes: 60 additions & 0 deletions test/unit/test_cluster_fuzz.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,6 +282,15 @@ def test_file_contents(self):
seen_calls = []
seen_second_builds = []
seen_JSPIs = []
seen_initial_contents = []

# Initial contents are noted in comments like this:
#
# /* using initial content 42.wasm */
#
# Note that we may see more than one in a file, as we may have more than
# one wasm in each testcase: each wasm has a chance.
initial_content_regex = re.compile(r'[/][*] using initial content ([^ ]+) [*][/]')

for i in range(1, N + 1):
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')
Expand All @@ -302,6 +311,8 @@ def test_file_contents(self):
assert '/* async */' in js
assert '/* await */' in js

seen_initial_contents.append(re.findall(initial_content_regex, js))

# There is always one build and one call (those are in the default
# fuzz_shell.js), and we add a couple of operations, each with equal
# probability to be a build or a call, so over the 100 testcases here we
Expand Down Expand Up @@ -346,6 +357,55 @@ def test_file_contents(self):

print()

# Flatten the data to help some of the below, from
# [['a.wasm', 'b.wasm'], ['c.wasm']]
# into
# ['a.wasm', 'b.wasm', 'c.wasm']
flat_initial_contents = [item for items in seen_initial_contents for item in items]

# Initial content appear 50% of the time for each wasm file. Each
# testcase has 1.333 wasm files on average.
print('Initial contents are distributed as ~ mean 0.68')
print(f'mean initial contents: {len(flat_initial_contents) / N}')
# Initial contents should be mostly unique (we have many, many testcases
# and we pick just 100 or so). And we must see more than one unique one.
unique_initial_contents = set(flat_initial_contents)
print(f'unique initial contents: {len(unique_initial_contents)} should be almost equal to {len(flat_initial_contents)}')
self.assertGreater(len(unique_initial_contents), 1)
# Not all testcases have initial contents.
num_initial_contents = [len(items) for items in seen_initial_contents]
self.assertEqual(min(num_initial_contents), 0)
# Some do (this is redundant given that the set of unique initial
# contents was asserted on before, so this just confirms/checks that).
self.assertGreaterEqual(max(num_initial_contents), 1)

print()

# Execute the files in V8. Almost all should execute properly (some
# small number may trap during startup, say on a segment out of bounds).
if shared.V8:
valid_executions = 0
for i in range(1, N + 1):
fuzz_file = os.path.join(temp_dir.name, f'fuzz-binaryen-{i}.js')

cmd = [shared.V8, '--wasm-staging', fuzz_file]
proc = subprocess.run(cmd, stdout=subprocess.PIPE)

# An execution is valid if we exited without error, and if we
# managed to run some code before exiting (modules with no
# exports will be considered "invalid" here, but that is very
# rare, and in a sense they are actually unuseful).
if proc.returncode == 0 and b'[fuzz-exec] calling ' in proc.stdout:
valid_executions += 1

print('Valid executions are distributed as ~ mean 0.99')
print(f'mean valid executions: {valid_executions / N}')
# Assert on having at least half execute properly. Given the true mean
# is 0.9, for half of 100 to fail is incredibly unlikely.
self.assertGreater(valid_executions, N / 2)

print()

# "zzz" in test name so that this runs last. If it runs first, it can be
# confusing as it appears next to the logging of which bundle we use (see
# setUpClass).
Expand Down

0 comments on commit 8d0f662

Please sign in to comment.