Skip to content

Commit

Permalink
Inject profiling for function calls to 'halide_copy_to_host' and 'hal…
Browse files Browse the repository at this point in the history
…ide_copy_to_device'. (halide#7913)

* Inject profiling for function calls to 'halide_copy_to_host' and 'halide_copy_to_device'.

* WIP: I get segfaults. The device_interface pointer is bogus.

* Figured it out...

* Allow global sync on d3d12.

* Cleanly time all buffer copies as well.

* Cleanup old comment.

* Following Andrews suggestion for suffixing buffer copies in the profiler.

* Sort the profiler report lines into three sections: funcs, buffer copy to device, and buffer copy to host.

* Inject profiling for function calls to 'halide_copy_to_host' and 'halide_copy_to_device'.

* WIP: I get segfaults. The device_interface pointer is bogus.

* Figured it out...

* Allow global sync on d3d12.

* Cleanly time all buffer copies as well.

* Cleanup old comment.

* Following Andrews suggestion for suffixing buffer copies in the profiler.

* Sort the profiler report lines into three sections: funcs, buffer copy to device, and buffer copy to host.

* Attempt to fix output parsing.

* Fix crash for copy_to_device

* halide_device_sync_global(NULL) -> success

* Fixed the buffer copy bug. Added a new test that will cause buffer copies in two directions within the compiled pipeline. This will catch this better in the future. Tweaked the profile report section header printing.

* Clang-format, my dear friend...
  • Loading branch information
mcourteaux authored and ardier committed Mar 3, 2024
1 parent 0e8c3bd commit 0b813fa
Show file tree
Hide file tree
Showing 12 changed files with 306 additions and 32 deletions.
1 change: 1 addition & 0 deletions src/CodeGen_Internal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ bool function_takes_user_context(const std::string &name) {
"halide_device_malloc",
"halide_device_and_host_malloc",
"halide_device_sync",
"halide_device_sync_global",
"halide_do_par_for",
"halide_do_loop_task",
"halide_do_task",
Expand Down
8 changes: 7 additions & 1 deletion src/OffloadGPULoops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,13 @@ class InjectGpuOffload : public IRMutator {
Call::make(Handle(), Call::make_struct, args, Call::Intrinsic),
Call::make(Handle(), Call::make_struct, arg_is_buffer, Call::Intrinsic),
};
return call_extern_and_assert("halide_" + api_unique_name + "_run", run_args);
Stmt run_and_assert = call_extern_and_assert("halide_" + api_unique_name + "_run", run_args);
if (target.has_feature(Target::Profile) || target.has_feature(Target::ProfileByTimer)) {
Expr device_interface = make_device_interface_call(loop->device_api, MemoryType::Auto);
Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});
return Block::make(run_and_assert, sync_and_assert);
}
return run_and_assert;
}

public:
Expand Down
72 changes: 72 additions & 0 deletions src/Profiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "ExprUsesVar.h"
#include "IRMutator.h"
#include "IROperator.h"
#include "InjectHostDevBufferCopies.h"
#include "Profiling.h"
#include "Scope.h"
#include "Simplify.h"
Expand Down Expand Up @@ -422,6 +423,77 @@ class InjectProfiling : public IRMutator {
}
return IfThenElse::make(std::move(condition), std::move(then_case), std::move(else_case));
}

Stmt visit(const LetStmt *op) override {
if (const Call *call = op->value.as<Call>()) {
Stmt start_profiler;
if (call->name == "halide_copy_to_host" || call->name == "halide_copy_to_device") {
std::string buffer_name;
if (const Variable *var = call->args.front().as<Variable>()) {
buffer_name = var->name;
if (ends_with(buffer_name, ".buffer")) {
buffer_name = buffer_name.substr(0, buffer_name.size() - 7);
} else {
internal_error << "Expected to find a variable ending in .buffer as first argument to function call " << call->name << "\n";
}
} else {
internal_error << "Expected to find a variable as first argument of the function call " << call->name << ".\n";
}
bool requires_sync = false;
if (call->name == "halide_copy_to_host") {
int copy_to_host_id = get_func_id(buffer_name + " (copy to host)");
start_profiler = set_current_func(copy_to_host_id);
requires_sync = false;
} else if (call->name == "halide_copy_to_device") {
int copy_to_device_id = get_func_id(buffer_name + " (copy to device)");
start_profiler = set_current_func(copy_to_device_id);
requires_sync = true;
} else {
internal_error << "Unexpected function name.\n";
}
if (start_profiler.defined()) {
// The copy functions are followed by an assert, which we will wrap in the timed body.
const AssertStmt *copy_assert = nullptr;
Stmt other;
if (const Block *block = op->body.as<Block>()) {
if (const AssertStmt *assert = block->first.as<AssertStmt>()) {
copy_assert = assert;
other = block->rest;
}
} else if (const AssertStmt *assert = op->body.as<AssertStmt>()) {
copy_assert = assert;
}
if (copy_assert) {
std::vector<Stmt> steps;
steps.push_back(AssertStmt::make(copy_assert->condition, copy_assert->message));
if (requires_sync) {
internal_assert(call->name == "halide_copy_to_device");
Expr device_interface = call->args.back(); // The last argument to the copy_to_device calls is the device_interface.
Stmt sync_and_assert = call_extern_and_assert("halide_device_sync_global", {device_interface});
steps.push_back(sync_and_assert);
}
steps.push_back(set_current_func(stack.back()));

if (other.defined()) {
steps.push_back(mutate(other));
}
return Block::make(start_profiler,
LetStmt::make(op->name, mutate(op->value),
Block::make(steps)));
} else {
internal_error << "No assert found after buffer copy.\n";
}
}
}
}

Stmt body = mutate(op->body);
Expr value = mutate(op->value);
if (body.same_as(op->body) && value.same_as(op->value)) {
return op;
}
return LetStmt::make(op->name, value, body);
}
};

} // namespace
Expand Down
9 changes: 9 additions & 0 deletions src/runtime/HalideRuntime.h
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,15 @@ extern int halide_device_release_crop(void *user_context,
* should rarely be necessary, except maybe for profiling. */
extern int halide_device_sync(void *user_context, struct halide_buffer_t *buf);

/**
* Wait for current GPU operations to complete. Calling this explicitly
* should rarely be necessary, except maybe for profiling.
* This variation of the synchronizing is useful when a synchronization is desirable
* without specifying any buffer to synchronize on.
* Calling this with a null device_interface is always illegal.
*/
extern int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface);

/** Allocate device memory to back a halide_buffer_t. */
extern int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
const struct halide_device_interface_t *device_interface);
Expand Down
8 changes: 6 additions & 2 deletions src/runtime/d3d12compute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2786,8 +2786,12 @@ WEAK int halide_d3d12compute_device_sync(void *user_context, struct halide_buffe
return d3d12_context.error();
}

d3d12_buffer *dbuffer = peel_buffer(buffer);
d3d12compute_device_sync_internal(d3d12_context.device, dbuffer);
if (buffer != nullptr) {
d3d12_buffer *dbuffer = peel_buffer(buffer);
d3d12compute_device_sync_internal(d3d12_context.device, dbuffer);
} else {
d3d12compute_device_sync_internal(d3d12_context.device, nullptr);
}

return halide_error_code_success;
}
Expand Down
15 changes: 15 additions & 0 deletions src/runtime/device_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,21 @@ WEAK int halide_device_sync(void *user_context, struct halide_buffer_t *buf) {
}
}

/**
* Wait for current GPU operations to complete. Calling this explicitly
* should rarely be necessary, except maybe for profiling.
* This variation of the synchronizing is useful when a synchronization is desirable
* without specifying any buffer to synchronize on.
*/
WEAK int halide_device_sync_global(void *user_context, const struct halide_device_interface_t *device_interface) {
if (device_interface == nullptr) {
return halide_error_code_no_device_interface;
}
// This function calls immediately the device_interface implementation to syncrhonize on
// "no buffer" (i.e., nullptr buffer) to trigger a "global" device sync.
return device_interface->impl->device_sync(user_context, nullptr);
}

/** Allocate device memory to back a halide_buffer_t. */
WEAK int halide_device_malloc(void *user_context, struct halide_buffer_t *buf,
const halide_device_interface_t *device_interface) {
Expand Down
128 changes: 111 additions & 17 deletions src/runtime/profiler_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,14 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
};
}
}
bool support_colors = false;
const char *term = getenv("TERM");
if (term) {
// Check if the terminal supports colors
if (strstr(term, "color") || strstr(term, "xterm")) {
support_colors = true;
}
}

for (halide_profiler_pipeline_stats *p = s->pipelines; p;
p = (halide_profiler_pipeline_stats *)(p->next)) {
Expand Down Expand Up @@ -385,14 +393,31 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
if (print_f_states) {
int f_stats_count = 0;
halide_profiler_func_stats **f_stats = (halide_profiler_func_stats **)__builtin_alloca(p->num_funcs * sizeof(halide_profiler_func_stats *));
const char *substr_copy_to_device = " (copy to device)";
const char *substr_copy_to_host = " (copy to host)";

int max_func_name_length = 23; // length of the section header
int num_copy_to_device = 0;
int num_copy_to_host = 0;

int max_func_name_length = 0;
uint64_t total_func_time = 0;
uint64_t total_copy_to_device_time = 0;
uint64_t total_copy_to_host_time = 0;
for (int i = 0; i < p->num_funcs; i++) {
halide_profiler_func_stats *fs = p->funcs + i;
int name_len = strlen(fs->name);
if (name_len > max_func_name_length) {
max_func_name_length = name_len;
}
if (strstr(fs->name, substr_copy_to_device)) {
num_copy_to_device++;
total_copy_to_device_time += fs->time;
} else if (strstr(fs->name, substr_copy_to_host)) {
num_copy_to_host++;
total_copy_to_host_time += fs->time;
} else {
total_func_time += fs->time;
}
}

for (int i = 0; i < p->num_funcs; i++) {
Expand All @@ -418,18 +443,8 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
}
}

for (int i = 0; i < f_stats_count; i++) {
size_t cursor = 0;
sstr.clear();
halide_profiler_func_stats *fs = f_stats[i];

sstr << " " << fs->name << ": ";
cursor += max_func_name_length + 5;
while (sstr.size() < cursor) {
sstr << " ";
}

float ft = fs->time / (p->runs * 1000000.0f);
const auto print_time_and_percentage = [&sstr, p](uint64_t time, size_t &cursor, bool light) {
float ft = time / (p->runs * 1000000.0f);
if (ft < 10000) {
sstr << " ";
}
Expand All @@ -451,16 +466,40 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
sstr << " ";
}

int percent = 0;
int perthousand = 0;
if (p->time != 0) {
percent = (100 * fs->time) / p->time;
perthousand = (1000 * time) / p->time;
}
sstr << "(";
if (perthousand < 100) {
sstr << " ";
}
sstr << "(" << percent << "%)";
cursor += 8;
int percent = perthousand / 10;
sstr << percent << "." << (perthousand - percent * 10) << "%)";
if (!light) {
cursor += 10;
while (sstr.size() < cursor) {
sstr << " ";
}
}
};

auto print_report_entry = [&](halide_profiler_func_stats *fs, const char *suffix_cut) {
size_t cursor = 0;
sstr.clear();

sstr << " " << fs->name;
if (suffix_cut) {
sstr.erase(strlen(suffix_cut));
}
sstr << ": ";
cursor += max_func_name_length + 7;
while (sstr.size() < cursor) {
sstr << " ";
}

print_time_and_percentage(fs->time, cursor, false);

if (!serial) {
float threads = fs->active_threads_numerator / (fs->active_threads_denominator + 1e-10);
sstr << "threads: " << threads;
Expand Down Expand Up @@ -494,6 +533,61 @@ WEAK void halide_profiler_report_unlocked(void *user_context, halide_profiler_st
sstr << "\n";

halide_print(user_context, sstr.str());
};

if (num_copy_to_host == 0 && num_copy_to_device == 0) {
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
print_report_entry(fs, nullptr);
}
} else {
const auto print_section_header = [&](const char *name, uint64_t total_time) {
size_t cursor = 0;
sstr.clear();
sstr << " ";
if (support_colors) {
sstr << "\033[90m\033[3m";
cursor += 9;
}
sstr << "[" << name << " ";
cursor += max_func_name_length + 7;
while (sstr.size() < cursor) {
sstr << ":";
}
print_time_and_percentage(total_time, cursor, true);
sstr << " ::::]";
if (support_colors) {
sstr << "\033[0m";
}
sstr << "\n";
halide_print(user_context, sstr.str());
};

print_section_header("funcs", total_func_time);
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
if (!strstr(fs->name, substr_copy_to_device) && !strstr(fs->name, substr_copy_to_host)) {
print_report_entry(fs, nullptr);
}
}
if (num_copy_to_device) {
print_section_header("buffer copies to device", total_copy_to_device_time);
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
if (strstr(fs->name, substr_copy_to_device)) {
print_report_entry(fs, substr_copy_to_device);
}
}
}
if (num_copy_to_host) {
print_section_header("buffer copies to host", total_copy_to_host_time);
for (int i = 0; i < f_stats_count; i++) {
halide_profiler_func_stats *fs = f_stats[i];
if (strstr(fs->name, substr_copy_to_host)) {
print_report_entry(fs, substr_copy_to_host);
}
}
}
}
}
}
Expand Down
1 change: 1 addition & 0 deletions src/runtime/runtime_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
(void *)&halide_device_malloc,
(void *)&halide_device_release,
(void *)&halide_device_sync,
(void *)&halide_device_sync_global,
(void *)&halide_disable_timer_interrupt,
(void *)&halide_do_par_for,
(void *)&halide_do_parallel_tasks,
Expand Down
1 change: 1 addition & 0 deletions test/correctness/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ tests(GROUPS correctness
debug_to_file_reorder.cpp
deferred_loop_level.cpp
deinterleave4.cpp
device_buffer_copies_with_profile.cpp
device_buffer_copy.cpp
device_copy_at_inner_loop.cpp
device_crop.cpp
Expand Down
Loading

0 comments on commit 0b813fa

Please sign in to comment.