Skip to content

Commit

Permalink
[WIP] Adopt Treelite 4.0, rewrite compiler
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Feb 12, 2024
1 parent 9d135f4 commit fc9d14f
Show file tree
Hide file tree
Showing 61 changed files with 1,946 additions and 2,166 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ repos:
language: python
args: [
"--linelength=100", "--recursive",
"--filter=-build/c++11,-build/include,-build/namespaces_literals,+build/include_what_you_use,+build/include_order",
"--filter=-build/c++11,-build/include,-build/namespaces_literals,-runtime/references,+build/include_what_you_use,+build/include_order",
"--root=include"]
additional_dependencies: [cpplint]
types_or: [c++]
Expand Down
6 changes: 3 additions & 3 deletions cmake/ExternalLibs.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
include(FetchContent)

# Treelite
find_package(Treelite 3.4.0)
find_package(Treelite 4.0.0)
if (Treelite_FOUND)
set(TREELITE_FROM_SYSTEM_ROOT TRUE)
set(TREELITE_LIB treelite::treelite)
Expand All @@ -10,11 +10,11 @@ else ()
FetchContent_Declare(
treelite
GIT_REPOSITORY https://github.com/dmlc/treelite.git
GIT_TAG 3.9.0
GIT_TAG 4.0.0
)
set(Treelite_BUILD_STATIC_LIBS ON)
FetchContent_MakeAvailable(treelite)
set_target_properties(treelite treelite_runtime PROPERTIES EXCLUDE_FROM_ALL TRUE)
set_target_properties(treelite PROPERTIES EXCLUDE_FROM_ALL TRUE)
target_include_directories(treelite_static PUBLIC
$<BUILD_INTERFACE:${treelite_SOURCE_DIR}/include>
$<BUILD_INTERFACE:${treelite_BINARY_DIR}/include>
Expand Down
2 changes: 1 addition & 1 deletion include/tl2cgen/annotator.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ class BranchAnnotator {
* \code
* Annotator annotator
* annotator.Load(fi); // load from a stream
* std::vector<std::vector<size_t>> annot = annotator.Get();
* std::vector<std::vector<std::uint64_t>> annot = annotator.Get();
* // access the frequency count for a specific node in a tree
* TL2CGEN_LOG(INFO) << "Tree " << tree_id << ", Node " << node_id << ": "
* << annot[tree_id][node_id];
Expand Down
37 changes: 15 additions & 22 deletions include/tl2cgen/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,6 @@
typedef void* TL2cgenModelHandle;
/*! \brief Handle to branch annotation data */
typedef void* TL2cgenAnnotationHandle;
/*! \brief Handle to compiler class */
typedef void* TL2cgenCompilerHandle;
/*! \brief Handle to a data matrix */
typedef void* TL2cgenDMatrixHandle;
/*! \brief Handle to predictor class */
Expand Down Expand Up @@ -158,41 +156,36 @@ TL2CGEN_DLL int TL2cgenAnnotationFree(TL2cgenAnnotationHandle handle);
* \defgroup compiler Compiler interface
* \{
*/
/*!
* \brief Create a compiler with a given name
* \param name Name of compiler
* \param params_json_str JSON string representing the parameters for the compiler
* \param out Created compiler
* \return 0 for success, -1 for failure
*/
TL2CGEN_DLL int TL2cgenCompilerCreate(
char const* name, char const* params_json_str, TL2cgenCompilerHandle* out);
/*!
* \brief Generate prediction code from a tree ensemble model. The code will
* be C99 compliant. One header file (.h) will be generated, along with
* one or more source files (.c).
*
* Usage example:
* \code
* TL2cgenCompilerGenerateCode(compiler, model, "./my/model");
* TL2cgenCompilerGenerateCode(model, compiler_params_json_str, "./my/model");
* // files to generate: ./my/model/header.h, ./my/model/main.c
* // if parallel compilation is enabled:
* // ./my/model/header.h, ./my/model/main.c, ./my/model/tu0.c,
* // ./my/model/tu1.c, and so forth
* \endcode
* \param compiler Handle for compiler
* \param model Handle for tree ensemble model
* \param compiler_params_json_str JSON string representing the parameters for the compiler
* \param dirpath Directory to store header and source files
* \return 0 for success, -1 for failure
*/
TL2CGEN_DLL int TL2cgenCompilerGenerateCode(
TL2cgenCompilerHandle compiler, TL2cgenModelHandle model, char const* dirpath);
TL2CGEN_DLL int TL2cgenGenerateCode(
TL2cgenModelHandle model, char const* compiler_params_json_str, char const* dirpath);
/*!
* \brief Delete compiler from memory
* \param handle Compiler to remove
* \brief Obtain human-readable representation of Abstract Syntax Tree (AST) generated by
* the compiler. Useful for debugging the code generation process.
* \param model Handle for tree ensemble model
* \param compiler_params_json_str JSON string representing the parameters for the compiler
* \param out_dump_str Pointer to store the returned string
* \return 0 for success, -1 for failure
*/
TL2CGEN_DLL int TL2cgenCompilerFree(TL2cgenCompilerHandle handle);
TL2CGEN_DLL int TL2cgenDumpAST(
TL2cgenModelHandle model, char const* compiler_params_json_str, char const** out_dump_str);
/*! \} */

/*!
Expand Down Expand Up @@ -304,12 +297,12 @@ TL2CGEN_DLL int TL2cgenPredictorPredictBatch(TL2cgenPredictorHandle predictor,
*
* Note. To access the element values from the output vector, you should convert the opaque
* handle (\ref TL2cgenPredictorOutputHandle type) to an appropriate pointer LeafOutputType*,
* where the type is either float, double, or uint32_t. So carry out the following steps:
* where the type is either float or double. So carry out the following steps:
* 1. Call \ref TL2cgenPredictorQueryLeafOutputType to obtain the type of the leaf output.
* It will return a string ("float32", "float64", or "uint32") representing the type.
* It will return a string ("float32" or "float64") representing the type.
* 2. Extract the void* pointer from the output vector object by calling
* \ref TL2cgenPredictorGetRawPointerFromOutputVector.
* 3. Depending on the type string, cast the void* pointer to float*, double*, or uint32_t*.
* 3. Depending on the type string, cast the void* pointer to float* or double*.
* 4. Now access the array with the casted pointer. The array's length is given by
* \ref TL2cgenPredictorQueryResultSize.
* \param predictor Predictor
Expand Down Expand Up @@ -369,7 +362,7 @@ TL2CGEN_DLL int TL2cgenPredictorQueryNumFeature(TL2cgenPredictorHandle predictor
* \param out Name of post prediction transformation
* \return 0 for success, -1 for failure
*/
TL2CGEN_DLL int TL2cgenPredictorQueryPredTransform(
TL2CGEN_DLL int TL2cgenPredictorQueryPostprocessor(
TL2cgenPredictorHandle predictor, char const** out);

/*!
Expand Down
75 changes: 22 additions & 53 deletions include/tl2cgen/compiler.h
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
/*!
* Copyright (c) 2023 by Contributors
* Copyright (c) 2023-2024 by Contributors
* \file compiler.h
* \brief Interface of compiler that compiles a tree ensemble model
* \brief Compiler that generates C code from a tree ensemble model
* \author Hyunsu Cho
*/
#ifndef TL2CGEN_COMPILER_H_
#define TL2CGEN_COMPILER_H_

#include <filesystem>
#include <functional>
#include <memory>
#include <string>
Expand All @@ -18,61 +19,29 @@ namespace treelite {
class Model; // forward declaration
} // namespace treelite

namespace tl2cgen {

namespace compiler {
namespace tl2cgen::compiler {

struct CompilerParam; // forward declaration

struct CompiledModel {
struct FileEntry {
std::string content;
std::vector<char> content_binary;
bool is_binary;
FileEntry() : is_binary(false) {}
// Passing std::vector<char> indicates binary data
// Passing std::string indicates text data
// Use move constructor and assignment exclusively to save memory
explicit FileEntry(std::string const& content) = delete;
explicit FileEntry(std::string&& content) : content(std::move(content)), is_binary(false) {}
explicit FileEntry(std::vector<char> const&) = delete;
explicit FileEntry(std::vector<char>&& content)
: content_binary(std::move(content)), is_binary(true) {}
FileEntry(FileEntry const& other) = delete;
FileEntry(FileEntry&& other) = default;
FileEntry& operator=(FileEntry const& other) = delete;
FileEntry& operator=(FileEntry&& other) = default;
};
std::unordered_map<std::string, FileEntry> files;
std::string file_prefix;
};

} // namespace compiler
/*!
* \brief Compile tree model into C code
* \param model Model to compile
* \param param Parameters to control code generation
* \param path Path to directory to store the generated C code (represented as human-readable text).
* Depending on the parameters, this directory may contain more than one source file.
*/
void CompileModel(
treelite::Model const& model, CompilerParam const& param, std::filesystem::path const& dirpath);

/*! \brief interface of compiler */
class Compiler {
public:
/*! \brief virtual destructor */
virtual ~Compiler() = default;
/*!
* \brief convert tree ensemble model
* \return compiled model
*/
virtual compiler::CompiledModel Compile(treelite::Model const& model) = 0;
/*!
* \brief Query the parameters used to intiailize the compiler
* \return Parameters used
*/
virtual compiler::CompilerParam QueryParam() const = 0;
/*!
* \brief Create a compiler from given name
* \param name Name of compiler
* \param param_json_str JSON string representing compiler configuration
* \return The created compiler
*/
static Compiler* Create(std::string const& name, char const* param_json_str);
};
/*!
* \brief Obtain human-readable representation of Abstract Syntax Tree (AST) generated by
* the compiler. Useful for debugging the code generation process.
* \param model Handle for tree ensemble model
* \param compiler_params_json_str JSON string representing the parameters for the compiler
* \return Text dump of AST
*/
std::string DumpAST(treelite::Model const& model, CompilerParam const& param);

} // namespace tl2cgen
} // namespace tl2cgen::compiler

#endif // TL2CGEN_COMPILER_H_
28 changes: 8 additions & 20 deletions include/tl2cgen/compiler_param.h
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
/*!
* Copyright (c) 2023 by Contributors
* Copyright (c) 2024 by Contributors
* \file compiler_param.h
* \brief Parameters for tree compiler
* \brief Compiler parameters
* \author Hyunsu Cho
*/
#ifndef TL2CGEN_COMPILER_PARAM_H_
#define TL2CGEN_COMPILER_PARAM_H_

#include <limits>
#include <string>

namespace tl2cgen::compiler {

/*! \brief parameters for tree compiler */
/*! \brief Parameters to control code generation */
struct CompilerParam {
/*!
* \defgroup compiler_param Parameters for tree compiler
Expand All @@ -24,30 +23,19 @@ struct CompilerParam {
* Use :py:func:`tl2cgen.annotate_branch` to generate this file.
* \endverbatim
*/
std::string annotate_in;
std::string annotate_in{"NULL"};
/*! \brief Whether to quantize threshold points (0: no, >0: yes) */
int quantize;
int quantize{0};
/*! \brief Option to enable parallel compilation;
if set to nonzero, the trees will be evely distributed
into ``[parallel_comp]`` files. Set this option to improve
compilation time and reduce memory consumption during
compilation. */
int parallel_comp;
int parallel_comp{0};
/*! \brief If >0, produce extra messages */
int verbose;
int verbose{0};
/*! \brief Native lib name (without extension) */
std::string native_lib_name;
/*! \brief Parameter for folding rarely visited subtrees (no if/else blocks);
all nodes whose data counts are lower than that of the root node
of the decision tree by ``[code_folding_req]`` will be
folded. To diable folding, set to ``+inf``. If hessian sums are
available, they will be used as proxies of data counts. */
double code_folding_req;
/*! \brief Only applicable when ``compiler`` is set to ``failsafe``. If set to a positive value,
the fail-safe compiler will not emit large constant arrays to the C code. Instead,
the arrays will be emitted as an ELF binary (Linux only). For large arrays, it is
much faster to directly dump ELF binaries than to pass them to a C compiler. */
int dump_array_as_elf;
std::string native_lib_name{"predictor"};
/*! \} */

static CompilerParam ParseFromJSON(char const* param_json_str);
Expand Down
Loading

0 comments on commit fc9d14f

Please sign in to comment.