Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Kokkos View core-dump utility tool #98

Draft
wants to merge 40 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
fab189e
Implemented the checkpoint writer part of this effort
DavidPoliakoff Aug 13, 2020
fbdc34b
This is the worst thing I've ever done. I love it
DavidPoliakoff Aug 14, 2020
3c395a7
Renamed, now write the allocations in order
DavidPoliakoff Aug 14, 2020
0e2eb92
Re-added synchronize
DavidPoliakoff Aug 14, 2020
99c3bc8
Fixed the remove_if totally intuitive interface
DavidPoliakoff Aug 14, 2020
ccde1eb
Printout on finish
DavidPoliakoff Aug 14, 2020
a97f0d2
Okay, remove_if is the dumbest interface I've ever seen
DavidPoliakoff Aug 14, 2020
ceb3e00
Protobuf added to build system
DavidPoliakoff Aug 17, 2020
c509412
Bulk add of Protobuf to repo
DavidPoliakoff Aug 17, 2020
c999c5f
Updated protobuf
DavidPoliakoff Aug 17, 2020
141d087
Fixed build recipes
DavidPoliakoff Aug 17, 2020
b272e01
Added actual protocol definition
DavidPoliakoff Aug 17, 2020
39ce44f
Removed old protobuf
DavidPoliakoff Aug 17, 2020
ce2ae52
Bulk re-add protobuf
DavidPoliakoff Aug 17, 2020
8b10421
Updated reader to use protobuf
DavidPoliakoff Aug 17, 2020
3e63690
Updated writer
DavidPoliakoff Aug 17, 2020
704dcd7
Redesign to split messages
DavidPoliakoff Aug 17, 2020
88a4ed9
Big reader update
DavidPoliakoff Aug 18, 2020
6f1e0de
Added example
DavidPoliakoff Aug 18, 2020
fb7c270
Now printing the isgnal
DavidPoliakoff Aug 18, 2020
a324485
Start of type info for generated files
DavidPoliakoff Aug 18, 2020
8692c7e
Push with some build system work
DavidPoliakoff Aug 18, 2020
76a456c
Updated to add SIGINT handler
DavidPoliakoff Aug 18, 2020
0a43924
Quick commit
DavidPoliakoff Aug 18, 2020
e4e7a22
Small fix
DavidPoliakoff Aug 18, 2020
211d84c
Did what Brian said
DavidPoliakoff Aug 18, 2020
6d1cb8f
Maybe?
DavidPoliakoff Aug 18, 2020
4fc04bc
No reraising SIGINT
DavidPoliakoff Aug 19, 2020
22632d9
Added debug dump
DavidPoliakoff Aug 19, 2020
642124d
Ignore signals
DavidPoliakoff Aug 19, 2020
a505fd0
Safer name handling (though it'll leak)
DavidPoliakoff Aug 19, 2020
66c7c87
Maybe this was null?
DavidPoliakoff Aug 19, 2020
04b1071
Another attempt
DavidPoliakoff Aug 19, 2020
f50af4c
More error checking
DavidPoliakoff Aug 19, 2020
9cb39a1
More debug output
DavidPoliakoff Aug 19, 2020
ebba4ce
First take on handling corrupted allocations
DavidPoliakoff Aug 19, 2020
faf8d41
Warning if no checkpoints
DavidPoliakoff Aug 20, 2020
0b89d61
Output to rank-specified file
DavidPoliakoff Aug 20, 2020
5e897ac
File close
DavidPoliakoff Aug 20, 2020
b3f448f
Temp commit - always write the checkpoint when you hit the kernel
DavidPoliakoff Aug 25, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
33 changes: 33 additions & 0 deletions debugging/checkpointing/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST))))

CXX=nvcc
CXXFLAGS=-O0 -std=c++11 -g -I. -L$(PROTOBUF_DIR)/lib64 -I$(PROTOBUF_DIR)/include -lprotobuf
SHARED_CXXFLAGS=-shared -Xcompiler -fPIC
PROTOBUF_DIR=${MAKEFILE_PATH}/protobuf-install

all: kp_checkpoint.so reader

NVCC_WRAPPER=nvcc_wrapper
KOKKOS_DIR=$(HOME)/installs/kokkos/cuda
KOKKOS_FLAGS=-I$(KOKKOS_DIR)/include -L$(KOKKOS_DIR)/lib64 -lkokkoscore -lkokkoscontainers

protobuf-build:
mkdir ${MAKEFILE_PATH}/protobuf-build

protobuf-build/Makefile: protobuf-build
cd protobuf-build && cmake -DCMAKE_INSTALL_PREFIX=${MAKEFILE_PATH}/protobuf-install -DBUILD_SHARED_LIBS=OFF -DCMAKE_CXX_FLAGS="-fPIC" -DCMAKE_BUILD_TYPE=Release -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_CXX_COMPILER=g++ ${MAKEFILE_PATH}/protobuf/cmake/

protobuf-install/lib64/libprotobuf.a: protobuf-build/Makefile
cd ${MAKEFILE_PATH}/protobuf-build && make -j8 install

kp_checkpoint.so: ${MAKEFILE_PATH}kp_kernel_logger.cpp protobuf-install/lib64/libprotobuf.a
$(CXX) $(SHARED_CXXFLAGS) $(CXXFLAGS) -o $@ ${MAKEFILE_PATH}kp_kernel_logger.cpp ${MAKEFILE_PATH}/protocols/checkpointing.pb.cc

reader: ${MAKEFILE_PATH}reader.cpp protobuf-install/lib64/libprotobuf.a
$(CXX) $(CXXFLAGS) -o $@ ${MAKEFILE_PATH}reader.cpp ${MAKEFILE_PATH}/protocols/checkpointing.pb.cc

example: ${MAKEFILE_PATH}example.cpp
$(NVCC_WRAPPER) $(CXXFLAGS) -o $@ ${MAKEFILE_PATH}example.cpp ${MAKEFILE_PATH}/protocols/checkpointing.pb.cc ${KOKKOS_FLAGS}

clean:
rm *.so
9 changes: 9 additions & 0 deletions debugging/checkpointing/checkpointing.proto
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
syntax = "proto2";

package kokkos_checkpointing;

message View {
required string name = 1;
required int64 size = 2;
required bytes data = 3;
};
69 changes: 69 additions & 0 deletions debugging/checkpointing/example.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@

#include <Kokkos_Core.hpp>
#include <fstream>
#include <iostream>
#include <protocols/checkpointing.pb.h>
struct ptr_info {
std::string who;
void* what;
size_t how_much;
bool where;
mutable void* canonical;
};

std::pair<void*,size_t> get_ptr_data(std::ifstream& input){
size_t message_size;
input >> message_size;

char* raw_input = new char[message_size];
input.read(raw_input, message_size);

ptr_info info;
kokkos_checkpointing::View v;
bool worked = v.ParseFromString(std::string(raw_input,message_size));
info.who = v.name();
info.how_much = v.size();
char* raw_data = reinterpret_cast<char*>(malloc(info.how_much));
memcpy(raw_data, const_cast<char*>(v.data().c_str()), info.how_much);
return std::make_pair(raw_data, info.how_much);
}
template<typename Hv, typename Dv>
void show(Hv h, Dv d){
std::cout << d.label() << "["<<h.extent(0) << "] (" << std::hex << h.data()<<") "<< std::dec << h[0]<<std::endl;
for(int x = 0 ; x < h.extent(0); ++x){
if(h(x)!=0){
std::cout << "Nonzero\n";
}
}
std::cout << h(h.extent(0)-2) << std::endl;
}
int main(int argc, char* argv[]){
Kokkos::initialize(argc, argv);
{
GOOGLE_PROTOBUF_VERIFY_VERSION;

std::string filename = "checkpoint.kokkos";
if (argc > 1) {
filename = argv[1];
}
std::ifstream input(filename);
size_t num_alloc;
input >> num_alloc;
using puppies_scalar_type= float;
using puppies_view_type= Kokkos::View<puppies_scalar_type*, Kokkos::CudaSpace>; /** you need to fill in the number of dimensions, sorry */
auto puppies_data = get_ptr_data(input);
puppies_view_type::HostMirror puppies_mirror( reinterpret_cast<puppies_scalar_type*>(puppies_data.first),puppies_data.second / sizeof(puppies_scalar_type)); /** you may need to adapt this for Views > 1D */
puppies_view_type puppies("puppies",puppies_data.second / sizeof(puppies_scalar_type));
Kokkos::deep_copy(puppies,puppies_mirror);
Kokkos::fence();
show(puppies_mirror,puppies);
auto puppies_1_data = get_ptr_data(input);
puppies_view_type::HostMirror puppies_1_mirror( reinterpret_cast<puppies_scalar_type*>(puppies_1_data.first),puppies_1_data.second / sizeof(puppies_scalar_type)); /** you may need to adapt this for Views > 1D */
puppies_view_type puppies_1("puppies_1",puppies_1_data.second / sizeof(puppies_scalar_type));
Kokkos::deep_copy(puppies_1,puppies_1_mirror);
Kokkos::fence();
show(puppies_1_mirror,puppies_1);

}
Kokkos::finalize();
}
282 changes: 282 additions & 0 deletions debugging/checkpointing/kp_kernel_logger.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 3.0
// Copyright (2020) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact David Poliakoff ([email protected])
//
// ************************************************************************
//@HEADER

#include <cstdio>
#include <inttypes.h>
#include <vector>
#include <iostream>
#include <signal.h>
#include <protocols/checkpointing.pb.h>
#include <fstream>
#include <string>
#include <cstring>
#include <cuda_runtime.h>
#include <set>
#include <map>
#include <algorithm>
#include <list>
struct ptr_info {
std::string who;
void* what;
size_t how_much;
bool where;
void* canonical; // look, I'm in a hurry, I'm sorry
};
std::string rank_string = "0";
struct variable_data {
std::vector<ptr_info> instances;
void insert(ptr_info in){
instances.push_back(in);
}
void remove(void* out){
auto er =
std::remove_if(instances.begin(), instances.end(), [=](const ptr_info& test){
bool rem= (test.what == out);
return rem;
});
if(er != instances.end()) { instances.erase(er); }
}
void remove(ptr_info& out){
auto er = std::remove_if(instances.begin(), instances.end(), [=](const ptr_info& test){
return test.what == out.what;
});
if(er != instances.end()) { instances.erase(er); }
}
};

std::string trigger;
std::string output;
bool debug;
bool have_checkpointed;
struct SpaceHandle {
char name[64];
};
std::map<std::string, variable_data> allocations;

bool should_reraise(int signo){
return ((signo != 999) && (signo != SIGINT));
}

void dump_checkpoint(){
std::ofstream out(output);
int size_sum = 0;
for(auto& alloc : allocations){
size_sum += alloc.second.instances.size();
}
out << size_sum << " ";
std::cout << "Starting allocation dump\n";
if(!have_checkpointed){
std::cout << "WARNING: have not yet checkpointed memory, you're likely to see a checkpoint file with a lot of [corrupted] allocations. Check your setting of CHECKPOINT_ATTR to make sure it matches your kernel name\n"; std::cout << "WARNING: have not yet checkpointed memory, you're likely to see a checkpoint file with a lot of [corrupted] allocations. Check your setting of CHECKPOINT_ATTR to make sure it matches your kernel name\n";
}
for(auto& variable_handle: allocations){
auto& alloc_list = variable_handle.second;
for(auto& alloc: alloc_list.instances){
kokkos_checkpointing::View v;
if(debug) {
std::cout << "Dumping an alloc with name: "<<alloc.who<<std::endl;
}

if(alloc.canonical){
v.set_size(alloc.how_much);
v.set_name(alloc.who);
v.set_data(alloc.canonical, alloc.how_much);
}
else{
v.set_size(0);
v.set_name(alloc.who + " [corrupted]");
v.set_data("");
}
size_t message_size =v.ByteSizeLong();
out << message_size;
bool success = v.SerializeToOstream(&out);
if(!success){
std::cout << "Error serializing a View named "<<alloc.who<<std::endl;
exit(1);
}
//out << alloc.who << " "<< alloc.how_much;
//out.write((char*)alloc.canonical,alloc.how_much);
}
}
out.close();
}

void checkpoint_handler(int signo){
static bool second;
if(second){
if(should_reraise(signo)){
sigignore(signo);
raise(signo);
}
}
// dump_checkpoint();
// std::cout <<"Finished writing on rank "<<rank_string<<", signal was "<<signo<<std::endl;
else{
second = true;
}
if(should_reraise(signo)){
sigignore(signo);
raise(signo);
}
}

void at_exit(){
checkpoint_handler(999);
}

extern "C" void kokkosp_init_library(const int loadSeq,
const uint64_t interfaceVer,
const uint32_t devInfoCount,
void* deviceInfo) {
std::cout << "KokkosP: Initialized checkpoint tool\n";
char* index = getenv("OMPI_COMM_WORLD_RANK");
trigger = getenv("CHECKPOINT_TRIGGER_ATTR");
output = getenv("CHECKPOINT_OUTPUT") ? getenv("CHECKPOINT_OUTPUT") : "checkpoint.kokkos";
debug = (getenv("CHECKPOINT_DEBUG") != nullptr);
if(index){
output+=".rank"+std::string(index);
rank_string = index;
}
signal(SIGSEGV, checkpoint_handler);
signal(SIGTERM, checkpoint_handler);
signal(SIGINT, checkpoint_handler);
signal(SIGABRT, checkpoint_handler);
atexit(at_exit);
}

bool operator<(const ptr_info& l, const ptr_info& r){
return l.what < r.what;
}


extern "C" void kokkosp_finalize_library() {
}

void checkpoint(){
cudaDeviceSynchronize();
have_checkpointed = true;
for(auto& variable_handle: allocations){
auto& alloc_list = variable_handle.second;
for(auto& alloc: alloc_list.instances){
if(!alloc.canonical){
alloc.canonical = malloc(alloc.how_much);
}
if(alloc.where){
cudaMemcpy(alloc.canonical, alloc.what, alloc.how_much, cudaMemcpyDefault);
}
else{
memcpy(alloc.canonical, alloc.what, alloc.how_much);
}
}
}
}

extern "C" void kokkosp_begin_parallel_for(const char* name, const uint32_t devID, uint64_t* kID) {
if(name && (std::string(name) == trigger)){
checkpoint();
dump_checkpoint();
}
}




extern "C" void kokkosp_end_parallel_for(const uint64_t kID) {
}

extern "C" void kokkosp_begin_parallel_scan(const char* name, const uint32_t devID, uint64_t* kID) {
if(name && (std::string(name) == trigger)){
checkpoint();
dump_checkpoint();
}
}

extern "C" void kokkosp_end_parallel_scan(const uint64_t kID) {
}

extern "C" void kokkosp_begin_parallel_reduce(const char* name, const uint32_t devID, uint64_t* kID) {
if(name && (std::string(name) == trigger)){
checkpoint();
dump_checkpoint();
}
}

extern "C" void kokkosp_end_parallel_reduce(const uint64_t kID) {
}

extern "C" void kokkosp_push_profile_region(char* regionName) {
}

extern "C" void kokkosp_pop_profile_region() {
}




extern "C" void kokkosp_allocate_data(SpaceHandle handle, const char* name, void* ptr, uint64_t size) {
bool device = (handle.name[0] == 'C');
ptr_info info;
char* newname = new char[128];
if(name){
strncpy(newname,name, 127);
}
else {
strcpy(newname,"[default name]");
}
info.who = std::string(newname);
info.what = ptr + 128;
info.how_much = size;
info.where = device;
info.canonical = nullptr;
allocations[name].insert(info);
}

extern "C" void kokkosp_deallocate_data(SpaceHandle handle, const char* name, void* ptr, uint64_t size) {
if(allocations.find(name)!=allocations.end()){
allocations[name].remove(ptr + 128);
}
}

extern "C" void kokkosp_begin_deep_copy(
SpaceHandle dst_handle, const char* dst_name, const void* dst_ptr,
SpaceHandle src_handle, const char* src_name, const void* src_ptr,
uint64_t size) {
}
Loading