diff --git a/README.md b/README.md index 640a4a018..4741ab7bb 100644 --- a/README.md +++ b/README.md @@ -3,15 +3,16 @@ This is a fork of Caffe that enables training of heatmap regressor ConvNets for the general problem of regressing (x,y) positions in images. To start training: -- Prepare your input images as images +- Prepare your input images - Create two label files, one for training and another for testing, in the format: - train/FILE.jpg 123,144,165,123,66,22 0 + train/FILE.jpg 123,144,165,123,66,22 372.296,720,1,480,0.53333 0 This is a space-delimited file where - the first arg is the path to your image - the second arg is a comma-delimited list of (x,y) coordinates you wish to regress - - the third arg is a coordinate 'cluster' (from which you have the option to evenly sample images in training). You can set this to 0. + - the third arg is a comma-delimited list of crops & scaling factors of the input image (in order x_left,x_right,y_left,y_right,scaling_fact). These are only used if you set a mean image for mean subtraction. You can set these to 0 o.w. + - the fourth arg is a coordinate 'cluster' (from which you have the option to evenly sample images in training). You can set this to 0. - Modify file paths in models/heatmap-flic-fusion/train_val.txt - Start training: sh train_heatmap.sh heatmap-flic-fusion 1 @@ -24,15 +25,23 @@ To start training: - visualise: show visualisations for crops, rotations etc (recommended for testing) - source: label file - root_img_dir: directory with images (recommend you store images on ramdisk) -- cropsize: size of random crop -- outside: size that crops are resized to +- meanfile: proto file containing the mean image(s) to be subtracted (optional) +- cropsize: size of random crop (randomly cropped from the original image) +- outsize: size that crops are resized to - sample_per_cluster: sample evenly across clusters - random_crop: do random crop (if false, do center crop) - label_height/width: width of regressed heatmap (must match net config) - segmentation: segment images on the fly (assumes images are in a segs/ directory) -- dont_flip_first: when mirroring images for augmentation, this option allows you to turn off label mirroring (e.g. if the first joint is a head) - angle_max: max rotation angle for training augmentation +### Pose estimation-specific parameters: +- flip_joint_labels: when horizontally flipping images for augmentation, if this is set to true the code also swaps left<->right labels (this is important e.g. for observer-centric pose estimation). This assumes that the left,right joint labelsare listed consecutively (e.g. wrist_left,wrist_right,elbow_left,elbow_right) +- dont_flip_first: This option allows you to turn off label mirroring for the first label. E.g. for labels head,wrist_right,wrist_left,elbow_right,elbow_left,shoulder_right,shoulder_left, the first joint is head and should not be swapped with wrist_right. + + +## Notes: +- Ensure that the cropsize is set so that the crop normally covers most of the positions in the image that you wish to regress. E.g. for FLIC we prepared 256x256 cropped input images around the torso point. + diff --git a/models/heatmap-flic-fusion/train_val.prototxt b/models/heatmap-flic-fusion/train_val.prototxt index 760f2ecdb..1f00faff3 100644 --- a/models/heatmap-flic-fusion/train_val.prototxt +++ b/models/heatmap-flic-fusion/train_val.prototxt @@ -13,12 +13,12 @@ layer { batchsize: 14 cropsize: 248 outsize: 256 -# multfact: 282 sample_per_cluster: false random_crop: true label_width: 64 label_height: 64 segmentation: false + flip_joint_labels: true dont_flip_first: true angle_max: 40 } @@ -36,7 +36,6 @@ layer { batchsize: 1 cropsize: 248 outsize: 256 -# multfact: 282 sample_per_cluster: false random_crop: false label_width: 64 @@ -556,6 +555,6 @@ layer { bottom: "label" bottom: "data" top: "loss_fusion" - visualise: true + visualise: false loss_weight: 3 } diff --git a/src/caffe/layers/data_heatmap.cpp b/src/caffe/layers/data_heatmap.cpp index 3dc242413..712232de1 100644 --- a/src/caffe/layers/data_heatmap.cpp +++ b/src/caffe/layers/data_heatmap.cpp @@ -49,7 +49,7 @@ void DataHeatmapLayer::DataLayerSetUp(const vector*>& bottom, const int outsize = heatmap_data_param.outsize(); const int label_batchsize = batchsize; sample_per_cluster_ = heatmap_data_param.sample_per_cluster(); - root_img_dir_ = heatmap_data_param.root_img_dir(); + root_img_dir_ = heatmap_data_param.root_img_dir(); // initialise rng seed @@ -242,7 +242,7 @@ void DataHeatmapLayer::DataLayerSetUp(const vector*>& bottom, // init data - this->transformed_data_.Reshape(batchsize, this->datum_channels_, outsize, outsize); + this->transformed_data_.Reshape(batchsize, this->datum_channels_, outsize, outsize); top[0]->Reshape(batchsize, this->datum_channels_, outsize, outsize); for (int i = 0; i < this->PREFETCH_COUNT; ++i) this->prefetch_[i].data_.Reshape(batchsize, this->datum_channels_, outsize, outsize); @@ -285,7 +285,7 @@ void DataHeatmapLayer::load_batch(Batch* batch) { // Pointers to blobs' float data Dtype* top_data = batch->data_.mutable_cpu_data(); - Dtype* top_label = batch->label_.mutable_cpu_data(); + Dtype* top_label = batch->label_.mutable_cpu_data(); cv::Mat img, img_res, img_annotation_vis, img_mean_vis, img_vis, img_res_vis, mean_img_this, seg, segTmp; @@ -297,17 +297,18 @@ void DataHeatmapLayer::load_batch(Batch* batch) { const int label_width = heatmap_data_param.label_width(); const float angle_max = heatmap_data_param.angle_max(); const bool dont_flip_first = heatmap_data_param.dont_flip_first(); + const bool flip_joint_labels = heatmap_data_param.flip_joint_labels(); const int multfact = heatmap_data_param.multfact(); const bool segmentation = heatmap_data_param.segmentation(); const int size = heatmap_data_param.cropsize(); const int outsize = heatmap_data_param.outsize(); const int num_aug = 1; - const float resizeFact = (float)outsize / (float)size; - const bool random_crop = heatmap_data_param.random_crop(); + const float resizeFact = (float)outsize / (float)size; + const bool random_crop = heatmap_data_param.random_crop(); // Shortcuts to global vars const bool sub_mean = this->sub_mean_; - const int channels = this->datum_channels_; + const int channels = this->datum_channels_; // What coordinates should we flip when mirroring images? // For pose estimation with joints assumes i=0,1 are for head, and i=2,3 left wrist, i=4,5 right wrist etc @@ -319,7 +320,7 @@ void DataHeatmapLayer::load_batch(Batch* batch) { if (visualise) { cv::namedWindow("original image", cv::WINDOW_AUTOSIZE); - cv::namedWindow("cropped image", cv::WINDOW_AUTOSIZE); + cv::namedWindow("cropped image", cv::WINDOW_AUTOSIZE); cv::namedWindow("interim resize image", cv::WINDOW_AUTOSIZE); cv::namedWindow("resulting image", cv::WINDOW_AUTOSIZE); } @@ -340,7 +341,7 @@ void DataHeatmapLayer::load_batch(Batch* batch) { std::string img_path = this->root_img_dir_ + img_name; LOG(INFO) << "img: " << img_path << " class: " << cur_class; - img = cv::imread(img_path, CV_LOAD_IMAGE_COLOR); + img = cv::imread(img_path, CV_LOAD_IMAGE_COLOR); // show image if (visualise) @@ -479,16 +480,19 @@ void DataHeatmapLayer::load_batch(Batch* batch) { // "flip" annotation joint numbers // assumes i=0,1 are for head, and i=2,3 left wrist, i=4,5 right wrist etc // where coordinates are (x,y) - float tmp_x, tmp_y; - for (int i = flip_start_ind; i < label_num_channels; i += 4) + if (flip_joint_labels) { - CHECK_LT(i+3, label_num_channels); - tmp_x = cur_label_aug[i]; - tmp_y = cur_label_aug[i + 1]; - cur_label_aug[i] = cur_label_aug[i + 2]; - cur_label_aug[i + 1] = cur_label_aug[i + 3]; - cur_label_aug[i + 2] = tmp_x; - cur_label_aug[i + 3] = tmp_y; + float tmp_x, tmp_y; + for (int i = flip_start_ind; i < label_num_channels; i += 4) + { + CHECK_LT(i + 3, label_num_channels); + tmp_x = cur_label_aug[i]; + tmp_y = cur_label_aug[i + 1]; + cur_label_aug[i] = cur_label_aug[i + 2]; + cur_label_aug[i + 1] = cur_label_aug[i + 3]; + cur_label_aug[i + 2] = tmp_x; + cur_label_aug[i + 3] = tmp_y; + } } } @@ -621,7 +625,7 @@ void DataHeatmapLayer::load_batch(Batch* batch) { const int img_size = channel_size * channels; // store image data - DLOG(INFO) << "storing image"; + DLOG(INFO) << "storing image"; for (int c = 0; c < channels; c++) { for (int i = 0; i < outsize; i++) @@ -658,9 +662,9 @@ void DataHeatmapLayer::load_batch(Batch* batch) { dataMatrix.at((int)j, (int)i) = gaussian; } } - } + } - } // jittered versions loop + } // jittered versions loop DLOG(INFO) << "next image"; diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index b0c179846..8cea8f9a6 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -626,6 +626,7 @@ message HeatmapDataParameter { optional uint32 label_height = 1015 [ default = 1 ]; optional bool dont_flip_first = 1016 [ default = true ]; optional float angle_max = 1017 [ default = 0 ]; + optional bool flip_joint_labels = 1018 [ default = true ]; } // Message that stores parameters used by HDF5DataLayer