Clarify readme & add param to turn label mirroring off

kingvision · Oct 17, 2015 · a2bf4b6 · a2bf4b6
1 parent 908d2cb
commit a2bf4b6
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -3,15 +3,16 @@
 This is a fork of Caffe that enables training of heatmap regressor ConvNets for the general problem of regressing (x,y) positions in images.
 
 To start training: 
-- Prepare your input images as images
+- Prepare your input images
 - Create two label files, one for training and another for testing, in the format:
 
-  train/FILE.jpg 123,144,165,123,66,22 0
+  train/FILE.jpg 123,144,165,123,66,22 372.296,720,1,480,0.53333 0
 
   This is a space-delimited file where 
   - the first arg is the path to your image
   - the second arg is a comma-delimited list of (x,y) coordinates you wish to regress
-  - the third arg is a coordinate 'cluster' (from which you have the option to evenly sample images in training). You can set this to 0.
+  - the third arg is a comma-delimited list of crops & scaling factors of the input image (in order x_left,x_right,y_left,y_right,scaling_fact). These are only used if you set a mean image for mean subtraction. You can set these to 0 o.w. 
+  - the fourth arg is a coordinate 'cluster' (from which you have the option to evenly sample images in training). You can set this to 0.
 
 - Modify file paths in models/heatmap-flic-fusion/train_val.txt 
 - Start training: sh train_heatmap.sh heatmap-flic-fusion 1
@@ -24,15 +25,23 @@ To start training:
 - visualise: show visualisations for crops, rotations etc (recommended for testing)
 - source: label file
 - root_img_dir: directory with images (recommend you store images on ramdisk)
-- cropsize: size of random crop
-- outside: size that crops are resized to
+- meanfile: proto file containing the mean image(s) to be subtracted (optional)
+- cropsize: size of random crop (randomly cropped from the original image)
+- outsize: size that crops are resized to
 - sample_per_cluster: sample evenly across clusters
 - random_crop: do random crop (if false, do center crop)
 - label_height/width: width of regressed heatmap (must match net config)
 - segmentation: segment images on the fly (assumes images are in a segs/ directory)
-- dont_flip_first: when mirroring images for augmentation, this option allows you to turn off label mirroring (e.g. if the first joint is a head)
 - angle_max: max rotation angle for training augmentation
 
+### Pose estimation-specific parameters:
+- flip_joint_labels: when horizontally flipping images for augmentation, if this is set to true the code also swaps left<->right labels (this is important e.g. for observer-centric pose estimation). This assumes that the left,right joint labelsare listed consecutively (e.g. wrist_left,wrist_right,elbow_left,elbow_right)
+- dont_flip_first: This option allows you to turn off label mirroring for the first label. E.g. for labels head,wrist_right,wrist_left,elbow_right,elbow_left,shoulder_right,shoulder_left, the first joint is head and should not be swapped with wrist_right.
+
+
+## Notes:
+- Ensure that the cropsize is set so that the crop normally covers most of the positions in the image that you wish to regress. E.g. for FLIC we prepared 256x256 cropped input images around the torso point.
+
 
 
 

diff --git a/models/heatmap-flic-fusion/train_val.prototxt b/models/heatmap-flic-fusion/train_val.prototxt
@@ -13,12 +13,12 @@ layer {
     batchsize: 14
     cropsize: 248
     outsize: 256
-#    multfact: 282
     sample_per_cluster: false
     random_crop: true
     label_width: 64
     label_height: 64
     segmentation: false
+    flip_joint_labels: true
     dont_flip_first: true
     angle_max: 40    
   }
@@ -36,7 +36,6 @@ layer {
     batchsize: 1
     cropsize: 248
     outsize: 256
-#    multfact: 282
     sample_per_cluster: false
     random_crop: false
     label_width: 64
@@ -556,6 +555,6 @@ layer {
   bottom: "label"
   bottom: "data"  
   top: "loss_fusion"
-  visualise: true
+  visualise: false
   loss_weight: 3
 }
diff --git a/src/caffe/layers/data_heatmap.cpp b/src/caffe/layers/data_heatmap.cpp
@@ -49,7 +49,7 @@ void DataHeatmapLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
     const int outsize = heatmap_data_param.outsize();
     const int label_batchsize = batchsize;
     sample_per_cluster_ = heatmap_data_param.sample_per_cluster();
-    root_img_dir_ = heatmap_data_param.root_img_dir();    
+    root_img_dir_ = heatmap_data_param.root_img_dir();
 
 
     // initialise rng seed
@@ -242,7 +242,7 @@ void DataHeatmapLayer<Dtype>::DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
 
 
     // init data
-    this->transformed_data_.Reshape(batchsize, this->datum_channels_, outsize, outsize);    
+    this->transformed_data_.Reshape(batchsize, this->datum_channels_, outsize, outsize);
     top[0]->Reshape(batchsize, this->datum_channels_, outsize, outsize);
     for (int i = 0; i < this->PREFETCH_COUNT; ++i)
         this->prefetch_[i].data_.Reshape(batchsize, this->datum_channels_, outsize, outsize);
@@ -285,7 +285,7 @@ void DataHeatmapLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
 
     // Pointers to blobs' float data
     Dtype* top_data = batch->data_.mutable_cpu_data();
-    Dtype* top_label = batch->label_.mutable_cpu_data(); 
+    Dtype* top_label = batch->label_.mutable_cpu_data();
 
     cv::Mat img, img_res, img_annotation_vis, img_mean_vis, img_vis, img_res_vis, mean_img_this, seg, segTmp;
 
@@ -297,17 +297,18 @@ void DataHeatmapLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
     const int label_width = heatmap_data_param.label_width();
     const float angle_max = heatmap_data_param.angle_max();
     const bool dont_flip_first = heatmap_data_param.dont_flip_first();
+    const bool flip_joint_labels = heatmap_data_param.flip_joint_labels();
     const int multfact = heatmap_data_param.multfact();
     const bool segmentation = heatmap_data_param.segmentation();
     const int size = heatmap_data_param.cropsize();
     const int outsize = heatmap_data_param.outsize();
     const int num_aug = 1;
-    const float resizeFact = (float)outsize / (float)size;    
-    const bool random_crop = heatmap_data_param.random_crop();    
+    const float resizeFact = (float)outsize / (float)size;
+    const bool random_crop = heatmap_data_param.random_crop();
 
     // Shortcuts to global vars
     const bool sub_mean = this->sub_mean_;
-    const int channels = this->datum_channels_;    
+    const int channels = this->datum_channels_;
 
     // What coordinates should we flip when mirroring images?
     // For pose estimation with joints assumes i=0,1 are for head, and i=2,3 left wrist, i=4,5 right wrist etc
@@ -319,7 +320,7 @@ void DataHeatmapLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
     if (visualise)
     {
         cv::namedWindow("original image", cv::WINDOW_AUTOSIZE);
-        cv::namedWindow("cropped image", cv::WINDOW_AUTOSIZE);  
+        cv::namedWindow("cropped image", cv::WINDOW_AUTOSIZE);
         cv::namedWindow("interim resize image", cv::WINDOW_AUTOSIZE);
         cv::namedWindow("resulting image", cv::WINDOW_AUTOSIZE);
     }
@@ -340,7 +341,7 @@ void DataHeatmapLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
 
         std::string img_path = this->root_img_dir_ + img_name;
         LOG(INFO) << "img: " << img_path << "  class: " << cur_class;
-        img = cv::imread(img_path, CV_LOAD_IMAGE_COLOR);       
+        img = cv::imread(img_path, CV_LOAD_IMAGE_COLOR);
 
         // show image
         if (visualise)
@@ -479,16 +480,19 @@ void DataHeatmapLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
                     // "flip" annotation joint numbers
                     // assumes i=0,1 are for head, and i=2,3 left wrist, i=4,5 right wrist etc
                     // where coordinates are (x,y)
-                    float tmp_x, tmp_y;
-                    for (int i = flip_start_ind; i < label_num_channels; i += 4)
+                    if (flip_joint_labels)
                     {
-                        CHECK_LT(i+3, label_num_channels);
-                        tmp_x = cur_label_aug[i];
-                        tmp_y = cur_label_aug[i + 1];
-                        cur_label_aug[i] = cur_label_aug[i + 2];
-                        cur_label_aug[i + 1] = cur_label_aug[i + 3];
-                        cur_label_aug[i + 2] = tmp_x;
-                        cur_label_aug[i + 3] = tmp_y;
+                        float tmp_x, tmp_y;
+                        for (int i = flip_start_ind; i < label_num_channels; i += 4)
+                        {
+                            CHECK_LT(i + 3, label_num_channels);
+                            tmp_x = cur_label_aug[i];
+                            tmp_y = cur_label_aug[i + 1];
+                            cur_label_aug[i] = cur_label_aug[i + 2];
+                            cur_label_aug[i + 1] = cur_label_aug[i + 3];
+                            cur_label_aug[i + 2] = tmp_x;
+                            cur_label_aug[i + 3] = tmp_y;
+                        }
                     }
                 }
 
@@ -621,7 +625,7 @@ void DataHeatmapLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
             const int img_size = channel_size * channels;
 
             // store image data
-            DLOG(INFO) << "storing image";            
+            DLOG(INFO) << "storing image";
             for (int c = 0; c < channels; c++)
             {
                 for (int i = 0; i < outsize; i++)
@@ -658,9 +662,9 @@ void DataHeatmapLayer<Dtype>::load_batch(Batch<Dtype>* batch) {
                             dataMatrix.at<float>((int)j, (int)i) = gaussian;
                     }
                 }
-            }       
+            }
 
-        } // jittered versions loop     
+        } // jittered versions loop
 
         DLOG(INFO) << "next image";
 

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
@@ -626,6 +626,7 @@ message HeatmapDataParameter {
   optional uint32 label_height = 1015 [ default = 1 ];
   optional bool dont_flip_first = 1016 [ default = true ];
   optional float angle_max = 1017 [ default = 0 ];  
+  optional bool flip_joint_labels = 1018 [ default = true ];
 }
 
 // Message that stores parameters used by HDF5DataLayer