test_model.py

import caffe
import numpy as np
import cv2
import sg_utils as utils
import cap_eval_utils
from IPython.core.debugger import Tracer
# import caffe

def load_model(prototxt_file, model_file, base_image_size, mean, vocab): 
  """
  Load the model from file. Includes pointers to the prototxt file, 
  caffemodel file name, and other settings - image mean, base_image_size, vocab 
  """
  model = {};
  model['net']= caffe.Net(prototxt_file, model_file, caffe.TEST);
  model['base_image_size'] = base_image_size;
  model['means'] = mean; model['vocab'] = vocab;
  return model

def output_words_image(threshold_metric, output_metric, min_words, threshold, vocab, is_functional):
  ind_output = np.argsort(threshold_metric)
  ind_output = ind_output[::-1]
  must_keep1 = threshold_metric[ind_output] >= threshold;
  must_keep2 = np.cumsum(is_functional[ind_output]) < 1+min_words;
  ind_output = [ind for j, ind in enumerate(ind_output) if must_keep1[j] or must_keep2[j]]
  out = [(vocab['words'][ind], output_metric[ind], threshold_metric[ind]) for ind in ind_output]
  return out

def output_words(imdb, detection_file, eval_file, vocab, \
  threshold_metric_name, output_metric_name, threshold, min_words, output_file = None, \
  functional_words = ['a', 'on', 'of', 'the', 'in', 'with', 'and', 'is', 'to', 'an', 'two', 'at', 'next', 'are']):
  """
  Output the words as generated by the model. Loads the detections from
  detection_file, score precision mapping from eval_file and output the words
  in output_file. Words in the output_file are sorted according to the
  threshold_metric_name and report the output_metric_name.
  """
  dt = utils.load_variables(detection_file);
  pt = utils.load_variables(eval_file);
  
  is_functional = np.array([x not in functional_words for x in vocab['words']]);
  prec = np.zeros(dt['mil_prob'].shape)
  for jj in xrange(prec.shape[1]):
    prec[:,jj] = cap_eval_utils.compute_precision_score_mapping(\
      pt['details']['score'][:,jj].copy(), \
      pt['details']['precision'][:,jj].copy(), \
      dt['mil_prob'][:,jj]);
    utils.tic_toc_print(1, 'compute precision score mapping: {:4d} / {:4d}'.format(jj, prec.shape[1]))
  dt['prec'] = prec;

  out_all = [] 
  for i in xrange(imdb.num_images):
    out = output_words_image(dt[threshold_metric_name][i,:], dt[output_metric_name][i,:], \
      min_words, threshold, vocab, is_functional)
    out_all.append(out)
    utils.tic_toc_print(1, 'output words image: {:4d} / {:4d}'.format(i, imdb.num_images))
     
  if output_file is not None:
    with open(output_file, 'wt') as f:
      for i in xrange(imdb.num_images):
        f.write('{:d}: '.format(imdb.image_index[i]))
        out = out_all[i]
        for j in xrange(len(out)):
          f.write('{:s} ({:.2f}), '.format(out[j][0], out[j][1]))
        f.write('\n')

def test_model(imdb, model, detection_file = None):
  """
  Tests model and stores detections on disk
  """
  N_WORDS = len(model['vocab']['words'])
  sc = np.zeros((imdb.num_images, N_WORDS), dtype=np.float)
  mil_prob = np.zeros((imdb.num_images, N_WORDS), dtype=np.float)
  for i in xrange(len(imdb.image_index)):
    im = cv2.imread(imdb.image_path_at(i))
    sc[i,:], mil_prob[i,:] = test_img(im, model['net'], model['base_image_size'], model['means'])
    utils.tic_toc_print(60, 'test_img : {:6d}/{:6d}'.format(i, len(imdb.image_index)))

  if detection_file is not None:
    utils.save_variables(detection_file, [sc, mil_prob, model['vocab'], imdb],
      ['sc', 'mil_prob', 'vocab', 'imdb'], overwrite = True)

def benchmark(imdb, vocab, gt_label, num_references, detection_file, eval_file = None):
  # Get ground truth
  # dt = utils.scio.loadmat(detection_file)
  dt = utils.load_variables(detection_file)
  mil_prob = dt['mil_prob'];
  
  # Benchmark the output, and return a result struct
  n_words           = len(vocab['words'])
  P                 = np.zeros(mil_prob.shape, dtype = np.float)
  R                 = np.zeros(mil_prob.shape, dtype = np.float)
  score             = np.zeros(mil_prob.shape, dtype = np.float)
  ap                = np.zeros((1, n_words), dtype   = np.float)
  
  human_prec        = np.zeros((1, n_words), dtype   = np.float)
  human_rec         = np.zeros((1, n_words), dtype   = np.float)
  
  prec_at_human_rec = np.zeros((1, n_words), dtype   = np.float)
  rec_at_human_prec = np.zeros((1, n_words), dtype   = np.float)
  rec_at_half_prec  = np.zeros((1, n_words), dtype   = np.float)
  
  prec_at_human_rec[...] = np.nan
  
  for i in range(len(vocab['words'])):
    utils.tic_toc_print(1, 'benchmarking : {:4d} / {:4d}'.format(i, n_words))
    P[:,i], R[:,i], score[:,i], ap[0,i] = cap_eval_utils.calc_pr_ovr(gt_label[:,i], mil_prob[:,i], num_references)
    human_prec[0,i], human_rec[0,i]  = cap_eval_utils.human_agreement(gt_label[:,i], num_references)
    
    ind = np.where(R[:,i] >= human_rec[0,i])[0]
    if len(ind) > 0:
      ind = np.sort(ind)
      prec_at_human_rec[0,i] = P[ind[0], i];

    ind = np.where(P[:,i] >= human_prec[0,i])[0]
    if len(ind) > 0:
      ind = np.sort(ind)
      rec_at_human_prec[0,i] = R[ind[-1], i];
    
    ind = np.where(P[:,i] >= 0.5)[0]
    if len(ind) > 0:
      ind = np.sort(ind)
      rec_at_half_prec[0,i]  = R[ind[-1], i];
    # # print '{:20s}: {:.3f}'.format(vocab['words'][i], ap[0,i]*100) 
  
  details = {'precision': P, 'recall': R, 'ap': ap, 'score': score, \
    'prec_at_human_rec': prec_at_human_rec, 'rec_at_human_prec': rec_at_human_prec, \
    'human_prec': human_prec, 'human_rec': human_rec, 'rec_at_half_prec': rec_at_half_prec}; 
  
  # Collect statistics over the POS
  agg = [];
  for pos in list(set(vocab['poss'])):
    ind = [i for i,x in enumerate(vocab['poss']) if pos == x]
    print "    {:5s} [{:4d}]     :     {:5.2f}     {:5.2f}     {:5.2f}". \
      format(pos, len(ind), 100*np.mean(ap[0, ind]), 100*np.mean(prec_at_human_rec[0, ind]), \
        100*np.mean(human_prec[0, ind]))
    agg.append({'pos': pos, 'ap': 100*np.mean(ap[0, ind]), \
      'prec_at_human_rec': 100*np.mean(prec_at_human_rec[0, ind]), \
      'human_prec': 100*np.mean(human_prec[0, ind])})  
  
  ind = range(len(vocab['words'])); pos = 'all';
  print "    {:5s} [{:4d}]     :     {:5.2f}     {:5.2f}     {:5.2f}". \
    format(pos, len(ind), 100*np.mean(ap[0, ind]), 100*np.mean(prec_at_human_rec[0, ind]), \
      100*np.mean(human_prec[0, ind]))
  agg.append({'pos': pos, 'ap': 100*np.mean(ap[0, ind]), \
    'prec_at_human_rec': 100*np.mean(prec_at_human_rec[0, ind]), \
    'human_prec': 100*np.mean(human_prec[0, ind])})  

  if eval_file is not None:
    utils.save_variables(eval_file, [details, agg, vocab, imdb],
      ['details', 'agg', 'vocab', 'imdb'], overwrite = True)
  
  return details

def test_img(im, net, base_image_size, means):
  """
  Calls Caffe to get output for this image
  """
  # Resize image
  im_orig = im.astype(np.float32, copy=True)
  im_orig -= means
  
  im, gr, grr = upsample_image(im_orig, base_image_size)
  im = np.transpose(im, axes = (2, 0, 1))
  im = im[np.newaxis, :, :, :]
  
  # Pass into Caffe
  net.forward(data=im.astype(np.float32, copy=False))

  # Get outputs and return them
  mil_prob= net.blobs['mil'].data.copy()
  sc = net.blobs['mil_max'].data.copy()

  # reshape appropriately
  mil_prob = mil_prob.reshape((1, mil_prob.size))
  sc = sc.reshape((1, sc.size))
  return sc, mil_prob


def upsample_image(im, sz):
  h = im.shape[0]
  w = im.shape[1]
  s = np.float(max(h, w))
  I_out = np.zeros((sz, sz, 3), dtype = np.float);
  I = cv2.resize(im, None, None, fx = np.float(sz)/s, fy = np.float(sz)/s, interpolation=cv2.INTER_LINEAR); 
  SZ = I.shape;
  I_out[0:I.shape[0], 0:I.shape[1],:] = I;
  return I_out, I, SZ