diff --git a/docs/commands.md b/docs/commands.md
index e303fc644..e48ca36fa 100755
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -8,19 +8,160 @@ Equivalently, running the library can be done via the `lmms_eval` entrypoint at
 
 This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
 
-* `--model` : Selects which model type or provider is evaluated. Must be a mdoels registered under lmms_eval/models. For example, `--model qwen_vl` or `--model llava`.
+- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#model-apis-and-inference-servers) for a full list of enabled model names and supported libraries or APIs.
 
 * `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=liuhaotian/llava-v1.5-7b,batch_size=1`. For a full list of what keyword arguments, see the initialization of the corresponding model class in `lmms_eval/models/`.
 
 * `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. You can use `--tasks list` to see all the available tasks. If you add your own tasks but not shown on the list, you can try to set `--verbosity=DEBUG` to view the error message. You can also use `--tasks list_with_num` to check every tasks and the number of question each task contains. However, `list_with_num` will download all the available datasets and may require lots of memory and time.
 
-* `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+- `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
 
-* `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+- `--gen_kwargs` : takes an arg string in same format as `--model_args` and creates a dictionary of keyword arguments. These will be passed to the models for all called `generate_until` (free-form or greedy generation task) tasks, to set options such as the sampling temperature or `top_p` / `top_k`. For a list of what args are supported for each model type, reference the respective library's documentation (for example, the documentation for `transformers.AutoModelForCausalLM.generate()`.) These kwargs will be applied to all `generate_until` tasks called--we do not currently support unique gen_kwargs or batch_size values per task in a single run of the library. To control these on a per-task level, set them in that task's YAML file.
 
-* `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+- `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
 
-* `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+- `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed.
+
+- `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type.
+
+- `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+
+- `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+
+- `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+
+- `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again.
+
+- `--cache_requests` : Can be "true", "refresh", or "delete". "true" means that the cache should be used. "refresh" means that you wish to regenerate the cache, which you should run if you change your dataset configuration for a given task. "delete" will delete the cache. Cached files are stored under lm_eval/cache/.cache unless you specify a different path via the environment variable: `LM_HARNESS_CACHE_PATH`. e.g. `LM_HARNESS_CACHE_PATH=~/Documents/cache_for_lm_harness`.
+
+- `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity.
+
+- `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task.
+
+- `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
+
+- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
+
+- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
+
+- `--apply_chat_template` : This flag specifies whether to apply a chat template to the prompt. It can be used in the following ways:
+	- `--apply_chat_template` : When used without an argument, applies the only available chat template to the prompt. For Hugging Face models, if no dedicated chat template exists, the default chat template will be applied.
+	- `--apply_chat_template template_name` : If the model has multiple chat templates, apply the specified template to the prompt.
+
+    For Hugging Face models, the default chat template can be found in the [`default_chat_template`](https://github.com/huggingface/transformers/blob/fc35907f95459d7a6c5281dfadd680b6f7b620e3/src/transformers/tokenization_utils_base.py#L1912) property of the Transformers Tokenizer.
+
+- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
+
+- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
+
+* `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.
+
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
+
+* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
+    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
+    * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`,
+    * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`,
+    * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`,
+    * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
+    * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
+    * `public_repo` - whether the repository is public, can be `True` or `False`,
+    * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
+    * `point_of_contact` - Point of contact for the results dataset, e.g., `yourname@example.com`.
+    * `gated` - whether to gate the details dataset, can be `True` or `False`.
+
+## External Library Usage
+
+We also support using the library's external API for use within model training loops or other scripts.
+
+`lmms_eval` supplies two functions for external import and use: `lmms_eval.evaluate()` and `lmms_eval.simple_evaluate()`.
+
+`simple_evaluate()` can be used by simply creating an `lmms_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs/model_guide.md), and wrapping your custom model in that class as follows:
+
+```python
+import lmms_eval
+...
+
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+
+# indexes all tasks from the `lm_eval/tasks` subdirectory.
+# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
+# to include a set of tasks in a separate directory.
+task_manager = lm_eval.tasks.TaskManager()
+
+# Setting `task_manager` to the one above is optional and should generally be done
+# if you want to include tasks from paths other than ones in `lm_eval/tasks`.
+# `simple_evaluate` will instantiate its own task_manager if it is set to None here.
+results = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["taskname1", "taskname2"],
+    num_fewshot=0,
+    task_manager=task_manager,
+    ...
+)
+```
+
+See the `simple_evaluate()` and `evaluate()` functions in [lmms_eval/evaluator.py](../lmms_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+
+Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
+
+As a brief example usage of `evaluate()`:
+
+```python
+import lm_eval
+
+# suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase
+from my_tasks import MyTask1
+...
+
+# create your model (could be running finetuning with some custom modeling code)
+my_model = initialize_my_model()
+...
+
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+
+# optional: the task_manager indexes tasks including ones
+# specified by the user through `include_path`.
+task_manager = lm_eval.tasks.TaskManager(
+    include_path="/path/to/custom/yaml"
+    )
+
+# To get a task dict for `evaluate`
+task_dict = lm_eval.tasks.get_task_dict(
+    [
+        "mmlu", # A stock task
+        "my_custom_task", # A custom task
+        {
+            "task": ..., # A dict that configures a task
+            "doc_to_text": ...,
+            },
+        MyTask1 # A task object from `lm_eval.task.Task`
+        ],
+    task_manager # A task manager that allows lm_eval to
+                 # load the task during evaluation.
+                 # If none is provided, `get_task_dict`
+                 # will instantiate one itself, but this
+                 # only includes the stock tasks so users
+                 # will need to set this if including
+                 # custom paths is required.
+    )
+
+results = evaluate(
+    lm=lm_obj,
+    task_dict=task_dict,
+    ...
+)
+```
 
 ## Usage with SRT API
 
diff --git a/lmms_eval/evaluator.py b/lmms_eval/evaluator.py
index a4c67974d..97cc994ee 100755
--- a/lmms_eval/evaluator.py
+++ b/lmms_eval/evaluator.py
@@ -35,6 +35,8 @@
     create_iterator,
     get_datetime_str,
     get_git_commit_hash,
+    handle_non_serializable,
+    hash_string,
     make_table,
     positional_deprecated,
     run_task_tests,
@@ -370,6 +372,7 @@ def evaluate(
     for task_output in eval_tasks:
         task: Task = task_output.task
         task_name = task_output.task_name
+        task.args = cli_args
 
         name_to_task[task_name] = task
 
@@ -453,327 +456,159 @@ def evaluate(
         if lm.world_size > 1:
             lm.accelerator.wait_for_everyone()
 
-    ### Collect values of metrics on all datapoints ###
-    metrics_info = collections.defaultdict(list)
+    RANK = lm.rank
+    WORLD_SIZE = lm.world_size
     ### Postprocess outputs ###
     # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
     for task_output in eval_tasks:
         task = task_output.task
-        task_name = task_output.task_name
         task.apply_filters()
 
+        ### Collect values of metrics on all datapoints ###
+        # # unpack results and sort back in order and return control to Task
         # TODO: make it possible to use a different metric per filter
+        # Pre-process task.instances to group by doc_id
+        instances_by_doc_id = collections.defaultdict(list)
+        for instance in task.instances:
+            instances_by_doc_id[instance.doc_id].append(instance)
+        # Sort instances within each group
+        for instances in instances_by_doc_id.values():
+            instances.sort(key=lambda x: x.idx)
         # iterate over different filters used
-        for key in task.instances[0].filtered_resps.keys():
-            # hack: remove image columns to speed avoid loading images and speed up postprocessing
-            # reason: doc_iterator will actually load image if it's in the doc.
-            docs = task.test_docs() if task.has_test_docs() else task.validation_docs()
-            if not task.config["process_results_use_image"]:
-                remove_cols = []
-                features = docs.features
-                # If it is an Image instance or a Sequence of Image instance. Remove it
-                for feature in features:
-                    if isinstance(features[feature], Image):
-                        remove_cols.append(feature)
-                    elif isinstance(features[feature], Sequence) and isinstance(features[feature].feature, Image):
-                        remove_cols.append(feature)
-                if remove_cols:
-                    docs = docs.remove_columns(remove_cols)
-
-            ####################### Processing with Full Docs Mode #######################
-            full_docs = task.config["full_docs"]
-
-            doc_iterator = itertools.islice(enumerate(docs), lm.rank, limit, lm.world_size)
-            # Instead of converting the iterator to a list, use `itertools.tee` to create a parallel iterator for counting
-            # doc_iterator, doc_iterator_for_counting = itertools.tee(doc_iterator)
-            # Don't use above one, this would crash if doc_iterator_for_counting contains too many objects and very slow
-            doc_iterator_for_counting = itertools.islice(range(len(task.test_docs())), lm.rank, limit, lm.world_size) if task.has_test_docs() else itertools.islice(range(len(task.validation_docs())), lm.rank, limit, lm.world_size)
-            total_docs = sum(1 for _ in doc_iterator_for_counting)
-            pbar = tqdm(total=total_docs, desc=f"Postprocessing", disable=(lm.rank != 0))
+        for filter_key in task.instances[0].filtered_resps.keys():
+            doc_iterator = task.doc_iterator(rank=RANK, limit=limit, world_size=WORLD_SIZE)
             for doc_id, doc in doc_iterator:
-                # subset instances to only this document id ; sort by idx
-                requests = list(filter(lambda x: x.doc_id == doc_id, task.instances))
-                requests.sort(key=lambda x: x.idx)
-                if full_docs:
-                    metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests], full_docs=docs)
-                else:
-                    metrics = task.process_results(doc, [req.filtered_resps[key] for req in requests])
+                requests = instances_by_doc_id[doc_id]
+                metrics = task.process_results(doc, [req.filtered_resps[filter_key] for req in requests])
                 if log_samples:
                     target = task.doc_to_target(doc)
+                    saved_doc = {key: value for key, value in doc.items() if "image" not in key}
+                    filtered_arguments = []
+                    for req in requests:
+                        # check if req.args is a list of tuples, and each item in the list is a serializable object
+                        for value in req.args:
+                            if isinstance(value, (str, int, float, bool, list, dict, type(None))):
+                                filtered_arguments.append(value)
+                            # else:
+                            #     filtered_arguments.append(_handle_non_serializable(value))
+
                     example = {
                         "doc_id": doc_id,
+                        "doc": saved_doc,
                         "target": target,
-                        "doc": doc,
-                        "arguments": [tuple(a for a in req.args if isinstance(a, (int, str))) for req in requests],  # do not include image
+                        "arguments": filtered_arguments,
                         "resps": [req.resps for req in requests],
-                        "filtered_resps": [req.filtered_resps[key] for req in requests],
+                        "filtered_resps": [req.filtered_resps[filter_key] for req in requests],
+                        "doc_hash": hash_string(
+                            json.dumps(
+                                requests[0].doc,
+                                indent=2,
+                                default=handle_non_serializable,
+                                ensure_ascii=False,
+                            )
+                        ),
+                        "prompt_hash": hash_string(requests[0].arguments[0]),
+                        "target_hash": hash_string(str(target)),
                     }
                     example.update(metrics)
-                    samples[task_name].append(example)
-
+                    task_output.logged_samples.append(example)
                 for metric, value in metrics.items():
-                    metrics_info[(task_name, key, metric)].append(value)
-
-                pbar.update(1)
+                    task_output.sample_metrics[(metric, filter_key)].append(value)
 
-            pbar.close()
-
-    if lm.world_size > 1:
-        # if multigpu, then gather data across all ranks
+    if WORLD_SIZE > 1:
+        # if multigpu, then gather data across all ranks to rank 0
         # first gather logged samples across all ranks
-        for task_name, task_samples in list(samples.items()):
-            full_samples = [None] * lm.world_size
-            torch.distributed.all_gather_object(full_samples, task_samples)
-            samples[task_name] = list(itertools.chain.from_iterable(full_samples))
-        # then collect metrics across all ranks
-        metrics_info_torch = collections.defaultdict(list)
-        for (task_name, key, metric), items in metrics_info.items():
-            numitem = 0
-            if type(items[0]) == tuple:
-                numitem = len(items[0])
-
-            if isinstance(items[0], (str, list, dict)):
-                # handle the string case
-                gathered_items = [None] * lm.accelerator.num_processes
-                torch.distributed.all_gather_object(gathered_items, items)
-
-                gathered_item = list(itertools.chain.from_iterable(gathered_items))
-            else:
-                # distributed gather requires all ranks to have same dimensions
-                # so we pad out with float32 min value
-                pad_value = torch.finfo(torch.float32).min
-                metrics_tensor = torch.tensor(items, device=lm.device)
-
-                original_dtype = metrics_tensor.dtype  # store original dtype
-                torch_device_tensor = lm.accelerator.pad_across_processes(metrics_tensor.to(torch.float32), pad_index=pad_value)
-                gathered_item = lm.accelerator.gather(torch_device_tensor)
-
-                if numitem > 0:
-                    gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
-                else:
-                    gathered_filtered = gathered_item[gathered_item != pad_value]
-
-                gathered_item = gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
-                # reconvert if we were passed a tuple of values
-                if numitem > 0:
-                    gathered_item = [tuple(g) for g in gathered_item]
-
-            if lm.rank == 0:
-                metrics_info_torch[(task_name, key, metric)] = gathered_item
-
-        metrics_info = metrics_info_torch
-        # Ensure all ranks wait for rank 0 to finish aggregation
-        torch.distributed.barrier()
-        lm.accelerator.wait_for_everyone()
-
-    # Synchronize processes with a temp file in case the evluation metric requires gpus
-    # TODO: fix barriers' taking up gpu computation
-    os.makedirs(cli_args.output_path, exist_ok=True)
-    if os.path.exists(f"{cli_args.output_path}/rank{int(os.environ.get('RANK', 0))}_metric_eval_done.txt"):
-        os.remove(f"{cli_args.output_path}/rank{int(os.environ.get('RANK', 0))}_metric_eval_done.txt")
-
-    if lm.rank == 0:
-        ### Get task ordering for correct sample-wide aggregation
-        group_to_task = {}
-        for group in task_hierarchy.keys():
-            if group not in task_order:
-                task_order[group] = 0
-
-            if len(task_hierarchy[group]) > 0:
-                group_to_task[group] = task_hierarchy[group].copy()
-
-            for task in task_hierarchy[group]:
-                if task in task_order:
-                    task_order[task] += 1
-                else:
-                    task_order[task] = 1 + task_order[group]
+        for task_output in eval_tasks:
+            if log_samples:
+                # for task_name, task_samples in list(samples.items()):
+                full_samples = [None] * WORLD_SIZE if RANK == 0 else None
+                per_rank_samples = []
+                for sample in task_output.logged_samples:
+                    per_rank_samples.append(sample)
+
+                torch.distributed.gather_object(
+                    obj=per_rank_samples,
+                    object_gather_list=full_samples,
+                    dst=0,
+                )
 
-                if task in task_hierarchy:
-                    group_to_task[group].remove(task)
-                    group_to_task[group].extend(task_hierarchy[task])
+                if RANK == 0:
+                    task_output.logged_samples = list(itertools.chain.from_iterable(full_samples))
 
-        task_to_group = {}
-        for group in group_to_task:
-            for task in group_to_task[group]:
-                if task in task_to_group:
-                    task_to_group[task].append(group)
-                else:
-                    task_to_group[task] = [group]
+            # then collect metrics across all ranks
+            for metrics in task_output.sample_metrics:
+                metric_list = [None] * WORLD_SIZE if RANK == 0 else None
+                torch.distributed.gather_object(
+                    obj=task_output.sample_metrics[metrics],
+                    object_gather_list=metric_list,
+                    dst=0,
+                )
+                if RANK == 0:
+                    task_output.sample_metrics[metrics] = list(itertools.chain.from_iterable(metric_list))
 
+    if RANK == 0:
         ### Aggregate results over all datapoints ###
         # aggregate results ; run bootstrap CIs
-        for (task_name, key, metric), items in metrics_info.items():
-            task = name_to_task[task_name]
-            metric_key = metric + "," + key
-
-            if type(task) == tuple:
-                group_name, task = task
-            else:
-                group_name = None
-
-            if metric not in task.aggregation():
-                continue
-
-            agg_fn = task.aggregation()[metric]
-
-            # Bo: for models that need to know the args to save to correct path
-            if inspect.getfullargspec(agg_fn).args == ["results", "args"]:
-                results[task_name][metric_key] = agg_fn(items, cli_args)
-            else:
-                # Bo: for models only need agg items
-                results[task_name][metric_key] = agg_fn(items)
-
-            results[task_name]["samples"] = len(items)
-
-            # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
-            # so we run them less iterations. still looking for a cleaner way to do this
-            if bootstrap_iters > 0:
-                stderr = lmms_eval.api.metrics.stderr_for_metric(
-                    metric=task.aggregation()[metric],
-                    bootstrap_iters=min(bootstrap_iters, 100) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
-                )
-
-                if stderr is not None and len(items) > 1:
-                    results[task_name][metric + "_stderr" + "," + key] = stderr(items)
-                else:
-                    results[task_name][metric + "_stderr" + "," + key] = "N/A"
-
+        for task_output in eval_tasks:
+            task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters)
+        (
+            results,
+            samples,
+            configs,
+            versions,
+            num_fewshot,
+            higher_is_better,
+        ) = consolidate_results(eval_tasks)
+
+        ### Calculate group metrics ###
         if bool(results):
-            for group, task_list in reversed(task_hierarchy.items()):
-                if task_list == []:
-                    total_size = results[group]["samples"]
-                else:
-                    total_size = 0
-
-                    for task in task_list:
-                        metrics = results[task]
-
-                        current_size = metrics.pop("samples")
-                        # TODO: There should be a way for users
-                        #       to toggle between weighted and
-                        #       unweighted averaging
-                        # For unweighted averaging, use:
-                        #     current_size = 1
-
-                        all_stderr = []
-                        for metric in [key for key in metrics.keys() if "_stderr" not in key]:
-                            stderr = "_stderr,".join(metric.split(","))
-                            stderr_score = results[task][stderr]
-                            var_score = stderr_score**2 if stderr_score != "N/A" else 0
-                            metric_score = results[task][metric]
-
-                            all_stderr.append(stderr)
-
-                            if metric_score is None:
-                                results[group][metric] = None
-                                results[group][stderr] = 0
-                                continue
-
-                            if metric in results[group]:
-                                if isinstance(results[group][metric], str) == False:
-                                    results[group][metric] = (results[group][metric] * total_size + metric_score * current_size) / (total_size + current_size)
-                                    # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
-                                    results[group][stderr] = ((total_size - 1) * results[group][stderr] + (current_size - 1) * var_score) / (total_size + current_size - 1) + total_size * current_size / (
-                                        (total_size + current_size) * (total_size + current_size - 1)
-                                    ) * (results[group][metric] - metric_score) ** 2
-                                else:
-                                    # accuracy = re.search(r'acc: ([\d.]+)%', results[group][metric]).group(1)
-                                    # score = re.search(r'score: ([\d.]+)', results[group][metric]).group(1)
-                                    # group_accuracy = float(accuracy)
-                                    # group_score = float(score)
-                                    # group_accuracy = (group_accuracy * total_size + metric_score * current_size) / total_size
-                                    # group_score = (group_score * total_size + metric_score * current_size) / total_size
-                                    # results[group][metric] = "Acc: " + str(group_accuracy) + " Score: " + str(group_score)
-                                    results[group][metric] = "group_results"
-                                    results[group][stderr] = 0
-                            else:
-                                results[group][metric] = metric_score
-                                results[group][stderr] = var_score
-
-                        total_size += current_size
-
-                    for stderr in all_stderr:
-                        results[group][stderr] = np.sqrt(results[group][stderr])
-
-                results[group]["samples"] = total_size
-
-        def print_tasks(task_hierarchy, task_order, task_version, task_group_alias):
-            results_agg = collections.defaultdict(dict)
-            groups_agg = collections.defaultdict(dict)
-            for group_name, task_list in task_hierarchy.items():
-                order = task_order[group_name]
-                results_agg[group_name] = results[group_name].copy()
-                results_agg[group_name]["tab"] = order
-
-                if (order < max(task_order.values())) and (len(task_list) > 0):
-                    groups_agg[group_name] = results[group_name].copy()
-                    groups_agg[group_name]["tab"] = order
-
-                if task_list != []:
-                    for task in sorted(task_list):
-                        if task in task_hierarchy:
-                            _task_hierarchy = {task: task_hierarchy[task]}
-                        else:
-                            _task_hierarchy = {task: []}
-
-                        _results_agg, _groups_agg, task_version = print_tasks(_task_hierarchy, task_order, task_version, task_group_alias)
-
-                        results_agg = {**results_agg, **_results_agg}
-                        groups_agg = {**groups_agg, **_groups_agg}
-
-            return results_agg, groups_agg, task_version
-
-        results_agg, groups_agg, versions = print_tasks(task_hierarchy, task_order, versions, task_group_alias)
-
-        for task in results_agg:
-            task_results = results_agg[task]
-
-            if "samples" in task_results:
-                task_results.pop("samples")
-
-            tab_string = ""
-            if "tab" in task_results:
-                tab = task_results.pop("tab")
-                tab_string = " " * tab + "- " if tab > 0 else ""
-
-            if task in task_group_alias:
-                task_alias = task_group_alias[task]
-                results_agg[task]["alias"] = tab_string + task_alias
-            else:
-                results_agg[task]["alias"] = tab_string + task
-
-        for group in groups_agg:
-            group_results = groups_agg[group]
-
-            if "samples" in group_results:
-                group_results.pop("samples")
-
-            tab_string = ""
-            if "tab" in group_results:
-                tab = group_results.pop("tab")
-                tab_string = " " * tab + "- " if tab > 0 else ""
-
-            if group in task_group_alias:
-                group_alias = task_group_alias[group]
-                groups_agg[group]["alias"] = tab_string + group_alias
-            else:
-                groups_agg[group]["alias"] = tab_string + group
-
-        for group_name, task_list in task_hierarchy.items():
-            if task_list != []:
-                num_fewshot[group_name] = num_fewshot[task_list[0]]
+            results, versions, show_group_table, *_ = consolidate_group_results(results, versions, task_dict)
+
+        results_agg, group_agg = prepare_print_tasks(task_dict, results)
+        subtask_list = get_subtask_list(task_dict)
+
+        # collect all higher_is_better values for metrics
+        # in the group's subtasks.
+        # TODO: clean this up ; unify with the below metric_list loop?
+        _higher_is_better = {}
+        for group, task_list in subtask_list.items():
+            if len(task_list) != 0:  # subtask list will list "task_name": [] for solo tasks
+                for task in task_list:
+                    for m, h in higher_is_better[task].items():
+                        if m not in _higher_is_better.keys():
+                            _higher_is_better[m] = h
+
+                        if m in _higher_is_better and _higher_is_better[m] is not None and _higher_is_better[m] != h:
+                            eval_logger.warning(f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None.")
+                            _higher_is_better[m] = None
+                higher_is_better[group] = _higher_is_better
 
         results_dict = {
             "results": dict(results_agg.items()),
-            **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
+            **({"groups": dict(group_agg.items())} if (bool(group_agg) & show_group_table) else {}),
+            "group_subtasks": dict(reversed(subtask_list.items())),
             "configs": dict(sorted(configs.items())),
             "versions": dict(sorted(versions.items())),
             "n-shot": dict(sorted(num_fewshot.items())),
+            "higher_is_better": dict(sorted(higher_is_better.items())),
+            "n-samples": {
+                task_output.task_name: {
+                    "original": len(task_output.task.eval_docs),
+                    "effective": min(
+                        limit if limit else len(task_output.task.eval_docs),
+                        len(task_output.task.eval_docs),
+                    ),
+                }
+                for task_output in eval_tasks
+            },
         }
         if log_samples:
             results_dict["samples"] = dict(samples)
+
+        return results_dict
+
     else:
-        results_dict = None
+        return None
 
     with open(f"{cli_args.output_path}/rank{int(os.environ.get('RANK', 0))}_metric_eval_done.txt", "w") as f:
         f.write(f"rank {int(os.environ.get('RANK', 0))} eval done")
diff --git a/lmms_eval/evaluator_utils.py b/lmms_eval/evaluator_utils.py
index 50b58ebee..48b5c9780 100644
--- a/lmms_eval/evaluator_utils.py
+++ b/lmms_eval/evaluator_utils.py
@@ -1,5 +1,6 @@
 # credit to https://github.com/EleutherAI/lm-evaluation-harness
 import collections
+import inspect
 import math
 import pathlib
 import sys
@@ -67,6 +68,7 @@ def __init__(
         self.sample_len = None
         self.sample_metrics = collections.defaultdict(list)
         self.agg_metrics = collections.defaultdict(list)
+        self.args = None
 
     @classmethod
     def from_taskdict(cls, task_name: str, task):
@@ -103,18 +105,22 @@ def from_taskdict(cls, task_name: str, task):
 
     def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None:
         for (metric, filter_key), items in self.sample_metrics.items():
-            agg_fn = self.task.aggregation()[metric]
-            metric_key = f"{metric},{filter_key}"
-            self.agg_metrics[metric_key] = agg_fn(items)
-            self.sample_len = len(items)  # TODO: same sample size for each metric?
-            if isinstance(bootstrap_iters, int):
-                stderr_fn = stderr_for_metric(
-                    metric=agg_fn,
-                    bootstrap_iters=min(bootstrap_iters, 100) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
-                )
-                self.agg_metrics[f"{metric}_stderr,{filter_key}"] = stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
-            else:
-                raise ValueError(f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations.")
+            if metric in self.task.aggregation():
+                agg_fn = self.task.aggregation()[metric]
+                metric_key = f"{metric},{filter_key}"
+                if "args" in inspect.signature(agg_fn).parameters:
+                    self.agg_metrics[metric_key] = agg_fn(items, args=self.task.args)
+                else:
+                    self.agg_metrics[metric_key] = agg_fn(items)
+                self.sample_len = len(items)  # TODO: same sample size for each metric?
+                if isinstance(bootstrap_iters, int):
+                    stderr_fn = stderr_for_metric(
+                        metric=agg_fn,
+                        bootstrap_iters=min(bootstrap_iters, 100) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
+                    )
+                    self.agg_metrics[f"{metric}_stderr,{filter_key}"] = stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A"
+                else:
+                    raise ValueError(f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations.")
 
     def __repr__(self):
         return f"TaskOutput(task_name={self.task_name}, " f"group_name={self.group_name}, " f"version={self.version}, " f"n_shot={self.n_shot}, " f"task_alias={self.task_alias}, " f"group_alias={self.group_alias})"
diff --git a/lmms_eval/tasks/__init__.py b/lmms_eval/tasks/__init__.py
index be368c55d..3749248ed 100755
--- a/lmms_eval/tasks/__init__.py
+++ b/lmms_eval/tasks/__init__.py
@@ -450,7 +450,7 @@ def _get_task_and_group(self, task_dir: str):
                             if attr in config:
                                 if attr == "group" and print_info:
                                     self.logger.debug(
-                                        "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. "
+                                        "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lmms-eval. "
                                         "`tag` will be used to allow to call a collection of tasks just like `group`. "
                                         "`group` will be removed in order to not cause confusion with the new ConfigurableGroup "
                                         "which will be the offical way to create groups with addition of group-wide configuations."
@@ -470,7 +470,7 @@ def _get_task_and_group(self, task_dir: str):
                                             "yaml_path": -1,
                                         }
                                     elif tasks_and_groups[tag]["type"] != "tag":
-                                        self.logger.info(f"The tag {tag} is already registered as a group, this tag will not be registered. " "This may affect tasks you want to call.")
+                                        self.logger.warning(f"The tag {tag} is already registered as a group, this tag will not be registered. " "This may affect tasks you want to call.")
                                         break
                                     else:
                                         tasks_and_groups[tag]["task"].append(task)
diff --git a/lmms_eval/utils.py b/lmms_eval/utils.py
index 4cbbc39ea..17324c8b3 100755
--- a/lmms_eval/utils.py
+++ b/lmms_eval/utils.py
@@ -40,6 +40,10 @@
 from loguru import logger as eval_logger
 
 SPACING = " " * 47
+HIGHER_IS_BETTER_SYMBOLS = {
+    True: "↑",
+    False: "↓",
+}
 
 
 def is_json(string):
@@ -434,7 +438,7 @@ def get_original(self, grouped_dict):
         return res
 
 
-def make_table(result_dict, column: str = "results"):
+def make_table(result_dict, column: str = "results", sort_results: bool = False):
     """Generate table of results."""
     from pytablewriter import LatexTableWriter, MarkdownTableWriter
 
@@ -449,6 +453,7 @@ def make_table(result_dict, column: str = "results"):
         "Filter",
         "n-shot",
         "Metric",
+        "",
         "Value",
         "",
         "Stderr",
@@ -459,54 +464,53 @@ def make_table(result_dict, column: str = "results"):
     md_writer.headers = all_headers
     latex_writer.headers = all_headers
 
-    # Set column alignments for LaTeX
-    latex_writer.column_alignments = ["center"] * len(all_headers)
-
-    # Set padding for LaTeX columns (this will add space between columns)
-    latex_writer.column_format = " ".join(["|c"] * len(all_headers)) + "|"
-
     values = []
 
-    for k, dic in result_dict[column].items():
-        version = result_dict["versions"][k]
-        n = str(result_dict["n-shot"][k])
+    keys = result_dict[column].keys()
+    if sort_results:
+        # sort entries alphabetically by task or group name.
+        # NOTE: we default here to false, because order matters for multi-level table printing a la mmlu.
+        # sorting here would mess that up
+        keys = sorted(keys)
+    for k in keys:
+        dic = result_dict[column][k]
+        version = result_dict["versions"].get(k, "    N/A")
+        n = str(result_dict.get("n-shot", " ").get(k, " "))
+        higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
 
         if "alias" in dic:
             k = dic.pop("alias")
 
-        for (mf), v in dic.items():
+        metric_items = dic.items()
+        metric_items = sorted(metric_items)
+
+        for (mf), v in metric_items:
             m, _, f = mf.partition(",")
             if m.endswith("_stderr"):
                 continue
 
-            points = "N/A"
-            if v is not None:
-                if isinstance(v, str):
-                    points = v
-                else:
-                    # if 0 <= v <= 1:
-                    #     # v *= 100
-                    points = "%.4f" % v
+            hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
+
+            v = "%.4f" % v if isinstance(v, float) else v
+            if v == "" or v is None:
+                v = "N/A"
 
             if m + "_stderr" + "," + f in dic:
-                if v is None:
-                    se = "N/A"
-                else:
-                    se = dic[m + "_stderr" + "," + f]
-                if se != "N/A":
-                    se = "%.4f" % se
-                values.append([k, version, f, n, m, points, "±", se])
+                # if dic[m + "_stderr" + "," + f] != []:
+                se = dic[m + "_stderr" + "," + f]
+                se = "   N/A" if se == "N/A" or se == [] else "%.4f" % se
+                if v != []:
+                    values.append([k, version, f, n, m, hib, v, "±", se])
             else:
-                values.append([k, version, f, n, m, points, "", ""])
-            k = ""
-            version = ""
+                values.append([k, version, f, n, m, hib, v, "", ""])
+            # k = ""
+            # version = ""
     md_writer.value_matrix = values
     latex_writer.value_matrix = values
 
-    # Print LaTeX table to see how it looks
+    # todo: make latex table look good
     # print(latex_writer.dumps())
 
-    # Return Markdown table (note: column width and text alignment may not be supported)
     return md_writer.dumps()
 
 
diff --git a/pyproject.toml b/pyproject.toml
index f9c7079d8..155b85d04 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -7,9 +7,9 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "lmms_eval"
-version = "0.2.1"
+version = "0.2.3"
 authors = [
-    { name = "LMMMs-Lab Evaluation Team", email = "lmms_eval@outlook.com" },
+    { name = "LMMMs-Lab Evaluation Team", email = "lmms-lab@outlook.com" },
 ]
 description = "A framework for evaluating large multi-modality language models"
 readme = "README.md"
@@ -45,7 +45,6 @@ dependencies = [
     "opencv-python-headless",
     "av",
     "hf_transfer",
-    "pywsd",
     "nltk",
     "sentencepiece==0.1.99",
     "yt-dlp",
@@ -61,7 +60,6 @@ dependencies = [
     "Jinja2",
     "openpyxl",
     "loguru",
-    "Levenshtein",
     "hf_transfer",
     "tenacity==8.3.0",
     "wandb>=0.16.0",
@@ -71,15 +69,18 @@ dependencies = [
     "packaging",
     "decord",
     "zss",
+    "protobuf==3.20",
+]
+
+[project.optional-dependencies]
+metrics = [
     "pywsd",
     "spacy",
     "anls",
     "rouge",
     "capture_metric",
-    "protobuf==3.20",
+    "Levenshtein",
 ]
-
-[project.optional-dependencies]
 vila = [
     "s2wrapper@git+https://github.com/bfshi/scaling_on_scales"
 ]
@@ -138,4 +139,4 @@ lmms-eval = "lmms_eval.__main__:cli_evaluate"
 
 [project.urls]
 Homepage = "https://lmms-lab.github.io"
-Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"
\ No newline at end of file
+Repository = "https://github.com/EvolvingLMMs-Lab/lmms-eval"