From 97e2b7327236666bd025bbbbc98eb383683fce17 Mon Sep 17 00:00:00 2001
From: Amy Thompson <52806925+amyjaynethompson@users.noreply.github.com>
Date: Tue, 25 Jun 2024 09:45:15 +0100
Subject: [PATCH] Data id fix (#2681)

fix data ids output by json file
---
 newsfragments/2681.bugfix                     |  1 +
 src/dials/algorithms/correlation/analysis.py  |  3 +-
 tests/algorithms/correlation/test_analysis.py | 47 ++++++++++++++++++-
 3 files changed, 49 insertions(+), 2 deletions(-)
 create mode 100644 newsfragments/2681.bugfix

diff --git a/newsfragments/2681.bugfix b/newsfragments/2681.bugfix
new file mode 100644
index 0000000000..4b113a304a
--- /dev/null
+++ b/newsfragments/2681.bugfix
@@ -0,0 +1 @@
+``dials.correlation_matrix``: Correctly select datasets for output json after filtering when used by multiplex. 
diff --git a/src/dials/algorithms/correlation/analysis.py b/src/dials/algorithms/correlation/analysis.py
index cf41589e0e..5a50e33f29 100644
--- a/src/dials/algorithms/correlation/analysis.py
+++ b/src/dials/algorithms/correlation/analysis.py
@@ -377,7 +377,8 @@ def convert_to_importable_json(self, linkage_matrix: np.ndarray) -> OrderedDict:
         linkage_mat_as_dict = linkage_matrix_to_dict(linkage_matrix)
         for d in linkage_mat_as_dict.values():
             # Difference in indexing between linkage_mat_as_dict and datasets, so have i-1
-            d["datasets"] = [self.ids_to_identifiers_map[i - 1] for i in d["datasets"]]
+            real_num = [self.labels[i - 1] for i in d["datasets"]]
+            d["datasets"] = [self.ids_to_identifiers_map[i] for i in real_num]
 
         return linkage_mat_as_dict
 
diff --git a/tests/algorithms/correlation/test_analysis.py b/tests/algorithms/correlation/test_analysis.py
index 3f0f2ec9c4..a79944a30b 100644
--- a/tests/algorithms/correlation/test_analysis.py
+++ b/tests/algorithms/correlation/test_analysis.py
@@ -1,7 +1,10 @@
 from __future__ import annotations
 
+import json
 import pathlib
 
+import pytest
+
 from dials.algorithms.correlation.analysis import CorrelationMatrix
 from dials.command_line.correlation_matrix import phil_scope
 from dials.util.multi_dataset_handling import (
@@ -11,7 +14,8 @@
 from dials.util.options import ArgumentParser, reflections_and_experiments_from_files
 
 
-def test_corr_mat(dials_data, run_in_tmp_path):
+@pytest.fixture()
+def proteinase_k(dials_data):
     mcp = dials_data("vmxi_proteinase_k_sweeps", pathlib=True)
     params = phil_scope.extract()
     input_data = []
@@ -41,7 +45,48 @@ def test_corr_mat(dials_data, run_in_tmp_path):
     assert len(experiments) == len(reflections)
     assert len(experiments) > 1
     experiments, reflections = assign_unique_identifiers(experiments, reflections)
+    yield experiments, reflections, params
+
+
+def test_corr_mat(proteinase_k, run_in_tmp_path):
+    experiments, reflections, params = proteinase_k
     matrices = CorrelationMatrix(experiments, reflections, params)
     matrices.calculate_matrices()
     matrices.output_json()
     assert pathlib.Path("dials.correlation_matrix.json").is_file()
+
+
+def test_filtered_corr_mat(proteinase_k, run_in_tmp_path):
+    experiments, reflections, params = proteinase_k
+    ids_to_identifiers_map = {}
+    for table in reflections:
+        ids_to_identifiers_map.update(table.experiment_identifiers())
+
+    # Simulate filtered dataset by multiplex
+    id_to_remove = [ids_to_identifiers_map[2]]
+    ids_to_identifiers_map.pop(2)
+    reflections.pop(2)
+    experiments.remove_on_experiment_identifiers(id_to_remove)
+
+    matrices = CorrelationMatrix(
+        experiments, reflections, params, ids_to_identifiers_map
+    )
+    matrices.calculate_matrices()
+    matrices.output_json()
+    assert pathlib.Path("dials.correlation_matrix.json").is_file()
+
+    expected_ids = [[1, 3], [0, 1, 3]]
+
+    # Check main algorithm correct with filtering
+    for i, j in zip(matrices.correlation_clusters, expected_ids):
+        assert i.labels == j
+
+    # Check json output also correct
+    with open(pathlib.Path("dials.correlation_matrix.json")) as f:
+        data = json.load(f)
+
+    assert len(data["correlation_matrix_clustering"]) == len(expected_ids)
+    for i, j in zip(data["correlation_matrix_clustering"], expected_ids):
+        assert len(data["correlation_matrix_clustering"][i]["datasets"]) == len(j)
+        for a, e in zip(data["correlation_matrix_clustering"][i]["datasets"], j):
+            assert a == ids_to_identifiers_map[e]