Subset.as_dataframe() hides targets instead of raising an error (#258)

* `Subset.as_dataframe()` hides targets instead of raising an error Resolves #257 * add tests for new dataframe behavior review comment: #258 (comment)
polaris-hub · Jan 31, 2025 · fe3bff3 · fe3bff3
1 parent 6747c58
commit fe3bff3
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 6 deletions.
diff --git a/polaris/dataset/_subset.py b/polaris/dataset/_subset.py
@@ -200,16 +200,19 @@ def as_dataframe(self) -> pd.DataFrame:
             This method loads the entire dataset in memory.
         """
         # Create an empty dataframe
-        cols = self.input_cols + self.target_cols
+        cols = self.input_cols
+        if not self._hide_targets:
+            cols += self.target_cols
         df = pd.DataFrame(columns=cols)
 
         # Fill the dataframe
-        targets = self.targets
-        if not self.is_multi_task:
-            targets = {self.target_cols[0]: targets}
+        if not self._hide_targets:
+            targets = self.targets
+            if not self.is_multi_task:
+                targets = {self.target_cols[0]: targets}
 
-        for k in targets:
-            df[k] = targets[k]
+            for k in targets:
+                df[k] = targets[k]
 
         inputs = self.inputs
         if not self.is_multi_input:

diff --git a/tests/test_subset.py b/tests/test_subset.py
@@ -1,5 +1,6 @@
 import datamol as dm
 import numpy as np
+import pandas as pd
 import pytest
 
 from polaris.dataset import Subset
@@ -52,6 +53,14 @@ def test_access_to_test_set(test_single_task_benchmark):
     assert all(isinstance(y, float) for x, y in train)
     assert all(isinstance(train[i][1], float) for i in range(len(train)))
 
+    # as_dataframe should work for both, but contain no targets for test
+    train_df = train.as_dataframe()
+    assert isinstance(train_df, pd.DataFrame)
+    assert "expt" in train_df.columns
+    test_df = test.as_dataframe()
+    assert isinstance(test_df, pd.DataFrame)
+    assert "expt" not in test_df.columns
+
 
 def test_input_featurization(test_single_task_benchmark):
     # Without a transformation, we expect a SMILES string