modin-project · YarShev · Dec 13, 2023 · Dec 13, 2023 · Dec 13, 2023 · anmyachev
@@ -709,11 +709,19 @@
                 return
             new_columns = self._validate_set_axis(new_columns, self._columns_cache)
         if isinstance(self._dtypes, ModinDtypes):
-            new_value = self._dtypes.set_index(new_columns)
-            self.set_dtypes_cache(new_value)
+            try:
+                new_dtypes = self._dtypes.set_index(new_columns)
+            except NotImplementedError:
+                # can raise on duplicated labels
+                new_dtypes = None
         elif isinstance(self._dtypes, pandas.Series):
-            self.dtypes.index = new_columns
+            new_dtypes = self.dtypes.set_axis(new_columns)
+        else:
+            new_dtypes = None
         self.set_columns_cache(new_columns)
+        # we have to set new dtypes cache after columns,
+        # so the 'self.columns' and 'new_dtypes.index' indices would match
+        self.set_dtypes_cache(new_dtypes)
         self.synchronize_labels(axis=1)
 
     columns = property(_get_columns, _set_columns)

@@ -294,6 +294,11 @@ def set_index(
         Calling this method on a descriptor that returns ``None`` for ``.columns_order``
         will result into information lose.
         """
+        if len(new_index) != len(set(new_index)):
+            raise NotImplementedError(
+                "Duplicated column names are not yet supported by DtypesDescriptor"
+            )
+
         if self.columns_order is None:
             # we can't map new columns to old columns and lost all dtypes :(
             return DtypesDescriptor(

@@ -2171,6 +2171,17 @@ def test_set_index_dataframe(self, initial_dtypes, result_dtypes):
                 assert df._dtypes._value.equals(result_dtypes)
         assert df.dtypes.index.equals(pandas.Index(["col1", "col2", "col3"]))
 
+    def test_set_index_with_dupl_labels(self):
+        """Verify that setting duplicated columns doesn't propagate any errors to a user."""
+        df = pd.DataFrame({"a": [1, 2, 3, 4], "b": [3.5, 4.4, 5.5, 6.6]})
+        # making sure that dtypes are represented by an unmaterialized dtypes-descriptor
+        df._query_compiler._modin_frame.set_dtypes_cache(None)
+
+        df.columns = ["a", "a"]
+        assert df.dtypes.equals(
+            pandas.Series([np.dtype(int), np.dtype("float64")], index=["a", "a"])
+        )
+
 
 class TestZeroComputationDtypes:
     """