diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 49b3eab3..b349f00a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: additional_dependencies: [tomli] args: [--in-place, --config, ./pyproject.toml] - repo: https://github.com/asottile/pyupgrade - rev: v3.18.0 + rev: v3.16.0 hooks: - id: pyupgrade - repo: https://github.com/hadialqattan/pycln diff --git a/src/qibotn/backends/cutensornet.py b/src/qibotn/backends/cutensornet.py index 1d385207..d124b617 100644 --- a/src/qibotn/backends/cutensornet.py +++ b/src/qibotn/backends/cutensornet.py @@ -12,6 +12,7 @@ class CuTensorNet(NumpyBackend): # pragma: no cover def __init__(self, runcard): super().__init__() + from cuquantum import cudaDataType, ComputeType, __version__ # pylint: disable=import-error from cuquantum import cutensornet as cutn # pylint: disable=import-error if runcard is not None: @@ -58,22 +59,21 @@ def __init__(self, runcard): self.expectation_enabled = False self.name = "qibotn" - self.cuquantum = cuquantum self.cutn = cutn self.platform = "cutensornet" - self.versions["cuquantum"] = self.cuquantum.__version__ + self.versions["cuquantum"] = __version__ self.supports_multigpu = True self.handle = self.cutn.create() global CUDA_TYPES CUDA_TYPES = { "complex64": ( - self.cuquantum.cudaDataType.CUDA_C_32F, - self.cuquantum.ComputeType.COMPUTE_32F, + cudaDataType.CUDA_C_32F, + ComputeType.COMPUTE_32F, ), "complex128": ( - self.cuquantum.cudaDataType.CUDA_C_64F, - self.cuquantum.ComputeType.COMPUTE_64F, + cudaDataType.CUDA_C_64F, + ComputeType.COMPUTE_64F, ), } diff --git a/src/qibotn/eval.py b/src/qibotn/eval.py index 245aa5ea..ebef1039 100644 --- a/src/qibotn/eval.py +++ b/src/qibotn/eval.py @@ -62,6 +62,7 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8): Dense vector of quantum circuit. """ + import cuquantum.cutensornet as cutn from cuquantum import Network from mpi4py import MPI @@ -71,21 +72,30 @@ def dense_vector_tn_MPI(qibo_circ, datatype, n_samples=8): size = comm.Get_size() device_id = rank % getDeviceCount() + cp.cuda.Device(device_id).use() # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + if rank == 0: + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - operands = myconvertor.state_vector_operands() + operands = myconvertor.state_vector_operands() + else: + operands = None - # Assign the device for each process. - device_id = rank % getDeviceCount() + operands = comm.bcast(operands, root) # Create network object. network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} + optimize={ + "samples": n_samples, + "slicing": { + "min_slices": max(32, size), + "memory_model": cutn.MemoryModel.CUTENSOR, + }, + } ) # Select the best path from all ranks. @@ -136,6 +146,7 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): Returns: Dense vector of quantum circuit. """ + import cuquantum.cutensornet as cutn from cupy.cuda import nccl from cuquantum import Network from mpi4py import MPI @@ -155,14 +166,25 @@ def dense_vector_tn_nccl(qibo_circ, datatype, n_samples=8): comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank) # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - operands = myconvertor.state_vector_operands() + if rank == 0: + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + operands = myconvertor.state_vector_operands() + else: + operands = None + + operands = comm_mpi.bcast(operands, root) network = Network(*operands) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} + optimize={ + "samples": n_samples, + "slicing": { + "min_slices": max(32, size), + "memory_model": cutn.MemoryModel.CUTENSOR, + }, + } ) # Select the best path from all ranks. @@ -226,6 +248,7 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl Returns: Expectation of quantum circuit due to pauli string. """ + import cuquantum.cutensornet as cutn from cupy.cuda import nccl from cuquantum import Network from mpi4py import MPI @@ -245,16 +268,28 @@ def expectation_pauli_tn_nccl(qibo_circ, datatype, pauli_string_pattern, n_sampl comm_nccl = nccl.NcclCommunicator(size, nccl_id, rank) # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - operands = myconvertor.expectation_operands( - pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) - ) + if rank == 0: + + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + operands = myconvertor.expectation_operands( + pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) + ) + else: + operands = None + + operands = comm_mpi.bcast(operands, root) network = Network(*operands) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} + optimize={ + "samples": n_samples, + "slicing": { + "min_slices": max(32, size), + "memory_model": cutn.MemoryModel.CUTENSOR, + }, + } ) # Select the best path from all ranks. @@ -318,6 +353,7 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample Returns: Expectation of quantum circuit due to pauli string. """ + import cuquantum.cutensornet as cutn from cuquantum import Network from mpi4py import MPI # this line initializes MPI @@ -326,24 +362,34 @@ def expectation_pauli_tn_MPI(qibo_circ, datatype, pauli_string_pattern, n_sample rank = comm.Get_rank() size = comm.Get_size() + # Assign the device for each process. device_id = rank % getDeviceCount() + cp.cuda.Device(device_id).use() # Perform circuit conversion - myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) + if rank == 0: + myconvertor = QiboCircuitToEinsum(qibo_circ, dtype=datatype) - operands = myconvertor.expectation_operands( - pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) - ) + operands = myconvertor.expectation_operands( + pauli_string_gen(qibo_circ.nqubits, pauli_string_pattern) + ) + else: + operands = None - # Assign the device for each process. - device_id = rank % getDeviceCount() + operands = comm.bcast(operands, root) # Create network object. network = Network(*operands, options={"device_id": device_id}) # Compute the path on all ranks with 8 samples for hyperoptimization. Force slicing to enable parallel contraction. path, info = network.contract_path( - optimize={"samples": n_samples, "slicing": {"min_slices": max(32, size)}} + optimize={ + "samples": n_samples, + "slicing": { + "min_slices": max(32, size), + "memory_model": cutn.MemoryModel.CUTENSOR, + }, + } ) # Select the best path from all ranks.