You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I can run normally on CUDA 0, but when I try to run on another GPU using the command: fast_reid/tools/train_net.py --config-file ./fast_reid/configs/MOT17/sbs_S50.yml MODEL.DEVICE "cuda:2"
I encounter an error. Traceback (most recent call last): File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 146, in train self.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/defaults.py", line 359, in run_step self._trainer.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 346, in run_step loss_dict = self.model(data) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, **kwargs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 100, in forward images = self.preprocess_image(batched_inputs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 130, in preprocess_image images.sub_(self.pixel_mean).div_(self.pixel_std) RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:2! Traceback (most recent call last): File "fast_reid/tools/train_net.py", line 60, in <module> args=(args,), File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/launch.py", line 71, in launch main_func(*args) File "fast_reid/tools/train_net.py", line 47, in main return trainer.train() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/defaults.py", line 350, in train super().train(self.start_epoch, self.max_epoch, self.iters_per_epoch) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 146, in train self.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/defaults.py", line 359, in run_step self._trainer.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 346, in run_step loss_dict = self.model(data) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, **kwargs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 100, in forward images = self.preprocess_image(batched_inputs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 130, in preprocess_image images.sub_(self.pixel_mean).div_(self.pixel_std) RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:2! Exception in thread Thread-1: Traceback (most recent call last): File "/home/user/anaconda3/envs/botlmz/lib/python3.7/threading.py", line 926, in _bootstrap_inner self.run() File "/home/user/anaconda3/envs/botlmz/lib/python3.7/threading.py", line 870, in run self._target(*self._args, **self._kwargs) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/queues.py", line 113, in get return _ForkingPickler.loads(res) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 295, in rebuild_storage_fd fd = df.detach() File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection c = Client(address, authkey=process.current_process().authkey) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 498, in Client answer_challenge(c, authkey) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 742, in answer_challenge message = connection.recv_bytes(256) # reject large message File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes buf = self._recv(4) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 379, in _recv chunk = read(handle, remaining) ConnectionResetError: [Errno 104] Connection reset by peer
how to solve this?
The text was updated successfully, but these errors were encountered:
I can run normally on CUDA 0, but when I try to run on another GPU using the command:
fast_reid/tools/train_net.py --config-file ./fast_reid/configs/MOT17/sbs_S50.yml MODEL.DEVICE "cuda:2"
I encounter an error.
Traceback (most recent call last): File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 146, in train self.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/defaults.py", line 359, in run_step self._trainer.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 346, in run_step loss_dict = self.model(data) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, **kwargs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 100, in forward images = self.preprocess_image(batched_inputs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 130, in preprocess_image images.sub_(self.pixel_mean).div_(self.pixel_std) RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:2! Traceback (most recent call last): File "fast_reid/tools/train_net.py", line 60, in <module> args=(args,), File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/launch.py", line 71, in launch main_func(*args) File "fast_reid/tools/train_net.py", line 47, in main return trainer.train() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/defaults.py", line 350, in train super().train(self.start_epoch, self.max_epoch, self.iters_per_epoch) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 146, in train self.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/defaults.py", line 359, in run_step self._trainer.run_step() File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/engine/train_loop.py", line 346, in run_step loss_dict = self.model(data) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl return forward_call(*input, **kwargs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 100, in forward images = self.preprocess_image(batched_inputs) File "/mnt/2206b677-3750-478c-b7c0-489c4f8ede41/lmz/BoT-SORT/fast_reid/fastreid/modeling/meta_arch/baseline.py", line 130, in preprocess_image images.sub_(self.pixel_mean).div_(self.pixel_std) RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:2! Exception in thread Thread-1: Traceback (most recent call last): File "/home/user/anaconda3/envs/botlmz/lib/python3.7/threading.py", line 926, in _bootstrap_inner self.run() File "/home/user/anaconda3/envs/botlmz/lib/python3.7/threading.py", line 870, in run self._target(*self._args, **self._kwargs) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/utils/data/_utils/pin_memory.py", line 28, in _pin_memory_loop r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/queues.py", line 113, in get return _ForkingPickler.loads(res) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/site-packages/torch/multiprocessing/reductions.py", line 295, in rebuild_storage_fd fd = df.detach() File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/resource_sharer.py", line 57, in detach with _resource_sharer.get_connection(self._id) as conn: File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/resource_sharer.py", line 87, in get_connection c = Client(address, authkey=process.current_process().authkey) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 498, in Client answer_challenge(c, authkey) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 742, in answer_challenge message = connection.recv_bytes(256) # reject large message File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes buf = self._recv_bytes(maxlength) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes buf = self._recv(4) File "/home/user/anaconda3/envs/botlmz/lib/python3.7/multiprocessing/connection.py", line 379, in _recv chunk = read(handle, remaining) ConnectionResetError: [Errno 104] Connection reset by peer
how to solve this?
The text was updated successfully, but these errors were encountered: