You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Hi , I have about 1,220,000 train samples and 500,000 evaluation samples.
There is no error when train, but errors occur when evaluation: It seems mmsegmentation want to save a big file using pickle.dump.How can I solve such problem?
2021-07-13 11:41:29,769 - mmseg - INFO - Iter [1700/1000000] lr: 7.189e-01, eta: 26 days, 21:45:55, time: 2.376, data_time: 0.075, memory: 10210, decode_0.loss_seg: 0.0184, decode_0.acc_seg: 98.1039, d
ecode_1.loss_seg: 0.0455, decode_1.acc_seg: 98.1037, loss: 0.0639
2021-07-13 11:45:19,160 - mmseg - INFO - Iter [1800/1000000] lr: 7.188e-01, eta: 26 days, 21:10:08, time: 2.294, data_time: 0.073, memory: 10210, decode_0.loss_seg: 0.0185, decode_0.acc_seg: 98.1001, d
ecode_1.loss_seg: 0.0455, decode_1.acc_seg: 98.1014, loss: 0.0640
2021-07-13 11:49:05,741 - mmseg - INFO - Iter [1900/1000000] lr: 7.188e-01, eta: 26 days, 20:12:44, time: 2.266, data_time: 0.071, memory: 10210, decode_0.loss_seg: 0.0187, decode_0.acc_seg: 98.0748, d
ecode_1.loss_seg: 0.0460, decode_1.acc_seg: 98.0779, loss: 0.0647
[>>>>>>>>>>>>>>>>>>> ] 509440/509690, 358.7 task/s, elapsed: 1420s, ETA: 1sTraceback (most recent call last):
File "tools/train.py", line 166, in
main()
File "tools/train.py", line 162, in main
meta=meta)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
[>>>>>>>>>>>>>>>>>>> ] 509440/509690, 358.7 task/s, elapsed: 1420s, ETA: 1sTraceback (most recent call last): [136/1818]
File "tools/train.py", line 166, in
main()
File "tools/train.py", line 162, in main
meta=meta)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
self.call_hook('after_train_iter')
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
getattr(hook, fn_name)(self)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/hooks/evaluation.py", line 172, in after_train_iter
self._do_evaluate(runner)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/core/evaluation/eval_hooks.py", line 87, in do_evaluate
gpu_collect=self.gpu_collect)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/apis/test.py", line 160, in multi_gpu_test
results = collect_results_cpu(results, len(dataset), tmpdir)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/apis/test.py", line 185, in collect_results_cpu
mmcv.dump(result_part, osp.join(tmpdir, 'part{}.pkl'.format(rank)))
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/fileio/io.py", line 80, in dump
handler.dump_to_path(obj, file, **kwargs)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/fileio/handlers/pickle_handler.py", line 26, in dump_to_path
obj, filepath, mode='wb', **kwargs)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/fileio/handlers/base.py", line 25, in dump_to_path
self.dump_to_fileobj(obj, f, **kwargs)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/fileio/handlers/pickle_handler.py", line 22, in dump_to_fileobj
pickle.dump(obj, file, **kwargs)
MemoryError
[>>>>>>>>>>>>>>>>>>> ] 509568/509690, 343.9 task/s, elapsed: 1482s, ETA: 0sTraceback (most recent call last):
File "tools/train.py", line 166, in
[>>>>>>>>>>>>>>>>>>> ] 509600/509690, 343.6 task/s, elapsed: 1483s, ETA: 0s main()
File "tools/train.py", line 162, in main
meta=meta)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/apis/train.py", line 116, in train_segmentor
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
[>>>>>>>>>>>>>>>>>>> ] 509614/509690, 343.6 task/s, elapsed: 1483s, ETA: 0s self.call_hook('after_train_iter')
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/base_runner.py", line 307, in call_hook
[>>>>>>>>>>>>>>>>>>> ] 509632/509690, 343.6 task/s, elapsed: 1483s, ETA: 0s getattr(hook, fn_name)(self)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/hooks/evaluation.py", line 172, in after_train_iter
self._do_evaluate(runner)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/core/evaluation/eval_hooks.py", line 87, in _do_evaluate
gpu_collect=self.gpu_collect)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/apis/test.py", line 138, in multi_gpu_test
for i, data in enumerate(data_loader):
File "/mnt/lustre/share/platform/env/miniconda3.6/envs/pt1.3v1/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 793, in next
[>>>>>>>>>>>>>>>>>>>>] 509696/509690, 343.1 task/s, elapsed: 1486s, ETA: 0s self._shutdown_workers()
File "/mnt/lustre/share/platform/env/miniconda3.6/envs/pt1.3v1/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 893, in _shutdown_workers
self._worker_result_queue.put((None, None))
File "/mnt/lustre/share/platform/env/miniconda3.6/envs/pt1.3v1/lib/python3.6/multiprocessing/queues.py", line 87, in put
self._start_thread()
File "/mnt/lustre/share/platform/env/miniconda3.6/envs/pt1.3v1/lib/python3.6/multiprocessing/queues.py", line 169, in _start_thread
self._thread.start()
File "/mnt/lustre/share/platform/env/miniconda3.6/envs/pt1.3v1/lib/python3.6/threading.py", line 846, in start
_start_new_thread(self._bootstrap, ())
RuntimeError: can't start new thread
^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[A^[[B^[[B^[[B^[[B^[[B^[[B^[[BTraceback (most recent call last):
File "tools/train.py", line 166, in
main()
File "tools/train.py", line 162, in main
meta=meta)
File "/mnt/lustre/menghao/projects/mmsegmentation/mmseg/apis/train.py", line 116, in train_segmentor
runner.run(data_loaders, cfg.workflow)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 131, in run
iter_runner(iter_loaders[i], **kwargs)
File "/mnt/lustre/menghao/.local/lib/python3.6/site-packages/mmcv/runner/iter_based_runner.py", line 66, in train
The text was updated successfully, but these errors were encountered:
Hi @MengHao666
Thanks for your feedback!
Please try to set efficient_test=True in evaluation or use --options evaluation.efficient_test=True in CMD.
Looking forward to your reply.
Hi , I have about 1,220,000 train samples and 500,000 evaluation samples.
There is no error when train, but errors occur when evaluation: It seems mmsegmentation want to save a big file using pickle.dump.How can I solve such problem?
The text was updated successfully, but these errors were encountered: