yoloV10训练报错Conv2d MUDNN failed in: RunFusion #58

SChaoZh · 2024-08-30T14:44:22Z

yoloV10使用musify-text将cuda替换musa，再将torch.musa替换torch_musa后训练AI报错Conv2d MUDNN failed in: RunFusion

RuntimeError Traceback (most recent call last)
Cell In[18], line 6
3 model = YOLOv10('./model/yolov10s.pt')
5 # yolo detect train data=coco.yaml model=yolov10n/s/m/b/l/x.yaml epochs=500 batch=256 imgsz=640 device=0,1,2,3,4,5,6,7
----> 6 model.train(model='./ultralytics/cfg/models/v10/yolov10s_mk.yaml',
7 data='./ultralytics/cfg/datasets/MK_persion.yaml',
8 epochs=128, batch=32, imgsz=320,
9 device='musa')

File /musaAI/yolov10/ultralytics/engine/model.py:657, in Model.train(self, trainer, **kwargs)
654 pass
656 self.trainer.hub_session = self.session # attach optional HUB session
--> 657 self.trainer.train()
658 # Update model and cfg after training
659 if RANK in (-1, 0):

File /musaAI/yolov10/ultralytics/engine/trainer.py:214, in BaseTrainer.train(self)
211 ddp_cleanup(self, str(file))
213 else:
--> 214 self._do_train(world_size)

File /musaAI/yolov10/ultralytics/engine/trainer.py:328, in BaseTrainer._do_train(self, world_size)
326 if world_size > 1:
327 self._setup_ddp(world_size)
--> 328 self._setup_train(world_size)
330 nb = len(self.train_loader) # number of batches
331 nw = max(round(self.args.warmup_epochs * nb), 100) if self.args.warmup_epochs > 0 else -1 # warmup iterations

File /musaAI/yolov10/ultralytics/engine/trainer.py:272, in BaseTrainer._setup_train(self, world_size)
270 if self.amp and RANK in (-1, 0): # Single-GPU and DDP
271 callbacks_backup = callbacks.default_callbacks.copy() # backup callbacks as check_amp() resets them
--> 272 self.amp = torch.tensor(check_amp(self.model), device=self.device)
273 callbacks.default_callbacks = callbacks_backup # restore callbacks
274 if RANK > -1 and world_size > 1: # DDP

File /musaAI/yolov10/ultralytics/utils/checks.py:654, in check_amp(model)
651 try:
652 from ultralytics import YOLO
--> 654 assert amp_allclose(YOLO("yolov8n.pt"), im)
655 LOGGER.info(f"{prefix}checks passed ✅")
656 except ConnectionError:

File /musaAI/yolov10/ultralytics/utils/checks.py:643, in check_amp..amp_allclose(m, im)
641 a = m(im, device=device, verbose=False)[0].boxes.data # FP32 inference
642 with torch_musa.amp.autocast(True):
--> 643 b = m(im, device=device, verbose=False)[0].boxes.data # AMP inference
644 del m
645 return a.shape == b.shape and torch.allclose(a, b.float(), atol=0.5)

File /musaAI/yolov10/ultralytics/engine/model.py:166, in Model.call(self, source, stream, **kwargs)
143 def call(
144 self,
145 source: Union[str, Path, int, list, tuple, np.ndarray, torch.Tensor] = None,
146 stream: bool = False,
147 **kwargs,
148 ) -> list:
149 """
150 An alias for the predict method, enabling the model instance to be callable.
151
(...)
164 (List[ultralytics.engine.results.Results]): A list of prediction results, encapsulated in the Results class.
165 """
--> 166 return self.predict(source, stream, **kwargs)

File /musaAI/yolov10/ultralytics/engine/model.py:441, in Model.predict(self, source, stream, predictor, **kwargs)
439 if prompts and hasattr(self.predictor, "set_prompts"): # for SAM-type models
440 self.predictor.set_prompts(prompts)
--> 441 return self.predictor.predict_cli(source=source) if is_cli else self.predictor(source=source, stream=stream)

File /musaAI/yolov10/ultralytics/engine/predictor.py:168, in BasePredictor.call(self, source, model, stream, *args, **kwargs)
166 return self.stream_inference(source, model, *args, **kwargs)
167 else:
--> 168 return list(self.stream_inference(source, model, *args, **kwargs))

File /opt/conda/envs/py39/lib/python3.9/site-packages/torch/utils/_contextlib.py:35, in _wrap_generator..generator_context(*args, **kwargs)
32 try:
33 # Issuing None to a generator fires it up
34 with ctx_factory():
---> 35 response = gen.send(None)
37 while True:
38 try:
39 # Forward the response to our caller and get its next request

File /musaAI/yolov10/ultralytics/engine/predictor.py:248, in BasePredictor.stream_inference(self, source, model, *args, **kwargs)
246 # Inference
247 with profilers[1]:
--> 248 preds = self.inference(im, *args, **kwargs)
249 if self.args.embed:
250 yield from [preds] if isinstance(preds, torch.Tensor) else preds # yield embedding tensors

File /musaAI/yolov10/ultralytics/engine/predictor.py:142, in BasePredictor.inference(self, im, *args, **kwargs)
136 """Runs inference on a given image using the specified model and arguments."""
137 visualize = (
138 increment_path(self.save_dir / Path(self.batch[0][0]).stem, mkdir=True)
139 if self.args.visualize and (not self.source_type.tensor)
140 else False
141 )
--> 142 return self.model(im, augment=self.args.augment, visualize=visualize, embed=self.args.embed, *args, **kwargs)

File /opt/conda/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /musaAI/yolov10/ultralytics/nn/autobackend.py:423, in AutoBackend.forward(self, im, augment, visualize, embed)
421 # PyTorch
422 if self.pt or self.nn_module:
--> 423 y = self.model(im, augment=augment, visualize=visualize, embed=embed)
425 # TorchScript
426 elif self.jit:

File /opt/conda/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /musaAI/yolov10/ultralytics/nn/tasks.py:94, in BaseModel.forward(self, x, *args, **kwargs)
92 if isinstance(x, dict): # for cases of training and validating while training.
93 return self.loss(x, *args, **kwargs)
---> 94 return self.predict(x, *args, **kwargs)

File /musaAI/yolov10/ultralytics/nn/tasks.py:112, in BaseModel.predict(self, x, profile, visualize, augment, embed)
110 if augment:
111 return self._predict_augment(x)
--> 112 return self._predict_once(x, profile, visualize, embed)

File /musaAI/yolov10/ultralytics/nn/tasks.py:133, in BaseModel._predict_once(self, x, profile, visualize, embed)
131 if profile:
132 self._profile_one_layer(m, x, dt)
--> 133 x = m(x) # run
134 y.append(x if m.i in self.save else None) # save output
135 if visualize:

File /opt/conda/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /musaAI/yolov10/ultralytics/nn/modules/conv.py:54, in Conv.forward_fuse(self, x)
52 def forward_fuse(self, x):
53 """Perform transposed convolution of 2D data."""
---> 54 return self.act(self.conv(x))

File /opt/conda/envs/py39/lib/python3.9/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/envs/py39/lib/python3.9/site-packages/torch/nn/modules/conv.py:463, in Conv2d.forward(self, input)
462 def forward(self, input: Tensor) -> Tensor:
--> 463 return self._conv_forward(input, self.weight, self.bias)

File /opt/conda/envs/py39/lib/python3.9/site-packages/torch/nn/modules/conv.py:459, in Conv2d._conv_forward(self, input, weight, bias)
455 if self.padding_mode != 'zeros':
456 return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
457 weight, bias, self.stride,
458 _pair(0), self.dilation, self.groups)
--> 459 return F.conv2d(input, weight, bias, self.stride,
460 self.padding, self.dilation, self.groups)

RuntimeError: Conv2d MUDNN failed in: RunFusion

The text was updated successfully, but these errors were encountered:

SChaoZh · 2024-08-30T14:49:21Z

musa_version_query信息
musa_toolkits:

{
"version": "2.0.0",
"git branch": "HEAD",
"git tag": "No tag",
"commit id": "35e9fd8519c162c704ee473093d53fe23fdaf224",
"commit date": "2023-12-25 17:27:09 +0800"
}
mcc:
{
"version": "2.0.0",
"git branch": "HEAD",
"git tag": "20231225_master",
"commit id": "228d4651d8fcb8511ca196a5740eef83326ce1cb",
"commit date": "2023-12-21 14:59:02 +0800"
}
mccl:
{
"version": "2.11.4",
"build archs": "--cuda-gpu-arch=mp_21",
"git branch": "HEAD",
"git tag": "No tag",
"commit id": "75a3be1470912dc394a17d0353de402d5393816a",
"commit date": "2023-12-06 19:19:46 +0800"
}
muAlg_dev:
{
"version": "0.3.0",
"git branch": "HEAD",
"commit id": "6690c26e73d430782e6b7148c57d1a0774819195",
"commit date": "2024-01-16 20:53:00 +0800"
}
muPP:
{
"version": "1.4.0",
"build archs": "21",
"git branch": "HEAD",
"git tag": "No tag",
"commit id": "1eacf78a1806cb989bb6972887cc27118bb1ffa2",
"commit date": "2023-12-25 10:15:25 +0800"
}
muThrust_dev:
{
"version": "0.3.0",
"git branch": "HEAD",
"commit id": "c6feacf2b4730028f109e059dbe26a8fb1d63763",
"commit date": "2024-01-16 20:53:06 +0800"
}
mublas:
{
"version": "1.3.0",
"build archs": "21",
"git branch": "HEAD",
"git tag": "20231225_develop",
"commit id": "cbbfe9445e6e8c5686c7ab6678b0b6a15a6b0fab",
"commit date": "2023-12-22 11:49:45 +0800"
}
mudnn:
{
"version": "2.4.0",
"git branch": "HEAD",
"git tag": "No tag",
"commit id": "6958e9e891d46f09bb1fd6c62f30a079286b0416",
"commit date": "2024-01-02 15:19:12 +0800"
}
mufft:
{
"version": "1.2.0",
"build archs": "21",
"git branch": "HEAD",
"git tag": "No tag",
"commit id": "c7397544276d036e32b89a1819b14cac9214b71d",
"commit date": "2023-11-08 11:54:16 +0800"
}
murand:
{
"version": "1.0.0",
"build archs": "21",
"git branch": "HEAD",
"git tag": "20231226_develop",
"commit id": "d2d60a4706c8e03840da689c4f1c5c99ec413966",
"commit date": "2023-11-09 12:45:33 +0800"
}
musify:
{
"version": "0.2.0",
"git branch": "HEAD",
"commit id": "564a5bcaa337a822e25b92678d43d8e37d268938",
"commit date": "2023-08-23 20:42:57 +0800"
}
musparse:
{
"version": "0.4.0",
"build archs": "21",
"git branch": "develop",
"git tag": "No tag",
"commit id": "2bf7506b3f36eca4d6160310773599febd44a179",
"commit date": "2024-01-11 12:14:57 +0800"
}
musa_runtime:
{
"version": "1.5.1",
"git branch": "HEAD",
"git tag": "No tag",
"commit id": "4a0bc1adf2b7feb5854754904febf3c62c3b010a",
"commit date": "2023-12-25 12:01:39 +0800"
}
driver_dependency:
{
"git branch": "HEAD",
"git tag": "20231219_develop",
"commit id": "4ee484c1e76a6894112f68421bb043990d166857",
"commit date": "2023-12-19 14:54:26 +0800"
}

1823616178 · 2024-09-03T02:45:03Z

mudnn 可能不支持这个神经网络

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

yoloV10训练报错Conv2d MUDNN failed in: RunFusion #58

yoloV10训练报错Conv2d MUDNN failed in: RunFusion #58

SChaoZh commented Aug 30, 2024

SChaoZh commented Aug 30, 2024

1823616178 commented Sep 3, 2024

yoloV10训练报错Conv2d MUDNN failed in: RunFusion #58

yoloV10训练报错Conv2d MUDNN failed in: RunFusion #58

Comments

SChaoZh commented Aug 30, 2024

SChaoZh commented Aug 30, 2024

1823616178 commented Sep 3, 2024