You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1854, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1851 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1853 with self.accelerator.accumulate(model):
-> 1854 tr_loss_step = self.training_step(model, inputs)
1856 if (
1857 args.logging_nan_inf_filter
1858 and not is_torch_tpu_available()
1859 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1860 ):
1861 # if loss is nan or inf simply add the average of previous logged losses
1862 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2723, in Trainer.training_step(self, model, inputs)
2720 return loss_mb.reduce_mean().detach().to(self.args.device)
2722 with self.compute_loss_context_manager():
-> 2723 loss = self.compute_loss(model, inputs)
2725 if self.args.n_gpu > 1:
2726 loss = loss.mean() # mean() to average on multi-gpu parallel training
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2746, in Trainer.compute_loss(self, model, inputs, return_outputs)
2744 else:
2745 labels = None
-> 2746 outputs = model(**inputs)
2747 # Save past state if it exists
2748 # TODO: this needs to be fixed and made cleaner later.
2749 if self.args.past_index >= 0:
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:161, in DataParallel.forward(self, *inputs, **kwargs)
156 if t.device != self.src_device_obj:
157 raise RuntimeError("module must have its parameters and buffers "
158 "on device {} (device_ids[0]) but found one of "
159 "them on device: {}".format(self.src_device_obj, t.device))
--> 161 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
162 # for forward function without any inputs, empty list and dict will be created
163 # so the module can be executed on one device which is the first one in device_ids
164 if not inputs and not kwargs:
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:53, in scatter_kwargs(inputs, kwargs, target_gpus, dim)
51 r"""Scatter with support for kwargs dictionary"""
52 inputs = scatter(inputs, target_gpus, dim) if inputs else []
---> 53 kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
54 if len(inputs) < len(kwargs):
55 inputs.extend(() for _ in range(len(kwargs) - len(inputs)))
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:44, in scatter(inputs, target_gpus, dim)
38 # After scatter_map is called, a scatter_map cell will exist. This cell
39 # has a reference to the actual function scatter_map, which has references
40 # to a closure that has a reference to the scatter_map cell (because the
41 # fn is recursive). To avoid this reference cycle, we set the function to
42 # None, clearing the cell
43 try:
---> 44 res = scatter_map(inputs)
45 finally:
46 scatter_map = None
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:35, in scatter..scatter_map(obj)
33 return [list(i) for i in zip(*map(scatter_map, obj))]
34 if isinstance(obj, dict) and len(obj) > 0:
---> 35 return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
36 return [obj for targets in target_gpus]
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:31, in scatter..scatter_map(obj)
29 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
30 if isinstance(obj, tuple) and len(obj) > 0:
---> 31 return list(zip(*map(scatter_map, obj)))
32 if isinstance(obj, list) and len(obj) > 0:
33 return [list(i) for i in zip(*map(scatter_map, obj))]
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:27, in scatter..scatter_map(obj)
25 def scatter_map(obj):
26 if isinstance(obj, torch.Tensor):
---> 27 return Scatter.apply(target_gpus, None, dim, obj)
28 if _is_namedtuple(obj):
29 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
File /opt/conda/lib/python3.10/site-packages/torch/autograd/function.py:506, in Function.apply(cls, *args, **kwargs)
503 if not torch._C._are_functorch_transforms_active():
504 # See NOTE: [functorch vjp and autograd interaction]
505 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506 return super().apply(*args, **kwargs) # type: ignore[misc]
508 if cls.setup_context == _SingleLevelFunction.setup_context:
509 raise RuntimeError(
510 'In order to use an autograd.Function with functorch transforms '
511 '(vmap, grad, jvp, jacrev, ...), it must override the setup_context '
512 'staticmethod. For more details, please see '
513 'https://pytorch.org/docs/master/notes/extending.func.html style="color:rgb(175,0,0)">')
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:96, in Scatter.forward(ctx, target_gpus, chunk_sizes, dim, input)
93 if torch.cuda.is_available() and ctx.input_device == -1:
94 # Perform CPU to GPU copies in a background stream
95 streams = [_get_stream(device) for device in target_gpus]
---> 96 outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
97 # Synchronize with the copy stream
98 if streams is not None:
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/comm.py:189, in scatter(tensor, devices, chunk_sizes, dim, streams, out)
187 if out is None:
188 devices = [_get_device_index(d) for d in devices]
--> 189 return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
190 else:
191 if devices is not None:
RuntimeError: CUDA error: device kernel image is invalid
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.
The text was updated successfully, but these errors were encountered:
I was simply do the steps described by you
Start training
RuntimeError Traceback (most recent call last)
Cell In[6], line 32
29 trainer.load_base_model()
31 print("Start training")
---> 32 trainer.train()
34 print("Merge model and save")
35 trainer.merge_and_save()
File /kaggle/input/llama-fine/llm_qlora-main/QloraTrainer.py:95, in QloraTrainer.train(self)
77 trainer = transformers.Trainer(
78 model=model,
79 train_dataset=data["train"],
(...)
92 data_collator=transformers.DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
93 )
94 model.config.use_cache = False # silence the warnings. Please re-enable for inference!
---> 95 trainer.train()
97 model_save_path = f"{self.config['model_output_dir']}/{self.config['model_name']}_adapter"
98 trainer.save_model(model_save_path)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
-> 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1854, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1851 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1853 with self.accelerator.accumulate(model):
-> 1854 tr_loss_step = self.training_step(model, inputs)
1856 if (
1857 args.logging_nan_inf_filter
1858 and not is_torch_tpu_available()
1859 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1860 ):
1861 # if loss is nan or inf simply add the average of previous logged losses
1862 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2723, in Trainer.training_step(self, model, inputs)
2720 return loss_mb.reduce_mean().detach().to(self.args.device)
2722 with self.compute_loss_context_manager():
-> 2723 loss = self.compute_loss(model, inputs)
2725 if self.args.n_gpu > 1:
2726 loss = loss.mean() # mean() to average on multi-gpu parallel training
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2746, in Trainer.compute_loss(self, model, inputs, return_outputs)
2744 else:
2745 labels = None
-> 2746 outputs = model(**inputs)
2747 # Save past state if it exists
2748 # TODO: this needs to be fixed and made cleaner later.
2749 if self.args.past_index >= 0:
File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:161, in DataParallel.forward(self, *inputs, **kwargs)
156 if t.device != self.src_device_obj:
157 raise RuntimeError("module must have its parameters and buffers "
158 "on device {} (device_ids[0]) but found one of "
159 "them on device: {}".format(self.src_device_obj, t.device))
--> 161 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
162 # for forward function without any inputs, empty list and dict will be created
163 # so the module can be executed on one device which is the first one in device_ids
164 if not inputs and not kwargs:
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:178, in DataParallel.scatter(self, inputs, kwargs, device_ids)
177 def scatter(self, inputs, kwargs, device_ids):
--> 178 return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:53, in scatter_kwargs(inputs, kwargs, target_gpus, dim)
51 r"""Scatter with support for kwargs dictionary"""
52 inputs = scatter(inputs, target_gpus, dim) if inputs else []
---> 53 kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
54 if len(inputs) < len(kwargs):
55 inputs.extend(() for _ in range(len(kwargs) - len(inputs)))
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:44, in scatter(inputs, target_gpus, dim)
38 # After scatter_map is called, a scatter_map cell will exist. This cell
39 # has a reference to the actual function scatter_map, which has references
40 # to a closure that has a reference to the scatter_map cell (because the
41 # fn is recursive). To avoid this reference cycle, we set the function to
42 # None, clearing the cell
43 try:
---> 44 res = scatter_map(inputs)
45 finally:
46 scatter_map = None
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:35, in scatter..scatter_map(obj)
33 return [list(i) for i in zip(*map(scatter_map, obj))]
34 if isinstance(obj, dict) and len(obj) > 0:
---> 35 return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
36 return [obj for targets in target_gpus]
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:31, in scatter..scatter_map(obj)
29 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
30 if isinstance(obj, tuple) and len(obj) > 0:
---> 31 return list(zip(*map(scatter_map, obj)))
32 if isinstance(obj, list) and len(obj) > 0:
33 return [list(i) for i in zip(*map(scatter_map, obj))]
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:27, in scatter..scatter_map(obj)
25 def scatter_map(obj):
26 if isinstance(obj, torch.Tensor):
---> 27 return Scatter.apply(target_gpus, None, dim, obj)
28 if _is_namedtuple(obj):
29 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
File /opt/conda/lib/python3.10/site-packages/torch/autograd/function.py:506, in Function.apply(cls, *args, **kwargs)
503 if not torch._C._are_functorch_transforms_active():
504 # See NOTE: [functorch vjp and autograd interaction]
505 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506 return super().apply(*args, **kwargs) # type: ignore[misc]
508 if cls.setup_context == _SingleLevelFunction.setup_context:
509 raise RuntimeError(
510 'In order to use an autograd.Function with functorch transforms '
511 '(vmap, grad, jvp, jacrev, ...), it must override the setup_context '
512 'staticmethod. For more details, please see '
513 'https://pytorch.org/docs/master/notes/extending.func.html style="color:rgb(175,0,0)">')
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:96, in Scatter.forward(ctx, target_gpus, chunk_sizes, dim, input)
93 if torch.cuda.is_available() and ctx.input_device == -1:
94 # Perform CPU to GPU copies in a background stream
95 streams = [_get_stream(device) for device in target_gpus]
---> 96 outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
97 # Synchronize with the copy stream
98 if streams is not None:
File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/comm.py:189, in scatter(tensor, devices, chunk_sizes, dim, streams, out)
187 if out is None:
188 devices = [_get_device_index(d) for d in devices]
--> 189 return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
190 else:
191 if devices is not None:
RuntimeError: CUDA error: device kernel image is invalid
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with
TORCH_USE_CUDA_DSA
to enable device-side assertions.The text was updated successfully, but these errors were encountered: