Fine tune error #9

bhuvneshsaini · 2023-12-20T08:14:13Z

I was simply do the steps described by you

Start training

RuntimeError Traceback (most recent call last)
Cell In[6], line 32
29 trainer.load_base_model()
31 print("Start training")
---> 32 trainer.train()
34 print("Merge model and save")
35 trainer.merge_and_save()

File /kaggle/input/llama-fine/llm_qlora-main/QloraTrainer.py:95, in QloraTrainer.train(self)
77 trainer = transformers.Trainer(
78 model=model,
79 train_dataset=data["train"],
(...)
92 data_collator=transformers.DataCollatorForLanguageModeling(self.tokenizer, mlm=False),
93 )
94 model.config.use_cache = False # silence the warnings. Please re-enable for inference!
---> 95 trainer.train()
97 model_save_path = f"{self.config['model_output_dir']}/{self.config['model_name']}_adapter"
98 trainer.save_model(model_save_path)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1537, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1535 hf_hub_utils.enable_progress_bars()
1536 else:
-> 1537 return inner_training_loop(
1538 args=args,
1539 resume_from_checkpoint=resume_from_checkpoint,
1540 trial=trial,
1541 ignore_keys_for_eval=ignore_keys_for_eval,
1542 )

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1854, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1851 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1853 with self.accelerator.accumulate(model):
-> 1854 tr_loss_step = self.training_step(model, inputs)
1856 if (
1857 args.logging_nan_inf_filter
1858 and not is_torch_tpu_available()
1859 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1860 ):
1861 # if loss is nan or inf simply add the average of previous logged losses
1862 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2723, in Trainer.training_step(self, model, inputs)
2720 return loss_mb.reduce_mean().detach().to(self.args.device)
2722 with self.compute_loss_context_manager():
-> 2723 loss = self.compute_loss(model, inputs)
2725 if self.args.n_gpu > 1:
2726 loss = loss.mean() # mean() to average on multi-gpu parallel training

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:2746, in Trainer.compute_loss(self, model, inputs, return_outputs)
2744 else:
2745 labels = None
-> 2746 outputs = model(**inputs)
2747 # Save past state if it exists
2748 # TODO: this needs to be fixed and made cleaner later.
2749 if self.args.past_index >= 0:

File /opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1501, in Module._call_impl(self, *args, **kwargs)
1496 # If we don't have any hooks, we want to skip the rest of the logic in
1497 # this function, and just call forward.
1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1499 or _global_backward_pre_hooks or _global_backward_hooks
1500 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501 return forward_call(*args, **kwargs)
1502 # Do not call functions when jit is used
1503 full_backward_hooks, non_full_backward_hooks = [], []

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:161, in DataParallel.forward(self, *inputs, **kwargs)
156 if t.device != self.src_device_obj:
157 raise RuntimeError("module must have its parameters and buffers "
158 "on device {} (device_ids[0]) but found one of "
159 "them on device: {}".format(self.src_device_obj, t.device))
--> 161 inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
162 # for forward function without any inputs, empty list and dict will be created
163 # so the module can be executed on one device which is the first one in device_ids
164 if not inputs and not kwargs:

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:178, in DataParallel.scatter(self, inputs, kwargs, device_ids)
177 def scatter(self, inputs, kwargs, device_ids):
--> 178 return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:53, in scatter_kwargs(inputs, kwargs, target_gpus, dim)
51 r"""Scatter with support for kwargs dictionary"""
52 inputs = scatter(inputs, target_gpus, dim) if inputs else []
---> 53 kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
54 if len(inputs) < len(kwargs):
55 inputs.extend(() for _ in range(len(kwargs) - len(inputs)))

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:44, in scatter(inputs, target_gpus, dim)
38 # After scatter_map is called, a scatter_map cell will exist. This cell
39 # has a reference to the actual function scatter_map, which has references
40 # to a closure that has a reference to the scatter_map cell (because the
41 # fn is recursive). To avoid this reference cycle, we set the function to
42 # None, clearing the cell
43 try:
---> 44 res = scatter_map(inputs)
45 finally:
46 scatter_map = None

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:35, in scatter..scatter_map(obj)
33 return [list(i) for i in zip(*map(scatter_map, obj))]
34 if isinstance(obj, dict) and len(obj) > 0:
---> 35 return [type(obj)(i) for i in zip(*map(scatter_map, obj.items()))]
36 return [obj for targets in target_gpus]

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:31, in scatter..scatter_map(obj)
29 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]
30 if isinstance(obj, tuple) and len(obj) > 0:
---> 31 return list(zip(*map(scatter_map, obj)))
32 if isinstance(obj, list) and len(obj) > 0:
33 return [list(i) for i in zip(*map(scatter_map, obj))]

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/scatter_gather.py:27, in scatter..scatter_map(obj)
25 def scatter_map(obj):
26 if isinstance(obj, torch.Tensor):
---> 27 return Scatter.apply(target_gpus, None, dim, obj)
28 if _is_namedtuple(obj):
29 return [type(obj)(*args) for args in zip(*map(scatter_map, obj))]

File /opt/conda/lib/python3.10/site-packages/torch/autograd/function.py:506, in Function.apply(cls, *args, **kwargs)
503 if not torch._C._are_functorch_transforms_active():
504 # See NOTE: [functorch vjp and autograd interaction]
505 args = _functorch.utils.unwrap_dead_wrappers(args)
--> 506 return super().apply(*args, **kwargs) # type: ignore[misc]
508 if cls.setup_context == _SingleLevelFunction.setup_context:
509 raise RuntimeError(
510 'In order to use an autograd.Function with functorch transforms '
511 '(vmap, grad, jvp, jacrev, ...), it must override the setup_context '
512 'staticmethod. For more details, please see '
513 'https://pytorch.org/docs/master/notes/extending.func.html style="color:rgb(175,0,0)">')

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/_functions.py:96, in Scatter.forward(ctx, target_gpus, chunk_sizes, dim, input)
93 if torch.cuda.is_available() and ctx.input_device == -1:
94 # Perform CPU to GPU copies in a background stream
95 streams = [_get_stream(device) for device in target_gpus]
---> 96 outputs = comm.scatter(input, target_gpus, chunk_sizes, ctx.dim, streams)
97 # Synchronize with the copy stream
98 if streams is not None:

File /opt/conda/lib/python3.10/site-packages/torch/nn/parallel/comm.py:189, in scatter(tensor, devices, chunk_sizes, dim, streams, out)
187 if out is None:
188 devices = [_get_device_index(d) for d in devices]
--> 189 return tuple(torch._C._scatter(tensor, devices, chunk_sizes, dim, streams))
190 else:
191 if devices is not None:

RuntimeError: CUDA error: device kernel image is invalid
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

The text was updated successfully, but these errors were encountered:

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Fine tune error #9

Fine tune error #9

bhuvneshsaini commented Dec 20, 2023

Fine tune error #9

Fine tune error #9

Comments

bhuvneshsaini commented Dec 20, 2023

Start training