AceCodeRM-32B / README.md

Update README.md

00e246b verified 3 months ago

5.53 kB

	---
	library_name: transformers
	tags: []
	---

	## Uses

	```python
	import torch
	import torch.nn as nn
	from transformers import Qwen2ForCausalLM, AutoTokenizer
	class ValueHead(nn.Module):
	r"""
	The ValueHead class implements a head for GPT2 that returns a scalar for each output token.
	"""

	def __init__(self, config, **kwargs):
	super().__init__()
	if not hasattr(config, "summary_dropout_prob"):
	summary_dropout_prob = kwargs.pop("summary_dropout_prob", 0.1)
	else:
	summary_dropout_prob = config.summary_dropout_prob

	self.dropout = (
	nn.Dropout(summary_dropout_prob) if summary_dropout_prob else nn.Identity()
	)

	# some models such as OPT have a projection layer before the word embeddings - e.g. OPT-350m
	if hasattr(config, "hidden_size"):
	hidden_size = config.hidden_size
	if hasattr(config, "word_embed_proj_dim"):
	hidden_size = config.word_embed_proj_dim
	elif hasattr(config, "is_encoder_decoder"):
	if config.is_encoder_decoder and hasattr(config, "decoder"):
	if hasattr(config.decoder, "hidden_size"):
	hidden_size = config.decoder.hidden_size

	self.summary = nn.Linear(hidden_size, 1)

	self.flatten = nn.Flatten()

	def forward(self, hidden_states):
	output = self.dropout(hidden_states)

	# For now force upcast in fp32 if needed. Let's keep the
	# output in fp32 for numerical stability.
	if output.dtype != self.summary.weight.dtype:
	output = output.to(self.summary.weight.dtype)

	output = self.summary(output)
	return output


	class Qwen2ForCausalRM(Qwen2ForCausalLM):
	def __init__(self, config):
	super().__init__(config)
	self.v_head = ValueHead(config)

	def forward(
	self,
	input_ids=None,
	past_key_values=None,
	attention_mask=None,
	return_past_key_values=False,
	**kwargs,
	):
	r"""
	Applies a forward pass to the wrapped model and returns the logits of the value head.

	Args:
	input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
	Indices of input sequence tokens in the vocabulary.
	past_key_values (`tuple(tuple(torch.FloatTensor))`, `optional`):
	Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
	(see `past_key_values` input) to speed up sequential decoding.
	attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
	Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.
	return_past_key_values (bool): A flag indicating if the computed hidden-states should be returned.
	kwargs (`dict`, `optional`):
	Additional keyword arguments, that are passed to the wrapped model.
	"""
	kwargs["output_hidden_states"] = (
	True # this had already been set in the LORA / PEFT examples
	)
	kwargs["past_key_values"] = past_key_values

	# if (
	# self.is_peft_model
	# and
	# self.pretrained_model.active_peft_config.peft_type == "PREFIX_TUNING"
	# ):
	# kwargs.pop("past_key_values")

	base_model_output = super().forward(
	input_ids=input_ids,
	attention_mask=attention_mask,
	**kwargs,
	)

	last_hidden_state = base_model_output.hidden_states[-1]
	lm_logits = base_model_output.logits
	loss = base_model_output.loss

	if last_hidden_state.device != self.v_head.summary.weight.device:
	last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device)

	value = self.v_head(last_hidden_state).squeeze(-1)

	# force upcast in fp32 if logits are in half-precision
	if lm_logits.dtype != torch.float32:
	lm_logits = lm_logits.float()

	if return_past_key_values:
	return (lm_logits, loss, value, base_model_output.past_key_values)
	else:
	return (lm_logits, loss, value)

	model_path = "CodeDPO/qwen_coder_2.5_rm"
	model = Qwen2ForCausalRM.from_pretrained(model_path, device_map="auto")
	tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
	input_chat = [
	{"role": "user", "content": "Hello, how are you?"},
	{
	"role": "assistant",
	"content": "I'm doing great. How can I help you today?",
	},
	{
	"role": "user",
	"content": "I'd like to show off how chat templating works!",
	},
	]
	input_tokens = tokenizer.apply_chat_template(
	input_chat,
	tokenize=True,
	return_dict=True,
	padding=True,
	return_tensors="pt",
	).to(model.device)
	_, _, values = model(
	**input_tokens,
	output_hidden_states=True,
	return_dict=True,
	use_cache=False,
	)
	masks = input_tokens["attention_mask"]
	chosen_scores = values.gather(
	dim=-1, index=(masks.sum(dim=-1, keepdim=True) - 1)
	) # find the last token (eos) in each sequence, a
	chosen_scores = chosen_scores.squeeze()
	print(chosen_scores)
	```