AceCodeRM-32B / README.md
WyettZ's picture
Update README.md
00e246b verified
|
raw
history blame
5.53 kB
---
library_name: transformers
tags: []
---
## Uses
```python
import torch
import torch.nn as nn
from transformers import Qwen2ForCausalLM, AutoTokenizer
class ValueHead(nn.Module):
r"""
The ValueHead class implements a head for GPT2 that returns a scalar for each output token.
"""
def __init__(self, config, **kwargs):
super().__init__()
if not hasattr(config, "summary_dropout_prob"):
summary_dropout_prob = kwargs.pop("summary_dropout_prob", 0.1)
else:
summary_dropout_prob = config.summary_dropout_prob
self.dropout = (
nn.Dropout(summary_dropout_prob) if summary_dropout_prob else nn.Identity()
)
# some models such as OPT have a projection layer before the word embeddings - e.g. OPT-350m
if hasattr(config, "hidden_size"):
hidden_size = config.hidden_size
if hasattr(config, "word_embed_proj_dim"):
hidden_size = config.word_embed_proj_dim
elif hasattr(config, "is_encoder_decoder"):
if config.is_encoder_decoder and hasattr(config, "decoder"):
if hasattr(config.decoder, "hidden_size"):
hidden_size = config.decoder.hidden_size
self.summary = nn.Linear(hidden_size, 1)
self.flatten = nn.Flatten()
def forward(self, hidden_states):
output = self.dropout(hidden_states)
# For now force upcast in fp32 if needed. Let's keep the
# output in fp32 for numerical stability.
if output.dtype != self.summary.weight.dtype:
output = output.to(self.summary.weight.dtype)
output = self.summary(output)
return output
class Qwen2ForCausalRM(Qwen2ForCausalLM):
def __init__(self, config):
super().__init__(config)
self.v_head = ValueHead(config)
def forward(
self,
input_ids=None,
past_key_values=None,
attention_mask=None,
return_past_key_values=False,
**kwargs,
):
r"""
Applies a forward pass to the wrapped model and returns the logits of the value head.
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary.
past_key_values (`tuple(tuple(torch.FloatTensor))`, `optional`):
Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `past_key_values` input) to speed up sequential decoding.
attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
return_past_key_values (bool): A flag indicating if the computed hidden-states should be returned.
kwargs (`dict`, `optional`):
Additional keyword arguments, that are passed to the wrapped model.
"""
kwargs["output_hidden_states"] = (
True # this had already been set in the LORA / PEFT examples
)
kwargs["past_key_values"] = past_key_values
# if (
# self.is_peft_model
# and
# self.pretrained_model.active_peft_config.peft_type == "PREFIX_TUNING"
# ):
# kwargs.pop("past_key_values")
base_model_output = super().forward(
input_ids=input_ids,
attention_mask=attention_mask,
**kwargs,
)
last_hidden_state = base_model_output.hidden_states[-1]
lm_logits = base_model_output.logits
loss = base_model_output.loss
if last_hidden_state.device != self.v_head.summary.weight.device:
last_hidden_state = last_hidden_state.to(self.v_head.summary.weight.device)
value = self.v_head(last_hidden_state).squeeze(-1)
# force upcast in fp32 if logits are in half-precision
if lm_logits.dtype != torch.float32:
lm_logits = lm_logits.float()
if return_past_key_values:
return (lm_logits, loss, value, base_model_output.past_key_values)
else:
return (lm_logits, loss, value)
model_path = "CodeDPO/qwen_coder_2.5_rm"
model = Qwen2ForCausalRM.from_pretrained(model_path, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
input_chat = [
{"role": "user", "content": "Hello, how are you?"},
{
"role": "assistant",
"content": "I'm doing great. How can I help you today?",
},
{
"role": "user",
"content": "I'd like to show off how chat templating works!",
},
]
input_tokens = tokenizer.apply_chat_template(
input_chat,
tokenize=True,
return_dict=True,
padding=True,
return_tensors="pt",
).to(model.device)
_, _, values = model(
**input_tokens,
output_hidden_states=True,
return_dict=True,
use_cache=False,
)
masks = input_tokens["attention_mask"]
chosen_scores = values.gather(
dim=-1, index=(masks.sum(dim=-1, keepdim=True) - 1)
) # find the last token (eos) in each sequence, a
chosen_scores = chosen_scores.squeeze()
print(chosen_scores)
```