Example usage:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TextStreamer
model_name = "meta-llama/Llama-2-13b-hf"
hf_token = "<your token here>"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", quantization_config=bnb_config)
model.load_adapter("sqreept/ro_diacritics_llama2_13b_lora", adapter_name="ro_diacritics_llama2_13b_lora")
def full_prompt(text):
system_prompt = "You are a professional Romanian diacritics corrector."
question = "What is the corrected version of the below text?\n<text>" + text + "</text>\n"
messages = [
{
"role": "system",
"content": system_prompt,
},
{
"role": "user",
"content": question,
},
]
full_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
return full_prompt
prompt = full_prompt("Stiinta este in general clara dar cateodata este si confuza")
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
inputs = tokenizer(prompt, return_tensors="pt")["input_ids"].to(model.device)
model.set_adapter("ro_diacritics_llama2_13b_lora")
outputs = model.generate(inputs=inputs, streamer=streamer, max_new_tokens=256, temperature=0.7, top_p=0.6, repetition_penalty=1.2)
response = tokenizer.decode(outputs[0,inputs[0].shape[-1]:], skip_special_tokens=True)