-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmodel.py
89 lines (77 loc) · 3.49 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""-*- indent-tabs-mode:nil; coding: utf-8 -*-.
Copyright (C) 2024
HardenedLinux community
This is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program.
If not, see <http://www.gnu.org/licenses/>.
"""
from typing import Any, Iterator
from llama_cpp import Llama
from sentencepiece import SentencePieceProcessor
def get_prompt(message: str,
chat_history: list[tuple[str, str]],
system_prompt: str) -> str:
"""Create a prompt for the model to generate a response from."""
texts = [f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>"]
texts.append(f"<|start_header_id|>user<|end_header_id|>\n\n{message.strip()}<|eot_id|>")
return "".join(texts)
class LLAMA_wrapper:
"""Wrapper for llama model."""
def __init__(self, config: dict = {}):
"""Initialize the model."""
self.config = config
self.model = Llama(model_path=config["model_name"],
n_ctx=config["max_input"],
n_batch=config["max_input"],
n_gpu_layers=30)
self.tokenizer = SentencePieceProcessor(model_file="tokenizer.model")
def get_input_token_length(self,
message: str,
chat_history: list[tuple[str, str]],
system_prompt: str) -> int:
"""Get the input token length for a prompt."""
prompt = get_prompt(message, chat_history, system_prompt)
input_ids = self.tokenizer.EncodeAsIds(prompt)
return len(input_ids)
def generate(self,
prompt: str,
max_new_tokens: int = 1024,
temperature: float = 0.8,
top_p: float = 0.95,
top_k: int = 50,
) -> Iterator[str]:
"""Generate a response from a prompt."""
inputs = self.model.tokenize(bytes(prompt, "utf-8"))
generate_kwargs = dict(top_p=top_p, top_k=top_k, temp=temperature)
generator = self.model.generate(inputs, **generate_kwargs)
outputs = []
for token in generator:
if token == self.model.token_eos():
break
b_text = self.model.detokenize([token])
text = str(b_text, encoding="utf-8")
outputs.append(text)
yield "".join(outputs)
def run(self,
message: str,
chat_history: list[tuple[str, str]],
system_prompt: str,
max_new_tokens: int = 1024,
temperature: float = 0.8,
top_p: float = 0.95,
top_k: int = 50,
) -> Iterator[str]:
"""Generate a response from a prompt."""
prompt = get_prompt(message, chat_history, system_prompt)
return self.generate(prompt, max_new_tokens, temperature, top_p, top_k)
def __call__(self, prompt: str, **kwargs: Any) -> str:
"""Generate a response from a prompt."""
return self.model.__call__(prompt, **kwargs)["choices"][0]["text"]