| | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig |
| | import torch |
| | |
| | from dolphin.configuration_dolphin import DolphinConfig |
| | from dolphin.modeling_dolphin import DolphinForCausalLM |
| |
|
| |
|
| | def inference_instruct(mycontext, question, device="cuda:0"): |
| | import time |
| | MEMORY_SIZE = 32 |
| | start_time = time.time() |
| | generated_token_ids = [] |
| | prompt = f" <context>{question}" |
| | text_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<context>")] |
| | input_ids = ( |
| | torch.tensor( |
| | text_chunks[0] + [-1] * MEMORY_SIZE + text_chunks[1], dtype=torch.long |
| | ) |
| | .unsqueeze(0) |
| | .to(device) |
| | ) |
| | |
| | context_tokenized = tokenizer( |
| | mycontext + "".join([f"[memory_{i}]" for i in range(MEMORY_SIZE)]), |
| | return_tensors="pt", |
| | ) |
| | context_tokenized = {k: v.to(device) for k, v in context_tokenized.items()} |
| | context_token_count = (context_tokenized["input_ids"]).shape[1] - MEMORY_SIZE |
| | |
| | for i in range(context_token_count): |
| | next_token = ( |
| | model( |
| | input_ids, |
| | context_input_ids=context_tokenized["input_ids"], |
| | context_attention_mask=context_tokenized["attention_mask"], |
| | ) |
| | .logits[:, -1] |
| | .argmax(-1) |
| | ) |
| | if next_token.item() == 151643: |
| | break |
| | generated_token_ids.append(next_token.item()) |
| | input_ids = torch.cat([input_ids, next_token.unsqueeze(1)], dim=-1) |
| | result = tokenizer.decode(generated_token_ids) |
| | print(f"Time taken: {time.time() - start_time}") |
| | return result |
| |
|
| |
|
| | if __name__ == "__main__": |
| | device_name = "cuda:0" if torch.cuda.is_available() else "cpu" |
| | AutoConfig.register("dolphin", DolphinConfig) |
| | AutoModelForCausalLM.register(DolphinConfig, DolphinForCausalLM) |
| | |
| | tokenizer = AutoTokenizer.from_pretrained('NexaAIDev/Dolphin') |
| | model = AutoModelForCausalLM.from_pretrained('NexaAIDev/Dolphin', trust_remote_code=True, torch_dtype=torch.bfloat16, device_map=device_name) |
| | |
| | |
| | mycontext = "Nexa AI is a Cupertino-based company founded in May 2023 that researches and develops models and tools for on-device AI applications. The company is founded by Alex and Zack. The company is known for its Octopus-series models, which rival large-scale language models in capabilities such as function-calling, multimodality, and action-planning, while remaining efficient and compact for edge device deployment. Nexa AI's mission is to advance on-device AI in collaboration with the global developer community. To this end, the company has created an on-device model hub for users to find, share, and collaborate on open-source AI models optimized for edge devices, as well as an SDK for developers to run and deploy AI models locally" |
| | question = "Who founded Nexa AI?" |
| | |
| | result = inference_instruct(mycontext, question, device=device_name) |
| | print("Result:", result) |