Spaces:

jackkuo
/

ADMP-LS

Running

App Files Files Community

ADMP-LS / servers /Retrieve /utils /token_util.py

jackkuo

reinit repo

82bf89e 3 months ago

raw

history blame contribute delete

2.26 kB

	import tiktoken


	def num_tokens_from_messages(messages, model="gpt-4o"):
	"""
	Returns the number of tokens used by a list of messages.

	Args:
	messages (list): A list of messages.
	model (str): The name of the model to use for tokenization.

	Returns:
	int: The number of tokens used by the messages.
	"""
	try:
	encoding = tiktoken.encoding_for_model(model)
	except KeyError:
	print("Warning: model not found. Using cl100k_base encoding.")
	encoding = tiktoken.get_encoding("cl100k_base")
	if model == "gpt-3.5-turbo":
	return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
	elif model == "gpt-4o":
	return num_tokens_from_messages(messages, model="gpt-4-0314")
	elif model == "gpt-3.5-turbo-0301":
	tokens_per_message = (
	4 # every message follows <\|start\|>{role/name}\n{content}<\|end\|>\n
	)
	tokens_per_name = -1 # if there's a name, the role is omitted
	elif model == "gpt-4-0314":
	tokens_per_message = 3
	tokens_per_name = 1
	else:
	raise NotImplementedError(
	f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
	)
	num_tokens = 0
	for message in messages:
	num_tokens += tokens_per_message
	for key, value in message.items():
	num_tokens += len(encoding.encode(value))
	if key == "name":
	num_tokens += tokens_per_name
	num_tokens += 3 # every reply is primed with <\|start\|>assistant<\|message\|>
	return num_tokens


	def num_tokens_from_text(text: str, model: str = "gpt-4o") -> int:
	"""
	Returns the number of tokens used by a text.

	Args:
	text (str): The text to tokenize.
	model (str): The name of the model to use for tokenization.
	"""
	try:
	encoding = tiktoken.encoding_for_model(model)
	except KeyError:
	print("Warning: model not found. Using cl100k_base encoding.")
	encoding = tiktoken.get_encoding("cl100k_base")
	num_tokens = 0
	if text:
	num_tokens += len(encoding.encode(text))
	return num_tokens