import modal import json from datasets import Dataset import time modal.enable_output() app = modal.App("fistalfinetuner") volume = modal.Volume.from_name("fistal-models", create_if_missing=True ) modal_image = ( modal.Image.debian_slim(python_version="3.11") .apt_install("git") .pip_install( "torch>=2.6.0", "torchvision", "torchaudio", extra_index_url="https://download.pytorch.org/whl/cu121", ) .pip_install( "transformers", "datasets", "accelerate", "trl", "bitsandbytes", "peft", "unsloth_zoo", "datasets==4.3.0" ) .pip_install( "unsloth @ git+https://github.com/unslothai/unsloth.git" ) ) @app.function( image=modal_image, gpu="T4", timeout=3600, volumes={"/models":volume}, retries=modal.Retries(max_retries=0, backoff_coefficient=1.0) ) def train_with_modal(ft_data: str, model_name: str): """ Finetuning model using Modal's GPU """ import torch if not torch.cuda.is_available(): return {"status": "error", "message": "No GPU available!"} from unsloth import FastLanguageModel, is_bf16_supported from transformers import TrainingArguments from trl import SFTTrainer import os data = [] for line in ft_data.strip().split('\n'): if line.strip(): data.append(json.loads(line)) model, tokenizer = FastLanguageModel.from_pretrained( model_name=model_name, max_seq_length=512, load_in_4bit=True, dtype=None ) print("Configuring LoRA...") model = FastLanguageModel.get_peft_model( model, r=128, target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], lora_alpha=16, lora_dropout=0, bias="none", random_state=2001, use_gradient_checkpointing="unsloth", loftq_config=None, use_rslora=False ) def format_example(example): text = tokenizer.apply_chat_template( example['messages'], tokenize=False, add_generation_prompt=False ) return {"text": text} dataset = Dataset.from_list(data) dataset = dataset.map(format_example) trainer = SFTTrainer( model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", max_seq_length=2000, dataset_num_proc=2, args=TrainingArguments( per_device_train_batch_size=2, gradient_accumulation_steps=8, warmup_steps=5, num_train_epochs=1, max_steps=30, learning_rate=2e-4, fp16=not is_bf16_supported(), bf16=is_bf16_supported(), logging_steps=1, optim="adamw_8bit", lr_scheduler_type="linear", output_dir="/tmp/training_output", seed=42, report_to="none", dataloader_num_workers=0 ) ) print("Training started...") trainer.train() print("Training complete!") timestamp = int(time.time()) volume_path = f"/models/finetuned-{timestamp}" os.makedirs(volume_path, exist_ok=True) print(f"Saving to: {volume_path}") model.save_pretrained_merged(volume_path, tokenizer, save_method="merged_16bit") print("Model saved!") model.config.save_pretrained(volume_path) trainer.save_model(volume_path) tokenizer.save_pretrained(volume_path) volume.commit() print("Volume has been committed!") del model del trainer import gc gc.collect() torch.cuda.empty_cache() return { "status":"success", "volume_path":volume_path, "timestamp": timestamp } @app.function( image=modal_image, volumes={"/models": volume}, timeout=900, secrets=[modal.Secret.from_name("huggingface-secret")] ) def upload_to_hf_from_volume(volume_path: str, timestamp: int, repoName: str): """ Upload model directly from Modal Volume to HuggingFace This runs on Modal's fast network - no download to local machine needed! """ from huggingface_hub import HfApi, create_repo import os print(f"📤 Uploading from {volume_path} to HuggingFace...") if not os.path.exists(volume_path): raise FileNotFoundError(f"Model not found at: {volume_path}") hf_token = os.environ.get("HF_TOKEN") if not hf_token: raise ValueError("HF_TOKEN not found in Modal secrets") hf_api = HfApi() repo_id = f"mahreenfathima/finetuned-{repoName}-{timestamp}" print(f"Creating HuggingFace repo: {repo_id}") create_repo( repo_id=repo_id, token=hf_token, private=False, exist_ok=True, repo_type="model" ) print(f"Uploading files to {repo_id}...") hf_api.upload_folder( folder_path=volume_path, repo_id=repo_id, token=hf_token, commit_message=f"Fine-tuned model (timestamp: {timestamp})" ) model_url = f"https://huggingface.co/{repo_id}" print(f"✅ Successfully uploaded to {model_url}") return { "model_url": model_url, "repo_id": repo_id } @app.function( gpu="T4", timeout=600, image=modal_image ) def evaluate_model(repo_id: str, test_inputs: list[str]): """Load model and run inference on test cases""" from unsloth import FastLanguageModel from transformers import AutoTokenizer import torch print(f"Loading model: {repo_id}") model, tokenizer = FastLanguageModel.from_pretrained( model_name=repo_id, max_seq_length=512, load_in_4bit=True, dtype=None, ) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token outputs = [] for test_input in test_inputs: print(f"Processing: {test_input[:50]}...") inputs = tokenizer(test_input, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=100, temperature=0.5, do_sample=True ) decoded = tokenizer.decode(output[0], skip_special_tokens=True) if decoded.startswith(test_input): decoded = decoded[len(test_input):].strip() outputs.append(decoded) return outputs