from fastapi import FastAPI import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline app = FastAPI() model_name = "Qwen/Qwen2.5-0.5B" # CPU環境なので load_in_4bit オプションは削除 model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float32, # CPU で動かす場合は FP32 を利用 device_map="cpu" ) tokenizer = AutoTokenizer.from_pretrained(model_name) @app.get("/") def home(): return {"message": "Qwen2.5-0.5B API is running in full precision on CPU"} @app.post("/generate") def generate_text(prompt: str, max_length: int = 50): generator = pipeline("text-generation", model=model, tokenizer=tokenizer) output = generator(prompt, max_length=max_length, do_sample=True, pad_token_id=tokenizer.pad_token_id) return {"generated_text": output[0]["generated_text"]}