import sys from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer def test_m2m(): model_name = "facebook/m2m100_418M" print(f"Loading {model_name}...") tokenizer = M2M100Tokenizer.from_pretrained(model_name) model = M2M100ForConditionalGeneration.from_pretrained(model_name) # Test cases: (Language Code, Input) test_cases = [ ("en", "he go to school yesterday"), ("pl", "on iść do szkoła wczoraj"), # Intentional broken grammar in Polish ] print("\nStarting M2M Tests (Self-Translation):\n") for lang, input_text in test_cases: tokenizer.src_lang = lang encoded = tokenizer(input_text, return_tensors="pt") # Translate to SAME language generated_tokens = model.generate( **encoded, forced_bos_token_id=tokenizer.get_lang_id(lang) ) corrected = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] print(f"[{lang}]") print(f"Input: {input_text}") print(f"Output: {corrected}") print("-" * 20) if __name__ == "__main__": test_m2m()