- Removed M2M100 Grammar Correction model completely to reduce bloat/complexity. - Implemented 'Style Prompting' in Settings -> AI Engine to handle punctuation natively via Whisper. - Added Style Presets: Standard (Default), Casual, and Custom. - Optimized Build: Bootstrapper no longer requires transformers/sentencepiece. - Fixed 'torch' NameError in Low VRAM mode. - Fixed Bootstrapper missing dependency detection. - Updated UI to reflect removed features. - Included compiled v1.0.2 Executable in dist/.
39 lines
1.2 KiB
Python
39 lines
1.2 KiB
Python
|
|
import sys
|
|
from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
|
|
|
|
def test_m2m():
|
|
model_name = "facebook/m2m100_418M"
|
|
print(f"Loading {model_name}...")
|
|
|
|
tokenizer = M2M100Tokenizer.from_pretrained(model_name)
|
|
model = M2M100ForConditionalGeneration.from_pretrained(model_name)
|
|
|
|
# Test cases: (Language Code, Input)
|
|
test_cases = [
|
|
("en", "he go to school yesterday"),
|
|
("pl", "on iść do szkoła wczoraj"), # Intentional broken grammar in Polish
|
|
]
|
|
|
|
print("\nStarting M2M Tests (Self-Translation):\n")
|
|
|
|
for lang, input_text in test_cases:
|
|
tokenizer.src_lang = lang
|
|
encoded = tokenizer(input_text, return_tensors="pt")
|
|
|
|
# Translate to SAME language
|
|
generated_tokens = model.generate(
|
|
**encoded,
|
|
forced_bos_token_id=tokenizer.get_lang_id(lang)
|
|
)
|
|
|
|
corrected = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
|
|
|
|
print(f"[{lang}]")
|
|
print(f"Input: {input_text}")
|
|
print(f"Output: {corrected}")
|
|
print("-" * 20)
|
|
|
|
if __name__ == "__main__":
|
|
test_m2m()
|