ワンクリックで
qtpass-localization-audit
// QtPass localization audit - structural checks on .ts files (placeholders, HTML balance, mnemonics, mixed-script artifacts)
// QtPass localization audit - structural checks on .ts files (placeholders, HTML balance, mnemonics, mixed-script artifacts)
QtPass CI/CD workflow - run GitHub Actions locally with act, linters, formatters
QtPass localization workflow - translation files, updating, adding languages
QtPass GitHub interaction - PRs, issues, branches, merging
Bug fixing workflow for QtPass - find, fix, test, PR
Documentation guide for QtPass - README, FAQ, localization
Release workflow for QtPass - versioning, builds, publishing
| name | qtpass-localization-audit |
| description | QtPass localization audit - structural checks on .ts files (placeholders, HTML balance, mnemonics, mixed-script artifacts) |
| license | GPL-3.0-or-later |
| metadata | {"audience":"developers","workflow":"localization"} |
Structural audits for .ts translation files. Use this before merging large translation contributions (machine-translated batches from Qwen, GPT, Gemini, etc.) and after reviewer-supplied refinements that could disturb non-obvious bits like placeholder format or HTML structure.
The audit catches issues that produce broken UI but read as plausible translations:
%1, %2).<br>/<strong>/<a>/<p> etc.&Letter.The Quick audit covers placeholders, mnemonics, HTML tag balance, and mixed-script artifacts only.
It does not detect wrong-string mappings (translation block paired with the wrong source) or cross-language leaks. For those, run the scripts in Targeted patterns — Find wrong-string mappings and Find Slovenian leaks (or any cross-locale leak) — or the full standard cleanup workflow below.
python3 <<'EOF'
import re, os
LOC = 'localization/localization_lv_LV.ts' # change as needed
with open(LOC) as f:
content = f.read()
mnemonic_sources = {
'&Use pass', 'Nati&ve Git/GPG',
'&Show', '&Hide', '&Restore', '&Quit',
'Mi&nimize', 'Ma&ximize',
}
# Locale-to-script map for mixed-script detection. Add ranges as needed.
LOCALE_TO_SCRIPT_RANGES = {
'si': '-', # Sinhala
'ta': '-', # Tamil
'hi': 'ऀ-ॿ', # Devanagari
'ru': 'Ѐ-ӿ', 'uk': 'Ѐ-ӿ', 'bg': 'Ѐ-ӿ', 'sr': 'Ѐ-ӿ',
'el': 'Ͱ-Ͽ', # Greek
'zh': '一-鿿', 'ja': '-ゟ゠-ヿ一-鿿', 'ko': '가-',
'ar': '-ۿ', 'he': '-',
}
lang = os.path.basename(LOC).replace('localization_', '').replace('.ts', '').split('_')[0]
script_range = LOCALE_TO_SCRIPT_RANGES.get(lang) # None for Latin-script locales
ph = mn = html = mixed = wrong = 0
for m in re.finditer(
r'<message[^>]*>.*?<source>(.*?)</source>.*?<translation([^>]*)>(.*?)</translation>',
content, re.DOTALL
):
src, attrs, trans = m.group(1), m.group(2), m.group(3)
if 'vanished' in attrs or not trans.strip():
continue
# 1. Placeholder integrity. %1, %2, ... must match exactly.
# %n is the Qt plural placeholder: if source uses it, the translation
# must use it too (once per <numerusform>), but the count is allowed
# to differ across plural forms.
s_ph = sorted(re.findall(r'%\d+', src))
t_ph = sorted(re.findall(r'%\d+', trans))
s_has_n = bool(re.search(r'%n\b', src))
t_has_n = bool(re.search(r'%n\b', trans))
if s_ph != t_ph or s_has_n != t_has_n or re.search(r'%\s+(?:\d|n)', trans):
ph += 1
print(f' [PH] {src[:60]}\n -> {trans[:80]}')
# 2. Missing mnemonics on known mnemonic-bearing sources
if src in mnemonic_sources and not re.search(r'&[^\s&;]', trans):
mn += 1
print(f' [MN] {src} -> {trans[:60]}')
# 3. HTML tag balance (per-tag count must match source)
for tag in ['html','body','p','br','strong','a','h3','pre','code','b','ol','li']:
s_o = len(re.findall(rf'<{tag}[ />]', src))
s_c = len(re.findall(rf'</{tag}>', src))
t_o = len(re.findall(rf'<{tag}[ />]', trans))
t_c = len(re.findall(rf'</{tag}>', trans))
if s_o != t_o or s_c != t_c:
html += 1
print(f' [HTML] {src[:40]} | <{tag}> s={s_o}/{s_c} t={t_o}/{t_c}')
break
# 4. Mixed-script (Latin letter glued to a non-Latin word, or vice versa).
# Skipped automatically for Latin-script locales (script_range is None).
if script_range:
mixed_re = re.compile(f'[{script_range}]+[a-z]+|[a-z]+[{script_range}]+')
if mixed_re.search(trans):
mixed += 1
print(f' [MIX] {src[:40]} | {trans[:60]}')
print(f'\nplaceholder={ph} mnemonic={mn} html={html} mixed-script={mixed}')
EOF
For a quick across-the-board check (no per-issue printing, just counts):
python3 <<'EOF'
import re, os, glob
mnemonic_sources = {
'&Use pass','Nati&ve Git/GPG','&Show','&Hide',
'&Restore','&Quit','Mi&nimize','Ma&ximize',
}
total_ph = total_mn = total_html = 0
for f in sorted(glob.glob('localization/localization_*.ts')):
loc = os.path.basename(f).replace('localization_','').replace('.ts','')
content = open(f).read()
ph = mn = html = 0
for m in re.finditer(r'<message[^>]*>.*?<source>(.*?)</source>.*?<translation([^>]*)>(.*?)</translation>', content, re.DOTALL):
src,a,t = m.group(1), m.group(2), m.group(3)
if 'vanished' in a or not t.strip(): continue
s_ph = sorted(re.findall(r'%\d+', src))
t_ph = sorted(re.findall(r'%\d+', t))
s_n = bool(re.search(r'%n\b', src)); t_n = bool(re.search(r'%n\b', t))
if s_ph != t_ph or s_n != t_n or re.search(r'%\s+(?:\d|n)', t): ph += 1
if src in mnemonic_sources and not re.search(r'&[^\s&;]', t): mn += 1
for tag in ['html','body','p','br','strong','a','h3','pre','code','b','ol','li']:
s_o = len(re.findall(rf'<{tag}[ />]', src))
s_c = len(re.findall(rf'</{tag}>', src))
t_o = len(re.findall(rf'<{tag}[ />]', t))
t_c = len(re.findall(rf'</{tag}>', t))
if s_o != t_o or s_c != t_c:
html += 1; break
if ph or mn or html:
print(f'{loc:8s} ph={ph:3d} mn={mn:3d} html={html:3d}')
total_ph += ph; total_mn += mn; total_html += html
print(f'---\ntotal: ph={total_ph} mn={total_mn} html={total_html}')
EOF
The audit finds structural breakage; semantic problems need eye review. Patterns we've repeatedly caught from machine-translated contributions:
Atgriezt paroli (= "Return password") for "Repeat password" in lv_LV; Iekrētāja lekts (≈ "Mr. [Non-word] sheet") for "Clipboard cleared".% 1 (with space) instead of %1 — Qt won't substitute and renders the literal.sadržju (= "content") in a Slovenian file; Latvian Parole for "password" appearing in Lithuanian.Pārencēšana (made-up Latvian for "re-encryption" — should be Pāršifrēšana); būtošana (made-up from būt = "to be") for "status".පොලීසිය (Sinhala for "police") for "search"; මුදල්/මූල්ය ("money"/"financial") for unrelated concepts; හාර්ය ("wife") for "key".kopkopu ("duplicate-duplicate") in lv_LV for "backup"; same fragment repeated 3× in a single string for sl_SI.%顯示 instead of &顯示 for &Show.},{ characters (machine-translation tooling fragment).<b>Clipboard</b> left untranslated inside an otherwise-Sinhala paragraph.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789 source must NOT be translated. Some MT pipelines insert hyphens (A-Z-...) or substitute regional letters (abcdeļš for the middle of the alphabet) — both break password generation.When a friend-or-bot contributes a batch of machine translations:
Commit the contribution as-is (one commit, attribute properly):
git checkout -b LangFromContributor upstream/main
git checkout -- localization/localization_<lang>.ts # in case of unrelated working-tree noise
git add localization/localization_<lang>.ts
git commit -S -m "i18n(<lang>): translations from <Contributor>"
Run the audit (single-file form above).
For each finding, decide:
<translation type="unfinished"></translation> for native review.qtpass-localization skill).Commit the cleanup as a second commit on the same branch (this keeps the contribution diff reviewable).
Re-run the audit to confirm 0 placeholder / 0 HTML / 0 mnemonic / 0 mixed-script.
lrelease6 smoke test:
lrelease6 localization/localization_<lang>.ts | tail -3
rm -f localization/localization_<lang>.qm
Push and PR, attributing the contributor in the commit body / PR description.
The patterns below cover checks the Quick audit intentionally skips — wrong-string mappings, cross-language leaks, and verbatim-untranslated source fallback. Run them when reviewing a translation contribution or before merging a Weblate sync.
Useful for scanning whether a file has actual coverage or is mostly source-fallback. Filter for proper-noun / technical-token false positives:
import re, glob
ignore = {
'QtPass','GPG','Git','OTP','PWGen','pass','Pass','Email','%1','%2','…','⌕',
'Aa','LTR','Ctrl+G','Ctrl+N','PGP','GnuPG','YubiKey','Push','Git push','Git pull',
'gpg','git','Git:','qrencode','pwgen',
'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789',
'login\nURL\ne-mail',
'QProcess::FailedToStart','QProcess::Crashed','QProcess::Timedout',
'QProcess::ReadError','QProcess::WriteError','QProcess::UnknownError',
}
for f in sorted(glob.glob('localization/localization_*.ts')):
content = open(f).read()
for m in re.finditer(r'<message[^>]*>.*?<source>(.*?)</source>.*?<translation([^>]*)>(.*?)</translation>', content, re.DOTALL):
src,a,t = m.group(1), m.group(2), m.group(3)
if 'vanished' in a or not t.strip(): continue
if src.strip() == t.strip() and src not in ignore and 'href' not in src and len(src) > 2:
print(f'{f}: {src[:60]!r}')
Replace the keyword set with distinguishing words from the unintended language:
# Detect Slovenian leaks in non-sl_SI files
slovenian = ['mogoče','prepričani','želite','izbrisati','odložišče','počiščeno',
'ključa','obesku','zagon','uspel','nepričakovano']
import re, glob
for f in sorted(glob.glob('localization/localization_*.ts')):
if 'sl_SI' in f: continue
content = open(f).read()
for m in re.finditer(r'<message[^>]*>.*?<source>(.*?)</source>.*?<translation([^>]*)>(.*?)</translation>', content, re.DOTALL):
src,a,t = m.group(1), m.group(2), m.group(3)
if 'vanished' in a or not t.strip(): continue
for w in slovenian:
if w in t.lower():
print(f'{f}: {src[:50]} -> {t[:60]}')
break
Look for translations whose %-placeholder set doesn't match the source's. Already covered by the main placeholder check, but a separate scan focused on mismatched length / radically different translation can also help:
import re, glob
for f in sorted(glob.glob('localization/localization_*.ts')):
content = open(f).read()
for m in re.finditer(r'<message[^>]*>.*?<source>(.*?)</source>.*?<translation([^>]*)>(.*?)</translation>', content, re.DOTALL):
src,a,t = m.group(1), m.group(2), m.group(3)
if 'vanished' in a or not t.strip(): continue
# Suspicious: translation 3× longer than source, or vice versa
if len(src) > 10 and (len(t) > 3 * len(src) or len(t) * 3 < len(src)):
print(f'{f}: len-ratio {len(src)}->{len(t)}: {src[:50]!r}')
krātuve→glabātuve) on a single locale: structure is preserved by definition.type="unfinished" (finalising) on previously-validated translations.For everything else — especially MT-source contributions — running the audit takes ~5 seconds and reliably catches issues that would otherwise need a second PR.