ci: tighten crowdin and i18n PR checks

2026-03-13 00:35:11 +09:00
parent 6bac2b9331
commit 4b7ee3923c
6 changed files with 132 additions and 15 deletions
--- a/.github/workflows/precheck.yml
+++ b/.github/workflows/precheck.yml
@@ -2,6 +2,10 @@
 name: PR Precheck
 on: [pull_request]

+permissions:
+  contents: read
+  issues: write
+
 jobs:
  conflict-check:
    runs-on: ubuntu-24.04
@@ -11,19 +15,37 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
+          ref: ${{ github.event.pull_request.head.sha }}
      - name: Check Conflicts
        id: check-conflicts
+        env:
+          BASE_REF: ${{ github.event.pull_request.base.ref }}
        run: |
-          git fetch origin main
-          $conflicts = git diff --name-only --diff-filter=U origin/main...HEAD
-          if ($conflicts) {
-            echo "conflict=true" >> $env:GITHUB_OUTPUT
-            Write-Host "Conflicts detected in files: $conflicts"
-          } else {
-            echo "conflict=false" >> $env:GITHUB_OUTPUT
-            Write-Host "No conflicts detected"
-          }
-        shell: pwsh
+          set -euo pipefail
+
+          git fetch origin "$BASE_REF":"refs/remotes/origin/$BASE_REF" --depth=1
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+          git config user.name "github-actions[bot]"
+
+          if git merge --no-commit --no-ff "origin/$BASE_REF" > /tmp/precheck-merge.log 2>&1; then
+            echo "conflict=false" >> "$GITHUB_OUTPUT"
+            echo "No conflicts detected against origin/$BASE_REF"
+            git merge --abort > /dev/null 2>&1 || true
+            exit 0
+          fi
+
+          if git diff --name-only --diff-filter=U | grep -q .; then
+            echo "conflict=true" >> "$GITHUB_OUTPUT"
+            echo "Conflicts detected against origin/$BASE_REF:"
+            git diff --name-only --diff-filter=U
+          else
+            echo "conflict=false" >> "$GITHUB_OUTPUT"
+            echo "Merge check returned non-zero without unmerged files against origin/$BASE_REF"
+            cat /tmp/precheck-merge.log
+          fi
+
+          git merge --abort > /dev/null 2>&1 || true
+        shell: bash
  labeler:
    runs-on: ubuntu-24.04
    needs: conflict-check
--- a/.github/workflows/ruff-pr.yml
+++ b/.github/workflows/ruff-pr.yml
@@ -1,5 +1,15 @@
 name: Ruff PR Check
-on: [ pull_request ]
+on:
+  pull_request:
+    paths:
+      - "*.py"
+      - "**/*.py"
+      - "pyproject.toml"
+      - "ruff.toml"
+      - ".ruff.toml"
+      - "setup.cfg"
+      - "tox.ini"
+      - ".pre-commit-config.yaml"
 jobs:
  ruff:
    runs-on: ubuntu-24.04
@@ -18,4 +28,3 @@ jobs:
      - name: Run Ruff Format Check
        run: ruff format --check --diff
        shell: pwsh
-
--- a/docs/i18n.md
+++ b/docs/i18n.md
@@ -85,6 +85,11 @@ Prompt 加载规则：
 3. 在代码中用 `t()` 或 `tn()` 替换硬编码字符串。
 4. 运行 `python scripts/i18n_validate.py` 校验结构。

+对于非 `zh-CN` 的目标 locale：
+
+- 不要手工把中文 source 文案直接复制进目标语言文件后提交。
+- 英文 locale 文件中不应保留中文字符；这类残留会被校验脚本拦截。
+
 ## 校验脚本

 运行：
@@ -101,6 +106,7 @@ python scripts/i18n_validate.py
 - 各语言 key 集合是否与 `zh-CN` 对齐
 - 占位符集合是否一致
 - plural 结构是否一致
+- 非 `zh-CN` locale 是否直接保留了包含中文字符的 source 文案
 - prompt 模板已存在时，其占位符集合必须与 `prompts/zh-CN/` 对齐

 对于 prompt 模板：
--- a/locales/en-US/startup.json
+++ b/locales/en-US/startup.json
@@ -1,6 +1,6 @@
 {
-  "startup.agreement_confirm_prompt": "Type \"confirmed\" or the Chinese word \"同意\", or set environment variables \"EULA_AGREE={eula_hash}\" and \"PRIVACY_AGREE={privacy_hash}\" to continue",
-  "startup.agreement_confirm_retry": "Please type \"confirmed\" or \"同意\" to continue",
+  "startup.agreement_confirm_prompt": "Type \"confirmed\", or set environment variables \"EULA_AGREE={eula_hash}\" and \"PRIVACY_AGREE={privacy_hash}\" to continue",
+  "startup.agreement_confirm_retry": "Please type \"confirmed\" to continue",
  "startup.agreement_reconfirm": "The EULA or Privacy Policy has been updated. Please review and confirm again before continuing.",
  "startup.agreement_updated": "Updated the {agreement_name} confirmation file {file_hash}",
  "startup.brain_external_world_failed": "Failed to start the brain and external world: {error}",
--- a/pytests/i18n_test/test_i18n_validate.py
+++ b/pytests/i18n_test/test_i18n_validate.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from importlib.util import module_from_spec, spec_from_file_location
+from pathlib import Path
+
+import json
+
+SCRIPT_PATH = Path(__file__).resolve().parents[2] / "scripts" / "i18n_validate.py"
+MODULE_SPEC = spec_from_file_location("i18n_validate_script", SCRIPT_PATH)
+assert MODULE_SPEC is not None
+assert MODULE_SPEC.loader is not None
+I18N_VALIDATE = module_from_spec(MODULE_SPEC)
+MODULE_SPEC.loader.exec_module(I18N_VALIDATE)
+
+
+def write_locale_file(locales_root: Path, locale: str, file_name: str, payload: dict[str, object]) -> None:
+    locale_dir = locales_root / locale
+    locale_dir.mkdir(parents=True, exist_ok=True)
+    (locale_dir / file_name).write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+def test_validate_json_locales_rejects_han_characters_in_english_locale(tmp_path: Path) -> None:
+    locales_root = tmp_path / "locales"
+    write_locale_file(locales_root, "zh-CN", "core.json", {"consent.prompt": "输入\"同意\"继续"})
+    write_locale_file(locales_root, "en-US", "core.json", {"consent.prompt": "Type \"confirmed\" or \"同意\" to continue"})
+
+    errors = I18N_VALIDATE.validate_json_locales(locales_root)
+
+    assert any("consent.prompt" in error and "仍包含中文字符" in error for error in errors)
+
+
+def test_validate_json_locales_rejects_untranslated_han_source_in_other_target_locales(tmp_path: Path) -> None:
+    locales_root = tmp_path / "locales"
+    write_locale_file(locales_root, "zh-CN", "core.json", {"greeting": "你好，世界"})
+    write_locale_file(locales_root, "ja", "core.json", {"greeting": "你好，世界"})
+
+    errors = I18N_VALIDATE.validate_json_locales(locales_root)
+
+    assert any("greeting" in error and "直接保留了包含中文字符的 source 文案" in error for error in errors)
--- a/scripts/i18n_validate.py
+++ b/scripts/i18n_validate.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from pathlib import Path
 from string import Formatter

+import re
 import sys

 PROJECT_ROOT = Path(__file__).resolve().parents[1]
@@ -23,6 +24,7 @@ from src.common.prompt_i18n import (  # noqa: E402
 )

 FORMATTER = Formatter()
+HAN_CHARACTER_PATTERN = re.compile(r"[\u3400-\u4DBF\u4E00-\u9FFF\uF900-\uFAFF]")


 def extract_placeholders(template: str) -> set[str]:
@@ -34,6 +36,41 @@ def extract_placeholders(template: str) -> set[str]:
    return placeholders


+def contains_han_characters(text: str) -> bool:
+    return HAN_CHARACTER_PATTERN.search(text) is not None
+
+
+def iter_translation_strings(value: TranslationValue) -> list[str]:
+    if isinstance(value, str):
+        return [value]
+    return [value[category] for category in sorted(value.keys())]
+
+
+def locale_requires_latin_only_validation(locale: str) -> bool:
+    normalized_locale = locale.lower()
+    return normalized_locale == "en" or normalized_locale.startswith("en-")
+
+
+def validate_locale_content(
+    key: str,
+    source_value: TranslationValue,
+    target_value: TranslationValue,
+    locale: str,
+    errors: list[str],
+) -> None:
+    source_texts = iter_translation_strings(source_value)
+    target_texts = iter_translation_strings(target_value)
+
+    if any(
+        source_text == target_text and contains_han_characters(source_text)
+        for source_text, target_text in zip(source_texts, target_texts, strict=False)
+    ):
+        errors.append(f"[{locale}] key '{key}' 直接保留了包含中文字符的 source 文案，请通过 Crowdin 提供目标语言翻译")
+
+    if locale_requires_latin_only_validation(locale) and any(contains_han_characters(text) for text in target_texts):
+        errors.append(f"[{locale}] key '{key}' 仍包含中文字符，请移除源语言残留后再提交")
+
+
 def validate_translation_pair(
    key: str,
    source_value: TranslationValue,
@@ -103,7 +140,11 @@ def validate_json_locales(locales_root: Path | None = None) -> list[str]:
            errors.append(f"[{locale}] 存在多余 key: {key}")

        for key in sorted(source_keys & locale_keys):
-            validate_translation_pair(key, source_catalog[key], catalog[key], locale, errors)
+            source_value = source_catalog[key]
+            target_value = catalog[key]
+            validate_translation_pair(key, source_value, target_value, locale, errors)
+            if isinstance(source_value, str) == isinstance(target_value, str):
+                validate_locale_content(key, source_value, target_value, locale, errors)

    return errors